2009-03-12, 11:23
Hello,
I noticed that i was having problems with gettting tv show info.
I always used to use tv.com. it seems tvdb is better, but it's having some issues at the minute with speed.
so anyway I quickly fixed the tv.com scraper so i could use it for now.
here it is:
I tested this on my tv shows and it worked. There is one bug I didn't address, then it grabs the writer credit, it just grabs the first writer noted, not all of them. This could be fixed but I didn't have the motivation, also I wasn't sure about the format of the tv show xml - it seems well documented for movies, but not tv :confused2:. I noticed in one other scraper that someone grabs all the names then returns them as one string seperated by a | character, is this the expected format?
Anyway, if someone wants to let me know if they find any mistakes. i might take a look.
Thanks.
I noticed that i was having problems with gettting tv show info.
I always used to use tv.com. it seems tvdb is better, but it's having some issues at the minute with speed.
so anyway I quickly fixed the tv.com scraper so i could use it for now.
here it is:
I tested this on my tv shows and it worked. There is one bug I didn't address, then it grabs the writer credit, it just grabs the first writer noted, not all of them. This could be fixed but I didn't have the motivation, also I wasn't sure about the format of the tv show xml - it seems well documented for movies, but not tv :confused2:. I noticed in one other scraper that someone grabs all the names then returns them as one string seperated by a | character, is this the expected format?
Anyway, if someone wants to let me know if they find any mistakes. i might take a look.
Thanks.
Code:
<?xml version="1.0" encoding="UTF-8"?>
<scraper name="TV.com" content="tvshows" thumb="tvcom.png">
<CreateSearchUrl dest="3">
<RegExp input="$$1" output="<url>http://www.tv.com/search.php?type=Search&amp;stype=ajax_search&amp;qs=\1&amp;search_type=program&amp;pg_results=0&amp;sort=</url>" dest="3">
<expression></expression>
</RegExp>
</CreateSearchUrl>
<GetSearchResults dest="3">
<RegExp input="$$4" output="<results>\1</results>" dest="3">
<RegExp input="$$1" output="<entity><title>\2</title><url>http://www.tv.com/show/\1/summary.html</url><url>http://www.tv.com/show/\1/cast.html</url><url>http://www.tv.com/show/\1/episode_listings.html?season=All</url><id>\1</id></entity>" dest="4">
<expression repeat="yes" noclean="1"><a href="http://www\.tv\.com/[^/]*/show/([0-9]+)/summary\.html[^"]*"[^>]*>([^<]+)</a></expression>
</RegExp>
<expression noclean="1"></expression>
</RegExp>
</GetSearchResults>
<GetDetails dest="7">
<RegExp input="$$5" output="<details>\1</details>" dest="7">
<RegExp input="$$1" output="<title>\1</title>" dest="5">
<expression noclean="1"><title>([^<]*) on TV\.com</expression>
</RegExp>
<RegExp input="$$1" output="<genre>\1</genre>" dest="5+">
<expression repeat="yes" noclean="1">;genre;[^>]*>([^<]*)</a></expression>
</RegExp>
<!-- <RegExp input="$$1" output="<plot>\1</plot>" dest="5+">
<expression>id="summary_fold" class="mt-10">\W*(.*?) *?</div></expression>
</RegExp> -->
<RegExp input="$$8" output="<plot>\1</plot>" dest="5+">
<RegExp input="$$1" output="\1" dest="6">
<expression><span class="long">(.*)</span>[^<]*<span class="short"></expression>
</RegExp>
<RegExp input="$$6" output="\1" dest="8">
<expression repeat="yes"></expression>
</RegExp>
<expression></expression>
</RegExp>
<RegExp input="$$1" output="<rating>\1</rating>" dest="5+">
<expression><span>Show Score</span>[^0-9]*([0-9\.]*)</expression>
</RegExp>
<RegExp input="$$1" output="<votes>\1</votes>" dest="5+">
<expression><span>([0-9,]*)</span>[^<]*Votes</expression>
</RegExp>
<RegExp input="$$2" output="<actor><name>\1</name><role>\2</role></actor>" dest="5+">
<expression repeat="yes">>([^<]*)</a></h3> <a class="photos_link" href="http://www\.tv\.com/[^/]*/person/[0-9]*/photos\.html\?tag=cast;stars;photos;[0-9]*">\(photos\)</a></div><div class="role">Role: ([^<]*)</div></expression>
</RegExp>
<RegExp input="$$1" output="<thumb>\1</thumb>" dest="5+">
<expression>(http://image\.com\.com/tv/images/content_headers/program_new/[0-9]*\.jpg)</expression>
</RegExp>
<RegExp input="$$1" output="<status>\1</status>" dest="5+">
<expression trim="1"><span class="program_status_name">([^<]*)</span></expression>
</RegExp>
<RegExp input="$$1" output="<premiered>\1</premiered>" dest="5+">
<expression trim="1"><span class="start_date">([^<]*)</span></expression>
</RegExp>
<RegExp input="$$8" output="<episodeguide>\1</episodeguide>" dest="5+">
<RegExp input="$$3" output="<url>http://www.tv.com/show/$$4/episode_listings.html?season=\1</url>" dest="8">
<expression repeat="yes">/show/[0-9]+/episode_listings\.html\?season=([0-9]+)</expression>
</RegExp>
<expression noclean="1"></expression>
</RegExp>
<expression noclean="1"></expression>
</RegExp>
</GetDetails>
<GetEpisodeList dest="3">
<RegExp input="$$5" output="<episodeguide>\1</episodeguide>" dest="3">
<RegExp input="$$1" output="\1" dest="6">
<expression> [^<]*<strong>([0-9]+)</strong></expression>
</RegExp>
<RegExp input="$$1" output="<episode><title>\3</title><id>\2</id><url >http://www.tv.com/episode/\2/summary.html</url><epnum>\1</epnum><season>$$6</season></episode>" dest="5">
<expression repeat="yes"><div>([0-9]*)</div></td><td class="ep_title"><div><a href="http://www\.tv\.com/[^/]*/[^/]*/episode/([0-9]*)/summary\.html[^>]*>([^<]*)</a></expression>
</RegExp>
<expression noclean="1"></expression>
</RegExp>
</GetEpisodeList>
<GetEpisodeDetails dest="3">
<RegExp input="$$5" output="<details>\1</details>" dest="3">
<RegExp input="$$1" output="<title>\1</title>" dest="5">
<expression><div class="content_title">[^<]*<h1>[^:]*:([^<]*)</h1></expression>
</RegExp>
<RegExp input="$$1" output="<plot>\1</plot>" dest="5+">
<expression><p class="deck">([^=]*)<a </expression>
</RegExp>
<RegExp input="$$1" output="<rating>\1</rating>" dest="5+">
<expression>Episode score[^<]*<span>([0-9\.]*)</span></expression>
</RegExp>
<RegExp input="$$1" output="<aired>\1</aired>" dest="5+">
<expression><span>First Aired:</span>([^<]*)</li></expression>
</RegExp>
<RegExp input="$$1" output="<actor><name>\1</name><role>\2</role></actor>" dest="5+">
<expression repeat="yes">">([^<]*)</a> \(([^<]*)\)[^<]*<</expression>
</RegExp>
<RegExp input="$$1" output="<director>\1</director>" dest="5+">
<expression>Director:</dt><dd><a [^>]*>([^<]*)</a></expression>
</RegExp>
<RegExp input="$$1" output="<credits>\1</credits>" dest="5+">
<expression>writer;0">([^<]*)</a></expression>
</RegExp>
<RegExp input="$$1" output="<thumb>\1</thumb>" dest="5+">
<expression>(http://image\.com\.com/tv/images/content_headers/episode_new/[0-9]*\.jpg)</expression>
</RegExp>
<RegExp input="$$1" output="<code>\1</code>" dest="5+">
<expression><span>Prod Code:</span>([^<]*)</li></expression>
</RegExp>
<expression noclean="1"></expression>
</RegExp>
</GetEpisodeDetails>
</scraper>