2009-08-06, 12:58
Okay, I've got a pretty good skeleton of a scraper going here, no fanart or thumbs yet. I'm slightly at a loss as to how I'm going to tackle that as other scrapers look like they use the IMDB id # to fetch artwork. I can do the reverse (i.e, look up a RT movie with the IMDB id with http://www.rottentomatoes.com/alias?type=imdbid&s=[imdb id #]) but RT movie pages do not contain the IMDB id or a way to look it up from within that site. So should I:
1) rewrite the scraper to lookup the IMDB id with CreateSearchUrl and call all the RT stuff with functions?
or
2) lookup the IMDB title with a call to Google or the like. I think PTGate looks up the IMDB id like this if the movie's page doesn't include it.
Both methods seem less than ideal, but they're the only options I can think of right now.
Here's what I've done so far anyway:
1) rewrite the scraper to lookup the IMDB id with CreateSearchUrl and call all the RT stuff with functions?
or
2) lookup the IMDB title with a call to Google or the like. I think PTGate looks up the IMDB id like this if the movie's page doesn't include it.
Both methods seem less than ideal, but they're the only options I can think of right now.
Here's what I've done so far anyway:
Code:
<?xml version="1.0" encoding="utf-8"?>
<scraper name="Rotten Tomatoes 0.5" date="2009-08-05" content="movies" framework="1.0" thumb="rottentomatoes.png" language="">
<GetSettings dest="3">
<RegExp input="$$5" output="<settings>\1</settings>" dest="3">
<RegExp input="$$1" output="<setting label="Location" type="labelenum" values="us|au|uk" id="locality" default="au"></setting>" dest="5">
<expression></expression>
</RegExp>
<RegExp input="$$1" output="<setting label="Retrieve Classification Reason" type="bool" id="classreason" default="false"></setting>" dest="5+">
<expression></expression>
</RegExp>
<expression noclean="1"></expression>
</RegExp>
</GetSettings>
<NfoUrl dest="3">
<RegExp input="$$1" output="\1" dest="3">
<expression noclean="1">(http://$INFO[locality]\.rottentomatoes\.com/m/[A-Za-z0-9_]*)</expression>
</RegExp>
</NfoUrl>
<CreateSearchUrl dest="3">
<RegExp input="$$1" output="http://$INFO[locality].rottentomatoes.com/search/full_search.php?search=\1" dest="3">
<expression noclean="1"></expression>
</RegExp>
</CreateSearchUrl>
<GetSearchResults dest="8">
<RegExp input="$$5" output="<?xml version="1.0" encoding="iso-8859-1" standalone="yes"?><results>\1</results>" dest="8">
<RegExp input="$$1" output="<entity><title>\2 (\3)</title><url>http://$INFO[locality].rottentomatoes.com/m/\1</url></entity>" dest="5">
<expression repeat="yes"><a href="/m/([^"]*)">([^<]*).*?([0-9]{4})</expression>
</RegExp>
<expression noclean="1"></expression>
</RegExp>
</GetSearchResults>
<GetDetails dest="3">
<RegExp input="$$8" output="<details>\1</details>" dest="3">
<RegExp input="$$1" output="<title>\1</title><originaltitle>\1</originaltitle><year>\2</year>" dest="8">
<expression noclean="1" trim="1"><h1.class="movie_title clearfix">([\S\s]*)\(([0-9]{4})\)</h1>[\S\s]*dialog_content clearfix</expression>
</RegExp>
<RegExp input="$$7" output="<director>\1</director>" dest="8+">
<RegExp input="$$1" output="\1" dest="7">
<expression noclean="1"><p class="movie_crew_shortened[\S\s]*Director:([\S\s]*)movie_crew_all</expression>
</RegExp>
<expression repeat="yes" noclean="1"><a.href="[^>]*>([A-Za-z ]*)</expression>
</RegExp>
<RegExp conditional="!classreason" input="$$1" output="<mpaa>\1</mpaa>" dest="8+">
<expression><div id="movie_stats">[\S\s]*<span class="content">([^<]*)[\S\s]*\[See.Full.Rating\]</expression>
</RegExp>
<RegExp conditional="classreason" input="$$1" output="<mpaa>\1 \2</mpaa>" dest="8+">
<expression><div id="movie_stats">[\S\s]*<span class="content">([^<]*)[\S\s]*\[See.Full.Rating\][\S\s]*movie_rating_reason".style="display:.none">([^<]*)</expression>
</RegExp>
<RegExp input="$$1" output="<runtime>\1</runtime>" dest="8+">
<expression>Runtime:[^0-9]*([^<]*)</expression>
</RegExp>
<RegExp input="$$1" output="<thumb><url spoof="http://www.culturalianet.com">http://www.culturalianet.com/imatges/articulos/\1-1.jpg</url></thumb>" dest="8+">
<expression>imatges/articulos/([0-9]*)-</expression>
</RegExp>
<RegExp input="$$7" output="<credits>\1</credits>" dest="8+">
<RegExp input="$$1" output="\1" dest="7">
<expression noclean="1">class="label">Screenwriter:</span>([\S\s]*)Story:</span></expression>
</RegExp>
<expression repeat="yes" noclean="1"><a.href="[^>]*>([A-Za-z ]*)</expression>
</RegExp>
<RegExp input="$$1" output="<rating>\1</rating>" dest="8+">
<expression><li class="ui-tabs-selected"><a title="([0-9]{2,3})</expression>
</RegExp>
<RegExp input="$$1" output="<votes>\1</votes>" dest="8+">
<expression><p>Reviews Counted: ([0-9]*)</expression>
</RegExp>
<RegExp input="$$1" output="<genre>\1</genre>" dest="8+">
<expression noclean="1"><span.class="label">Genre:</span>.<span class="content"><a.href="/movie/browser.php\?genre=[0-9]*">([^<]*)</expression>
</RegExp>
<RegExp input="$$7" output="<actor><name>\1</name><role></role></actor>" dest="8+">
<RegExp input="$$1" output="\1" dest="7">
<expression noclean="1"><span class="label">Starring:([\S\s]*)<p class="movie_cast_all"</expression>
</RegExp>
<expression repeat="yes" noclean="1"><a.href="[^>]*>([A-Za-z ]*)</expression>
</RegExp>
<RegExp input="$$1" output="<plot>\1</plot>" dest="8+">
<expression><span id="movie_synopsis_all" style="display: none;">([\S\s]*)<a href="#" id="movie_synopsis_link</expression>
</RegExp>
<expression noclean="1"></expression>
</RegExp>
</GetDetails>
</scraper>