Class: YayImdbs
- Inherits:
-
Object
- Object
- YayImdbs
- Defined in:
- lib/yay_imdbs.rb
Constant Summary collapse
- IMDB_BASE_URL =
'http://www.imdb.com/'
- IMDB_SEARCH_URL =
IMDB_BASE_URL + 'find?s=tt&q='
- IMDB_MOVIE_URL =
IMDB_BASE_URL + 'title/tt'
- STRIP_WHITESPACE =
/(\s{2,}|\n|\||\302\240\302\273)/u
- MORE_INFO_LINKS =
['See more', 'Add/edit official sites', 'See all certifications', 'See full summary', 'see all', ]
- DATE_PROPERTIES =
[:release_date]
- LIST_PROPERTIES =
[:genres, :plot_keywords, :country, :sound_mix, :language]
- INT_LIST_PROPERTIES =
[:year, :season]
- PROPERTY_ALIAS =
{:genres => :genre, :taglines => :tagline, :year => :years, :season => :seasons, :language => :languages, :motion_picture_rating_mpaa => :mpaa, :official_sites => :official_site}
- OFFICAL_SITE_REGEX =
/<a href="([^"]+)"[^>]*>Official site<\/a>/
Class Method Summary collapse
- .clean_movie_property(key, value, imdb_id) ⇒ Object
-
.clean_title(movie_title) ⇒ Object
Remove surrounding double quotes that seems to appear on tv show name.
- .get_episodes_page(imdb_id) ⇒ Object
- .get_media_page(url_fragment) ⇒ Object
- .get_movie_page(imdb_id) ⇒ Object
-
.get_official_site_url(value, imdb_id) ⇒ Object
TODO capture all official sites, not all sites have an “Official site” link (e.g. Lost).
- .get_official_sites_page(imdb_id) ⇒ Object
- .get_search_page(name) ⇒ Object
- .get_title_and_year_from_meta(doc) ⇒ Object
- .movie_properties(doc) ⇒ Object
- .scrap_episodes(info_hash) ⇒ Object
- .scrap_images(doc, info_hash) ⇒ Object
- .scrap_movie_info(imdb_id) ⇒ Object
- .search_for_imdb_id(name, year = nil, type = nil) ⇒ Object
- .search_imdb(search_term) ⇒ Object
-
.strip_whitespace(s) ⇒ Object
Hackyness to get around ruby 1.9 encoding issue.
- .video_type(td) ⇒ Object
- .video_type_from_meta(doc) ⇒ Object
Class Method Details
.clean_movie_property(key, value, imdb_id) ⇒ Object
127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 |
# File 'lib/yay_imdbs.rb', line 127 def clean_movie_property(key, value, imdb_id) if DATE_PROPERTIES.include?(key) value = Date.strptime(value, '%d %B %Y') rescue nil elsif key == :runtime if value =~ /(\d+)\smin/ value = $1.to_i else value = nil end elsif key == :official_sites value = get_official_site_url(value, imdb_id) elsif LIST_PROPERTIES.include?(key) value = value.split('|').collect { |l| l.gsub(/[^a-zA-Z0-9\-]/, '') } elsif INT_LIST_PROPERTIES.include?(key) value = value.split('|').collect { |l| l.strip.to_i }.reject { |y| y <= 0 } end return value end |
.clean_title(movie_title) ⇒ Object
Remove surrounding double quotes that seems to appear on tv show name
235 236 237 238 |
# File 'lib/yay_imdbs.rb', line 235 def clean_title(movie_title) movie_title = $1 if movie_title =~ /^"(.*)"$/ return movie_title.strip end |
.get_episodes_page(imdb_id) ⇒ Object
215 216 217 |
# File 'lib/yay_imdbs.rb', line 215 def get_episodes_page(imdb_id) Nokogiri::HTML(open(IMDB_MOVIE_URL + imdb_id + '/episodes')) end |
.get_media_page(url_fragment) ⇒ Object
219 220 221 |
# File 'lib/yay_imdbs.rb', line 219 def get_media_page(url_fragment) Nokogiri::HTML(open(IMDB_BASE_URL + url_fragment)) end |
.get_movie_page(imdb_id) ⇒ Object
207 208 209 |
# File 'lib/yay_imdbs.rb', line 207 def get_movie_page(imdb_id) Nokogiri::HTML(open(IMDB_MOVIE_URL + imdb_id)) end |
.get_official_site_url(value, imdb_id) ⇒ Object
TODO capture all official sites, not all sites have an “Official site” link (e.g. Lost)
159 160 161 162 163 164 165 |
# File 'lib/yay_imdbs.rb', line 159 def get_official_site_url(value, imdb_id) value = value.match(OFFICAL_SITE_REGEX) if value.nil? value = get_official_sites_page(imdb_id).inner_html.match(OFFICAL_SITE_REGEX) end return $1 end |
.get_official_sites_page(imdb_id) ⇒ Object
211 212 213 |
# File 'lib/yay_imdbs.rb', line 211 def get_official_sites_page(imdb_id) Nokogiri::HTML(open(IMDB_MOVIE_URL + imdb_id + '/officialsites' )) end |
.get_search_page(name) ⇒ Object
203 204 205 |
# File 'lib/yay_imdbs.rb', line 203 def get_search_page(name) Nokogiri::HTML(open(IMDB_SEARCH_URL + URI.escape(name))) end |
.get_title_and_year_from_meta(doc) ⇒ Object
223 224 225 226 227 228 229 230 231 232 |
# File 'lib/yay_imdbs.rb', line 223 def (doc) title_text = doc.at_css("meta[name='title']").try(:[], 'content') title_text = title_text.sub(/^IMDb - /, '') # Matches 'Movie Name (2010)' or 'Movie Name (2010/I)' or 'Lost (TV Series 2004–2010)' if title_text && title_text =~ /(.*) \([^\)0-9]*(\d{4})((\/\w*)|(.\d{4})|([^\)]*))?\)/ movie_title = clean_title($1) movie_year = $2.to_i end return movie_title, movie_year end |
.movie_properties(doc) ⇒ Object
146 147 148 149 150 151 152 153 154 155 156 |
# File 'lib/yay_imdbs.rb', line 146 def movie_properties(doc) doc.css("div h4").each do |h4| div = h4.parent raw_key = h4.inner_text key = raw_key.sub(':', '').strip.downcase value = div.inner_text[((div.inner_text =~ /#{Regexp.escape(raw_key)}/) + raw_key.length).. -1] value = value.gsub(/\302\240\302\273/u, '').strip.gsub(/(#{MORE_INFO_LINKS.join(')|(')})$/i, '').strip symbol_key = key.downcase.gsub(/[^a-zA-Z0-9 ]/, '').gsub(/\s/, '_').to_sym yield symbol_key, value end end |
.scrap_episodes(info_hash) ⇒ Object
184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 |
# File 'lib/yay_imdbs.rb', line 184 def scrap_episodes(info_hash) episodes = [] doc = get_episodes_page(info_hash[:imdb_id]) doc.css(".filter-all").each do |e_div| next unless e_div.at_css('h3').inner_text =~ /Season (\d+), Episode (\d+):/ episode = {"series" => $1.to_i, "episode" => $2.to_i, "title" => $'.strip} raw_date = e_div.at_css('strong').inner_text.strip episode['date'] = Date.parse(raw_date) rescue nil # Seems that the day can sometimes be ???? which doesnt play will with regex episode['plot'] = $'.strip if e_div.inner_text =~ /#{raw_date}/ rescue nil episodes << episode end info_hash['episodes'] = episodes end |
.scrap_images(doc, info_hash) ⇒ Object
167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 |
# File 'lib/yay_imdbs.rb', line 167 def scrap_images(doc, info_hash) #scrap poster image urls thumbnail_url = doc.at_css("td[id=img_primary] a img").try(:[], 'src') return if thumbnail_url.nil? || thumbnail_url =~ /\/nopicture\// info_hash['medium_image'] = thumbnail_url # Small thumbnail image, gotten by hacking medium url info_hash['small_image'] = thumbnail_url.sub(/@@.*$/, '@@._V1._SX120_120,160_.jpg') #Try to scrap a larger version of the image url large_img_page_link = doc.at_css("td[id=img_primary] a").try(:[], 'href') return unless large_img_page_link large_img_doc = get_media_page(large_img_page_link) large_img_url = large_img_doc.at_css("img[id=primary-img]").try(:[], 'src') info_hash['large_image'] = large_img_url end |
.scrap_movie_info(imdb_id) ⇒ Object
88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 |
# File 'lib/yay_imdbs.rb', line 88 def scrap_movie_info(imdb_id) info_hash = {:imdb_id => imdb_id}.with_indifferent_access doc = get_movie_page(imdb_id) title, year = (doc) info_hash[:title], info_hash[:year] = title, year if info_hash['title'].nil? #If we cant get title and year something is wrong raise "Unable to find title or year for imdb id #{imdb_id}" end info_hash[:video_type] = (doc) info_hash[:plot] = doc.xpath("//td[@id='overview-top']/p[2]").inner_text.strip info_hash[:rating] = doc.at_css('.star-box-giga-star').inner_text.gsub(/[^0-9.]/, '').to_f rescue nil found_info_divs = false movie_properties(doc) do |key, value| found_info_divs = true info_hash["raw_#{key}"] = value info_hash[key] = clean_movie_property(key, value, imdb_id) info_hash[PROPERTY_ALIAS[key]] = info_hash[key] if PROPERTY_ALIAS[key] end unless found_info_divs #If we don't find any info divs assume parsing failed raise "No info divs found for imdb id #{imdb_id}" end # Hack: tv shows can have a year property, which is a list, fixing ... info_hash[:year] = year scrap_images(doc, info_hash) #scrap episodes if tv series scrap_episodes(info_hash) if info_hash.has_key?('season') return info_hash end |
.search_for_imdb_id(name, year = nil, type = nil) ⇒ Object
42 43 44 45 46 47 48 49 50 51 52 53 |
# File 'lib/yay_imdbs.rb', line 42 def search_for_imdb_id(name, year=nil, type=nil) search_results = search_imdb(name) search_results.each do |result| # Ensure result is the correct video type next if type && (result[:video_type].to_s != type.to_s) # If no year provided just return first result return result[:imdb_id] if year.nil? || result[:year] == year end return nil end |
.search_imdb(search_term) ⇒ Object
55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 |
# File 'lib/yay_imdbs.rb', line 55 def search_imdb(search_term) search_results = [] doc = get_search_page(search_term) # If the search is an exact match imdb will redirect to the movie page not search results page # we uses the title meta element to determine if we got an exact match movie_title, movie_year = (doc) if movie_title canonical_link = doc.at_css("link[rel='canonical']").try(:[], 'href') if canonical_link && canonical_link =~ /tt(\d+)\// return [:name => movie_title, :year => movie_year, :imdb_id => $1, :video_type => (doc)] else raise "Unable to extract imdb id from exact search result" end end doc.css("td").each do |td| td.css("a").each do |link| href = link['href'] current_name = link.content # Ignore links with no text (e.g. image links) or links that don't link to movie pages next unless current_name.present? && href =~ /^\/title\/tt(\d+)/ imdb_id = $1 current_year = $1.gsub(/\(\)/, '').to_i if td.inner_text =~ /\((\d{4}\/?\w*)\)/ search_results << {:imdb_id => imdb_id, :name => clean_title(current_name), :year => current_year, :video_type => video_type(td)} end end return search_results end |
.strip_whitespace(s) ⇒ Object
Hackyness to get around ruby 1.9 encoding issue
241 242 243 |
# File 'lib/yay_imdbs.rb', line 241 def strip_whitespace(s) s.encode('UTF-8').gsub(STRIP_WHITESPACE, '').strip end |
.video_type(td) ⇒ Object
245 246 247 248 |
# File 'lib/yay_imdbs.rb', line 245 def video_type(td) return :tv_show if td.content =~ /\((TV series|TV)\)/ return :movie end |
.video_type_from_meta(doc) ⇒ Object
250 251 252 253 |
# File 'lib/yay_imdbs.rb', line 250 def (doc) type_text = doc.at_css("meta[property='og:type']").try(:[], 'content') type_text =~ /tv_show/ ? :tv_show : :movie end |