Class: YayImdbs
- Inherits:
-
Object
- Object
- YayImdbs
- Defined in:
- lib/yay_imdbs.rb
Constant Summary collapse
- IMDB_BASE_URL =
'http://www.imdb.com/'
- IMDB_SEARCH_URL =
IMDB_BASE_URL + 'find?s=tt&q='
- IMDB_MOVIE_URL =
IMDB_BASE_URL + 'title/tt'
- STRIP_WHITESPACE =
/(\s{2,}|\n|\||\302\240\302\273)/u
- DATE_PROPERTIES =
[:release_date]
- LIST_PROPERTIES =
[:genres, :plot_keywords, :country, :sound_mix, :language]
- INT_LIST_PROPERTIES =
[:year, :season]
- PROPERTY_ALIAS =
{:genres => :genre, :taglines => :tagline, :year => :years, :season => :seasons, :language => :languages, :motion_picture_rating_mpaa => :mpaa}
Class Method Summary collapse
- .clean_movie_property(key, value) ⇒ Object
-
.clean_title(movie_title) ⇒ Object
Remove surrounding double quotes that seems to appear on tv show name.
- .get_episodes_page(imdb_id) ⇒ Object
- .get_media_page(url_fragment) ⇒ Object
- .get_movie_page(imdb_id) ⇒ Object
- .get_search_page(name) ⇒ Object
- .get_title_and_year_from_meta(doc) ⇒ Object
- .movie_properties(doc) ⇒ Object
- .scrap_episodes(info_hash) ⇒ Object
- .scrap_images(doc, info_hash) ⇒ Object
- .scrap_movie_info(imdb_id) ⇒ Object
- .search_for_imdb_id(name, year = nil, type = nil) ⇒ Object
- .search_imdb(search_term) ⇒ Object
-
.strip_whitespace(s) ⇒ Object
Hackyness to get around ruby 1.9 encoding issue.
- .video_type(td) ⇒ Object
- .video_type_from_meta(doc) ⇒ Object
Class Method Details
.clean_movie_property(key, value) ⇒ Object
118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
# File 'lib/yay_imdbs.rb', line 118 def clean_movie_property(key, value) if DATE_PROPERTIES.include?(key) value = Date.strptime(value, '%d %B %Y') rescue nil elsif key == :runtime if value =~ /(\d+)\smin/ value = $1.to_i else value = nil end elsif LIST_PROPERTIES.include?(key) value = value.split('|').collect { |l| l.gsub(/[^a-zA-Z0-9\-]/, '') } elsif INT_LIST_PROPERTIES.include?(key) value = value.split('|').collect { |l| l.strip.to_i }.reject { |y| y <= 0 } end return value end |
.clean_title(movie_title) ⇒ Object
Remove surrounding double quotes that seems to appear on tv show name
212 213 214 215 |
# File 'lib/yay_imdbs.rb', line 212 def clean_title(movie_title) movie_title = $1 if movie_title =~ /^"(.*)"$/ return movie_title.strip end |
.get_episodes_page(imdb_id) ⇒ Object
193 194 195 |
# File 'lib/yay_imdbs.rb', line 193 def get_episodes_page(imdb_id) Nokogiri::HTML(open(IMDB_MOVIE_URL + imdb_id + '/episodes')) end |
.get_media_page(url_fragment) ⇒ Object
197 198 199 |
# File 'lib/yay_imdbs.rb', line 197 def get_media_page(url_fragment) Nokogiri::HTML(open(IMDB_BASE_URL + url_fragment)) end |
.get_movie_page(imdb_id) ⇒ Object
189 190 191 |
# File 'lib/yay_imdbs.rb', line 189 def get_movie_page(imdb_id) Nokogiri::HTML(open(IMDB_MOVIE_URL + imdb_id)) end |
.get_search_page(name) ⇒ Object
185 186 187 |
# File 'lib/yay_imdbs.rb', line 185 def get_search_page(name) Nokogiri::HTML(open(IMDB_SEARCH_URL + URI.escape(name))) end |
.get_title_and_year_from_meta(doc) ⇒ Object
201 202 203 204 205 206 207 208 209 |
# File 'lib/yay_imdbs.rb', line 201 def (doc) title_text = doc.at_css("meta[name='title']").try(:[], 'content') # Matches 'Movie Name (2010)' or 'Movie Name (2010/I)' or 'Lost (TV Series 2004–2010)' if title_text && title_text =~ /(.*) \([^\)0-9]*(\d{4})((\/\w*)|(.\d{4}))?\)/ movie_title = clean_title($1) movie_year = $2.to_i end return movie_title, movie_year end |
.movie_properties(doc) ⇒ Object
135 136 137 138 139 140 141 142 143 144 145 146 147 |
# File 'lib/yay_imdbs.rb', line 135 def movie_properties(doc) doc.css("div h4").each do |h4| div = h4.parent raw_key = h4.inner_text key = raw_key.sub(':', '').strip.downcase value = div.inner_text[((div.inner_text =~ /#{Regexp.escape(raw_key)}/) + raw_key.length).. -1] value = value.gsub(/\302\240\302\273/u, '').strip.gsub(/(See more)|(see all)|(See all certifications)|(See full summary)$/, '').strip symbol_key = key.downcase.gsub(/[^a-zA-Z0-9 ]/, '').gsub(/\s/, '_').to_sym yield symbol_key, value end end |
.scrap_episodes(info_hash) ⇒ Object
166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 |
# File 'lib/yay_imdbs.rb', line 166 def scrap_episodes(info_hash) episodes = [] doc = get_episodes_page(info_hash[:imdb_id]) doc.css(".filter-all").each do |e_div| next unless e_div.at_css('h3').inner_text =~ /Season (\d+), Episode (\d+):/ episode = {"series" => $1.to_i, "episode" => $2.to_i, "title" => $'.strip} raw_date = e_div.at_css('strong').inner_text.strip episode['date'] = Date.parse(raw_date) rescue nil if e_div.inner_text =~ /#{raw_date}/ episode['plot'] = $'.strip end episodes << episode end info_hash['episodes'] = episodes end |
.scrap_images(doc, info_hash) ⇒ Object
149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 |
# File 'lib/yay_imdbs.rb', line 149 def scrap_images(doc, info_hash) #scrap poster image urls thumbnail_url = doc.at_css("td[id=img_primary] a img").try(:[], 'src') return if thumbnail_url.nil? || thumbnail_url =~ /\/nopicture\// info_hash['medium_image'] = thumbnail_url # Small thumbnail image, gotten by hacking medium url info_hash['small_image'] = thumbnail_url.sub(/@@.*$/, '@@._V1._SX120_120,160_.jpg') #Try to scrap a larger version of the image url large_img_page_link = doc.at_css("td[id=img_primary] a").try(:[], 'href') return unless large_img_page_link large_img_doc = get_media_page(large_img_page_link) large_img_url = large_img_doc.at_css("img[id=primary-img]").try(:[], 'src') info_hash['large_image'] = large_img_url end |
.scrap_movie_info(imdb_id) ⇒ Object
79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 |
# File 'lib/yay_imdbs.rb', line 79 def scrap_movie_info(imdb_id) info_hash = {:imdb_id => imdb_id}.with_indifferent_access doc = get_movie_page(imdb_id) title, year = (doc) info_hash[:title], info_hash[:year] = title, year if info_hash['title'].nil? #If we cant get title and year something is wrong raise "Unable to find title or year for imdb id #{imdb_id}" end info_hash[:video_type] = (doc) info_hash[:plot] = doc.xpath("//td[@id='overview-top']/p[2]").inner_text.strip info_hash[:rating] = doc.at_css('.rating-rating').content.gsub(/\/.*/, '').to_f rescue nil found_info_divs = false movie_properties(doc) do |key, value| found_info_divs = true info_hash["raw_#{key}"] = value info_hash[key] = clean_movie_property(key, value) info_hash[PROPERTY_ALIAS[key]] = info_hash[key] if PROPERTY_ALIAS[key] end unless found_info_divs #If we don't find any info divs assume parsing failed raise "No info divs found for imdb id #{imdb_id}" end # Hack: tv shows can have a year property, which is a list, fixing ... info_hash[:year] = year scrap_images(doc, info_hash) #scrap episodes if tv series scrap_episodes(info_hash) if info_hash.has_key?('season') return info_hash end |
.search_for_imdb_id(name, year = nil, type = nil) ⇒ Object
33 34 35 36 37 38 39 40 41 42 43 44 |
# File 'lib/yay_imdbs.rb', line 33 def search_for_imdb_id(name, year=nil, type=nil) search_results = search_imdb(name) search_results.each do |result| # Ensure result is the correct video type next if type && (result[:video_type].to_s != type.to_s) # If no year provided just return first result return result[:imdb_id] if year.nil? || result[:year] == year end return nil end |
.search_imdb(search_term) ⇒ Object
46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 |
# File 'lib/yay_imdbs.rb', line 46 def search_imdb(search_term) search_results = [] doc = get_search_page(search_term) # If the search is an exact match imdb will redirect to the movie page not search results page # we uses the title meta element to determine if we got an exact match movie_title, movie_year = (doc) if movie_title canonical_link = doc.at_css("link[rel='canonical']").try(:[], 'href') if canonical_link && canonical_link =~ /tt(\d+)\// return [:name => movie_title, :year => movie_year, :imdb_id => $1, :video_type => (doc)] else raise "Unable to extract imdb id from exact search result" end end doc.css("td").each do |td| td.css("a").each do |link| href = link['href'] current_name = link.content # Ignore links with no text (e.g. image links) or links that don't link to movie pages next unless current_name.present? && href =~ /^\/title\/tt(\d+)/ imdb_id = $1 current_year = $1.gsub(/\(\)/, '').to_i if td.inner_text =~ /\((\d{4}\/?\w*)\)/ search_results << {:imdb_id => imdb_id, :name => clean_title(current_name), :year => current_year, :video_type => video_type(td)} end end return search_results end |
.strip_whitespace(s) ⇒ Object
Hackyness to get around ruby 1.9 encoding issue
218 219 220 |
# File 'lib/yay_imdbs.rb', line 218 def strip_whitespace(s) s.encode('UTF-8').gsub(STRIP_WHITESPACE, '').strip end |
.video_type(td) ⇒ Object
222 223 224 225 |
# File 'lib/yay_imdbs.rb', line 222 def video_type(td) return :tv_show if td.content =~ /\((TV series|TV)\)/ return :movie end |
.video_type_from_meta(doc) ⇒ Object
227 228 229 230 |
# File 'lib/yay_imdbs.rb', line 227 def (doc) type_text = doc.at_css("meta[property='og:type']").try(:[], 'content') type_text == 'tv_show' ? :tv_show : :movie end |