Class: Imdb

Inherits:
Object
  • Object
show all
Defined in:
lib/imdb/imdb.rb

Constant Summary collapse

IMDB_MOVIE_BASE_URL =
"http://www.imdb.com/title/"
IMDB_NAME_BASE_URL =
"http://www.imdb.com/name/"
IMDB_COMPANY_BASE_URL =
"http://www.imdb.com/company/"
IMDB_GENRE_BASE_URL =
"http://www.imdb.com/Sections/Genres/"
IMDB_SEARCH_BASE_URL =
"http://imdb.com/find?s=all&q="
IMDB_TOP_250_URL =
"http://www.imdb.com/chart/top"
IMDB_TOP_BY_DECADE_BASE_URL =
"http://www.imdb.com/chart/"
IMDB_ALL_TIME_BOX_OFFICE_BASE_URL =
"http://www.imdb.com/boxoffice/alltimegross"

Class Method Summary collapse

Class Method Details

.all_time_us_box_officeObject



36
37
38
39
40
# File 'lib/imdb/imdb.rb', line 36

def self.all_time_us_box_office
  coder = HTMLEntities.new
  document = Hpricot(open(IMDB_ALL_TIME_BOX_OFFICE_BASE_URL).read)
  parse_all_time_box_office(document)
end

.all_time_worldwide_box_officeObject



42
43
44
45
# File 'lib/imdb/imdb.rb', line 42

def self.all_time_worldwide_box_office
  document = Hpricot(open("#{IMDB_ALL_TIME_BOX_OFFICE_BASE_URL}?region=world-wide").read)
  parse_all_time_box_office(document)
end

.find_movie_by_id(id) ⇒ Object



78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
# File 'lib/imdb/imdb.rb', line 78

def self.find_movie_by_id(id)
  coder = HTMLEntities.new

  data = Hpricot(open(IMDB_MOVIE_BASE_URL + id))
  
  movie = ImdbMovie.new
  
  movie.imdb_id = id
  movie.title = coder.decode(data.at("title").inner_text.gsub(/\((\d{4}(\/[^)]*)?|[A-Z]+)\)/,'').strip)

  rating_text = (data/"div.starbar-meta/b").inner_text
  if rating_text =~ /([\d\.]+)\/10/
    movie.rating = $1
  end

  begin
    movie.poster_url = data.at("div.photo/a[@name='poster']/img")['src']
  rescue
    movie.poster_url = nil
  end

  infos = (data/"div.info")
  infos.each do |info|
    info_title = (info/"h5").inner_text
    case info_title
    when /Directors?:/
      movie.directors = parse_names(info)
    when /Writers?[^:]+:/
      movie.writers = parse_names(info)
    when /Company:/
      movie.company = parse_company(info)
    when "Tagline:"
      movie.tagline = coder.decode(parse_info(info).strip)
	if movie.tagline
      	movie.tagline = movie.tagline.gsub(/\s*see more.*/i, '')
	end
    when "Runtime:"
      movie.runtime = parse_info(info).strip
      if (movie.runtime)
        movie.runtime.gsub!(/^[^:]+:\s*/, '')
        movie.runtime.gsub!(/min .*/, 'min')
      end
    when "Plot:"
      movie.plot = parse_info(info).strip
      movie.plot = movie.plot.gsub(/\s*\|\s*add synopsis.*/i, '')
      movie.plot = movie.plot.gsub(/\s*\|\s*full synopsis.*/i, '')
      movie.plot = movie.plot.gsub(/\s*\|\s*add summary.*/i, '')
      movie.plot = movie.plot.gsub(/\s*full summary.*/i, '')
      movie.plot = movie.plot.gsub(/more$/i, '')
      movie.plot = coder.decode(movie.plot.strip)
    when "Genre:"
      movie.genres = parse_genres(info)
    when "Release Date:"
      begin
        if (parse_info(info).strip =~ /(\d{1,2}) ([a-zA-Z]+) (\d{4})/)
          movie.release_date = Date.parse("#{$2} #{$1}, #{$3}")
        end
      rescue
        movie.release_date = nil
      end
    when "Certification:"
      begin
        movie.certification = (info/"a").map { |v| v.inner_html }.select { |v| v =~ /^USA:/ && v !~ /Unrated/ }.map { |v| v[/^USA:/]=''; v.strip }.first
      end
    end
  end 

  cast = (data/"table.cast"/"tr")
  cast.each do |cast_member|
      actor_a = (cast_member/"td.nm").inner_html
      actor_a =~ /name\/([^"]+)\//
      actor_id = $1
      actor_name = coder.decode((cast_member/"td.nm"/"a").inner_text)
      actor_role = coder.decode((cast_member/"td.char").inner_text)
      movie.actors = movie.actors << ImdbName.new(actor_id, actor_name, actor_role)
  end

  movie # return movie

end

.parse_all_time_box_office(document) ⇒ Object



47
48
49
50
51
52
53
54
55
56
57
58
59
# File 'lib/imdb/imdb.rb', line 47

def self.parse_all_time_box_office(document)
  coder = HTMLEntities.new
  results = []
  document.search("div#main table tr").each do |result|
    movie_link = result.at("td a")
    dollar_amount = result.at("td:nth(2)")
    next unless dollar_amount && movie_link

    results << {:imdb_id => movie_link["href"].match(/tt\d+/).to_s, :title => coder.decode(movie_link.inner_text), :dollar_amount => dollar_amount.inner_text}
  end
  
  results
end

.search_movies_by_title(title) ⇒ Object



62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
# File 'lib/imdb/imdb.rb', line 62

def self.search_movies_by_title(title)
  coder = HTMLEntities.new
  document = Hpricot(open("#{IMDB_SEARCH_BASE_URL}#{CGI::escape(title)};s=tt").read)
  # we got search results
  if document.search('title').inner_text == "IMDb Title Search"
    results = document.search('a[@href^="/title/tt"]').reject do |element|
      element.innerHTML.strip_tags.empty?
    end.map do |element|
      {:imdb_id => element['href'][/tt\d+/], :title => coder.decode(element.innerHTML.strip_tags.unescape_html)}
    end
    results.uniq
  else
    {:imdb_id => document.search('link[@href^="http://www.imdb.com/title/tt"]').first['href'].match(/tt\d+/).to_s, :title => coder.decode(document.search('meta[@name="title"]').first["content"].gsub(/\(\d\d\d\d\)$/, '').strip)}
  end
end

.top_250Object



12
13
14
15
16
17
18
19
20
21
22
# File 'lib/imdb/imdb.rb', line 12

def self.top_250
  coder = HTMLEntities.new
  document = Hpricot(open(IMDB_TOP_250_URL).read)
  # we got search results
  results = []
  document.search("div#main a").each do |result|
    results << {:imdb_id => result["href"].match(/tt\d+/).to_s, :title => coder.decode(result.inner_text)}
  end
  
  results
end

.top_by_decade(decade) ⇒ Object



24
25
26
27
28
29
30
31
32
33
34
# File 'lib/imdb/imdb.rb', line 24

def self.top_by_decade(decade)
  coder = HTMLEntities.new
  document = Hpricot(open("#{IMDB_TOP_BY_DECADE_BASE_URL}#{decade}s").read)
  # we got search results
  results = []
  document.search("div#main table:nth(0) a").each do |result|
    results << {:imdb_id => result["href"].match(/tt\d+/).to_s, :title => coder.decode(result.inner_text)}
  end
  
  results
end