Class: Youtube::BrowseScraper

Inherits:
Object
  • Object
show all
Defined in:
lib/youtube/browsescraper.rb

Overview

Introduction

Youtube::BrowseScraper scrapes video information from search result page on www.youtube.com.

You can get result as array or xml.

XML format is same as YouTube Developer API (www.youtube.com/dev_api_ref?m=youtube.videos.list_by_tag).

Example

require "rubygems"
require "youtube/BrowseScraper"

scraper = Youtube::BrowseScraper.new(browse, time, category, language, page)
scraper.open
data = scraper.scrape
p data

More Information

Author

Syuichi Kohata <[email protected]>

Version

0.0.1

License

MIT license

Constant Summary collapse

MostRecent =

constants for browse parameter(default MoseRecent)

'mr'
MostViewed =
'mp'
TopRated =
'tr'
MostDiscussed =
'md'
TopFavorites =
'mf'
MostLinked =
'mrd'
RecentryFeatured =
'rf'
MostResponded =
'ms'
WatchOnMobile =
'mv'
BrowseArray =
[MostRecent,
MostViewed,
TopRated,
MostDiscussed,
TopFavorites,
MostLinked,
RecentryFeatured,
MostResponded,
WatchOnMobile]
Today =

constants for time parameter(default Today)

't'
ThisWeek =
'w'
ThisMonth =
'm'
All =
'a'
TimeArray =
[Today,
ThisWeek,
ThisMonth,
All]
AllCategory =

constants for category parameter(default 0)

0
AutosVehicles =
2
Comedy =
23
Entertainment =
24
FilmAnimation =
1
GadgetsGames =
20
HowtoDIY =
26
Music =
10
NewsPolitics =
25
PeopleBlogs =
22
PetsAnimals =
15
Sports =
17
TravelPlaces =
19
AllLanguage =

constants for language parameter(default ”)

''
English =
'EN'
Spanish =
'ES'
Japanese =
'JP'
German =
'DE'
Chinese =
'CN'
French =
'FR'
LanguageArray =
[AllLanguage,
English,
Spanish,
Japanese,
German,
Chinese,
French]
@@youtube_search_base_url =
'http://www.youtube.com/browse'

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(browse = MostRecent, time = Today, category = AllCategory, language = AllLanguage, page = 1) ⇒ BrowseScraper

Create Youtube::BrowseScraper object (default parameter )

You cannot specify number of videos per page. Always, the number of videos is 20 per page.



136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
# File 'lib/youtube/browsescraper.rb', line 136

def initialize browse = MostRecent, time = Today, category = AllCategory, language = AllLanguage, page = 1
  @browse   = browse
  @time     = time
  @category = category
  @language = language
  @page     = page

  errors = []
  errors << "browse"    if BrowseArray.index(@browse)     == nil
  errors << "time"      if TimeArray.index(@time)         == nil
  errors << "language"  if LanguageArray.index(@language) == nil
  unless errors.empty? then
    error_msg = "parameter error occurred.\n"
    errors.each do |error|
      error_msg << error + " is invalid.\n"
    end
    raise error_msg
  end
end

Instance Attribute Details

#browseObject

Returns the value of attribute browse.



120
121
122
# File 'lib/youtube/browsescraper.rb', line 120

def browse
  @browse
end

#categoryObject

Returns the value of attribute category.



122
123
124
# File 'lib/youtube/browsescraper.rb', line 122

def category
  @category
end

#languageObject

Returns the value of attribute language.



123
124
125
# File 'lib/youtube/browsescraper.rb', line 123

def language
  @language
end

#pageObject

Returns the value of attribute page.



124
125
126
# File 'lib/youtube/browsescraper.rb', line 124

def page
  @page
end

#timeObject

Returns the value of attribute time.



121
122
123
# File 'lib/youtube/browsescraper.rb', line 121

def time
  @time
end

#video_countObject (readonly)

Returns the value of attribute video_count.



125
126
127
# File 'lib/youtube/browsescraper.rb', line 125

def video_count
  @video_count
end

#video_fromObject (readonly)

Returns the value of attribute video_from.



126
127
128
# File 'lib/youtube/browsescraper.rb', line 126

def video_from
  @video_from
end

#video_toObject (readonly)

Returns the value of attribute video_to.



127
128
129
# File 'lib/youtube/browsescraper.rb', line 127

def video_to
  @video_to
end

Instance Method Details

#check_video(video) ⇒ Object



235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
# File 'lib/youtube/browsescraper.rb', line 235

def check_video video
  errors = []

  errors << "id"             if video.id.empty?
  errors << "author"         if video.author.empty?
  errors << "title"          if video.title.empty?
  errors << "length_seconds" if video.length_seconds.to_s.empty?
  errors << "thumbnail_url"  if video.thumbnail_url.empty?

  unless errors.empty? then
    error_msg = "scraping error occurred.\n"
    errors.each do |error|
      error_msg << error + " is not setted.\n"
    end
    raise error_msg
  end
end

#eachObject



253
254
255
256
257
# File 'lib/youtube/browsescraper.rb', line 253

def each
  @videos.each do |video|
    yield video
  end
end

#get_xmlObject

Return videos information as XML Format.



192
193
# File 'lib/youtube/browsescraper.rb', line 192

def get_xml
end

#openObject

Get search result from youtube by specified keyword.



157
158
159
160
161
162
163
164
165
166
# File 'lib/youtube/browsescraper.rb', line 157

def open
  @url  = @@youtube_search_base_url
  @url += "?s=#{@browse}"
  @url += "&t=#{@time}"
  @url += "&c=#{@category}"
  @url += "&l=#{@language}"
  @url += "&p=#{@page}"
  @html = Kernel.open(@url).read
  @search_result = Hpricot.parse(@html)
end

#replace_document_write_javascriptObject



195
196
197
# File 'lib/youtube/browsescraper.rb', line 195

def replace_document_write_javascript
  @html.gsub!(%r{<script language="javascript" type="text/javascript">.*?document.write\('(.*?)'\).*?</script>}m, '\1')
end

#scrapeObject

Scrape video information from search result html.



169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
# File 'lib/youtube/browsescraper.rb', line 169

def scrape
  @videos = []
  @video_count = 0
  @search_result.search('//div[@class="v120vEntry"]').each do |video_html|
    video                = Youtube::Video.new

    video.id             = scrape_id(video_html)
    video.author         = scrape_author(video_html)
    video.title          = scrape_title(video_html)
    video.length_seconds = scrape_length_seconds(video_html)
    video.rating_avg     = scrape_rating_avg(video_html)
    video.view_count     = scrape_view_count(video_html)
    video.thumbnail_url  = scrape_thumbnail_url(video_html)

    check_video video

    @videos << video
    @video_count += 1
  end
  @videos
end

#scrape_author(video_html) ⇒ Object



226
227
228
# File 'lib/youtube/browsescraper.rb', line 226

def scrape_author video_html
  video_html.search("div[@class='vfacets']").inner_html.sub(/.*From:<\/span> <a.*?>(.*?)<\/a>.*/m, '\1')
end

#scrape_id(video_html) ⇒ Object



199
200
201
# File 'lib/youtube/browsescraper.rb', line 199

def scrape_id video_html
  scrape_thumbnail_url(video_html).sub(%r{.*/([^/]+)/[^/]+.jpg}, '\1')
end

#scrape_length_seconds(video_html) ⇒ Object



211
212
213
214
215
# File 'lib/youtube/browsescraper.rb', line 211

def scrape_length_seconds video_html
  length_seconds = video_html.search("span[@class='runtime']").inner_html
  length_seconds =~ /(\d\d):(\d\d)/
  $1.to_i * 60 + $2.to_i
end

#scrape_rating_avg(video_html) ⇒ Object



217
218
219
220
# File 'lib/youtube/browsescraper.rb', line 217

def scrape_rating_avg video_html
  video_html.search("img[@src='/img/icn_star_full_11x11.gif']").size +
  video_html.search("img[@src='/img/icn_star_half_11x11.gif']").size * 0.5
end

#scrape_thumbnail_url(video_html) ⇒ Object



203
204
205
# File 'lib/youtube/browsescraper.rb', line 203

def scrape_thumbnail_url video_html
  video_html.search("img[@class='vimg120']").to_html.sub(/.*src="(.*?)".*/, '\1')
end

#scrape_title(video_html) ⇒ Object



207
208
209
# File 'lib/youtube/browsescraper.rb', line 207

def scrape_title video_html
  video_html.search('div[@class="vtitle"]/a').inner_html
end

#scrape_view_count(video_html) ⇒ Object



230
231
232
233
# File 'lib/youtube/browsescraper.rb', line 230

def scrape_view_count video_html
  @num = video_html.search("div[@class='vfacets']").inner_html.sub(/.*Views:<\/span> ([\d,]+).*/m, '\1')
  @num.gsub(/,/, '').to_i
end