Class: Scrapix::GoogleImages
- Inherits:
-
Object
- Object
- Scrapix::GoogleImages
- Includes:
- Capybara::DSL
- Defined in:
- lib/scrapix/google_images.rb
Overview
download images from a Google Image Search
Instance Method Summary collapse
-
#find(page_no = 1) ⇒ Object
params: page_no => starting page number for google results.
-
#initialize(query = nil, options = {}) ⇒ GoogleImages
constructor
options can be: size: named size, e.g.
- #options=(opts) ⇒ Object
- #query=(q) ⇒ Object
- #search_url(page_no = 1) ⇒ Object
- #total=(n) ⇒ Object
Constructor Details
#initialize(query = nil, options = {}) ⇒ GoogleImages
options can be:
size: named size, e.g. icon, small, medium, large, 13mp, 1280x800, etc.
safe: true or false
10 11 12 13 14 |
# File 'lib/scrapix/google_images.rb', line 10 def initialize(query = nil, = {}) self. = self.query = query self.total = 100 end |
Instance Method Details
#find(page_no = 1) ⇒ Object
params: page_no => starting page number for google results
43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 |
# File 'lib/scrapix/google_images.rb', line 43 def find(page_no = 1) images = {} return images unless @query while images.count < @num visit search_url(page_no) links = Capybara.page.all("a") links = links.select{|x| x["href"] =~ /^\/imgres/} if links.any? return images unless links.any? page_counter = 0 links.each do |link| attribs = CGI.parse(URI.parse(link["href"]).query) rescue nil next if attribs.nil? hash = Digest::MD5.hexdigest(attribs["imgurl"][0]) unless images.has_key?(hash) images[hash] = { width: attribs["w"][0], height: attribs["h"][0], url: attribs["imgurl"][0], reference_url: attribs["imgrefurl"][0] } page_counter += 1 end end page_no += 1 break if page_counter == 0 end images.take(@num).map{|x| x[1]} end |
#options=(opts) ⇒ Object
28 29 30 31 32 33 34 35 36 37 38 39 40 |
# File 'lib/scrapix/google_images.rb', line 28 def (opts) # convert symbolic keys to string keys = {} opts.each { |k,v| [k.to_s] = v } # merge the options with defaults! @options ||= { "safe" => true, "size" => "any" } @options.merge!() sanitize_size # parametrize for url purposes @params = create_params end |
#query=(q) ⇒ Object
20 21 22 |
# File 'lib/scrapix/google_images.rb', line 20 def query=(q) @query = URI.escape(q) if q end |
#search_url(page_no = 1) ⇒ Object
16 17 18 |
# File 'lib/scrapix/google_images.rb', line 16 def search_url(page_no = 1) "http://google.com/search?tbm=isch&q=#{@query}#{@params}&start=#{(page_no - 1)*20}" end |
#total=(n) ⇒ Object
24 25 26 |
# File 'lib/scrapix/google_images.rb', line 24 def total=(n) @num = n.to_i end |