Class: Scrapix::GoogleImages

Inherits:
Object
  • Object
show all
Includes:
Capybara::DSL
Defined in:
lib/scrapix/google_images.rb

Overview

download images from a Google Image Search

Instance Method Summary collapse

Constructor Details

#initialize(query = nil, options = {}) ⇒ GoogleImages

options can be:

size: named size, e.g. icon, small, medium, large, 13mp, 1280x800, etc.
safe: true or false


10
11
12
13
14
# File 'lib/scrapix/google_images.rb', line 10

def initialize(query = nil, options = {})
  self.options = options
  self.query   = query
  self.total   = 100
end

Instance Method Details

#find(page_no = 1) ⇒ Object

params: page_no => starting page number for google results



43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
# File 'lib/scrapix/google_images.rb', line 43

def find(page_no = 1)
  images = {}
  return images unless @query

  while images.count < @num
    visit search_url(page_no)
    links = Capybara.page.all("a")
    links = links.select{|x| x["href"] =~ /^\/imgres/} if links.any?
    return images unless links.any?
    page_counter = 0
    links.each do |link|
      attribs = CGI.parse(URI.parse(link["href"]).query) rescue nil
      next if attribs.nil?
      hash = Digest::MD5.hexdigest(attribs["imgurl"][0])
      unless images.has_key?(hash)
        images[hash] = {
          width:          attribs["w"][0],
          height:         attribs["h"][0],
          url:            attribs["imgurl"][0],
          reference_url:  attribs["imgrefurl"][0]
        }
        page_counter += 1
      end
    end
    page_no += 1
    break if page_counter == 0
  end
  images.take(@num).map{|x| x[1]}
end

#options=(opts) ⇒ Object



28
29
30
31
32
33
34
35
36
37
38
39
40
# File 'lib/scrapix/google_images.rb', line 28

def options=(opts)
  # convert symbolic keys to string keys
  options = {}
  opts.each { |k,v| options[k.to_s] = v }

  # merge the options with defaults!
  @options ||= { "safe" => true, "size" => "any" }
  @options.merge!(options)
  sanitize_size

  # parametrize for url purposes
  @params = create_params
end

#query=(q) ⇒ Object



20
21
22
# File 'lib/scrapix/google_images.rb', line 20

def query=(q)
  @query = URI.escape(q) if q
end

#search_url(page_no = 1) ⇒ Object



16
17
18
# File 'lib/scrapix/google_images.rb', line 16

def search_url(page_no = 1)
  "http://google.com/search?tbm=isch&q=#{@query}#{@params}&start=#{(page_no - 1)*20}"
end

#total=(n) ⇒ Object



24
25
26
# File 'lib/scrapix/google_images.rb', line 24

def total=(n)
  @num = n.to_i
end