Class: GoogleCrawler

Inherits:
Object
  • Object
show all
Defined in:
lib/gcrawler/search.rb

Overview

Google crawler

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(proxies: [], black_domains: [], exclude_hosts: []) ⇒ GoogleCrawler

Returns a new instance of GoogleCrawler.



41
42
43
44
45
46
47
# File 'lib/gcrawler/search.rb', line 41

def initialize(proxies: [], black_domains: [], exclude_hosts: [])
  @exclude_hosts = exclude_hosts
  Utils.proxies = proxies
  Utils.black_domains = black_domains

  @crawler = Crawler.new
end

Instance Attribute Details

#exclude_hostsObject

Returns the value of attribute exclude_hosts.



39
40
41
# File 'lib/gcrawler/search.rb', line 39

def exclude_hosts
  @exclude_hosts
end

Instance Method Details

#search_as_object(*keywords, language: nil, num: nil, country: nil, start: 0, pause: 0) ⇒ Object

search as object with keys ‘url’



57
58
59
60
61
# File 'lib/gcrawler/search.rb', line 57

def search_as_object(*keywords, language: nil, num: nil, country: nil, start: 0, pause: 0)
  search_as_page(*keywords, language: language, num: num, country: country, start: start, pause: pause)

  generate_objects
end

#search_as_page(*keywords, language: nil, num: nil, country: nil, start: 0, pause: 0) ⇒ Object

search as page Args:

keywords (varargs): kw1, kw2, kw3, ...
language (str, optional): Query language. Defaults to nil.
num (uint, optional): Number of results per page(default is 10 per page). Defaults to nil.
start (int, optional): Offset. Defaults to 0.
country (str, optional): Query country, Defaults to None, example: countryCN or cn or CN.
pause (uint, optional): Set crawling delay seconds bwtween two requests. 
                        Too short which may be forbidden by Google crawling monitor. Defaults to nil.

Return:

Mechanize::Page, see https://github.com/sparklemotion/mechanize


76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
# File 'lib/gcrawler/search.rb', line 76

def search_as_page(*keywords, language: nil, num: nil, country: nil, start: 0, pause: 0)
  return if keywords.empty?

  query_str = "q=#{keywords.join('+')}&btnG=Search&gbv=1&safe=active&start=0"
  query_str += "&ln=#{language}" unless language.blank?
  query_str += "&num=#{num.to_i}" unless num.blank?
  query_str += "&cr=#{country}" unless country.blank?
  query_str.gsub!(/start=0/, "start=#{start}") unless start == 0

  @crawler.query_str(query_str)

  seconds = pause.zero? ? Utils.random_interval_time : pause
  LOGGER.info "Crawling query string is #{query_str}, will be crawling after #{seconds} seconds..."
  sleep(seconds)

  @crawler.crawl

  raise "Fetch on Google failed with code #{@crawler.response_code}" unless @crawler.response_code == 200

  LOGGER.info 'Crawl on Google successfully...'
end

#search_as_url(*keywords, language: nil, num: nil, country: nil, start: 0, pause: 0) ⇒ Object

search as url



50
51
52
53
54
# File 'lib/gcrawler/search.rb', line 50

def search_as_url(*keywords, language: nil, num: nil, country: nil, start: 0, pause: 0)
  search_as_page(*keywords, language: language, num: num, country: country, start: start, pause: pause)

  filter_urls
end