Class: GruCrawler

Inherits:
Object
  • Object
show all
Defined in:
lib/grucrawler.rb,
lib/grucrawler/queue.rb,
lib/grucrawler/version.rb

Defined Under Namespace

Classes: DoNotCrawlFurther, Queue

Constant Summary collapse

VERSION =
"0.0.5"

Instance Method Summary collapse

Constructor Details

#initialize(rules) ⇒ GruCrawler

Returns a new instance of GruCrawler.



12
13
14
15
16
17
18
19
# File 'lib/grucrawler.rb', line 12

def initialize(rules)
  @crawler = rules
  @options = @crawler.options()
  domain_wait = @options[:domain_wait] || 20
  @queue = GruCrawler::Queue.new(@crawler.class.name, @options[:visit_urls_only_once], domain_wait)

  @crawler.on_init(self)
end

Instance Method Details

#add_from_queueObject



32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
# File 'lib/grucrawler.rb', line 32

def add_from_queue
  url = @queue.next_url()
  return false unless url

  request = Typhoeus::Request.new(url, followlocation: @options[:follow_redirects], accept_encoding: 'gzip')
  @queue.started(url)

  request.on_complete do |response|
    on_response(response)
  end

  @crawler.debug("#{Time.now} started URL #{url}")
  @hydra.queue(request)

  true
end

#add_url(url) ⇒ Object



28
29
30
# File 'lib/grucrawler.rb', line 28

def add_url(url)
  @queue.push(url)
end

#crawl_moreObject



77
78
79
80
81
# File 'lib/grucrawler.rb', line 77

def crawl_more
  while @queue.count < @concurrency
    break unless add_from_queue()
  end
end

#on_response(response) ⇒ Object



53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# File 'lib/grucrawler.rb', line 53

def on_response(response)
  @crawler.debug("#{Time.now} ended URL #{response.request.url}")
  @queue.finished(response.request.url)

  crawl_more()

  if response.body.length > (@options[:max_page_size] || 1000*1000*1000)
    @crawler.debug("URL response size too big: #{response.body.length} from #{response.request.url}")
    return
  end

  nokogiri = Nokogiri::HTML(response.body)

  begin
    @crawler.on_page_received(response, nokogiri)
  rescue
    @crawler.log_error(response, $!)
  end

  queue_links(response, nokogiri)

  crawl_more()
end


83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
# File 'lib/grucrawler.rb', line 83

def queue_links(response, nokogiri)
  nokogiri.css('a').each do |link|
    next unless link['href']

    begin
      url = URI.join(response.effective_url, link['href']).to_s
    rescue
      next
    end
    if @crawler.follow_link(url, response, nokogiri)
      added = add_url(url)
      @crawler.debug("#{Time.now} queued #{url}") if added
    end
  end
end

#resetObject



49
50
51
# File 'lib/grucrawler.rb', line 49

def reset
  @queue.reset
end

#runObject



21
22
23
24
25
26
# File 'lib/grucrawler.rb', line 21

def run
  @hydra = Typhoeus::Hydra.new()
  @concurrency = @options[:concurrency] || 5
  crawl_more()
  @hydra.run
end