Class: Watobo::Crawler::Grabber

Inherits:
Object
  • Object
show all
Defined in:
plugins/crawler/lib/grabber.rb

Instance Method Summary collapse

Constructor Details

#initialize(link_queue, page_queue, opts = {}) ⇒ Grabber

Returns a new instance of Grabber.


74
75
76
77
78
79
80
81
82
83
84
85
86
# File 'plugins/crawler/lib/grabber.rb', line 74

def initialize(link_queue, page_queue, opts = {} )
  @link_queue = link_queue
  @page_queue = page_queue
  @opts = opts
  begin
    @agent = Crawler::Agent.new(@opts)
    
  rescue => bang
    puts bang
    puts bang.backtrace
  end

end

Instance Method Details

#get_page(linkbag) ⇒ Object


26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
# File 'plugins/crawler/lib/grabber.rb', line 26

def get_page(linkbag)
  begin
    return nil if linkbag.nil?
    return nil unless linkbag.respond_to? :link
    page = nil

    uri = linkbag.link
    uri = linkbag.link.uri if linkbag.link.respond_to? :uri

    unless @opts[:head_request_pattern].empty?
      pext = uri.path.match(/\.[^\.]*$/)
      unless pext.nil?
        if pext[0] =~ /\.#{@opts[:head_request_pattern]}/i
        page = @agent.head uri
        end
      end
    end

    page = @agent.get uri if page.nil?

    sleep(@opts[:delay]/1000.0).round(3) if @opts[:delay] > 0
    return nil if page.nil?
    return PageBag.new( page, linkbag.depth+1 )
  rescue => bang
    puts bang #if $DEBUG
    puts bang.backtrace if $DEBUG
  end
  return nil
end

#runObject


56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
# File 'plugins/crawler/lib/grabber.rb', line 56

def run
  Thread.new(@link_queue, @page_queue){ |lq, pq|
    loop do
      begin
        #link, referer, depth = lq.deq
        link = lq.deq
        next if link.depth > @opts[:max_depth]
        page = get_page(link)
        pq << page unless page.nil?

      rescue => bang
        puts bang
        puts bang.backtrace
      end
    end
  }
end