Class: Watobo::Crawler::Grabber
- Inherits:
-
Object
- Object
- Watobo::Crawler::Grabber
- Defined in:
- plugins/crawler/lib/grabber.rb
Instance Method Summary collapse
- #get_page(linkbag) ⇒ Object
-
#initialize(link_queue, page_queue, opts = {}) ⇒ Grabber
constructor
A new instance of Grabber.
- #run ⇒ Object
Constructor Details
#initialize(link_queue, page_queue, opts = {}) ⇒ Grabber
Returns a new instance of Grabber.
74 75 76 77 78 79 80 81 82 83 84 85 86 |
# File 'plugins/crawler/lib/grabber.rb', line 74 def initialize(link_queue, page_queue, opts = {} ) @link_queue = link_queue @page_queue = page_queue @opts = opts begin @agent = Crawler::Agent.new(@opts) rescue => bang puts bang puts bang.backtrace end end |
Instance Method Details
#get_page(linkbag) ⇒ Object
26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 |
# File 'plugins/crawler/lib/grabber.rb', line 26 def get_page(linkbag) begin return nil if linkbag.nil? return nil unless linkbag.respond_to? :link page = nil uri = linkbag.link uri = linkbag.link.uri if linkbag.link.respond_to? :uri unless @opts[:head_request_pattern].empty? pext = uri.path.match(/\.[^\.]*$/) unless pext.nil? if pext[0] =~ /\.#{@opts[:head_request_pattern]}/i page = @agent.head uri end end end page = @agent.get uri if page.nil? sleep(@opts[:delay]/1000.0).round(3) if @opts[:delay] > 0 return nil if page.nil? return PageBag.new( page, linkbag.depth+1 ) rescue => bang puts bang #if $DEBUG puts bang.backtrace if $DEBUG end return nil end |
#run ⇒ Object
56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 |
# File 'plugins/crawler/lib/grabber.rb', line 56 def run Thread.new(@link_queue, @page_queue){ |lq, pq| loop do begin #link, referer, depth = lq.deq link = lq.deq next if link.depth > @opts[:max_depth] page = get_page(link) pq << page unless page.nil? rescue => bang puts bang puts bang.backtrace end end } end |