Class: LameSitemapper::Scraper

Inherits:
Object
  • Object
show all
Defined in:
lib/scraper.rb

Constant Summary collapse

EXTRACT_TAGS =
[
  ["//a/@href", "anchors"],
  ["//img/@src", "images"],
  ["//link/@href", "links"],
  ["//script/@src", "scripts"]
]

Instance Method Summary collapse

Constructor Details

#initialize(seen_urls, urls_queue, pages_queue, index, opts, robots) ⇒ Scraper

Returns a new instance of Scraper.



18
19
20
21
22
23
24
25
# File 'lib/scraper.rb', line 18

def initialize(seen_urls, urls_queue, pages_queue, index, opts, robots)
  @seen_urls = seen_urls
  @urls_queue = urls_queue
  @pages_queue = pages_queue
  @index = index
  @opts = opts
  @robots = robots
end

Instance Method Details

#runObject



27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
# File 'lib/scraper.rb', line 27

def run
  Thread.current[:name] = "%02d" % @index
  LOGGER.debug "running scraper #{@index}"
  loop do
    msg = @urls_queue.pop
    unless msg
      LOGGER.debug "scraper #{@index} received finish message"
      break
    end

    page = create_page(msg)

    @pages_queue.push(page: page, url: msg[:url], depth: msg[:depth], parent: msg[:parent])
  end
end