Class: LameSitemapper::Core

Inherits:
Object
  • Object
show all
Defined in:
lib/core.rb

Instance Method Summary collapse

Constructor Details

#initialize(out, opts) ⇒ Core

Returns a new instance of Core.



14
15
16
17
# File 'lib/core.rb', line 14

def initialize(out, opts)
  @out = out
  @opts = opts
end

Instance Method Details

#start(host, start_url) ⇒ Object



19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
# File 'lib/core.rb', line 19

def start(host, start_url)
  if @opts.use_robots
    @robots = WebRobots.new(SETTINGS[:web_settings][:useragent], {
      crawl_delay: :sleep,
      :http_get => lambda do |url|
        response = WebHelper.get_http_response(url)
        return unless response
        return response.body.force_encoding("UTF-8")
      end
    })

    if error = @robots.error(host)
      msg = "unable to fetch robots.txt"
      LOGGER.fatal msg
      $stderr.puts msg
      return [nil, start_url]
    end
  end

  # check if our host redirects to somewhere else, if it does, change start_url to redirect url
  response = WebHelper.get_http_response(start_url, :head)
  unless response
    msg = "unable to fetch starting url"
    LOGGER.fatal msg
    $stderr.puts msg

    return [nil, start_url]
  end

  if response.redirect_count.to_i > 0
    host = UrlHelper::get_normalized_host(response.effective_url) 
    start_url = UrlHelper::get_normalized_url(host, response.effective_url)
  end

  urls_queue = Queue.new
  pages_queue = Queue.new
  seen_urls = {}
  threads = []
  root = nil

  Thread.abort_on_exception = true
  (1..@opts.scraper_threads.to_i).each_with_index do |index|
    threads << Thread.new { Scraper.new(seen_urls, urls_queue, pages_queue, index, @opts, @robots).run }
  end

  urls_queue.push(host: host, url: start_url, depth: 0, parent: root)

  loop do
    msg = pages_queue.pop
    if msg[:page]
      if LOGGER.info?
        if msg[:page].scraped?
          details = ": a(#{msg[:page].anchors.count}), img(#{msg[:page].images.count}), link(#{msg[:page].links.count}), script(#{msg[:page].scripts.count})"
        else
          details = ": #{msg[:page].format_codes}"
        end
        LOGGER.info "#{UrlHelper.log_prefix(msg[:depth])} created at #{msg[:page].path}#{details}"
      end

      msg[:page].anchors.each do |anchor|
        urls_queue.push(host: host, url: anchor, depth: msg[:depth] + 1, parent: msg[:page])
      end

      if msg[:parent].nil?
        root = msg[:page]
      else
        msg[:parent].sub_pages << msg[:page]
      end
    end

    if urls_queue.empty? && pages_queue.empty?
      until urls_queue.num_waiting == threads.size
        Thread.pass
      end
      if pages_queue.empty?
        threads.size.times { urls_queue << nil }
        break
      end
    end
  end

  threads.each { |thread| thread.join }

  [root, start_url]
end