Class: BrowserCrawler::EngineUtilities::CrawlManager

Inherits:
Object
  • Object
show all
Defined in:
lib/browser_crawler/engine_utilities/crawl_manager.rb

Overview

This main operated class which controls queue of unvisisted links.

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(report_store:, max_pages: 0, deep_visit: false, logger: nil) ⇒ CrawlManager

Returns a new instance of CrawlManager.



20
21
22
23
24
25
26
27
28
# File 'lib/browser_crawler/engine_utilities/crawl_manager.rb', line 20

def initialize(report_store:,
               max_pages: 0,
               deep_visit: false,
               logger: nil)
  @report_store     = report_store
  @max_pages        = max_pages
  @deep_visit       = deep_visit
  @logger           = logger || Logger.new(STDOUT)
end

Instance Attribute Details

#deep_visitObject (readonly)

Returns the value of attribute deep_visit.



11
12
13
# File 'lib/browser_crawler/engine_utilities/crawl_manager.rb', line 11

def deep_visit
  @deep_visit
end

#host_nameObject (readonly)

Returns the value of attribute host_name.



11
12
13
# File 'lib/browser_crawler/engine_utilities/crawl_manager.rb', line 11

def host_name
  @host_name
end

#loggerObject (readonly)

Returns the value of attribute logger.



11
12
13
# File 'lib/browser_crawler/engine_utilities/crawl_manager.rb', line 11

def logger
  @logger
end

#max_pagesObject (readonly)

Returns the value of attribute max_pages.



11
12
13
# File 'lib/browser_crawler/engine_utilities/crawl_manager.rb', line 11

def max_pages
  @max_pages
end

#page_inspectorObject (readonly)

Returns the value of attribute page_inspector.



11
12
13
# File 'lib/browser_crawler/engine_utilities/crawl_manager.rb', line 11

def page_inspector
  @page_inspector
end

#report_storeObject (readonly)

Returns the value of attribute report_store.



11
12
13
# File 'lib/browser_crawler/engine_utilities/crawl_manager.rb', line 11

def report_store
  @report_store
end

#target_urlObject (readonly)

Returns the value of attribute target_url.



11
12
13
# File 'lib/browser_crawler/engine_utilities/crawl_manager.rb', line 11

def target_url
  @target_url
end

Returns the value of attribute unvisited_links_queue.



11
12
13
# File 'lib/browser_crawler/engine_utilities/crawl_manager.rb', line 11

def unvisited_links_queue
  @unvisited_links_queue
end

Instance Method Details

#crawl(target_url:, capybara_session:, screenshot_operator: nil) ⇒ Object



30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
# File 'lib/browser_crawler/engine_utilities/crawl_manager.rb', line 30

def crawl(target_url:, capybara_session:, screenshot_operator: nil)
  @host_name             = UrlTools.uri!(url: target_url).host
  @unvisited_links_queue = [target_url]

  loop do
    break if unvisited_links_queue.empty? || limit_reached?

    unvisited_link = unvisited_links_queue.shift

    link_inspector = LinkInspector.new(raw_link: unvisited_link,
                                       host_name: host_name)

    unless link_valid?(link_inspector)
      @logger.info("Skipped visited #{unvisited_link}")
      report_store.record_unrecognized_link(unvisited_link)
      next
    end

    inspect_page(link_inspector: link_inspector,
                 capybara_session: capybara_session,
                 screenshot_operator: screenshot_operator)
  end
end

Returns:

  • (Boolean)


54
55
56
57
58
# File 'lib/browser_crawler/engine_utilities/crawl_manager.rb', line 54

def link_valid?(link_inspector)
  link_inspector.link_valid? &&
    internal_resource?(link_inspector) &&
    page_unvisited?(link_inspector)
end