Class: BrokenLinkFinder::Finder

Inherits:
Object
  • Object
show all
Defined in:
lib/broken_link_finder/finder.rb

Overview

Class responsible for finding broken links on a page or site.

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(sort: :page, max_threads: DEFAULT_MAX_THREADS) {|@crawler| ... } ⇒ Finder

Returns a new Finder instance.

Yields:



27
28
29
30
31
32
33
34
35
36
37
# File 'lib/broken_link_finder/finder.rb', line 27

def initialize(sort: :page, max_threads: DEFAULT_MAX_THREADS, &block)
  raise "Sort by either :page or :link, not #{sort}" \
  unless %i[page link].include?(sort)

  @sort        = sort
  @max_threads = max_threads
  @crawler     = Wgit::Crawler.new
  @manager     = BrokenLinkFinder::LinkManager.new(@sort)

  yield @crawler if block_given?
end

Instance Attribute Details

#crawlerObject (readonly)

The underlying Wgit::Crawler used by this instance of Finder.



21
22
23
# File 'lib/broken_link_finder/finder.rb', line 21

def crawler
  @crawler
end

#managerObject (readonly)

The underlying link manager used by this instance of Finder.



24
25
26
# File 'lib/broken_link_finder/finder.rb', line 24

def manager
  @manager
end

#max_threadsObject (readonly)

The max number of threads created during #crawl_site - one thread per page.



18
19
20
# File 'lib/broken_link_finder/finder.rb', line 18

def max_threads
  @max_threads
end

#sortObject (readonly)

The collection key - either :page or :link.



15
16
17
# File 'lib/broken_link_finder/finder.rb', line 15

def sort
  @sort
end

Instance Method Details

Returns the current broken links.



40
41
42
# File 'lib/broken_link_finder/finder.rb', line 40

def broken_links
  @manager.broken_links
end

#crawl_site(url, allow_paths: nil, disallow_paths: nil) ⇒ Object Also known as: crawl_r

Finds broken links within an entire site and records them. Returns true if at least one broken link was found. Access the broken links afterwards with Finder#broken_links.



82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# File 'lib/broken_link_finder/finder.rb', line 82

def crawl_site(url, allow_paths: nil, disallow_paths: nil)
  @manager.empty

  start   = Time.now
  url     = url.to_url
  pool    = Thread.pool(@max_threads)
  crawled = Set.new

  # Crawl the site's HTML web pages looking for links.
  # We dup the url to avoid recording any redirects.
  paths = { allow_paths: allow_paths, disallow_paths: disallow_paths }
  externals = @crawler.crawl_site(url.dup, **paths) do |doc|
    crawled << doc.url
    next unless doc

    # Start a thread for each page, checking for broken links.
    pool.process { find_broken_links(doc) }
  end

  # Wait for all threads to finish, even if url was invalid.
  pool.shutdown

  # Ensure the given website url is valid.
  raise "Invalid or broken URL: #{url}" unless externals

  retry_broken_links

  @manager.sort
  @manager.tally(url: url, pages_crawled: crawled.to_a, start: start)

  broken_links.any?
ensure
  pool.shutdown if defined?(pool)
end

#crawl_statsObject

Returns the current crawl stats.



50
51
52
# File 'lib/broken_link_finder/finder.rb', line 50

def crawl_stats
  @manager.crawl_stats
end

#crawl_url(url) ⇒ Object Also known as: crawl_page

Finds broken links within a single page and records them. Returns true if at least one broken link was found. Access the broken links afterwards with Finder#broken_links.



57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
# File 'lib/broken_link_finder/finder.rb', line 57

def crawl_url(url)
  @manager.empty

  start = Time.now
  url   = url.to_url

  # We dup the url to avoid recording any redirects.
  doc = @crawler.crawl(url.dup)

  # Ensure the given page url is valid.
  raise "Invalid or broken URL: #{url}" if doc.empty?

  # Get all page links and determine which are broken.
  find_broken_links(doc)
  retry_broken_links

  @manager.sort
  @manager.tally(url: url, pages_crawled: [url], start: start)

  broken_links.any?
end

Returns the current ignored links.



45
46
47
# File 'lib/broken_link_finder/finder.rb', line 45

def ignored_links
  @manager.ignored_links
end

#report(stream = STDOUT, type: :text, broken_verbose: true, ignored_verbose: false) ⇒ Object

Outputs the link report into a stream e.g. STDOUT or a file, anything that respond_to? :puts. Defaults to STDOUT.



119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
# File 'lib/broken_link_finder/finder.rb', line 119

def report(stream = STDOUT, type: :text,
           broken_verbose: true, ignored_verbose: false)
  klass = case type
          when :text
            BrokenLinkFinder::TextReporter
          when :html
            BrokenLinkFinder::HTMLReporter
          else
            raise "The type: must be :text or :html, not: :#{type}"
          end

  reporter = klass.new(stream, @sort,
                       broken_links, ignored_links,
                       @manager.broken_link_map, crawl_stats)
  reporter.call(broken_verbose: broken_verbose,
                ignored_verbose: ignored_verbose)
end