Class: Clownfish::LinksByPage

Inherits:
Object
  • Object
show all
Defined in:
lib/clownfish/fish/links_by_page.rb

Overview

Clownfish that records every link on a page and the repsonse status codes when the links are followed.

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initializeLinksByPage

Returns a new instance of LinksByPage.



9
10
11
# File 'lib/clownfish/fish/links_by_page.rb', line 9

def initialize
  @links_by_page = {}
end

Instance Attribute Details

Hash of url String to UrlStatuses. The values are all links found on page at the key.



7
8
9
# File 'lib/clownfish/fish/links_by_page.rb', line 7

def links_by_page
  @links_by_page
end

Instance Method Details

#anemone_optionsObject



13
14
15
16
# File 'lib/clownfish/fish/links_by_page.rb', line 13

def anemone_options
  # Not looking at page bodies so don't keep them around
  {:discard_page_bodies => true}
end

#on_every_page(page) ⇒ Object



18
19
20
21
22
23
24
25
26
# File 'lib/clownfish/fish/links_by_page.rb', line 18

def on_every_page(page)
  # First url in crawl has no page
  referer = page.referer ? page.referer.to_s : '[starting point]'

  @links_by_page[referer] = UrlStatuses.new unless @links_by_page.include? referer

  links = @links_by_page[referer]
  links.add_url(page.url.to_s, page.code)
end

#report(options = {}) ⇒ Object

Print links by page to stdout.

options - Hash specifying what and how to report.

:to     - IO to print report to.  Defaults to STDOUT.
:status - One or Array of status specifiers. Defaults to :all.
          Only links with these statues will be reported.  See
          Clownfish::StatusGroup for accepted status specifiers.


35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
# File 'lib/clownfish/fish/links_by_page.rb', line 35

def report(options = {})
  options = report_options(options)
  out = options[:to]
  specifiers = options[:status]

  @links_by_page.each do |page, link_statuses|
    link_status_pairs = link_statuses.query(specifiers)

    unless link_status_pairs.empty?
      out.puts "#{page}"
      link_status_pairs.each do |link, status|
        out.puts "#{status} #{link}"
      end
      out.puts
    end
  end
end