Class: LinkChecker

Inherits:
Object
  • Object
show all
Defined in:
lib/link_checker.rb

Defined Under Namespace

Classes: Error, Good, Redirect, Result

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(params) ⇒ LinkChecker

Create a new instance of LinkChecker

Parameters:

  • params (Hash)

    A hash containing the :target value, which can represent either a file path or a URL. And an optional :options value, which contains a hash with a list of possible optional paramters. This can include :no_warnings, :warnings_are_errors, or :max_threads



17
18
19
20
21
22
23
24
25
26
27
28
# File 'lib/link_checker.rb', line 17

def initialize(params)
  @options = params[:options] || { }
  @target =  params[:target] || './'

  @html_files = []
  @links = []
  @errors = []
  @warnings = []
  @return_code = 0

  @options[:max_threads] ||= 100
end

Class Method Details

.check_uri(uri, redirected = false) ⇒ LinkChecker::Result

Check one URL.

Parameters:

  • uri (URI)

    A URI object for the target URL.

Returns:



53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
# File 'lib/link_checker.rb', line 53

def self.check_uri(uri, redirected=false)
  http = Net::HTTP.new(uri.host, uri.port)
  http.use_ssl = true if uri.scheme == "https"
  http.start do
    path = (uri.path.empty?) ? '/' : uri.path
    http.request_get(path) do |response|
      case response
      when Net::HTTPSuccess then
        if redirected
          return Redirect.new(:final_destination_uri_string => uri.to_s)
        else
          return Good.new(:uri_string => uri.to_s)
        end
      when Net::HTTPRedirection then
        return self.check_uri(URI(response['location']), true)
      else
        return Error.new(:uri_string => uri.to_s, :error => response)
      end
    end
  end
end

Find a list of all external links in the specified target, represented as URI strings.

Parameters:

  • source (String)

    Either a file path or a URL.

Returns:

  • (Array)

    A list of URI strings.



41
42
43
44
45
46
# File 'lib/link_checker.rb', line 41

def self.external_link_uri_strings(source)
  Nokogiri::HTML(source).css('a').select {|link|
      !link.attribute('href').nil? &&
      link.attribute('href').value =~ /^https?\:\/\//
  }.map{|link| link.attributes['href'].value }
end

Instance Method Details

#check_page(page, page_name) ⇒ Object

Spawn a thread to check an HTML page, and then spawn a thread for checking each link within that page.

there is an error or a warning.

Parameters:

  • source (String)

    The contents of the HTML page, as a string.

  • source_name (String)

    The name of the source, which will be reported if



139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
# File 'lib/link_checker.rb', line 139

def check_page(page, page_name)
  Thread.new do
    threads = []
    results = []
    self.class.external_link_uri_strings(page).each do |uri_string|
      Thread.exclusive { @links << page }
      wait_to_spawn_thread
      threads << Thread.new do
        begin
          uri = URI(uri_string)
          response = self.class.check_uri(uri)
          response.uri_string = uri_string
          Thread.exclusive { results << response }
        rescue => error
          Thread.exclusive { results <<
            Error.new( :error => error.to_s, :uri_string => uri_string) }
        end
      end
    end
    threads.each {|thread| thread.join }
    report_results(page_name, results)
  end
end

#check_urisObject

Check the URLs in the @target, either using #check_uris_by_crawling or #check_uris_in_files, depending on whether the @target looks like an http:// URL or a file path.



78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# File 'lib/link_checker.rb', line 78

def check_uris
  begin
    if @target =~ /^https?\:\/\//
      check_uris_by_crawling
    else
      check_uris_in_files
    end
  rescue => error
    puts "Error: #{error.to_s}".red
  end

  # Report the final results.
  unless @html_files.empty?
    file_pluralized = (@html_files.size.eql? 1) ? 'file' : 'files'
    link_pluralized = (@links.size.eql? 1) ? 'link' : 'links'
    if @errors.empty?
      puts ("Checked #{@links.size} #{link_pluralized} in #{@html_files.size} " +
        "HTML #{file_pluralized} and found no errors.").green
    else
      error_pluralized = (@errors.size.eql? 1) ? 'error' : 'errors'
      puts ("Checked #{@links.size} #{link_pluralized} in #{@html_files.size} " +
        "HTML #{file_pluralized} and found #{@errors.size} #{error_pluralized}.").red
    end
  end

  @return_code
end

#check_uris_by_crawlingObject

Use Anemone to crawl the pages at the @target URL, and then check all of the external URLs in those pages.



108
109
110
111
112
113
114
115
116
117
118
119
# File 'lib/link_checker.rb', line 108

def check_uris_by_crawling
  threads = []
  Anemone.crawl(@target) do |anemone|
    anemone.storage = Anemone::Storage.PStore('link-checker-crawled-pages.pstore')
    anemone.on_every_page do |crawled_page|
      raise StandardError.new(crawled_page.error) if crawled_page.error
      threads << check_page(crawled_page.body, crawled_page.url.to_s)
      @html_files << crawled_page
    end
  end
  threads.each{|thread| thread.join }
end

#check_uris_in_filesObject

Treat the @target as a file path and find all HTML files under that path, and then scan all of the external URLs in those files.



123
124
125
126
127
128
129
130
131
# File 'lib/link_checker.rb', line 123

def check_uris_in_files
  threads = []
  html_file_paths.each do |file|
    wait_to_spawn_thread
    threads << check_page(open(file), file)
    @html_files << file
  end
  threads.each{|thread| thread.join }
end

#html_file_pathsObject

Find a list of HTML files in the @target path, which was set in the #initialize method.



31
32
33
34
35
# File 'lib/link_checker.rb', line 31

def html_file_paths
  Find.find(@target).map {|path|
    FileTest.file?(path) && (path =~ /\.html?$/) ? path : nil
  }.reject{|path| path.nil? }
end

#report_results(page_name, results) ⇒ Object

Report the results of scanning one HTML page.

Parameters:

  • page_name (String)

    The name of the page.

  • results (Array)

    An array of Result objects.



167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
# File 'lib/link_checker.rb', line 167

def report_results(page_name, results)
  errors = results.select{|result| result.class.eql? Error}
  warnings = results.select{|result| result.class.eql? Redirect}
  @return_code = 1 unless errors.empty?
  if @options[:warnings_are_errors]
    @return_code = 1 unless warnings.empty?
    errors = errors + warnings
    warnings = []
  end
  Thread.exclusive do
    # Store the results in the LinkChecker instance.
    # This must be thread-exclusive to avoid a race condition.
    @errors = @errors.concat(errors)
    @warnings = @warnings.concat(warnings)

    if errors.empty?
      message = "Checked: #{page_name}"
      if warnings.empty? || @options[:no_warnings]
        puts message.green
      else
        puts message.yellow
      end
      unless @options[:no_warnings]
        warnings.each do |warning|
          puts "   Warning: #{warning.uri_string}".yellow
          puts "     Redirected to: #{warning.final_destination_uri_string}".yellow
        end
      end
    else
      puts "Problem: #{page_name}".red
      errors.each do |error|
        puts "   Link: #{error.uri_string}".red
        case error
        when Redirect
          puts "     Redirected to: #{error.final_destination_uri_string}".red
        when Error
          puts "     Response: #{error.error.to_s}".red
        end
      end
    end
  end
end