Class: HTMLProofer::UrlValidator::External
- Inherits:
-
HTMLProofer::UrlValidator
- Object
- HTMLProofer::UrlValidator
- HTMLProofer::UrlValidator::External
- Includes:
- HTMLProofer::Utils
- Defined in:
- lib/html_proofer/url_validator/external.rb
Instance Attribute Summary collapse
-
#before_request ⇒ Object
writeonly
Sets the attribute before_request.
-
#external_urls ⇒ Object
readonly
Returns the value of attribute external_urls.
Instance Method Summary collapse
- #add_failure(metadata, description, status = nil) ⇒ Object
-
#check_hash_in_2xx_response(href, url, response, filenames) ⇒ Object
Even though the response was a success, we may have been asked to check if the hash on the URL exists on the page.
- #handle_connection_failure(href, metadata, response_code, status_message) ⇒ Object
- #handle_timeout(href, filenames, response_code) ⇒ Object
-
#initialize(runner, external_urls) ⇒ External
constructor
A new instance of External.
- #queue_request(method, url, filenames) ⇒ Object
- #response_handler(response, url, filenames) ⇒ Object
-
#run_external_link_checker(external_urls) ⇒ Object
Proofer runs faster if we pull out all the external URLs and run the checks at the end.
- #validate ⇒ Object
Methods included from HTMLProofer::Utils
#blank?, #create_nokogiri, #pluralize
Constructor Details
#initialize(runner, external_urls) ⇒ External
Returns a new instance of External.
16 17 18 19 20 21 22 23 24 |
# File 'lib/html_proofer/url_validator/external.rb', line 16 def initialize(runner, external_urls) super(runner) @external_urls = external_urls @hydra = Typhoeus::Hydra.new(@runner.[:hydra]) @before_request = [] @paths_with_queries = {} end |
Instance Attribute Details
#before_request=(value) ⇒ Object (writeonly)
Sets the attribute before_request
14 15 16 |
# File 'lib/html_proofer/url_validator/external.rb', line 14 def before_request=(value) @before_request = value end |
#external_urls ⇒ Object (readonly)
Returns the value of attribute external_urls.
13 14 15 |
# File 'lib/html_proofer/url_validator/external.rb', line 13 def external_urls @external_urls end |
Instance Method Details
#add_failure(metadata, description, status = nil) ⇒ Object
193 194 195 196 197 198 199 200 201 |
# File 'lib/html_proofer/url_validator/external.rb', line 193 def add_failure(, description, status = nil) if blank?() # possible if we're checking an array of links @failed_checks << Failure.new("", "Links > External", description, status: status) else .each do |m| @failed_checks << Failure.new(m[:filename], "Links > External", description, line: m[:line], status: status) end end end |
#check_hash_in_2xx_response(href, url, response, filenames) ⇒ Object
Even though the response was a success, we may have been asked to check if the hash on the URL exists on the page
118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 |
# File 'lib/html_proofer/url_validator/external.rb', line 118 def check_hash_in_2xx_response(href, url, response, filenames) return false if @runner.[:only_4xx] return false unless @runner.[:check_external_hash] return false unless url.hash? hash = url.hash headers = response..fetch(:headers, {}) content_type = headers.find { |k, _| k.casecmp("content-type").zero? } # attempt to verify PDF hash ref; see #787 for more details # FIXME: this is re-reading the PDF response if content_type && content_type[1].include?("pdf") io = URI.parse(url.to_s).open reader = PDF::Reader.new(io) pages = reader.pages if hash =~ /\Apage=(\d+)\z/ page = Regexp.last_match[1].to_i unless pages[page - 1] msg = "External link #{href} failed: #{url.without_hash} exists, but the hash '#{hash}' does not" add_failure(filenames, msg, response.code) @cache.add_external(href, filenames, response.code, msg, false) end return true end end body_doc = create_nokogiri(response.body) unencoded_hash = Addressable::URI.unescape(hash) xpath = [%(//*[@name="#{hash}"]|/*[@name="#{unencoded_hash}"]|//*[@id="#{hash}"]|//*[@id="#{unencoded_hash}"])] # user-content is a special addition by GitHub. if url.host =~ /github\.com/i xpath << [%(//*[@name="user-content-#{hash}"]|//*[@id="user-content-#{hash}"])] # when linking to a file on GitHub, like #L12-L34, only the first "L" portion # will be identified as a linkable portion xpath << [%(//td[@id="#{Regexp.last_match[1]}"])] if hash =~ /\A(L\d)+/ end return unless body_doc.xpath(xpath.join("|")).empty? msg = "External link #{href} failed: #{url.without_hash} exists, but the hash '#{hash}' does not" add_failure(filenames, msg, response.code) @cache.add_external(href, filenames, response.code, msg, false) true end |
#handle_connection_failure(href, metadata, response_code, status_message) ⇒ Object
175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 |
# File 'lib/html_proofer/url_validator/external.rb', line 175 def handle_connection_failure(href, , response_code, ) msgs = [<<~MSG, External link #{href} failed with something very wrong. It's possible libcurl couldn't connect to the server, or perhaps the request timed out. Sometimes, making too many requests at once also breaks things. MSG ] msgs << "Either way, the return message from the server is: #{}" unless blank?() msg = msgs.join("\n").chomp @cache.add_external(href, , 0, msg, false) return if @runner.[:only_4xx] add_failure(, msg, response_code) end |
#handle_timeout(href, filenames, response_code) ⇒ Object
167 168 169 170 171 172 173 |
# File 'lib/html_proofer/url_validator/external.rb', line 167 def handle_timeout(href, filenames, response_code) msg = "External link #{href} failed: got a time out (response code #{response_code})" @cache.add_external(href, filenames, 0, msg, false) return if @runner.[:only_4xx] add_failure(filenames, msg, response_code) end |
#queue_request(method, url, filenames) ⇒ Object
72 73 74 75 76 77 78 79 80 |
# File 'lib/html_proofer/url_validator/external.rb', line 72 def queue_request(method, url, filenames) opts = @runner.[:typhoeus].merge(method: method) request = Typhoeus::Request.new(url.url, opts) @before_request.each do |callback| callback.call(request) end request.on_complete { |response| response_handler(response, url, filenames) } @hydra.queue(request) end |
#response_handler(response, url, filenames) ⇒ Object
82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 |
# File 'lib/html_proofer/url_validator/external.rb', line 82 def response_handler(response, url, filenames) method = response.request.[:method] href = response.request.base_url.to_s response_code = response.code response.body.delete!("\x00") @logger.log(:debug, "Received a #{response_code} for #{href}") return if @runner.[:ignore_status_codes].include?(response_code) if response_code.between?(200, 299) @cache.add_external(href, filenames, response_code, "OK", true) unless check_hash_in_2xx_response( href, url, response, filenames, ) elsif response.timed_out? handle_timeout(href, filenames, response_code) elsif response_code.zero? handle_connection_failure(href, filenames, response_code, response.) elsif method == :head # some servers don't support HEAD queue_request(:get, url, filenames) else return if @runner.[:only_4xx] && !response_code.between?(400, 499) # Received a non-successful http response. = blank?(response.) ? "" : ": #{response.}" msg = "External link #{href} failed#{}" add_failure(filenames, msg, response_code) @cache.add_external(href, filenames, response_code, msg, false) end end |
#run_external_link_checker(external_urls) ⇒ Object
Proofer runs faster if we pull out all the external URLs and run the checks at the end. Otherwise, we’re halting the consuming process for every file during ‘process_files`.
In addition, sorting the list lets libcurl keep connections to the same hosts alive.
Finally, we’ll first make a HEAD request, rather than GETing all the contents. If the HEAD fails, we’ll fall back to GET, as some servers are not configured for HEAD. If we’ve decided to check for hashes, we must do a GET–HEAD is not available as an option.
46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 |
# File 'lib/html_proofer/url_validator/external.rb', line 46 def run_external_link_checker(external_urls) # Route log from Typhoeus/Ethon to our own logger Ethon.logger = @logger external_urls.each_pair do |external_url, | url = Attribute::Url.new(@runner, external_url, base_url: nil) unless url.valid? add_failure(, "#{url} is an invalid URL", 0) next end next unless new_url_query_values?(url) method = if @runner.[:check_external_hash] && url.hash? :get else :head end queue_request(method, url, ) end @hydra.run end |
#validate ⇒ Object
26 27 28 29 30 31 32 33 34 |
# File 'lib/html_proofer/url_validator/external.rb', line 26 def validate urls_to_check = @cache.external_enabled? ? @runner.load_external_cache : @external_urls urls_detected = pluralize(urls_to_check.count, "external link", "external links") @logger.log(:info, "Checking #{urls_detected}") run_external_link_checker(urls_to_check) @failed_checks end |