Module: AwesomeBot
- Defined in:
- lib/awesome_bot.rb,
lib/awesome_bot/cli.rb,
lib/awesome_bot/net.rb,
lib/awesome_bot/check.rb,
lib/awesome_bot/links.rb,
lib/awesome_bot/write.rb,
lib/awesome_bot/output.rb,
lib/awesome_bot/result.rb,
lib/awesome_bot/version.rb,
lib/awesome_bot/white_list.rb
Overview
Process white list
Defined Under Namespace
Classes: Result
Constant Summary collapse
- STATUS_ERROR =
-1
- MARKDOWN_LINK_REGEX =
This matches, from left to right: a literal [ the link title - i.e. anything up to the next closing bracket a literal ] a literal ( the link destination (optionally enclosed in a single pair of angle brackets) a literal )
/\[ [^\]]+ \] \( <? ([^)<>]+) >? \)/x
- RESULTS_PREFIX =
'ab-results'
- STATUS_OK =
'✓'
- STATUS_OTHER =
'?'
- STATUS_400s =
'x'
- STATUS_REDIRECT =
'→'
- PROJECT =
'awesome_bot'
- PROJECT_DESCRIPTION =
'Check for valid and duplicate URLs in a file. '\ 'Great for "awesome" projects.'
- PROJECT_URL =
'https://github.com/dkhamsing/awesome_bot'
- VERSION =
'1.20.0'
Class Method Summary collapse
- .check(content, options = nil, number_of_threads = 1) {|"Links to check: #{r.links.count}"| ... } ⇒ Object
- .cli ⇒ Object
- .cli_process(filename, options) ⇒ Object
- .filter_filename(f) ⇒ Object
- .get_relative_links(content, base) ⇒ Object
- .links_filter(list) ⇒ Object
- .links_find(content, url_base = nil) ⇒ Object
- .loc(x, content) ⇒ Object
- .loc_formatted(loc, largest = 3) ⇒ Object
- .log_status(s) ⇒ Object
- .net_status(url, timeout = 30, head) ⇒ Object
- .number_of_digits(content) ⇒ Object
- .order_by_loc(list, content) ⇒ Object
- .output(x, index, total, largest) ⇒ Object
- .output_redirect(x) ⇒ Object
- .pad_list(list) ⇒ Object
- .pad_text(number, digits) ⇒ Object
- .status_is_redirected?(status) ⇒ Boolean
- .statuses(links, threads, timeout, head = false, delay = 0) ⇒ Object
- .white_list(list, item) ⇒ Object
- .write_markdown_results(filename, filtered, silent) ⇒ Object
- .write_results(f, r, silent) ⇒ Object
- .write_results_filtered(file, filtered, silent) ⇒ Object
Class Method Details
.check(content, options = nil, number_of_threads = 1) {|"Links to check: #{r.links.count}"| ... } ⇒ Object
9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 |
# File 'lib/awesome_bot/check.rb', line 9 def check(content, =nil, number_of_threads=1) if .nil? white_listed = nil skip_dupe = false timeout = nil delay = 0 base = nil else white_listed = ['whitelist'] skip_dupe = ['allowdupe'] timeout = ['timeout'] delay = ['delay'] delay = 0 if delay.nil? base = ['baseurl'] end links = links_filter(links_find(content, base)) r = Result.new(links, white_listed) r.skip_dupe = skip_dupe r.dupes = r.links.select { |e| r.links.count(e) > 1 } yield "Links to check: #{r.links.count}" if block_given? yield ", #{r.links_white_listed.count} white listed" if r.white_listing && block_given? uniq = r.links.uniq.count yield ", #{uniq} unique" if uniq != r.links.count && block_given? yield "\n" if block_given? total = pad_list r.links.uniq r.links.uniq.each_with_index do |u, j| yield " #{pad_text j + 1, total}. #{u} \n" if block_given? end head = false yield 'Checking URLs: ' if block_given? && r.links.count > 0 r.status = statuses(r.links.uniq, number_of_threads, timeout, head, delay) do |s| yield log_status s if block_given? end yield "\n" if block_given? return r if !r.white_listing || (r.links_white_listed.count == 0) yield 'Checking white listed URLs: ' if block_given? r.white_listed = statuses(r.links_white_listed.uniq, number_of_threads, nil, head, delay) do |s| yield log_status s if block_given? end yield "\n" if block_given? r end |
.cli ⇒ Object
10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 |
# File 'lib/awesome_bot/cli.rb', line 10 def cli() require 'optparse' ARGV << '-h' if ARGV.empty? = {} ARGV. do |opts| opts. = "Usage: #{PROJECT} [file or files] \n"\ " #{PROJECT} [options]" opts.on('-f', '--files [files]', Array, 'Comma separated files to check') { |val| ['files'] = val } opts.on('-a', '--allow [errors]', Array, 'Status code errors to allow') { |val| ['errors'] = val } opts.on('--allow-dupe', TrueClass, 'Duplicate URLs are allowed') { |val| ['allow_dupe'] = val } opts.on('--allow-ssl', TrueClass, 'SSL errors are allowed') { |val| ['allow_ssl'] = val } opts.on('--allow-redirect', TrueClass, 'Redirected URLs are allowed') { |val| ['allow_redirect'] = val } opts.on('--allow-timeout', TrueClass, 'URLs that time out are allowed') { |val| ['allow_timeout'] = val } opts.on('--base-url [base url]', String, 'Base URL to use for relative links') { |val| ['base_url'] = val } opts.on('-d', '--request-delay [seconds]', Float, 'Set request delay') { |val| ['delay'] = val } opts.on('-t', '--set-timeout [seconds]', Integer, 'Set connection timeout (default: 30)') { |val| ['timeout'] = val } opts.on('--skip-save-results', TrueClass, 'Skip saving results') { |val| ['no_results'] = val } opts.on('-w', '--white-list [urls]', Array, 'Comma separated URLs to white list') { |val| ['white_list'] = val } opts.on('-v', '--version', String, 'Display version') { |val| puts "#{PROJECT} version #{VERSION}" } opts.on_tail("--help") do puts opts exit end opts.parse! end files = ['files'] if files.nil? files = [] ARGV.each do |a| files.push a if a !~ /^--.*/ end end summary = {} files.each do |f| summary[f] = cli_process(f, ) end if summary.count>1 puts "\nSummary" largest = 0 summary.each do |k, v| s = k.size largest = s if s>largest end summary.each do |k, v| k_display = "%#{largest}.#{largest}s" % k puts "#{k_display}: #{v}" end end summary.each { |k, v| exit 1 unless v==STATUS_OK } end |
.cli_process(filename, options) ⇒ Object
71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 |
# File 'lib/awesome_bot/cli.rb', line 71 def cli_process(filename, ) begin untrusted = File.read filename content = untrusted.encode('UTF-16', :invalid => :replace, :replace => '').encode('UTF-8') rescue => error puts "File open error: #{error}" return error end puts "> Checking links in #{filename}" base = ['base_url'] puts "> Will check relative links with base URL #{base}" unless base.nil? errors = ['errors'] puts "> Will allow errors: #{errors.join ','}" unless errors.nil? skip_dupe = ['allow_dupe'] puts '> Will allow duplicate links' if skip_dupe == true allow_redirects = ['allow_redirect'] puts '> Will allow redirects' if allow_redirects == true allow_ssl = ['allow_ssl'] puts '> Will allow SSL errors' if allow_ssl == true allow_timeouts = ['allow_timeout'] puts '> Will allow network timeouts' if allow_timeouts == true delay = ['delay'] puts "> Will delay each request by #{delay} second#{delay==1? '': 's'}" unless delay.nil? white_listed = ['white_list'] timeout = ['timeout'] puts "> Connection timeout = #{timeout}s" unless timeout.nil? puts "> White list links matching: #{white_listed.join ', '} " unless white_listed.nil? no_results = ['no_results'] if no_results == true puts '> Will not save results' else no_results = false end = { 'allowdupe' => skip_dupe, 'delay' => delay, 'timeout' => timeout, 'whitelist' => white_listed, 'baseurl' => base } threads = delay == nil ? 10 : 1 r = check(content, , threads) do |o| print o end digits = number_of_digits content unless r.white_listed.nil? puts "\n> White listed:" o = order_by_loc r.white_listed, content o.each_with_index do |x, k| temp, _ = output(x, k, pad_list(o), digits) puts temp end end allow_redirects = false if allow_redirects.nil? allow_ssl = false if allow_ssl.nil? allow_timeouts = false if allow_timeouts.nil? = { 'errors' => errors, 'redirect' => allow_redirects, 'ssl' => allow_ssl, 'timeout' => allow_timeouts } if r.success() == true puts 'No issues :-)' write_results(filename, r, no_results) write_markdown_results(filename, nil, no_results) return STATUS_OK else filtered_issues = [] puts "\nIssues :-(" print "> Links \n" if r.success_links() puts " All OK #{STATUS_OK}" else o = order_by_loc r.statuses_issues(), content o.each_with_index do |x, k| temp, h = output(x, k, pad_list(o), digits) filtered_issues.push h puts temp end end unless skip_dupe print "> Dupes \n" if r.success_dupe puts " None #{STATUS_OK}" else dupe_hash = r.dupes.uniq.map do |x| temp = {} temp['url'] = x temp end o = order_by_loc dupe_hash, content largest = o.last['loc'].to_s.size o.each_with_index do |d, index| loc = d['loc'] url = d['url'] error = 'Dupe' hash = { 'loc'=> loc, 'link'=> url, 'error'=> error } filtered_issues.push hash print " #{pad_text index + 1, pad_list(r.dupes.uniq)}. " print loc_formatted loc, largest puts " #{url}" end end end write_results(filename, r, no_results) filtered = write_results_filtered(filename, filtered_issues, no_results) write_markdown_results(filename, filtered, no_results) return 'Issues' end end |
.filter_filename(f) ⇒ Object
8 9 10 |
# File 'lib/awesome_bot/write.rb', line 8 def filter_filename(f) f.gsub('/','-') end |
.get_relative_links(content, base) ⇒ Object
58 59 60 61 62 63 64 65 |
# File 'lib/awesome_bot/links.rb', line 58 def get_relative_links(content, base) links = [] content.scan(MARKDOWN_LINK_REGEX) { |groups| links << groups.first } links.reject { |x| x.include?('http') || x.include?('#') } .map { |x| x =~ /\S/ ? x.match(/^\S*/) : x } .map { |x| "#{base}#{x}"} end |
.links_filter(list) ⇒ Object
13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 |
# File 'lib/awesome_bot/links.rb', line 13 def links_filter(list) list.reject { |x| x.length < 9 } .map do |x| x.gsub(',','%2c').gsub(/'.*/, '').gsub(/,.*/, '') end .map do |x| if x.include? ')]' x.gsub /\)\].*/, '' elsif (x.scan(')').count == 1) && (x.scan('(').count == 1) x elsif (x.scan(')').count == 2) && (x.scan('(').count == 1) x.gsub(/\)\).*/, ')') elsif (x.scan(')').count > 0) if (x.include? 'wikipedia') if (x.scan(')').count >= 1) && (x.scan('(').count == 0) x.gsub(/\).*/, '') else x end else x.gsub(/\).*/, '') end elsif x.include? '[' # adoc x.gsub(/\[.*/, '') elsif x[-1]=='.' || x[-1]==':' x[0..-2] elsif x[-1]=='.' x[0..-2] elsif x[-3..-1]=='%2c' x[0..-4] else x end end end |
.links_find(content, url_base = nil) ⇒ Object
49 50 51 52 53 54 55 56 |
# File 'lib/awesome_bot/links.rb', line 49 def links_find(content, url_base=nil) require 'uri' ext = URI.extract(content, /http()s?/) return ext if url_base.nil? rel = get_relative_links content, url_base return rel + ext end |
.loc(x, content) ⇒ Object
9 10 11 12 13 14 15 16 17 |
# File 'lib/awesome_bot/output.rb', line 9 def loc(x, content) count = 0 lines = content.split "\n" lines.each do |l| count += 1 return count if l.include? x end return count end |
.loc_formatted(loc, largest = 3) ⇒ Object
19 20 21 22 |
# File 'lib/awesome_bot/output.rb', line 19 def loc_formatted(loc, largest=3) line = pad_text loc, largest "[L#{line}]" end |
.log_status(s) ⇒ Object
24 25 26 27 28 29 30 31 32 33 34 |
# File 'lib/awesome_bot/output.rb', line 24 def log_status(s) if status_is_redirected? s return STATUS_REDIRECT elsif s == 200 return STATUS_OK elsif (s > 399 && s < 500) return STATUS_400s else return STATUS_OTHER end end |
.net_status(url, timeout = 30, head) ⇒ Object
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 |
# File 'lib/awesome_bot/net.rb', line 6 def net_status(url, timeout=30, head) require 'net/http' require 'openssl' require 'uri' uri = URI.parse url Net::HTTP.start(uri.host, uri.port, :use_ssl => uri.scheme == 'https', :open_timeout => timeout) do |http| ua = {'User-Agent' => 'awesome_bot'} if head request = Net::HTTP::Head.new(uri,ua) else request = Net::HTTP::Get.new(uri,ua) end if uri.userinfo auth_user, auth_pass = uri.userinfo.split(/:/) request.basic_auth auth_user, auth_pass end response = http.request request code = response.code==nil ? 200 : response.code.to_i headers = {} response.each do |k, v| headers[k] = v.force_encoding("utf-8") end # handle incomplete redirect loc = headers['location'] unless loc.nil? loc_uri = URI.parse loc if loc_uri.scheme.nil? new_loc = uri.scheme + '://' + uri.host + loc headers['location'] = new_loc end end return [code, headers] end end |
.number_of_digits(content) ⇒ Object
36 37 38 39 |
# File 'lib/awesome_bot/output.rb', line 36 def number_of_digits(content) lines = content.split "\n" return pad_list lines end |
.order_by_loc(list, content) ⇒ Object
41 42 43 44 45 46 47 48 |
# File 'lib/awesome_bot/output.rb', line 41 def order_by_loc(list, content) list.each do |x| x['loc'] = loc x['url'], content end s = list.sort_by { |h| h['loc'] } return s end |
.output(x, index, total, largest) ⇒ Object
50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 |
# File 'lib/awesome_bot/output.rb', line 50 def output(x, index, total, largest) s = x['status'] loc = x['loc'] status = s == STATUS_ERROR ? '' : s link = x['url'] redirect = status_is_redirected?(s) ? x['headers']['location'] : '' error = s == STATUS_ERROR ? x['error'] : '' hash = { 'loc'=> loc, 'status'=> s, 'link'=> link, 'redirect'=> redirect, 'error'=> error } o = " #{pad_text index + 1, total}. " \ "#{loc_formatted loc, largest} " \ "#{status} " \ "#{link} " \ "#{error}" \ "#{output_redirect x} \n" [o, hash] end |
.output_redirect(x) ⇒ Object
78 79 80 81 82 83 84 |
# File 'lib/awesome_bot/output.rb', line 78 def output_redirect(x) if status_is_redirected? x['status'] " #{STATUS_REDIRECT} #{x['headers']['location']}" else '' end end |
.pad_list(list) ⇒ Object
86 87 88 |
# File 'lib/awesome_bot/output.rb', line 86 def pad_list(list) list.count.to_s.size end |
.pad_text(number, digits) ⇒ Object
90 91 92 93 |
# File 'lib/awesome_bot/output.rb', line 90 def pad_text(number, digits) format = "%0#{digits}d" "#{sprintf format, number}" end |
.status_is_redirected?(status) ⇒ Boolean
48 49 50 |
# File 'lib/awesome_bot/net.rb', line 48 def status_is_redirected?(status) (status > 299) && (status < 400) end |
.statuses(links, threads, timeout, head = false, delay = 0) ⇒ Object
52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 |
# File 'lib/awesome_bot/net.rb', line 52 def statuses(links, threads, timeout, head=false, delay=0) require 'parallel' statuses = [] Parallel.each(links, in_threads: threads) do |u| sleep delay begin status, headers = net_status u, timeout, head error = nil rescue => e status = STATUS_ERROR headers = {} error = e end yield status, u, headers if block_given? statuses.push('url' => u, 'status' => status, 'error' => error, 'headers' => headers) end # Parallel statuses end |
.white_list(list, item) ⇒ Object
4 5 6 7 |
# File 'lib/awesome_bot/white_list.rb', line 4 def white_list(list, item) list.each { |x| return true if item.include? x } false end |
.write_markdown_results(filename, filtered, silent) ⇒ Object
12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 |
# File 'lib/awesome_bot/write.rb', line 12 def write_markdown_results(filename, filtered, silent) return false if silent==true payload = if filtered.nil? {'error'=>false} else results = File.read filtered j = JSON.parse results num = j.count plural = num==1?'':'s' title = "Found #{num} link issue#{plural}" = "#### Link issue#{plural} by [`awesome_bot`](https://github.com/dkhamsing/awesome_bot)\n\n" << " Line | Status | Link\n" << "| ---: | :----: | --- |\n" j.sort_by { |h| h['loc'] }.each do |i| error = i['error'] loc = i['loc'] link = i['link'] s = i['status'] r = i['redirect'] if error=='Dupe' << "#{loc} | Dupe | #{link} " else status = s==-1? 'Error' : "[#{s}](https://httpstatuses.com/#{s})" << "#{loc} | #{status} | #{link} " << "<br> #{error}" unless error =='' << "redirects to<br>#{r}" unless r=='' end << "\n" end { 'error' => true, 'title' => title, 'message'=> } end results_file_filter = filter_filename filename results_file = "#{RESULTS_PREFIX}-#{results_file_filter}-markdown-table.json" File.open(results_file, 'w') { |f| f.write JSON.pretty_generate(payload) } puts "Wrote markdown table results to #{results_file}" return true end |
.write_results(f, r, silent) ⇒ Object
62 63 64 65 66 67 68 69 70 71 |
# File 'lib/awesome_bot/write.rb', line 62 def write_results(f, r, silent) return false if silent==true results_file_filter = filter_filename f results_file = "#{RESULTS_PREFIX}-#{results_file_filter}.json" r.write results_file puts "\nWrote results to #{results_file}" return true end |
.write_results_filtered(file, filtered, silent) ⇒ Object
73 74 75 76 77 78 79 80 81 82 |
# File 'lib/awesome_bot/write.rb', line 73 def write_results_filtered(file, filtered, silent) return nil if silent==true results_file_filter = filter_filename file results_file = "#{RESULTS_PREFIX}-#{results_file_filter}-filtered.json" File.open(results_file, 'w') { |f| f.write JSON.pretty_generate(filtered) } puts "Wrote filtered results to #{results_file}" return results_file end |