Class: SiteDiff::Crawler
- Inherits:
-
Object
- Object
- SiteDiff::Crawler
- Defined in:
- lib/sitediff/crawler.rb
Overview
SiteDiff Crawler.
Defined Under Namespace
Classes: Info
Constant Summary collapse
- DEFAULT_DEPTH =
3
Instance Method Summary collapse
-
#add_uri(rel, depth, referrer = '') ⇒ Object
Handle a newly found relative URI.
-
#fetched_uri(rel, depth, res) ⇒ Object
Handle the fetch of a URI.
-
#filter_links(uris) ⇒ Object
Filter out links we don’t want.
-
#find_links(doc) ⇒ Object
Return a list of string links found on a page.
-
#initialize(hydra, base, interval, include_regex, exclude_regex, depth = DEFAULT_DEPTH, curl_opts = UriWrapper::DEFAULT_CURL_OPTS, debug: true, &block) ⇒ Crawler
constructor
Create a crawler with a base URL.
-
#relativize_link(uri) ⇒ Object
Make a link relative to @base_uri.
-
#resolve_link(base, rel) ⇒ Object
Resolve a potentially-relative link.
Constructor Details
#initialize(hydra, base, interval, include_regex, exclude_regex, depth = DEFAULT_DEPTH, curl_opts = UriWrapper::DEFAULT_CURL_OPTS, debug: true, &block) ⇒ Crawler
Create a crawler with a base URL
18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 |
# File 'lib/sitediff/crawler.rb', line 18 def initialize(hydra, base, interval, include_regex, exclude_regex, depth = DEFAULT_DEPTH, curl_opts = UriWrapper::DEFAULT_CURL_OPTS, debug: true, &block) @hydra = hydra @base_uri = Addressable::URI.parse(base) @base = base @interval = interval @include_regex = include_regex @exclude_regex = exclude_regex @found = Set.new @callback = block @curl_opts = curl_opts @debug = debug add_uri('', depth, referrer: '/') end |
Instance Method Details
#add_uri(rel, depth, referrer = '') ⇒ Object
Handle a newly found relative URI
41 42 43 44 45 46 47 48 49 50 |
# File 'lib/sitediff/crawler.rb', line 41 def add_uri(rel, depth, referrer = '') return if @found.include? rel @found << rel wrapper = UriWrapper.new(@base + rel, @curl_opts, debug: @debug, referrer:) wrapper.queue(@hydra) do |res| fetched_uri(rel, depth, res) end end |
#fetched_uri(rel, depth, res) ⇒ Object
Handle the fetch of a URI
53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 |
# File 'lib/sitediff/crawler.rb', line 53 def fetched_uri(rel, depth, res) if res.error SiteDiff.log(res.error, :error) return elsif !res.content SiteDiff.log('Response is missing content. Treating as an error.', :error) return end base = Addressable::URI.parse(@base + rel) doc = Nokogiri::HTML(res.content) # Call the callback info = Info.new( relative: rel, uri: base, read_result: res, document: doc ) # Insert delay to limit fetching rate if @interval != 0 SiteDiff.log("Waiting #{@interval} milliseconds.", :info) sleep(@interval / 1000.0) end @callback[info] return unless depth >= 1 # Find links links = find_links(doc) uris = links.map { |l| resolve_link(base, l) }.compact uris = filter_links(uris) # Make them relative rels = uris.map { |u| relativize_link(u) } # Queue them in turn rels.each do |r| next if @found.include? r add_uri(r, depth - 1, rel) end end |
#filter_links(uris) ⇒ Object
Filter out links we don’t want. Links passed in are absolute URIs.
127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 |
# File 'lib/sitediff/crawler.rb', line 127 def filter_links(uris) uris.find_all do |u| is_sub_uri = (u.host == @base_uri.host) && u.path.start_with?(@base_uri.path) next unless is_sub_uri # puts "Trying regex #{u.path}" is_included = @include_regex.nil? ? false : @include_regex.match(u.path) is_excluded = @exclude_regex.nil? ? false : @exclude_regex.match(u.path) if is_excluded && !is_included SiteDiff.log "Ignoring excluded URL #{u.path}", :info end is_included || !is_excluded end end |
#find_links(doc) ⇒ Object
Return a list of string links found on a page.
122 123 124 |
# File 'lib/sitediff/crawler.rb', line 122 def find_links(doc) doc.xpath('//a[@href]').map { |e| e['href'] } end |
#relativize_link(uri) ⇒ Object
Make a link relative to @base_uri
107 108 109 110 111 112 113 114 115 116 117 118 119 |
# File 'lib/sitediff/crawler.rb', line 107 def relativize_link(uri) # fullPath = uri.path # if uri.query # fullPath += "?" + uri.query # end # # if uri.fragment # fullPath += "#" + uri.fragment # end # fullPath.gsub(@base_uri.path, "") # uri.path.slice(@base_uri.path.length, uri.path.length) end |
#resolve_link(base, rel) ⇒ Object
Resolve a potentially-relative link. Return nil on error.
98 99 100 101 102 103 104 |
# File 'lib/sitediff/crawler.rb', line 98 def resolve_link(base, rel) rel = rel.strip base + rel rescue Addressable::URI::InvalidURIError SiteDiff.log "skipped invalid URL: '#{rel}' (at #{base})", :warning nil end |