Class: HarvesterTools::BruteForce
- Inherits:
-
Object
- Object
- HarvesterTools::BruteForce
- Defined in:
- lib/harvester_brute.rb
Class Method Summary collapse
- .begin_brute_force(guid:, links: [], metadata: HarvesterTools::MetadataObject.new) ⇒ Object
- .do_content_negotiation(url:, metadata:, links: []) ⇒ Object
- .process_alternates(links: [], metadata:) ⇒ Object
- .resolve_url_brute(url:, method: :get, nolinkheaders: true, headers:, metadata:) ⇒ Object
- .sanity_check_alternate(link:, metadata:) ⇒ Object
Class Method Details
.begin_brute_force(guid:, links: [], metadata: HarvesterTools::MetadataObject.new) ⇒ Object
6 7 8 9 10 11 12 13 |
# File 'lib/harvester_brute.rb', line 6 def self.begin_brute_force(guid:, links: [], metadata: HarvesterTools::MetadataObject.new) type, url = HarvesterTools::Utils.convertToURL(guid: guid) return false unless type # TODO: follow rel=alternate headers, if they are in LD or Hash format do_content_negotiation(url: url, metadata: , links: links) end |
.do_content_negotiation(url:, metadata:, links: []) ⇒ Object
15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 |
# File 'lib/harvester_brute.rb', line 15 def self.do_content_negotiation(url:, metadata:, links: []) warn "\n\nINFO: entering content negotiation of #{url}\n\n" .comments << "INFO: entering content negotiation of #{url}.\n" response = resolve_url_brute(url: url, metadata: , headers: FspHarvester::ACCEPT_LD_HEADER) if response HarvesterTools::MetadataHarvester.(response: response, metadata: ) end response = resolve_url_brute(url: url, metadata: , headers: FspHarvester::ACCEPT_STAR_HEADER) if response HarvesterTools::MetadataHarvester.(response: response, metadata: ) # extract from landing page response = resolve_url_brute(url: response.request.url, metadata: , headers: FspHarvester::ACCEPT_LD_HEADER) # now do content negotiation on the landing page if response HarvesterTools::MetadataHarvester.(response: response, metadata: ) # extract from landing page end end process_alternates(links: links, metadata: ) end |
.process_alternates(links: [], metadata:) ⇒ Object
36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 |
# File 'lib/harvester_brute.rb', line 36 def self.process_alternates(links: [], metadata:) warn "\n\nINFO: entering content negotiation on link alternates\n\n" .comments << "INFO: entering content negotiation on link alternates.\n" # process "alternate" links links.each do |link| next unless link.relation == "alternate" next unless sanity_check_alternate(link: link, metadata: ) # don't try to process zip files! LOL! url = link.href headers = {'Accept' => "#{link.type}"} if link.respond_to?("type") headers ||= FspHarvester::ACCEPT_STAR_HEADER warn "\n\nINFO: resolving alternate #{url} with headers #{headers.to_s}\n\n" .comments << "INFO: entering content negotiation on link alternates.\n" response = resolve_url_brute(url: url, metadata: , headers: headers) # now do content negotiation on the link if response HarvesterTools::MetadataHarvester.(response: response, metadata: ) # extract from alternate link end end end |
.resolve_url_brute(url:, method: :get, nolinkheaders: true, headers:, metadata:) ⇒ Object
80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 |
# File 'lib/harvester_brute.rb', line 80 def self.resolve_url_brute(url:, method: :get, nolinkheaders: true, headers:, metadata:) cache_key = Digest::MD5.hexdigest url + headers.to_s if .url_header_hash[cache_key] warn "Already processed #{url} - moving on" .comments << "INFO: Already processed #{url} - moving on.\n" return false end .guidtype = 'uri' if .guidtype.nil? warn "\n\n BRUTE FETCHING #{url} \nwith headers\n #{headers}\n\n" response = HarvesterTools::WebUtils.fspfetch(url: url, headers: headers, method: method, meta: ) warn "\n\n head #{response.headers.inspect}\n\n" if response unless response .add_warning(['001', url, headers]) .comments << "WARN: Unable to resolve #{url} using HTTP Accept header #{headers}.\n" .full_response << [url, "No response"] false end .comments << "INFO: following redirection using this header led to the following URL: #{.all_uris.last}. Using the output from this URL for the next few tests..." .full_response << [url, response.body] .url_header_hash[cache_key] = true response end |
.sanity_check_alternate(link:, metadata:) ⇒ Object
57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 |
# File 'lib/harvester_brute.rb', line 57 def self.sanity_check_alternate(link:, metadata:) type = link.type if link.respond_to?('type') href = link.href unless type # we're gonna have to check extensions... m = href.match(/.*\.[\w\-]+/) extension = m[1] unless %w[json jsonld rdf ttl turtle n3 triples ntriples txt html xhtml nq xml].include? extension warn "\n\nINFO: extension #{extension} is not trusted as a link 'alternate' representation for metadata, so it will not be processed\n" .comments << "INFO: extension #{extension} is not trusted as a link 'alternate' representation for metadata, so it will not be processed.\n" return false end return true end type.gsub!(/;.*/, '') # remove any UTF8 blah blah abbrev = HarvesterTools::MetadataHarvester.abbreviate_type(contenttype: type) unless abbrev warn "\n\nINFO: content-type #{type} is not trusted as a link 'alternate' representation for metadata, so it will not be processed\n" .comments << "INFO: content-type #{type} is not trusted as a link 'alternate' representation for metadata, so it will not be processed.\n" return false end true end |