Class: HarvesterTools::BruteForce

Inherits:
Object
  • Object
show all
Defined in:
lib/harvester_brute.rb

Class Method Summary collapse

Class Method Details

.begin_brute_force(guid:, links: [], metadata: HarvesterTools::MetadataObject.new) ⇒ Object



6
7
8
9
10
11
12
13
# File 'lib/harvester_brute.rb', line 6

def self.begin_brute_force(guid:, links: [], metadata: HarvesterTools::MetadataObject.new)
  type, url = HarvesterTools::Utils.convertToURL(guid: guid)
  return false unless type

  # TODO:  follow rel=alternate headers, if they are in LD or Hash format
  do_content_negotiation(url: url, metadata: , links: links)
  
end

.do_content_negotiation(url:, metadata:, links: []) ⇒ Object



15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
# File 'lib/harvester_brute.rb', line 15

def self.do_content_negotiation(url:, metadata:, links: [])
  warn "\n\nINFO: entering content negotiation of #{url}\n\n"
  .comments << "INFO: entering content negotiation of #{url}.\n"


  response = resolve_url_brute(url: url, metadata: , headers: FspHarvester::ACCEPT_LD_HEADER)
  if response
    HarvesterTools::MetadataHarvester.(response: response, metadata: )
  end
  response = resolve_url_brute(url: url, metadata: , headers: FspHarvester::ACCEPT_STAR_HEADER)
  if response
    HarvesterTools::MetadataHarvester.(response: response, metadata: ) # extract from landing page
    response = resolve_url_brute(url: response.request.url, metadata: , headers: FspHarvester::ACCEPT_LD_HEADER) # now do content negotiation on the landing page
    if response
      HarvesterTools::MetadataHarvester.(response: response, metadata: ) # extract from landing page
    end
  end

  process_alternates(links: links, metadata: )
end

.process_alternates(links: [], metadata:) ⇒ Object



36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
# File 'lib/harvester_brute.rb', line 36

def self.process_alternates(links: [], metadata:)
  warn "\n\nINFO: entering content negotiation on link alternates\n\n"
  .comments << "INFO: entering content negotiation on link alternates.\n"
  # process "alternate" links
  links.each do |link|  
    next unless link.relation == "alternate"
    next unless sanity_check_alternate(link: link, metadata: )  # don't try to process zip files!  LOL!

    url = link.href
    headers = {'Accept' => "#{link.type}"} if link.respond_to?("type")
    headers ||= FspHarvester::ACCEPT_STAR_HEADER
    warn "\n\nINFO: resolving alternate #{url} with headers #{headers.to_s}\n\n"
    .comments << "INFO: entering content negotiation on link alternates.\n"
    response = resolve_url_brute(url: url, metadata: , headers: headers) # now do content negotiation on the link
    if response
      HarvesterTools::MetadataHarvester.(response: response, metadata: ) # extract from alternate link
    end
  end

end

.resolve_url_brute(url:, method: :get, nolinkheaders: true, headers:, metadata:) ⇒ Object



80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
# File 'lib/harvester_brute.rb', line 80

def self.resolve_url_brute(url:, method: :get, nolinkheaders: true, headers:, metadata:)

  cache_key = Digest::MD5.hexdigest url + headers.to_s
  if .url_header_hash[cache_key]
    warn "Already processed #{url} - moving on"
    .comments << "INFO: Already processed #{url} - moving on.\n"
    return false
  end

  .guidtype = 'uri' if .guidtype.nil?
  warn "\n\n BRUTE FETCHING #{url} \nwith headers\n #{headers}\n\n"
  response = HarvesterTools::WebUtils.fspfetch(url: url, headers: headers, method: method, meta: )
  warn "\n\n head #{response.headers.inspect}\n\n" if response

  unless response
    .add_warning(['001', url, headers])
    .comments << "WARN: Unable to resolve #{url} using HTTP Accept header #{headers}.\n"
    .full_response << [url, "No response"]
    false
  end

  .comments << "INFO: following redirection using this header led to the following URL: #{.all_uris.last}.  Using the output from this URL for the next few tests..."
  .full_response << [url, response.body]
  .url_header_hash[cache_key] = true
  response
end

.sanity_check_alternate(link:, metadata:) ⇒ Object



57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
# File 'lib/harvester_brute.rb', line 57

def self.sanity_check_alternate(link:, metadata:)
  type = link.type if link.respond_to?('type')
  href = link.href
  unless type # we're gonna have to check extensions...
    m = href.match(/.*\.[\w\-]+/)
    extension = m[1]
    unless %w[json jsonld rdf ttl turtle n3 triples ntriples txt html xhtml nq xml].include? extension
      warn "\n\nINFO: extension #{extension} is not trusted as a link 'alternate' representation for metadata, so it will not be processed\n"
      .comments << "INFO: extension #{extension} is not trusted as a link 'alternate' representation for metadata, so it will not be processed.\n"
      return false
    end
    return true
  end
  type.gsub!(/;.*/, '')  # remove any UTF8 blah blah
  abbrev = HarvesterTools::MetadataHarvester.abbreviate_type(contenttype: type)
  unless abbrev
    warn "\n\nINFO: content-type #{type} is not trusted as a link 'alternate' representation for metadata, so it will not be processed\n"
    .comments << "INFO: content-type #{type} is not trusted as a link 'alternate' representation for metadata, so it will not be processed.\n"
    return false
  end
  true
end