Class: HarvesterTools::MetadataHarvester
- Inherits:
-
Object
- Object
- HarvesterTools::MetadataHarvester
- Defined in:
- lib/metadata_harvester.rb
Class Method Summary collapse
- .abbreviate_type(contenttype:) ⇒ Object
- .attempt_to_detect_type(body:, headers:) ⇒ Object
- .attempt_to_resolve(link:, headers: FspHarvester::ACCEPT_STAR_HEADER) ⇒ Object
- .check_json(body:) ⇒ Object
- .check_ld(body:, claimed_type:) ⇒ Object
- .extract_metadata_from_body(response:, metadata: HarvesterTools::MetadataObject.new) ⇒ Object
- .extract_metadata_from_links(links: [], metadata: HarvesterTools::MetadataObject.new) ⇒ Object
-
.ntriples_hack(body:) ⇒ Object
distriller cannot recognize single-line ntriples unless they end with a period, which is not required by the spec…
- .process_according_to_type(body:, uri:, abbreviation:, content_type:, metadata:, harvester: HarvesterTools::MetadataParser.new(metadata_object: @meta)) ⇒ Object
- .validate_claimed_type(abbreviation:, claimed_type:) ⇒ Object
Class Method Details
.abbreviate_type(contenttype:) ⇒ Object
229 230 231 232 233 234 235 236 237 238 239 240 241 |
# File 'lib/metadata_harvester.rb', line 229 def self.abbreviate_type(contenttype:) foundtype = nil FspHarvester::RDF_FORMATS.merge(FspHarvester::XML_FORMATS).merge(FspHarvester::HTML_FORMATS).merge(FspHarvester::JSON_FORMATS).each do |type, vals| warn "\n\ntype #{type}\nvals #{vals}\n\n" @meta.comments << "INFO: testing #{type} MIME types for #{contenttype}" next unless vals.include? contenttype foundtype = type @meta.comments << "INFO: detected a #{type} MIME type" break end foundtype end |
.attempt_to_detect_type(body:, headers:) ⇒ Object
95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 |
# File 'lib/metadata_harvester.rb', line 95 def self.attempt_to_detect_type(body:, headers:) # described by should be an html, xml, json, or linked data document abbreviation = nil content_type = nil @meta.comments << 'INFO: Testing metadata format for html, xml, and linked data formats\n' claimed_type = headers[:content_type] claimed_type.gsub!(/\s*;.*/, '') if body =~ /^\s*<\?xml/ if body[0..1000] =~ /<HTML/i # take a sample, it should appear quite early (it will appear in other places in e.g. tutorial documents) abbreviation = 'html' content_type = validate_claimed_type(abbreviation: abbreviation, claimed_type: claimed_type) @meta.add_warning(['022', @meta.all_uris.last, "" ]) unless content_type content_type |= 'text/html' @meta.comments << 'INFO: appears to be HTML\n' elsif body =~ /<rdf:RDF/i abbreviation = 'rdfxml' content_type = validate_claimed_type(abbreviation: abbreviation, claimed_type: claimed_type) @meta.add_warning(['022', @meta.all_uris.last, "" ]) unless content_type content_type |= 'application/rdf+xml' @meta.comments << 'INFO: appears to be RDF-XML\n' else abbreviation = 'xml' content_type = validate_claimed_type(abbreviation: abbreviation, claimed_type: claimed_type) @meta.add_warning(['022', @meta.all_uris.last, "" ]) unless content_type content_type |= 'application/xml' @meta.comments << 'INFO: appears to be XML\n' end elsif body[0..1000] =~ /<HTML/i # take a sample, it should appear quite early (it will appear in other places in e.g. tutorial documents) abbreviation = 'html' content_type = validate_claimed_type(abbreviation: abbreviation, claimed_type: claimed_type) @meta.add_warning(['022', @meta.all_uris.last, "" ]) unless content_type content_type ||= 'text/html' @meta.comments << 'INFO: appears to be HTML\n' else abbreviation, content_type = check_ld(body: body, claimed_type: claimed_type) abbreviation, content_type = check_json(body: body) unless abbreviation # don't test if LD already found! end unless content_type @meta.add_warning(['017', url, header]) @meta.comments << "WARN: metadata format returned from #{url} using Accept header #{header} is not recognized. Processing will end now.\n" end [abbreviation, content_type] end |
.attempt_to_resolve(link:, headers: FspHarvester::ACCEPT_STAR_HEADER) ⇒ Object
79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 |
# File 'lib/metadata_harvester.rb', line 79 def self.attempt_to_resolve(link:, headers: FspHarvester::ACCEPT_STAR_HEADER) @meta.comments << "INFO: link #{link.href} being processed" if link.respond_to? 'type' header = { 'Accept' => link.type } else @meta.comments << "INFO: link #{link.href} has no MIME type, defaulting to */*" end url = link.href response = HarvesterTools::WebUtils.fspfetch(url: url, method: :get, headers: header) unless response @meta.add_warning(['016', url, header]) @meta.comments << "WARN: Unable to resolve describedby link #{url} using HTTP Accept header #{header}.\n" end response end |
.check_json(body:) ⇒ Object
211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 |
# File 'lib/metadata_harvester.rb', line 211 def self.check_json(body:) abbreviation = nil parsed = nil begin parsed = JSON.parse(body.force_encoding('UTF-8')) rescue StandardError abbreviation = nil end if parsed abbreviation = 'json' else @meta.comments << "INFO: metadata does not appear to be in JSON format. No options left.\n" return [nil, nil] end [abbreviation, 'application/json'] end |
.check_ld(body:, claimed_type:) ⇒ Object
159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 |
# File 'lib/metadata_harvester.rb', line 159 def self.check_ld(body:, claimed_type:) detected_type = ntriples_hack(body: body) # ntriples hack for one-line metadata records unless detected_type # see if distiller can detect a type detected_type = RDF::Format.for({ sample: body[0..5000].force_encoding('UTF-8')}) @meta.comments << "INFO: Auto-detected type #{detected_type}\n" end # at this point, detected_type is something like RDF::Turtle::Format (or nil). This will return a content-type contenttype = '' abbreviation = '' if detected_type detectedcontenttypes = detected_type.content_type # comes back as array of [application/x, application/y] case when claimed_type =~ /application\/vnd\./ # vnd are domain specific contenttype = claimed_type # just pick one arbitrarily, since it doesn't match thedeclared type anyway abbreviation = abbreviate_type(contenttype: contenttype) @meta.comments << "INFO: using content-type #{contenttype}.\n" when detectedcontenttypes.include?(claimed_type) warn "detected types #{detectedcontenttypes} claimed type #{claimed_type}" @meta.add_warning(['022', @meta.all_uris.last, "" ]) contenttype = detected_type.content_type.first # just pick one arbitrarily, since it doesn't match thedeclared type anyway abbreviation = abbreviate_type(contenttype: contenttype) @meta.comments << "INFO: using content-type #{contenttype} even though there was a mismatch.\n" else contenttype = claimed_type # just pick one arbitrarily, since it doesn't match thedeclared type anyway abbreviation = abbreviate_type(contenttype: contenttype) @meta.comments << "INFO: using content-type #{contenttype}.\n" end else @meta.comments << "INFO: metadata does not appear to be in a linked data format. Trying other options.\n" end [abbreviation, contenttype] end |
.extract_metadata_from_body(response:, metadata: HarvesterTools::MetadataObject.new) ⇒ Object
38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 |
# File 'lib/metadata_harvester.rb', line 38 def self.(response:, metadata: HarvesterTools::MetadataObject.new) @meta = @meta.comments << 'INFO: now collecting both linked data and hash-style data using the harvested links' abbreviation, content_type = attempt_to_detect_type(body: response.body, headers: response.headers) unless abbreviation @meta.add_warning(['017', response.request.url, '']) @meta.comments << "WARN: format returned from #{response.request.url} is not recognized. Moving on.\n" return end request_content_types = response.request.headers["Accept"].split(/,\s*/) unless (request_content_types.include? content_type) and !(request_content_types.include? "*/*") and (response.code != 406) @meta.add_warning(['023', response.request.url, '']) @meta.comments << "WARN: format returned from #{response.request.url} does not match request type. This should result in a 406 error, but instead was accepted as a 200.\n" end process_according_to_type(body: response.body, uri: response.request.url, metadata: @meta, abbreviation: abbreviation, content_type: content_type) end |
.extract_metadata_from_links(links: [], metadata: HarvesterTools::MetadataObject.new) ⇒ Object
8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 |
# File 'lib/metadata_harvester.rb', line 8 def self.(links: [], metadata: HarvesterTools::MetadataObject.new) @meta = @meta.comments << 'INFO: now collecting both linked data and hash-style data using the harvested links' describedby = links.select { |l| l if l.relation == 'describedby' } warn "metadata harvester links length #{describedby.length}" hvst = HarvesterTools::MetadataParser.new(metadata_object: @meta) # put here because the class variable for detecting duplicates should apply to all URIs describedby.each do |link| accepttype = FspHarvester::ACCEPT_STAR_HEADER accept = link.respond_to?('type') ? link.type : nil accept.gsub!('json+ld', 'ld+json') # patch for bug in Dataverse 5.14 linksets accepttype = { 'Accept' => accept } if accept response = attempt_to_resolve(link: link, headers: accepttype) warn "\n\nRESPONSE #{response}\n\n" abbreviation, content_type = attempt_to_detect_type(body: response.body, headers: response.headers) warn "ABBR #{abbreviation} CONT #{content_type}\n\n" unless abbreviation @meta.add_warning(['017', url, header]) @meta.comments << "WARN: metadata format returned from #{url} using Accept header #{header} is not recognized. Processing will end now.\n" next end process_according_to_type(body: response.body, uri: link, metadata: @meta, abbreviation: abbreviation, content_type: content_type, harvester: hvst) end end |
.ntriples_hack(body:) ⇒ Object
distriller cannot recognize single-line ntriples unless they end with a period, which is not required by the spec… so hack it!
193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 |
# File 'lib/metadata_harvester.rb', line 193 def self.ntriples_hack(body:) # distriller cannot recognize single-line ntriples unless they end with a period, which is not required by the spec... so hack it! detected_type = nil body.split.each do |line| line.strip! next if line.empty? next unless line =~ /\s*<[^>]+>\s*<[^>]+>\s\S+/ @meta.comments << "INFO: running ntriples hack on #{line + ' .'}\n" detected_type = RDF::Format.for({ sample: "#{line} ." }) # adding a period allows detection of ntriples by distiller break end @meta.comments << "INFO: ntriples hack found: #{detected_type}\n" return nil if detected_type != RDF::NTriples::Format # only return the hacky case detected_type end |
.process_according_to_type(body:, uri:, abbreviation:, content_type:, metadata:, harvester: HarvesterTools::MetadataParser.new(metadata_object: @meta)) ⇒ Object
57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 |
# File 'lib/metadata_harvester.rb', line 57 def self.process_according_to_type(body:, uri:, abbreviation:, content_type:, metadata:, harvester: HarvesterTools::MetadataParser.new(metadata_object: @meta)) warn "PROCESSING #{abbreviation}" case abbreviation when 'html' @meta.comments << 'INFO: Processing html' harvester.process_html(body: body, uri: uri, metadata: @meta) when 'xml' @meta.comments << 'INFO: Processing xml' harvester.process_xml(body: body, metadata: @meta) when 'json' @meta.comments << 'INFO: Processing json' harvester.process_json(body: body, metadata: @meta) when 'jsonld', 'rdfxml', 'turtle', 'ntriples', 'nquads' warn "PROCESSING USING TURTLE" @meta.comments << 'INFO: Processing linked data' harvester.process_ld(body: body, content_type: content_type, metadata: @meta) when 'specialist' warn 'no specialized parsers so far' end end |
.validate_claimed_type(abbreviation:, claimed_type:) ⇒ Object
140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 |
# File 'lib/metadata_harvester.rb', line 140 def self.validate_claimed_type(abbreviation:, claimed_type:) warn "\n\nclaimed type #{claimed_type}\nabbreviation #{abbreviation}\n\n" claimed_type.gsub!(/\s*;.*/, '') case abbreviation when 'html' return claimed_type if FspHarvester::HTML_FORMATS['html'].include? claimed_type when 'xml' return claimed_type if FspHarvester::XML_FORMATS['xml'].include? claimed_type when 'json' return claimed_type if FspHarvester::JSON_FORMATS['json'].include? claimed_type when 'jsonld', 'rdfxml', 'turtle', 'ntriples', 'nquads' return claimed_type if FspHarvester::RDF_FORMATS.values.flatten.include? claimed_type when 'specialist' warn 'no specialized parsers so far' end return false end |