Class: HarvesterTools::ExternalTools
- Inherits:
-
Object
- Object
- HarvesterTools::ExternalTools
- Defined in:
- lib/external_tools.rb
Instance Attribute Summary collapse
-
#distillerknown ⇒ Object
Returns the value of attribute distillerknown.
-
#extructknown ⇒ Object
Returns the value of attribute extructknown.
Instance Method Summary collapse
-
#initialize(metadata: HarvesterTools::MetadataObject.new) ⇒ ExternalTools
constructor
A new instance of ExternalTools.
- #process_with_distiller(body:, metadata:) ⇒ Object
- #process_with_extruct(uri:, metadata:) ⇒ Object
Constructor Details
#initialize(metadata: HarvesterTools::MetadataObject.new) ⇒ ExternalTools
Returns a new instance of ExternalTools.
10 11 12 13 14 |
# File 'lib/external_tools.rb', line 10 def initialize(metadata: HarvesterTools::MetadataObject.new) @distillerknown = {} @extructknown = {} @meta = end |
Instance Attribute Details
#distillerknown ⇒ Object
Returns the value of attribute distillerknown.
8 9 10 |
# File 'lib/external_tools.rb', line 8 def distillerknown @distillerknown end |
#extructknown ⇒ Object
Returns the value of attribute extructknown.
8 9 10 |
# File 'lib/external_tools.rb', line 8 def extructknown @extructknown end |
Instance Method Details
#process_with_distiller(body:, metadata:) ⇒ Object
16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 |
# File 'lib/external_tools.rb', line 16 def process_with_distiller(body:, metadata:) = bhash = Digest::SHA256.hexdigest(body) if distillerknown[bhash] .comments << "INFO: data is already parsed by distiller.\n" else .comments << "INFO: Using 'Kellog's Distiller' to try to extract metadata from return value (message body).\n" file = Tempfile.new('foo', encoding: 'UTF-8') body = body.force_encoding('UTF-8') body.scrub! body = body.gsub(%r{"@context"\s*:\s*"https?://schema.org/?"}, '"@context": "https://schema.org/docs/jsonldcontext.json"') # a bug in distiller, apparently file.write(body) file.rewind .comments << "INFO: The message body is being examined by Distiller\n" command = "LANG=en_US.UTF-8 #{FspHarvester::RDF_COMMAND} serialize --input-format rdfa --output-format jsonld #{file.path}" warn "distiller command: #{command}" result, _stderr, _status = Open3.capture3(command) warn '' warn "distiller errors: #{_stderr}" if _stderr file.close file.unlink result = result.force_encoding('UTF-8') # warn "DIST RESULT: #{result}" if result !~ /@context/i # failure returns nil .comments << "WARN: The Distiller tool failed to find parseable data in the body, perhaps due to incorrectly formatted HTML..\n" .add_warning(['018', '', '']) result = '{}' else .comments << "INFO: The Distiller found parseable data. Parsing as JSON-LD\n" end distillerknown[bhash] = true end result end |
#process_with_extruct(uri:, metadata:) ⇒ Object
53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 |
# File 'lib/external_tools.rb', line 53 def process_with_extruct(uri:, metadata:) bhash = Digest::SHA256.hexdigest(uri) jsonld = '{}' microdata = {} microformat = {} opengraph = {} rdfa = '{}' if extructknown[bhash] .comments << "INFO: data is already parsed by extruct.\n" else .comments << "INFO: Using 'extruct' to try to extract metadata from return value (message body) of #{uri}.\n" warn 'begin open3' stdout, stderr, status = Open3.capture3(FspHarvester::EXTRUCT_COMMAND + ' ' + uri) warn "open3 status: #{status} #{stdout}" result = stderr # absurd that the output comes over stderr! LOL! if result.to_s.match(/(Failed\sto\sextract.*?)\n/) .comments << "WARN: extruct threw an error #{Regexp.last_match(1)} when attempting to parse return value (message body) of #{uri}.\n" .add_warning(['019', '', '']) if result.to_s.match(/(ValueError:.*?)\n/) .comments << "WARN: extruct error was #{Regexp.last_match(1)}\n" .add_warning(['019', '', '']) end elsif result.to_s.match(/^\s+?\{/) or result.to_s.match(/^\s+\[/) # this is JSON begin json = JSON.parse result rescue StandardError .comments << "WARN: extruct threw an error when attempting to parse the extruct command return value from processing #{uri}.\n" .add_warning(['019', '', '']) return [jsonld, microdata, microformat, opengraph, rdfa] end .comments << "INFO: the extruct tool found parseable data at #{uri}\n" jsonld = json['json-ld'].to_json if json['json-ld'].any? microdata = json['microdata'].first if json['microdata'].any? microformat = json['microformat'].first if json['microformat'].any? opengraph = json['opengraph'].first if json['opengraph'].any? rdfa = json['rdfa'].to_json if json['rdfa'].any? # @meta.merge_hash(json.first) if json.first.is_a? Hash else @meta.comments << "WARN: the extruct tool failed to find parseable data at #{uri}\n" end end extructknown[bhash] = true [jsonld, microdata, microformat, opengraph, rdfa] end |