Class: HarvesterTools::MetadataParser
- Inherits:
-
Object
- Object
- HarvesterTools::MetadataParser
- Defined in:
- lib/metadata_parser.rb
Constant Summary collapse
- @@distillerknown =
attr_accessor :distillerknown
{}
Class Method Summary collapse
Instance Method Summary collapse
-
#initialize(metadata_object: HarvesterTools::MetadataObject.new) ⇒ MetadataParser
constructor
A new instance of MetadataParser.
- #parse_rdf(body:, content_type:, metadata:) ⇒ Object
- #process_html(body:, uri:, metadata: @meta) ⇒ Object
- #process_json(body:, metadata:) ⇒ Object
- #process_ld(body:, content_type:, metadata:) ⇒ Object
- #process_xml(body:, metadata:) ⇒ Object
Constructor Details
#initialize(metadata_object: HarvesterTools::MetadataObject.new) ⇒ MetadataParser
Returns a new instance of MetadataParser.
11 12 13 |
# File 'lib/metadata_parser.rb', line 11 def initialize(metadata_object: HarvesterTools::MetadataObject.new) @meta = end |
Class Method Details
.parse_rdf(body:, content_type:, metadata:) ⇒ Object
58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 |
# File 'lib/metadata_parser.rb', line 58 def self.parse_rdf(body:, content_type:, metadata:) @meta = warn "1 PARSING RDF #{body}" unless body .comments << "CRITICAL: The response message body component appears to have no content.\n" .add_warning(['018', '', '']) return end warn "2 PARSING RDF #{body}" unless body.match(/\w/) .comments << "CRITICAL: The response message body component appears to have no content.\n" .add_warning(['018', '', '']) return end warn "3 PARSING RDF #{body} content type #{content_type.class}" rdfformat = RDF::Format.for(content_type: content_type) warn "FORMAT #{rdfformat}" warn "FORMAT #{RDF::Format.for(content_type: 'text/turtle')}" unless rdfformat .comments << "CRITICAL: Found what appears to be RDF (sample: #{body[0..300].delete!("\n")}), but it could not find a parser. Please report this error, along with the GUID of the resource, to the maintainer of the system.\n" .add_warning(['018', '', '']) return end graph = HarvesterTools::Cache.checkRDFCache(body: body) if graph.size > 0 warn "\n\n\n unmarshalling graph from cache\n\ngraph size #{graph.size}\n\n" .merge_rdf(graph.to_a) else warn "\n\n\nfound format #{rdfformat}\n\n" .comments << "INFO: The response message body component appears to contain #{rdfformat}.\n" reader = '' begin reader = rdfformat.reader.new(body.force_encoding('UTF-8')) rescue Exception => e .comments << "WARN: Though linked data was found, it failed to parse (Exception #{e}). This likely indicates some syntax error in the data. As a result, no metadata will be extracted from this message.\n" .add_warning(['018', '', '']) return end begin if reader.size.zero? .comments << "WARN: Though linked data was found, it failed to parse. This likely indicates some syntax error in the data. As a result, no metadata will be extracted from this message.\n" return end reader = rdfformat.reader.new(body) # have to re-read it here, but now its safe because we have already caught errors warn 'WRITING TO CACHE' HarvesterTools::Cache.writeRDFCache(reader: reader, body: body.force_encoding('UTF-8')) # write to the special RDF graph cache warn 'WRITING DONE' reader = rdfformat.reader.new(body.force_encoding('UTF-8')) # frustrating that we cannot rewind! warn 'RE-READING DONE' .merge_rdf(reader.to_a) warn 'MERGE DONE' rescue RDF::ReaderError => e .comments << "CRITICAL: The Linked Data was malformed and caused the parser to crash with error message: #{e.} || (sample of what was parsed: #{body[0..300].delete("\n")})\n" warn "CRITICAL: The Linked Data was malformed and caused the parser to crash with error message: #{e.} || (sample of what was parsed: #{body[0..300].delete("\n")})\n" .add_warning(['018', '', '']) rescue Exception => e .comments << "CRITICAL: An unknown error occurred while parsing the (apparent) Linked Data (sample of what was parsed: #{body[0..300].delete("\n")}). Moving on...\n" warn "\n\nCRITICAL: #{e.inspect} An unknown error occurred while parsing the (apparent) Linked Data (full body: #{body.force_encoding('UTF-8')}). Moving on...\n" .add_warning(['018', '', '']) end end end |
Instance Method Details
#parse_rdf(body:, content_type:, metadata:) ⇒ Object
54 55 56 |
# File 'lib/metadata_parser.rb', line 54 def parse_rdf(body:, content_type:, metadata:) self.class.parse_rdf(body: body, content_type: content_type, metadata: ) end |
#process_html(body:, uri:, metadata: @meta) ⇒ Object
15 16 17 18 19 20 21 22 23 24 25 |
# File 'lib/metadata_parser.rb', line 15 def process_html(body:, uri:, metadata: @meta) tools = HarvesterTools::ExternalTools.new(metadata: @meta) tools.process_with_distiller(body: body, metadata: @meta) # adds to @meta jsonld, microdata, microformat, opengraph, rdfa = tools.process_with_extruct(uri: uri, metadata: @meta) parse_rdf(body: jsonld, content_type: 'application/ld+json', metadata: ) @meta.merge_hash(microdata) @meta.merge_hash(microformat) @meta.merge_hash(opengraph) parse_rdf(body: rdfa, content_type: 'application/ld+json', metadata: @meta) end |
#process_json(body:, metadata:) ⇒ Object
39 40 41 42 43 44 45 46 47 48 |
# File 'lib/metadata_parser.rb', line 39 def process_json(body:, metadata:) begin hash = JSON.parse(body) rescue .comments << "CRITICAL: Malformed JSON detected. Cannot process metadata.\n" .add_warning(['021', '', '']) end .comments << "INFO: The JSON is being merged in the metadata object\n" .hash.merge hash end |
#process_ld(body:, content_type:, metadata:) ⇒ Object
50 51 52 |
# File 'lib/metadata_parser.rb', line 50 def process_ld(body:, content_type:, metadata:) parse_rdf(body: body, content_type: content_type, metadata: ) end |
#process_xml(body:, metadata:) ⇒ Object
27 28 29 30 31 32 33 34 35 36 37 |
# File 'lib/metadata_parser.rb', line 27 def process_xml(body:, metadata:) begin hash = XmlSimple.xml_in(body) rescue .comments << "CRITICAL: Malformed XML detected. Cannot process metadata.\n" .add_warning(['020', '', '']) end .comments << "INFO: The XML is being merged in the metadata object\n" .hash.merge hash end |