Class: HarvesterTools::MetadataParser

Inherits:
Object
  • Object
show all
Defined in:
lib/metadata_parser.rb

Constant Summary collapse

@@distillerknown =

attr_accessor :distillerknown

{}

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(metadata_object: HarvesterTools::MetadataObject.new) ⇒ MetadataParser

Returns a new instance of MetadataParser.



11
12
13
# File 'lib/metadata_parser.rb', line 11

def initialize(metadata_object: HarvesterTools::MetadataObject.new)
  @meta = 
end

Class Method Details

.parse_rdf(body:, content_type:, metadata:) ⇒ Object



58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# File 'lib/metadata_parser.rb', line 58

def self.parse_rdf(body:, content_type:, metadata:)
  @meta = 
  warn "1 PARSING RDF #{body}"
  unless body
    .comments << "CRITICAL: The response message body component appears to have no content.\n"
    .add_warning(['018', '', ''])
    return
  end
  warn "2 PARSING RDF #{body}"

  unless body.match(/\w/)
    .comments << "CRITICAL: The response message body component appears to have no content.\n"
    .add_warning(['018', '', ''])
    return
  end
  warn "3 PARSING RDF #{body} content type #{content_type.class}"

  rdfformat = RDF::Format.for(content_type: content_type)
  warn "FORMAT #{rdfformat}"
  warn "FORMAT #{RDF::Format.for(content_type: 'text/turtle')}"
  unless rdfformat
    .comments << "CRITICAL: Found what appears to be RDF (sample:  #{body[0..300].delete!("\n")}), but it could not find a parser.  Please report this error, along with the GUID of the resource, to the maintainer of the system.\n"
    .add_warning(['018', '', ''])
    return
  end

  graph = HarvesterTools::Cache.checkRDFCache(body: body)
  if graph.size > 0
    warn "\n\n\n unmarshalling graph from cache\n\ngraph size #{graph.size}\n\n"
    .merge_rdf(graph.to_a)
  else
    warn "\n\n\nfound format #{rdfformat}\n\n"
    .comments << "INFO: The response message body component appears to contain #{rdfformat}.\n"
    reader = ''
    begin
      reader = rdfformat.reader.new(body.force_encoding('UTF-8'))
    rescue Exception => e
      .comments << "WARN: Though linked data was found, it failed to parse (Exception #{e}).  This likely indicates some syntax error in the data.  As a result, no metadata will be extracted from this message.\n"
      .add_warning(['018', '', ''])
      return
    end

    begin
      if reader.size.zero?
        .comments << "WARN: Though linked data was found, it failed to parse.  This likely indicates some syntax error in the data.  As a result, no metadata will be extracted from this message.\n"
        return
      end
      reader = rdfformat.reader.new(body) # have to re-read it here, but now its safe because we have already caught errors
      warn 'WRITING TO CACHE'
      HarvesterTools::Cache.writeRDFCache(reader: reader, body: body.force_encoding('UTF-8')) # write to the special RDF graph cache
      warn 'WRITING DONE'
      reader = rdfformat.reader.new(body.force_encoding('UTF-8'))  # frustrating that we cannot rewind!
      warn 'RE-READING DONE'
      .merge_rdf(reader.to_a)
      warn 'MERGE DONE'
    rescue RDF::ReaderError => e
      .comments << "CRITICAL: The Linked Data was malformed and caused the parser to crash with error message: #{e.message} ||  (sample of what was parsed:  #{body[0..300].delete("\n")})\n"
      warn "CRITICAL: The Linked Data was malformed and caused the parser to crash with error message: #{e.message} ||  (sample of what was parsed:  #{body[0..300].delete("\n")})\n"
      .add_warning(['018', '', ''])
    rescue Exception => e
      .comments << "CRITICAL: An unknown error occurred while parsing the (apparent) Linked Data (sample of what was parsed:  #{body[0..300].delete("\n")}).  Moving on...\n"
      warn "\n\nCRITICAL: #{e.inspect} An unknown error occurred while parsing the (apparent) Linked Data (full body:  #{body.force_encoding('UTF-8')}).  Moving on...\n"
      .add_warning(['018', '', ''])
    end
  end
end

Instance Method Details

#parse_rdf(body:, content_type:, metadata:) ⇒ Object



54
55
56
# File 'lib/metadata_parser.rb', line 54

def parse_rdf(body:, content_type:, metadata:)
  self.class.parse_rdf(body: body, content_type: content_type, metadata: )
end

#process_html(body:, uri:, metadata: @meta) ⇒ Object



15
16
17
18
19
20
21
22
23
24
25
# File 'lib/metadata_parser.rb', line 15

def process_html(body:, uri:, metadata: @meta)
  tools = HarvesterTools::ExternalTools.new(metadata: @meta)
  tools.process_with_distiller(body: body, metadata: @meta) # adds to @meta

  jsonld, microdata, microformat, opengraph, rdfa = tools.process_with_extruct(uri: uri, metadata: @meta)
  parse_rdf(body: jsonld, content_type: 'application/ld+json', metadata: )
  @meta.merge_hash(microdata)
  @meta.merge_hash(microformat) 
  @meta.merge_hash(opengraph) 
  parse_rdf(body: rdfa, content_type: 'application/ld+json', metadata: @meta)
end

#process_json(body:, metadata:) ⇒ Object



39
40
41
42
43
44
45
46
47
48
# File 'lib/metadata_parser.rb', line 39

def process_json(body:, metadata:)
  begin
    hash = JSON.parse(body)
  rescue
    .comments << "CRITICAL: Malformed JSON detected.  Cannot process metadata.\n"
    .add_warning(['021', '', ''])
  end
  .comments << "INFO: The JSON is being merged in the metadata object\n"
  .hash.merge hash
end

#process_ld(body:, content_type:, metadata:) ⇒ Object



50
51
52
# File 'lib/metadata_parser.rb', line 50

def process_ld(body:, content_type:, metadata:)
  parse_rdf(body: body, content_type: content_type, metadata: )
end

#process_xml(body:, metadata:) ⇒ Object



27
28
29
30
31
32
33
34
35
36
37
# File 'lib/metadata_parser.rb', line 27

def process_xml(body:, metadata:)
  
  begin
    hash = XmlSimple.xml_in(body)
  rescue
    .comments << "CRITICAL: Malformed XML detected.  Cannot process metadata.\n"
    .add_warning(['020', '', ''])
  end
  .comments << "INFO: The XML is being merged in the metadata object\n"
  .hash.merge hash
end