Class: RelatonOasis::DataParser
- Inherits:
-
Object
- Object
- RelatonOasis::DataParser
- Includes:
- DataParserUtils
- Defined in:
- lib/relaton_oasis/data_parser.rb
Overview
Parser for OASIS document.
Instance Method Summary collapse
-
#document_part_refs ⇒ Array<String>
Look for “Cite as” references.
-
#initialize(node) ⇒ DataParser
constructor
Initialize parser.
- #link_node ⇒ Object
- #links ⇒ Object
-
#parse ⇒ RelatonOasis::OasisBibliographicItem
Parse document.
-
#parse_abstract ⇒ Array<RelatonBib::FormattedString>
# Parse abstract.
- #parse_authorizer ⇒ Object
-
#parse_date ⇒ Array<RelatonBib::BibliographicDate>
Parse date.
-
#parse_docnumber ⇒ String
Parse document number.
-
#parse_editorialgroup ⇒ RelatonBib::EditorialGroup
Parse technical committee.
- #parse_link ⇒ Object
-
#parse_relation ⇒ Array<RelatonBib::DocumentRelation>
Parse relation.
-
#parse_technology_area ⇒ Array<String>
Parse technology areas.
-
#parse_title ⇒ Array<RelatonBib::TypedTitleString>
Parse title.
- #parts ⇒ Object
-
#parts_to_docid(parts) ⇒ String
Create document identifier from parts references.
- #text ⇒ Object
- #title ⇒ Object
-
#title_to_docid(title) ⇒ String
Create document identifier from title.
Methods included from DataParserUtils
#affiliation, #contact, #create_contribution_info, #create_person, #page, #parse_chairs, #parse_contributor, #parse_docid, #parse_doctype, #parse_editors, #parse_editors_from_text, #parse_errata, #parse_part, #parse_spec, #publisher_oasis, #retry_page
Constructor Details
#initialize(node) ⇒ DataParser
Initialize parser.
11 12 13 |
# File 'lib/relaton_oasis/data_parser.rb', line 11 def initialize(node) @node = node end |
Instance Method Details
#document_part_refs ⇒ Array<String>
Look for “Cite as” references.
135 136 137 138 139 140 |
# File 'lib/relaton_oasis/data_parser.rb', line 135 def document_part_refs @node.css( ".standard__grid--cite-as > p > strong", "span.Refterm", "span.abbrev", "span.citationLabel > strong" ).map { |p| p.text.gsub(/^\[{1,2}|\]$/, "").strip } end |
#link_node ⇒ Object
107 108 109 |
# File 'lib/relaton_oasis/data_parser.rb', line 107 def link_node @link_node ||= @node.at("./div/div/div[contains(@class, 'standard__grid--cite-as')]/p[strong or span/strong]/a") end |
#links ⇒ Object
160 161 162 163 164 |
# File 'lib/relaton_oasis/data_parser.rb', line 160 def links l = @node.xpath("./div/div/div[1]/p[1]/a[@href]") l = @node.xpath("./div/div/div[1]/p[2]/a[@href]") if l.empty? l end |
#parse ⇒ RelatonOasis::OasisBibliographicItem
Parse document.
30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 |
# File 'lib/relaton_oasis/data_parser.rb', line 30 def parse # rubocop:disable Metrics/MethodLength RelatonOasis::OasisBibliographicItem.new( type: "standard", doctype: parse_doctype, title: parse_title, docid: parse_docid, link: parse_link, docnumber: parse_docnumber, date: parse_date, contributor: parse_contributor, abstract: parse_abstract, language: ["en"], script: ["Latn"], editorialgroup: parse_editorialgroup, relation: parse_relation, technology_area: parse_technology_area, ) end |
#parse_abstract ⇒ Array<RelatonBib::FormattedString>
# Parse abstract.
76 77 78 79 80 81 82 83 |
# File 'lib/relaton_oasis/data_parser.rb', line 76 def parse_abstract c = @node.xpath( "./summary/div/div[@class='standard__description']/p", ).map { |a| a.text.gsub(/[\n\t]+/, " ").strip }.join("\n") return [] if c.empty? [RelatonBib::FormattedString.new(content: c, language: "en", script: "Latn")] end |
#parse_authorizer ⇒ Object
98 99 100 101 102 103 104 105 |
# File 'lib/relaton_oasis/data_parser.rb', line 98 def @node.xpath("./div[@class='standard__details']/a").map do |a| cnt = RelatonBib::Contact.new(type: "uri", value: a[:href]) org = RelatonBib::Organization.new name: a.text.strip, contact: [cnt] role = { type: "authorizer", description: ["Committee"] } RelatonBib::ContributionInfo.new entity: org, role: [role] end end |
#parse_date ⇒ Array<RelatonBib::BibliographicDate>
Parse date.
63 64 65 66 67 68 69 |
# File 'lib/relaton_oasis/data_parser.rb', line 63 def parse_date @node.xpath("./summary/div/time[@class='standard__date']").map do |d| date_str = d.text.match(/\d{2}\s\w+\s\d{4}/).to_s date = Date.parse(date_str).to_s RelatonBib::BibliographicDate.new(on: date, type: "issued") end end |
#parse_docnumber ⇒ String
Parse document number. If the docuemnt has no parts, the document number is constructed from the title. If the document had one part, the document number is constructed from the part. If the document has parts, the document number is constructed from the parts.
174 175 176 177 178 179 180 181 |
# File 'lib/relaton_oasis/data_parser.rb', line 174 def parse_docnumber parts = document_part_refs case parts.size when 0 then parse_spec title_to_docid(@node.at("./summary/div/h2").text) when 1 then parse_part parse_spec(parts[0]) else parts_to_docid parts end end |
#parse_editorialgroup ⇒ RelatonBib::EditorialGroup
Parse technical committee.
90 91 92 93 94 95 96 |
# File 'lib/relaton_oasis/data_parser.rb', line 90 def parse_editorialgroup tc = @node.xpath("./div[@class='standard__details']/a").map do |a| wg = RelatonBib::WorkGroup.new name: a.text.strip RelatonBib::TechnicalCommittee.new wg end RelatonBib::EditorialGroup.new tc end |
#parse_link ⇒ Object
142 143 144 145 146 147 148 149 150 151 152 |
# File 'lib/relaton_oasis/data_parser.rb', line 142 def parse_link return [] if parts.size > 1 links.map do |l| type = l[:href].match(/\.(\w+)$/)&.captures&.first type ||= "src" type.sub!("docx", "doc") type.sub!("html", "src") RelatonBib::TypedUri.new(type: type, content: l[:href]) end end |
#parse_relation ⇒ Array<RelatonBib::DocumentRelation>
Parse relation.
116 117 118 119 120 121 122 123 124 125 126 127 128 |
# File 'lib/relaton_oasis/data_parser.rb', line 116 def parse_relation rels = @node.xpath( "./div/div/div[contains(@class, 'standard__grid--cite-as')]/p[strong or span/strong or b/span]", ) return [] unless rels.size > 1 rels.map do |r| docid = DataPartParser.new(r).parse_docid fref = RelatonBib::FormattedRef.new content: docid[0].id bib = RelatonOasis::OasisBibliographicItem.new formattedref: fref RelatonBib::DocumentRelation.new type: "hasPart", bibitem: bib end end |
#parse_technology_area ⇒ Array<String>
Parse technology areas.
248 249 250 |
# File 'lib/relaton_oasis/data_parser.rb', line 248 def parse_technology_area super @node end |
#parse_title ⇒ Array<RelatonBib::TypedTitleString>
Parse title.
54 55 56 |
# File 'lib/relaton_oasis/data_parser.rb', line 54 def parse_title [RelatonBib::TypedTitleString.new(type: "main", content: title, language: "en", script: "Latn")] end |
#parts ⇒ Object
154 155 156 157 158 |
# File 'lib/relaton_oasis/data_parser.rb', line 154 def parts @parts ||= @node.xpath( "./div/div/div[contains(@class, 'standard__grid--cite-as')]/p[strong or span/strong]", ) end |
#parts_to_docid(parts) ⇒ String
Create document identifier from parts references.
190 191 192 193 194 195 196 197 198 199 200 201 |
# File 'lib/relaton_oasis/data_parser.rb', line 190 def parts_to_docid(parts) id = parts[1..].each_with_object(parts[0].split("-")) do |part, acc| chunks = part.split "-" chunks.each.with_index do |chunk, idx| unless chunk.casecmp(acc[idx])&.zero? acc.slice!(idx..-1) break end end end.join("-") parse_part parse_spec(id) end |
#text ⇒ Object
19 20 21 22 23 |
# File 'lib/relaton_oasis/data_parser.rb', line 19 def text @text ||= @node.at( "./div/div/div[contains(@class, 'standard__grid--cite-as')]/p[em or i or a or span]", )&.text&.strip end |
#title ⇒ Object
15 16 17 |
# File 'lib/relaton_oasis/data_parser.rb', line 15 def title @title ||= @node.at("./summary/div/h2").text end |
#title_to_docid(title) ⇒ String
Create document identifier from title.
210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 |
# File 'lib/relaton_oasis/data_parser.rb', line 210 def title_to_docid(title) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity abbrs = title.scan(/(?<=\()[^)]+(?=\))/) if abbrs.any? id = abbrs.map { |abbr| abbr.split.join("-") }.join "-" /(?:Version\s|v)(?<ver>[\d.]+)/ =~ title id += "-v#{ver}" if ver /(?<eb>ebXML|ebMS)/ =~ title id = "#{eb}-#{id}" if eb id else series_end = false title.sub(/\s\[OASIS\s\d+\]$/, "").split(/[,:]?\s|-|(?<=[a-z])(?=[A-Z][a-z])/) .each_with_object([""]) do |word, acc| if word =~ /^v[\d.]+/ acc << $MATCH.to_s series_end = true elsif word.match?(/^Version/) acc << "v" series_end = false elsif word.match?(/^\d|ebXML|ebMS/) series_end ? acc << word : acc[-1] += word series_end = true elsif word.match?(/^\w+$/) && word == word.upcase series_end ? acc << word : acc[-1] = word series_end = true elsif word.match?(/[A-Z]+[a-z]+/) series_end ? acc << word[0] : acc[-1] += word[0] series_end = false end end.join "-" end end |