Class: RelatonOasis::DataParser

Inherits:
Object
  • Object
show all
Includes:
DataParserUtils
Defined in:
lib/relaton_oasis/data_parser.rb

Overview

Parser for OASIS document.

Instance Method Summary collapse

Methods included from DataParserUtils

#affiliation, #contact, #create_contribution_info, #create_person, #page, #parse_chairs, #parse_contributor, #parse_docid, #parse_doctype, #parse_editors, #parse_editors_from_text, #parse_errata, #parse_part, #parse_spec, #publisher_oasis, #retry_page

Constructor Details

#initialize(node) ⇒ DataParser

Initialize parser.

Parameters:

  • node (Nokogiri::HTML::Element)

    docment node



11
12
13
# File 'lib/relaton_oasis/data_parser.rb', line 11

def initialize(node)
  @node = node
end

Instance Method Details

#document_part_refsArray<String>

Look for “Cite as” references.

Returns:

  • (Array<String>)

    document part references



135
136
137
138
139
140
# File 'lib/relaton_oasis/data_parser.rb', line 135

def document_part_refs
  @node.css(
    ".standard__grid--cite-as > p > strong",
    "span.Refterm", "span.abbrev", "span.citationLabel > strong"
  ).map { |p| p.text.gsub(/^\[{1,2}|\]$/, "").strip }
end


107
108
109
# File 'lib/relaton_oasis/data_parser.rb', line 107

def link_node
  @link_node ||= @node.at("./div/div/div[contains(@class, 'standard__grid--cite-as')]/p[strong or span/strong]/a")
end


160
161
162
163
164
# File 'lib/relaton_oasis/data_parser.rb', line 160

def links
  l = @node.xpath("./div/div/div[1]/p[1]/a[@href]")
  l = @node.xpath("./div/div/div[1]/p[2]/a[@href]") if l.empty?
  l
end

#parseRelatonOasis::OasisBibliographicItem

Parse document.

Returns:



30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
# File 'lib/relaton_oasis/data_parser.rb', line 30

def parse # rubocop:disable Metrics/MethodLength
  RelatonOasis::OasisBibliographicItem.new(
    type: "standard",
    doctype: parse_doctype,
    title: parse_title,
    docid: parse_docid,
    link: parse_link,
    docnumber: parse_docnumber,
    date: parse_date,
    contributor: parse_contributor,
    abstract: parse_abstract,
    language: ["en"],
    script: ["Latn"],
    editorialgroup: parse_editorialgroup,
    relation: parse_relation,
    technology_area: parse_technology_area,
  )
end

#parse_abstractArray<RelatonBib::FormattedString>

# Parse abstract.

Returns:

  • (Array<RelatonBib::FormattedString>)

    abstract



76
77
78
79
80
81
82
83
# File 'lib/relaton_oasis/data_parser.rb', line 76

def parse_abstract
  c = @node.xpath(
    "./summary/div/div[@class='standard__description']/p",
  ).map { |a| a.text.gsub(/[\n\t]+/, " ").strip }.join("\n")
  return [] if c.empty?

  [RelatonBib::FormattedString.new(content: c, language: "en", script: "Latn")]
end

#parse_authorizerObject



98
99
100
101
102
103
104
105
# File 'lib/relaton_oasis/data_parser.rb', line 98

def parse_authorizer
  @node.xpath("./div[@class='standard__details']/a").map do |a|
    cnt = RelatonBib::Contact.new(type: "uri", value: a[:href])
    org = RelatonBib::Organization.new name: a.text.strip, contact: [cnt]
    role = { type: "authorizer", description: ["Committee"] }
    RelatonBib::ContributionInfo.new entity: org, role: [role]
  end
end

#parse_dateArray<RelatonBib::BibliographicDate>

Parse date.

Returns:

  • (Array<RelatonBib::BibliographicDate>)

    date



63
64
65
66
67
68
69
# File 'lib/relaton_oasis/data_parser.rb', line 63

def parse_date
  @node.xpath("./summary/div/time[@class='standard__date']").map do |d|
    date_str = d.text.match(/\d{2}\s\w+\s\d{4}/).to_s
    date = Date.parse(date_str).to_s
    RelatonBib::BibliographicDate.new(on: date, type: "issued")
  end
end

#parse_docnumberString

Parse document number. If the docuemnt has no parts, the document number is constructed from the title. If the document had one part, the document number is constructed from the part. If the document has parts, the document number is constructed from the parts.

Returns:

  • (String)

    document number



174
175
176
177
178
179
180
181
# File 'lib/relaton_oasis/data_parser.rb', line 174

def parse_docnumber
  parts = document_part_refs
  case parts.size
  when 0 then parse_spec title_to_docid(@node.at("./summary/div/h2").text)
  when 1 then parse_part parse_spec(parts[0])
  else parts_to_docid parts
  end
end

#parse_editorialgroupRelatonBib::EditorialGroup

Parse technical committee.

Returns:

  • (RelatonBib::EditorialGroup)

    technical committee



90
91
92
93
94
95
96
# File 'lib/relaton_oasis/data_parser.rb', line 90

def parse_editorialgroup
  tc = @node.xpath("./div[@class='standard__details']/a").map do |a|
    wg = RelatonBib::WorkGroup.new name: a.text.strip
    RelatonBib::TechnicalCommittee.new wg
  end
  RelatonBib::EditorialGroup.new tc
end


142
143
144
145
146
147
148
149
150
151
152
# File 'lib/relaton_oasis/data_parser.rb', line 142

def parse_link
  return [] if parts.size > 1

  links.map do |l|
    type = l[:href].match(/\.(\w+)$/)&.captures&.first
    type ||= "src"
    type.sub!("docx", "doc")
    type.sub!("html", "src")
    RelatonBib::TypedUri.new(type: type, content: l[:href])
  end
end

#parse_relationArray<RelatonBib::DocumentRelation>

Parse relation.

Returns:

  • (Array<RelatonBib::DocumentRelation>)

    relation



116
117
118
119
120
121
122
123
124
125
126
127
128
# File 'lib/relaton_oasis/data_parser.rb', line 116

def parse_relation
  rels = @node.xpath(
    "./div/div/div[contains(@class, 'standard__grid--cite-as')]/p[strong or span/strong or b/span]",
  )
  return [] unless rels.size > 1

  rels.map do |r|
    docid = DataPartParser.new(r).parse_docid
    fref = RelatonBib::FormattedRef.new content: docid[0].id
    bib = RelatonOasis::OasisBibliographicItem.new formattedref: fref
    RelatonBib::DocumentRelation.new type: "hasPart", bibitem: bib
  end
end

#parse_technology_areaArray<String>

Parse technology areas.

Returns:

  • (Array<String>)

    technology areas



248
249
250
# File 'lib/relaton_oasis/data_parser.rb', line 248

def parse_technology_area
  super @node
end

#parse_titleArray<RelatonBib::TypedTitleString>

Parse title.

Returns:

  • (Array<RelatonBib::TypedTitleString>)

    <description>



54
55
56
# File 'lib/relaton_oasis/data_parser.rb', line 54

def parse_title
  [RelatonBib::TypedTitleString.new(type: "main", content: title, language: "en", script: "Latn")]
end

#partsObject



154
155
156
157
158
# File 'lib/relaton_oasis/data_parser.rb', line 154

def parts
  @parts ||= @node.xpath(
    "./div/div/div[contains(@class, 'standard__grid--cite-as')]/p[strong or span/strong]",
  )
end

#parts_to_docid(parts) ⇒ String

Create document identifier from parts references.

Parameters:

  • parts (Array<String>)

    parts references

Returns:

  • (String)

    document identifier



190
191
192
193
194
195
196
197
198
199
200
201
# File 'lib/relaton_oasis/data_parser.rb', line 190

def parts_to_docid(parts)
  id = parts[1..].each_with_object(parts[0].split("-")) do |part, acc|
    chunks = part.split "-"
    chunks.each.with_index do |chunk, idx|
      unless chunk.casecmp(acc[idx])&.zero?
        acc.slice!(idx..-1)
        break
      end
    end
  end.join("-")
  parse_part parse_spec(id)
end

#textObject



19
20
21
22
23
# File 'lib/relaton_oasis/data_parser.rb', line 19

def text
  @text ||= @node.at(
    "./div/div/div[contains(@class, 'standard__grid--cite-as')]/p[em or i or a or span]",
  )&.text&.strip
end

#titleObject



15
16
17
# File 'lib/relaton_oasis/data_parser.rb', line 15

def title
  @title ||= @node.at("./summary/div/h2").text
end

#title_to_docid(title) ⇒ String

Create document identifier from title.

Parameters:

  • title (String)

    title

Returns:

  • (String)

    document identifier



210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
# File 'lib/relaton_oasis/data_parser.rb', line 210

def title_to_docid(title) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
  abbrs = title.scan(/(?<=\()[^)]+(?=\))/)
  if abbrs.any?
    id = abbrs.map { |abbr| abbr.split.join("-") }.join "-"
    /(?:Version\s|v)(?<ver>[\d.]+)/ =~ title
    id += "-v#{ver}" if ver
    /(?<eb>ebXML|ebMS)/ =~ title
    id = "#{eb}-#{id}" if eb
    id
  else
    series_end = false
    title.sub(/\s\[OASIS\s\d+\]$/, "").split(/[,:]?\s|-|(?<=[a-z])(?=[A-Z][a-z])/)
      .each_with_object([""]) do |word, acc|
      if word =~ /^v[\d.]+/
        acc << $MATCH.to_s
        series_end = true
      elsif word.match?(/^Version/)
        acc << "v"
        series_end = false
      elsif word.match?(/^\d|ebXML|ebMS/)
        series_end ? acc << word : acc[-1] += word
        series_end = true
      elsif word.match?(/^\w+$/) && word == word.upcase
        series_end ? acc << word : acc[-1] = word
        series_end = true
      elsif word.match?(/[A-Z]+[a-z]+/)
        series_end ? acc << word[0] : acc[-1] += word[0]
        series_end = false
      end
    end.join "-"
  end
end