Class: RelatonEcma::DataParser

Inherits:
Object
  • Object
show all
Defined in:
lib/relaton_ecma/data_parser.rb

Constant Summary collapse

MATTRS =
%i[docid title date link doctype].freeze
ATTRS =
MATTRS + %i[abstract relation edition doctype].freeze

Instance Method Summary collapse

Constructor Details

#initialize(hit) ⇒ DataParser

Initialize parser

Parameters:

  • hit (Nokogiri::XML::Element)

    document hit



11
12
13
14
15
16
17
# File 'lib/relaton_ecma/data_parser.rb', line 11

def initialize(hit)
  @hit = hit
  @bib = {
    type: "standard", language: ["en"], script: ["Latn"], place: ["Geneva"]
  }
  @agent = Mechanize.new
end

Instance Method Details

#contributorObject



192
193
194
195
# File 'lib/relaton_ecma/data_parser.rb', line 192

def contributor
  org = RelatonBib::Organization.new name: "Ecma International"
  [{ entity: org, role: [{ type: "publisher" }] }]
end

#edition_id_parts(text) ⇒ Array<String,nil,Array<RelatonBib::BibliographicDate>>

Parse edition and date

Parameters:

  • text (String)

    identifier text

Returns:

  • (Array<String,nil,Array<RelatonBib::BibliographicDate>>)

    edition and date



86
87
88
89
90
91
92
93
94
95
96
97
98
# File 'lib/relaton_ecma/data_parser.rb', line 86

def edition_id_parts(text) # rubocop:disable Metrics/MethodLength
  %r{^
    (?<id>\w+(?:[\d-]+|\sTR/\d+)),?\s
    (?:Volume\s(?<vol>[\d.]+),?\s)?
    (?<ed>[\d.]+)(?:st|nd|rd|th)?\sedition
    (?:[,.]\s(?<dt>\w+\s\d+))?
  }x =~ text
  date = [dt].compact.map do |d|
    on = Date.strptime(d, "%B %Y").strftime("%Y-%m")
    RelatonBib::BibliographicDate.new(type: "published", on: on)
  end
  [id, ed, date, vol]
end


73
74
75
76
77
# File 'lib/relaton_ecma/data_parser.rb', line 73

def edition_link(hit)
  { "src" => hit.at("./a"), "pdf" => hit.at("./span/a") }.map do |type, a|
    RelatonBib::TypedUri.new(type: type, content: a[:href]) if a
  end.compact
end


122
123
124
# File 'lib/relaton_ecma/data_parser.rb', line 122

def edition_translation_link(edition)
  translation_link.select { |l| l[:ed] == edition }.map { |l| l[:link] }
end

#fetch_abstractArray<RelatonBib::FormattedString>

Returns:

  • (Array<RelatonBib::FormattedString>)


148
149
150
151
152
153
154
155
# File 'lib/relaton_ecma/data_parser.rb', line 148

def fetch_abstract
  content = @doc.xpath('//div[@class="ecma-item-content"]/p').map do |a|
    a.text.strip.squeeze(" ").gsub("\r\n", "")
  end.join "\n"
  return [] if content.empty?

  [RelatonBib::FormattedString.new(content: content, language: "en", script: "Latn")]
end

#fetch_dateArray<RelatonBib::BibliographicDate>

Returns:

  • (Array<RelatonBib::BibliographicDate>)


158
159
160
161
162
163
# File 'lib/relaton_ecma/data_parser.rb', line 158

def fetch_date
  @doc.xpath('//p[@class="ecma-item-edition"]').map do |d|
    date = d.text.split(", ").last
    RelatonBib::BibliographicDate.new type: "published", on: date
  end
end

#fetch_docid(id = nil) ⇒ Array<RelatonBib::DocumentIdentifier>

Returns:

  • (Array<RelatonBib::DocumentIdentifier>)


101
102
103
104
# File 'lib/relaton_ecma/data_parser.rb', line 101

def fetch_docid(id = nil)
  id ||= @hit.text
  [RelatonBib::DocumentIdentifier.new(type: "ECMA", id: id, primary: true)]
end

#fetch_doctypeObject Also known as: fetch_mem_doctype



215
216
217
# File 'lib/relaton_ecma/data_parser.rb', line 215

def fetch_doctype
  RelatonBib::DocumentType.new type: "document"
end

#fetch_editionRelatonBib::Edition?

Returns:

  • (RelatonBib::Edition, nil)


187
188
189
190
# File 'lib/relaton_ecma/data_parser.rb', line 187

def fetch_edition
  cnt = @doc.at('//p[@class="ecma-item-edition"]')&.text&.match(/^\d+(?=(?:st|nd|th|rd))/)&.to_s
  RelatonBib::Edition.new(content: cnt) if cnt && !cnt.empty?
end

Returns:

  • (Array<RelatonBib::TypedUri>)


107
108
109
110
111
112
113
114
# File 'lib/relaton_ecma/data_parser.rb', line 107

def fetch_link # rubocop:disable Metrics/AbcSize
  link = []
  link << RelatonBib::TypedUri.new(type: "src", content: @hit[:href]) if @hit[:href]
  ref = @doc.at('//div[@class="ecma-item-content-wrapper"]/span/a',
                '//div[@class="ecma-item-content-wrapper"]/a')
  link << RelatonBib::TypedUri.new(type: "pdf", content: ref[:href]) if ref
  link + edition_translation_link(@bib[:edition]&.content)
end

#fetch_mem_dateObject



203
204
205
206
207
# File 'lib/relaton_ecma/data_parser.rb', line 203

def fetch_mem_date
  date = @hit.at("div[2]//p").text
  on = Date.strptime(date, "%B %Y").strftime "%Y-%m"
  [RelatonBib::BibliographicDate.new(type: "published", on: on)]
end

#fetch_mem_docidArray<RelatonBib::DocumentIdentifier>

Returns:

  • (Array<RelatonBib::DocumentIdentifier>)


198
199
200
201
# File 'lib/relaton_ecma/data_parser.rb', line 198

def fetch_mem_docid
  code = "ECMA MEM/#{@hit.at('div[1]//p').text}"
  fetch_docid code
end


116
117
118
119
120
# File 'lib/relaton_ecma/data_parser.rb', line 116

def fetch_mem_link
  @hit.xpath("./div/section/div/p/a").map do |a|
    RelatonBib::TypedUri.new(type: "pdf", content: a[:href])
  end
end

#fetch_mem_titleObject



209
210
211
212
213
# File 'lib/relaton_ecma/data_parser.rb', line 209

def fetch_mem_title
  year = @hit.at("div[1]//p").text
  content = "\"Memento #{year}\" for year #{year}"
  [{ content: content, language: "en", script: "Latn" }]
end

#fetch_relationArray<Hash>

Returns:

  • (Array<Hash>)


166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
# File 'lib/relaton_ecma/data_parser.rb', line 166

def fetch_relation # rubocop:disable Metrics/AbcSize, Metrics/MethodLength, Metrics/CyclomaticComplexity
  @doc.xpath("//ul[@class='ecma-item-archives']/li").filter_map do |rel|
    ref, ed, date, vol = edition_id_parts rel.at("span").text
    next if ed.nil? || ed.empty?

    fref = RelatonBib::FormattedRef.new content: ref, language: "en", script: "Latn"
    docid = RelatonBib::DocumentIdentifier.new(type: "ECMA", id: ref, primary: true)
    link = rel.xpath("span/a").map { |l| RelatonBib::TypedUri.new type: "pdf", content: l[:href] }
    edition = RelatonBib::Edition.new content: ed
    extent = vol && [RelatonBib::Locality.new("volume", vol)]
    bibitem = BibliographicItem.new(
      docid: [docid], formattedref: fref, date: date, edition: edition,
      link: link, extent: extent
    )
    { type: "updates", bibitem: bibitem }
  end
end

#fetch_titleArray<Hash>

Returns:

  • (Array<Hash>)


141
142
143
144
145
# File 'lib/relaton_ecma/data_parser.rb', line 141

def fetch_title
  @doc.xpath('//p[@class="ecma-item-short-description"]').map do |t|
    { content: t.text.strip, language: "en", script: "Latn" }
  end
end

#get_page(url) ⇒ Mechanize::Page

Get page with retries

Parameters:

  • url (String)

    url to fetch

Returns:

  • (Mechanize::Page)

    document



39
40
41
42
43
44
45
46
47
# File 'lib/relaton_ecma/data_parser.rb', line 39

def get_page(url)
  3.times do |n|
    sleep n
    doc = @agent.get url
    return doc
  rescue StandardError => e
    Util.error e.message
  end
end

#parseObject

rubocop:disable Metrics/AbcSize,Metrics/MethodLength



19
20
21
22
23
24
25
26
27
28
29
30
# File 'lib/relaton_ecma/data_parser.rb', line 19

def parse # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
  if @hit[:href]
    @agent.user_agent_alias = Mechanize::AGENT_ALIASES.keys[rand(21)]
    @doc = get_page @hit[:href]
    ATTRS.each { |a| @bib[a] = send "fetch_#{a}" }
  else
    MATTRS.each { |a| @bib[a] = send "fetch_mem_#{a}" }
  end
  @bib[:contributor] = contributor
  items = [BibliographicItem.new(**@bib)]
  items + parse_editions
end

#parse_editionsvoid

This method returns an undefined value.

Parse editions

Parameters:

  • doc (Mechanize::Page)

    document

  • bib (Hash)

    bibliographic item the last edition



57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
# File 'lib/relaton_ecma/data_parser.rb', line 57

def parse_editions # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
  return [] unless @doc

  docid = @bib[:docid]
  @doc.xpath('//div[@id="main"]/div[1]/div/main/article/div/div/standard/div/ul/li').map do |hit|
    id, ed, @bib[:date], vol = edition_id_parts hit.at("./span", "./a").text
    @bib[:link] = edition_link(hit) + edition_translation_link(ed)
    next if ed.nil? || ed.empty?

    @bib[:docid] = id.nil? || id.empty? ? docid : fetch_docid(id)
    @bib[:edition] = RelatonBib::Edition.new(content: ed)
    @bib[:extent] = vol && [RelatonBib::Locality.new("volume", vol)]
    BibliographicItem.new(**@bib)
  end.compact
end


126
127
128
129
130
131
132
133
134
135
136
137
138
# File 'lib/relaton_ecma/data_parser.rb', line 126

def translation_link
  return [] unless @doc

  @translation_link ||= @doc.xpath("//main/article/div/div/standard/div[2]/ul/li").map do |l|
    a = l.at("span/a")
    id = l.at("span").text
    %r{\w+[\d-]+,\s(?<lang>\w+)\sversion,\s(?<ed>[\d.]+)(?:st|nd|rd|th)\sedition} =~ id
    case lang
    when "Japanese"
      { ed: ed, link: RelatonBib::TypedUri.new(type: "pdf", language: "ja", script: "Jpan", content: a[:href]) }
    end
  end.compact
end