Class: RelatonEcma::DataParser
- Inherits:
-
Object
- Object
- RelatonEcma::DataParser
- Defined in:
- lib/relaton_ecma/data_parser.rb
Constant Summary collapse
- MATTRS =
%i[docid title date link doctype].freeze
- ATTRS =
MATTRS + %i[abstract relation edition doctype].freeze
Instance Method Summary collapse
- #contributor ⇒ Object
-
#edition_id_parts(text) ⇒ Array<String,nil,Array<RelatonBib::BibliographicDate>>
Parse edition and date.
- #edition_link(hit) ⇒ Object
- #edition_translation_link(edition) ⇒ Object
- #fetch_abstract ⇒ Array<RelatonBib::FormattedString>
- #fetch_date ⇒ Array<RelatonBib::BibliographicDate>
- #fetch_docid(id = nil) ⇒ Array<RelatonBib::DocumentIdentifier>
- #fetch_doctype ⇒ Object (also: #fetch_mem_doctype)
- #fetch_edition ⇒ RelatonBib::Edition?
- #fetch_link ⇒ Array<RelatonBib::TypedUri>
- #fetch_mem_date ⇒ Object
- #fetch_mem_docid ⇒ Array<RelatonBib::DocumentIdentifier>
- #fetch_mem_link ⇒ Object
- #fetch_mem_title ⇒ Object
- #fetch_relation ⇒ Array<Hash>
- #fetch_title ⇒ Array<Hash>
-
#get_page(url) ⇒ Mechanize::Page
Get page with retries.
-
#initialize(hit) ⇒ DataParser
constructor
Initialize parser.
-
#parse ⇒ Object
rubocop:disable Metrics/AbcSize,Metrics/MethodLength.
-
#parse_editions ⇒ void
Parse editions.
- #translation_link ⇒ Object
Constructor Details
#initialize(hit) ⇒ DataParser
Initialize parser
11 12 13 14 15 16 17 |
# File 'lib/relaton_ecma/data_parser.rb', line 11 def initialize(hit) @hit = hit @bib = { type: "standard", language: ["en"], script: ["Latn"], place: ["Geneva"] } @agent = Mechanize.new end |
Instance Method Details
#contributor ⇒ Object
192 193 194 195 |
# File 'lib/relaton_ecma/data_parser.rb', line 192 def contributor org = RelatonBib::Organization.new name: "Ecma International" [{ entity: org, role: [{ type: "publisher" }] }] end |
#edition_id_parts(text) ⇒ Array<String,nil,Array<RelatonBib::BibliographicDate>>
Parse edition and date
86 87 88 89 90 91 92 93 94 95 96 97 98 |
# File 'lib/relaton_ecma/data_parser.rb', line 86 def edition_id_parts(text) # rubocop:disable Metrics/MethodLength %r{^ (?<id>\w+(?:[\d-]+|\sTR/\d+)),?\s (?:Volume\s(?<vol>[\d.]+),?\s)? (?<ed>[\d.]+)(?:st|nd|rd|th)?\sedition (?:[,.]\s(?<dt>\w+\s\d+))? }x =~ text date = [dt].compact.map do |d| on = Date.strptime(d, "%B %Y").strftime("%Y-%m") RelatonBib::BibliographicDate.new(type: "published", on: on) end [id, ed, date, vol] end |
#edition_link(hit) ⇒ Object
73 74 75 76 77 |
# File 'lib/relaton_ecma/data_parser.rb', line 73 def edition_link(hit) { "src" => hit.at("./a"), "pdf" => hit.at("./span/a") }.map do |type, a| RelatonBib::TypedUri.new(type: type, content: a[:href]) if a end.compact end |
#edition_translation_link(edition) ⇒ Object
122 123 124 |
# File 'lib/relaton_ecma/data_parser.rb', line 122 def edition_translation_link(edition) translation_link.select { |l| l[:ed] == edition }.map { |l| l[:link] } end |
#fetch_abstract ⇒ Array<RelatonBib::FormattedString>
148 149 150 151 152 153 154 155 |
# File 'lib/relaton_ecma/data_parser.rb', line 148 def fetch_abstract content = @doc.xpath('//div[@class="ecma-item-content"]/p').map do |a| a.text.strip.squeeze(" ").gsub("\r\n", "") end.join "\n" return [] if content.empty? [RelatonBib::FormattedString.new(content: content, language: "en", script: "Latn")] end |
#fetch_date ⇒ Array<RelatonBib::BibliographicDate>
158 159 160 161 162 163 |
# File 'lib/relaton_ecma/data_parser.rb', line 158 def fetch_date @doc.xpath('//p[@class="ecma-item-edition"]').map do |d| date = d.text.split(", ").last RelatonBib::BibliographicDate.new type: "published", on: date end end |
#fetch_docid(id = nil) ⇒ Array<RelatonBib::DocumentIdentifier>
101 102 103 104 |
# File 'lib/relaton_ecma/data_parser.rb', line 101 def fetch_docid(id = nil) id ||= @hit.text [RelatonBib::DocumentIdentifier.new(type: "ECMA", id: id, primary: true)] end |
#fetch_doctype ⇒ Object Also known as: fetch_mem_doctype
215 216 217 |
# File 'lib/relaton_ecma/data_parser.rb', line 215 def fetch_doctype RelatonBib::DocumentType.new type: "document" end |
#fetch_edition ⇒ RelatonBib::Edition?
187 188 189 190 |
# File 'lib/relaton_ecma/data_parser.rb', line 187 def fetch_edition cnt = @doc.at('//p[@class="ecma-item-edition"]')&.text&.match(/^\d+(?=(?:st|nd|th|rd))/)&.to_s RelatonBib::Edition.new(content: cnt) if cnt && !cnt.empty? end |
#fetch_link ⇒ Array<RelatonBib::TypedUri>
107 108 109 110 111 112 113 114 |
# File 'lib/relaton_ecma/data_parser.rb', line 107 def fetch_link # rubocop:disable Metrics/AbcSize link = [] link << RelatonBib::TypedUri.new(type: "src", content: @hit[:href]) if @hit[:href] ref = @doc.at('//div[@class="ecma-item-content-wrapper"]/span/a', '//div[@class="ecma-item-content-wrapper"]/a') link << RelatonBib::TypedUri.new(type: "pdf", content: ref[:href]) if ref link + edition_translation_link(@bib[:edition]&.content) end |
#fetch_mem_date ⇒ Object
203 204 205 206 207 |
# File 'lib/relaton_ecma/data_parser.rb', line 203 def fetch_mem_date date = @hit.at("div[2]//p").text on = Date.strptime(date, "%B %Y").strftime "%Y-%m" [RelatonBib::BibliographicDate.new(type: "published", on: on)] end |
#fetch_mem_docid ⇒ Array<RelatonBib::DocumentIdentifier>
198 199 200 201 |
# File 'lib/relaton_ecma/data_parser.rb', line 198 def fetch_mem_docid code = "ECMA MEM/#{@hit.at('div[1]//p').text}" fetch_docid code end |
#fetch_mem_link ⇒ Object
116 117 118 119 120 |
# File 'lib/relaton_ecma/data_parser.rb', line 116 def fetch_mem_link @hit.xpath("./div/section/div/p/a").map do |a| RelatonBib::TypedUri.new(type: "pdf", content: a[:href]) end end |
#fetch_mem_title ⇒ Object
209 210 211 212 213 |
# File 'lib/relaton_ecma/data_parser.rb', line 209 def fetch_mem_title year = @hit.at("div[1]//p").text content = "\"Memento #{year}\" for year #{year}" [{ content: content, language: "en", script: "Latn" }] end |
#fetch_relation ⇒ Array<Hash>
166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 |
# File 'lib/relaton_ecma/data_parser.rb', line 166 def fetch_relation # rubocop:disable Metrics/AbcSize, Metrics/MethodLength, Metrics/CyclomaticComplexity @doc.xpath("//ul[@class='ecma-item-archives']/li").filter_map do |rel| ref, ed, date, vol = edition_id_parts rel.at("span").text next if ed.nil? || ed.empty? fref = RelatonBib::FormattedRef.new content: ref, language: "en", script: "Latn" docid = RelatonBib::DocumentIdentifier.new(type: "ECMA", id: ref, primary: true) link = rel.xpath("span/a").map { |l| RelatonBib::TypedUri.new type: "pdf", content: l[:href] } edition = RelatonBib::Edition.new content: ed extent = vol && [RelatonBib::Locality.new("volume", vol)] bibitem = BibliographicItem.new( docid: [docid], formattedref: fref, date: date, edition: edition, link: link, extent: extent ) { type: "updates", bibitem: bibitem } end end |
#fetch_title ⇒ Array<Hash>
141 142 143 144 145 |
# File 'lib/relaton_ecma/data_parser.rb', line 141 def fetch_title @doc.xpath('//p[@class="ecma-item-short-description"]').map do |t| { content: t.text.strip, language: "en", script: "Latn" } end end |
#get_page(url) ⇒ Mechanize::Page
Get page with retries
39 40 41 42 43 44 45 46 47 |
# File 'lib/relaton_ecma/data_parser.rb', line 39 def get_page(url) 3.times do |n| sleep n doc = @agent.get url return doc rescue StandardError => e Util.error e. end end |
#parse ⇒ Object
rubocop:disable Metrics/AbcSize,Metrics/MethodLength
19 20 21 22 23 24 25 26 27 28 29 30 |
# File 'lib/relaton_ecma/data_parser.rb', line 19 def parse # rubocop:disable Metrics/AbcSize,Metrics/MethodLength if @hit[:href] @agent.user_agent_alias = Mechanize::AGENT_ALIASES.keys[rand(21)] @doc = get_page @hit[:href] ATTRS.each { |a| @bib[a] = send "fetch_#{a}" } else MATTRS.each { |a| @bib[a] = send "fetch_mem_#{a}" } end @bib[:contributor] = contributor items = [BibliographicItem.new(**@bib)] items + parse_editions end |
#parse_editions ⇒ void
This method returns an undefined value.
Parse editions
57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 |
# File 'lib/relaton_ecma/data_parser.rb', line 57 def parse_editions # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity return [] unless @doc docid = @bib[:docid] @doc.xpath('//div[@id="main"]/div[1]/div/main/article/div/div/standard/div/ul/li').map do |hit| id, ed, @bib[:date], vol = edition_id_parts hit.at("./span", "./a").text @bib[:link] = edition_link(hit) + edition_translation_link(ed) next if ed.nil? || ed.empty? @bib[:docid] = id.nil? || id.empty? ? docid : fetch_docid(id) @bib[:edition] = RelatonBib::Edition.new(content: ed) @bib[:extent] = vol && [RelatonBib::Locality.new("volume", vol)] BibliographicItem.new(**@bib) end.compact end |
#translation_link ⇒ Object
126 127 128 129 130 131 132 133 134 135 136 137 138 |
# File 'lib/relaton_ecma/data_parser.rb', line 126 def translation_link return [] unless @doc @translation_link ||= @doc.xpath("//main/article/div/div/standard/div[2]/ul/li").map do |l| a = l.at("span/a") id = l.at("span").text %r{\w+[\d-]+,\s(?<lang>\w+)\sversion,\s(?<ed>[\d.]+)(?:st|nd|rd|th)\sedition} =~ id case lang when "Japanese" { ed: ed, link: RelatonBib::TypedUri.new(type: "pdf", language: "ja", script: "Jpan", content: a[:href]) } end end.compact end |