Class: RelatonCie::DataFetcher

Inherits:
Object
  • Object
show all
Defined in:
lib/relaton_cie/data_fetcher.rb

Constant Summary collapse

URL =
"https://www.techstreet.com/cie/searches/31156444?page=1&per_page=100"

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(output, format) ⇒ DataFetcher

Returns a new instance of DataFetcher.



12
13
14
15
16
17
# File 'lib/relaton_cie/data_fetcher.rb', line 12

def initialize(output, format)
  @output = output
  @format = format
  @files = []
  @ext = format == "bibxml" ? "xml" : format
end

Class Method Details

.fetch(output: "data", format: "yaml") ⇒ Object



228
229
230
231
232
233
234
235
236
237
238
# File 'lib/relaton_cie/data_fetcher.rb', line 228

def self.fetch(output: "data", format: "yaml")
  t1 = Time.now
  puts "Started at: #{t1}"

  FileUtils.mkdir_p output
  new(output, format).fetch URL

  t2 = Time.now
  puts "Stopped at: #{t2}"
  puts "Done in: #{(t2 - t1).round} sec."
end

Instance Method Details

#agentObject



19
20
21
# File 'lib/relaton_cie/data_fetcher.rb', line 19

def agent
  @agent ||= Mechanize.new
end

#content(bib) ⇒ Object



182
183
184
185
186
187
188
# File 'lib/relaton_cie/data_fetcher.rb', line 182

def content(bib)
  case @format
  when "xml" then bib.to_xml(bibdata: true)
  when "yaml" then bib.to_hash.to_yaml
  when "bibxml" then bib.to_bibxml
  end
end

#fetch(url) ⇒ Object



209
210
211
212
213
214
215
216
217
218
# File 'lib/relaton_cie/data_fetcher.rb', line 209

def fetch(url)
  result = time_req { agent.get url }
  result.xpath("//li[@data-product]").each { |hit| parse_page hit }
  np = result.at '//a[@class="next_page"]'
  if np
    fetch "https://www.techstreet.com#{np[:href]}"
  else
    index.save
  end
end

#fetch_abstract(doc) ⇒ Array<RelatonBib::FormattedString>

Parameters:

  • doc (Mechanize::Page)

Returns:

  • (Array<RelatonBib::FormattedString>)


124
125
126
127
128
129
130
# File 'lib/relaton_cie/data_fetcher.rb', line 124

def fetch_abstract(doc)
  content = doc.at('//div[contains(@class,"description")]')&.text&.strip
  return [] if content.nil? || content.empty?

  [RelatonBib::FormattedString.new(content: content, language: "en",
                                   script: "Latn")]
end

#fetch_contributor(doc) ⇒ Array<Hash>

Parameters:

  • doc (Mechanize::Page)

Returns:

  • (Array<Hash>)


134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
# File 'lib/relaton_cie/data_fetcher.rb', line 134

def fetch_contributor(doc) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength,Metrics/CyclomaticComplexity
  authors = doc.xpath('//hgroup/p[not(@class="pub_date")]').text.gsub "\"", ""
  contribs = []
  until authors.empty?
    /^(?<sname1>\S+(?:\sder?\s)?[^\s,]+)
    (?:,?\s(?<sname2>[\w-]{2,})(?=,\s+\w\.))?
    (?:,?\s(?<fname>[\w-]{2,})(?!,\s+\w\.))?
    (?:(?:\s?,\s?|\s)(?<init>(?:\w(?:\s?\.|\s|,|$)[\s-]?)+))?
    (?:(?:,\s*|\s+|\.|(?<=\s))(?:and\s)?)?/x =~ authors
    raise StandardError, "Author name not found in \"#{authors}\"" unless $LAST_MATCH_INFO

    authors.sub! $LAST_MATCH_INFO.to_s, ""
    sname = [sname1, sname2].compact.join " "
    surname = RelatonBib::LocalizedString.new sname, "en", "Latn"
    initial = (init&.strip || "").split(/(?:,|\.)(?:-|\s)?/).map do |int|
      RelatonBib::LocalizedString.new(int.strip, "en", "Latn")
    end
    forename = fname ? [RelatonBib::LocalizedString.new(fname, "en", "Latn")] : []
    fullname = RelatonBib::FullName.new surname: surname, forename: forename, initial: initial
    person = RelatonBib::Person.new name: fullname
    contribs << { entity: person, role: [{ type: "author" }] }
  end
  org = RelatonBib::Organization.new(
    name: "Commission Internationale de L'Eclairage", abbreviation: "CIE",
    url: "cie.co.at"
  )
  contribs << { entity: org, role: [{ type: "publisher" }] }
end

#fetch_date(doc) ⇒ Array<RelatonBib::BibliographicDate>

Parameters:

  • doc (Mechanize::Page)

Returns:

  • (Array<RelatonBib::BibliographicDate>)


84
85
86
87
88
89
90
# File 'lib/relaton_cie/data_fetcher.rb', line 84

def fetch_date(doc)
  doc.xpath('//dt[.="Published:"]/following-sibling::dd[1]').map do |d|
    pd = d.text.strip
    on = pd.match?(/^\d{4}(?:[^-]|$)/) ? pd : Date.strptime(pd, "%m/%d/%Y").strftime("%Y-%m-%d")
    RelatonBib::BibliographicDate.new(type: "published", on: on)
  end
end

#fetch_docid(hit, doc) ⇒ Array<RelatonBib::DocumentIdentifier>

Parameters:

  • hit (Nokogiri::HTML::Document)
  • doc (Mechanize::Page)

Returns:

  • (Array<RelatonBib::DocumentIdentifier>)


30
31
32
33
34
35
36
37
38
39
40
# File 'lib/relaton_cie/data_fetcher.rb', line 30

def fetch_docid(hit, doc)
  code, code2 = parse_code hit, doc
  docid = [RelatonBib::DocumentIdentifier.new(type: "CIE", id: code, primary: true)]
  if code2
    type2 = code2.match(/\w+/).to_s
    docid << RelatonBib::DocumentIdentifier.new(type: type2, id: code2.strip)
  end
  isbn = doc.at('//dt[contains(.,"ISBN")]/following-sibling::dd')
  docid << RelatonBib::DocumentIdentifier.new(type: "ISBN", id: isbn.text.strip) if isbn
  docid
end

#fetch_docnumber(hit) ⇒ Object



69
70
71
# File 'lib/relaton_cie/data_fetcher.rb', line 69

def fetch_docnumber(hit)
  parse_code(hit).first.sub(/\w+\s/, "")
end

#fetch_doctypeObject



163
164
165
# File 'lib/relaton_cie/data_fetcher.rb', line 163

def fetch_doctype
  RelatonBib::DocumentType.new(type: "document")
end

#fetch_edition(doc) ⇒ String

Parameters:

  • doc (Mechanize::Page)

Returns:

  • (String)


94
95
96
# File 'lib/relaton_cie/data_fetcher.rb', line 94

def fetch_edition(doc)
  doc.at('//dt[.="Edition:"]/following-sibling::dd')&.text&.match(/^\d+(?=(st|nd|rd|th))/)&.to_s
end

Parameters:

  • url (String)

Returns:

  • (Array<RelatonBib::TypedUri>)


118
119
120
# File 'lib/relaton_cie/data_fetcher.rb', line 118

def fetch_link(url)
  [RelatonBib::TypedUri.new(type: "src", content: url)]
end

#fetch_relation(doc) ⇒ Array<Hash>

Parameters:

  • doc (Mechanize::Page)

Returns:

  • (Array<Hash>)


100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# File 'lib/relaton_cie/data_fetcher.rb', line 100

def fetch_relation(doc) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
  doc.xpath('//section[@class="history"]/ol/li[not(contains(@class,"selected-product"))]').map do |rel|
    ref = rel.at("a")
    url = "https://www.techstreet.com#{ref[:href]}"
    title = RelatonBib::TypedTitleString.from_string ref.at('p/span[@class="title"]').text
    did = ref.at("h3").text
    docid = [RelatonBib::DocumentIdentifier.new(type: "CIE", id: did, primary: true)]
    on = ref.at("p/time")
    date = [RelatonBib::BibliographicDate.new(type: "published", on: on[:datetime])]
    link = [RelatonBib::TypedUri.new(type: "src", content: url)]
    bibitem = BibliographicItem.new docid: docid, title: title, link: link, date: date
    type = ref.at('//li/i[contains(@class,"historical")]') ? "updates" : "updatedBy"
    { type: type, bibitem: bibitem }
  end
end

#fetch_title(doc) ⇒ RelatonBib::TypedTitleStringCollection, Array

Parameters:

  • doc (Mechanize::Page)

Returns:

  • (RelatonBib::TypedTitleStringCollection, Array)


75
76
77
78
79
80
# File 'lib/relaton_cie/data_fetcher.rb', line 75

def fetch_title(doc)
  t = doc.at("//hgroup/h2", "//hgroup/h1")
  return [] unless t

  RelatonBib::TypedTitleString.from_string t.text.strip
end

#indexObject



23
24
25
# File 'lib/relaton_cie/data_fetcher.rb', line 23

def index
  @index ||= Relaton::Index.find_or_create :cie, file: "index-v1.yaml"
end

#parse_cie_code(code1, code2, doc = nil) ⇒ Object

rubocop:disable Metrics/CyclomaticComplexity



61
62
63
64
65
66
67
# File 'lib/relaton_cie/data_fetcher.rb', line 61

def parse_cie_code(code1, code2, doc = nil) # rubocop:disable Metrics/CyclomaticComplexity
  code = code1.size > 25 && code2 ? "CIE #{code2.sub(/,(\sPages)?/, '')}" : code1
  add = doc&.at("//hgroup/h2")&.text&.match(/(Add)endum\s(\d+)$/)
  return code unless add

  "#{code} #{add[1]} #{add[2]}"
end

#parse_code(hit, doc = nil) ⇒ Object



42
43
44
45
46
47
# File 'lib/relaton_cie/data_fetcher.rb', line 42

def parse_code(hit, doc = nil)
  code = hit.at("h3/a").text.strip.squeeze(" ").sub(/\u25b9/, "").gsub(" / ", "/")
  c2idx = %r{(?:\(|/)(?<c2>(?:ISO|IEC)\s[^()]+)} =~ code
  code = code[0...c2idx].strip if c2idx
  [primary_code(code, doc), c2]
end

#parse_page(hit) ⇒ Object

Parameters:

  • hit (Nokogiri::HTML::Element)


191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
# File 'lib/relaton_cie/data_fetcher.rb', line 191

def parse_page(hit) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
  url = "https://www.techstreet.com#{hit.at('h3/a')[:href]}"
  doc = time_req { agent.get url }
  item = BibliographicItem.new(
    type: "standard", link: fetch_link(url), docnumber: fetch_docnumber(hit),
    docid: fetch_docid(hit, doc), title: fetch_title(doc),
    abstract: fetch_abstract(doc), date: fetch_date(doc),
    edition: fetch_edition(doc), contributor: fetch_contributor(doc),
    relation: fetch_relation(doc), language: ["en"], script: ["Latn"],
    doctype: fetch_doctype
  )
  write_file item
rescue StandardError => e
  Util.error do
    "Document: #{url}\n#{e.message}\n#{e.backtrace}"
  end
end

#primary_code(code, doc = nil) ⇒ Object



49
50
51
52
53
54
55
56
57
58
59
# File 'lib/relaton_cie/data_fetcher.rb', line 49

def primary_code(code, doc = nil)
  /^(?<code1>[^(]+)(?:\((?<code2>\w+\d+,(?:\sPages)?[^)]+))?/ =~ code
  if code1&.match?(/^CIE/)
    parse_cie_code code1, code2, doc
  elsif (pcode = doc&.at('//dt[.="Product Code(s):"]/following-sibling::dd'))
    "CIE #{pcode.text.strip.match(/[^,]+/)}"
  else
    num = code.match(/(?<=\()\w{2}\d+,.+(?=\))/).to_s.gsub(/,(?=\s)/, "").gsub(/,(?=\S)/, " ")
    "CIE #{num}"
  end
end

#time_reqObject



220
221
222
223
224
225
226
# File 'lib/relaton_cie/data_fetcher.rb', line 220

def time_req
  t1 = Time.now
  result = yield
  t = 1 - (Time.now - t1)
  sleep t if t.positive?
  result
end

#write_file(bib) ⇒ Object

Parameters:



168
169
170
171
172
173
174
175
176
177
178
179
180
# File 'lib/relaton_cie/data_fetcher.rb', line 168

def write_file(bib) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
  id = bib.docidentifier[0].id.gsub(%r{[/\s\-:.]}, "_")
  file = "#{@output}/#{id.upcase}.#{@format}"
  if @files.include? file
    Util.warn do
      "File #{file} exists. Docid: #{bib.docidentifier[0].id}\n" \
      "Link: #{bib.link.detect { |l| l.type == 'src' }.content}"
    end
  else @files << file
  end
  index.add_or_update bib.docidentifier[0].id, file
  File.write file, content(bib), encoding: "UTF-8"
end