Class: RelatonEcma::DataFetcher

Inherits:
Object
  • Object
show all
Defined in:
lib/relaton_ecma/data_fetcher.rb

Constant Summary collapse

URL =
"https://www.ecma-international.org/publications-and-standards/"

Instance Method Summary collapse

Constructor Details

#initialize(output: "data", format: "yaml") ⇒ DataFetcher

Returns a new instance of DataFetcher.

Parameters:

  • :output (String)

    directory to output documents

  • :format (String)

    output format (xml, yaml, bibxml)



13
14
15
16
17
18
19
20
21
# File 'lib/relaton_ecma/data_fetcher.rb', line 13

def initialize(output: "data", format: "yaml")
  @output = output
  @format = format
  @ext = format.sub(/^bib/, "")
  @files = []
  @index = Relaton::Index.find_or_create :ECMA
  @agent = Mechanize.new
  @agent.user_agent_alias = Mechanize::AGENT_ALIASES.keys[rand(21)]
end

Instance Method Details

#fetchvoid

This method returns an undefined value.

Fetch data from Ecma website.



80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
# File 'lib/relaton_ecma/data_fetcher.rb', line 80

def fetch
  t1 = Time.now
  puts "Started at: #{t1}"

  FileUtils.mkdir_p @output

  html_index "standards"
  html_index "technical-reports"
  html_index "mementos"
  @index.save

  t2 = Time.now
  puts "Stopped at: #{t2}"
  puts "Done in: #{(t2 - t1).round} sec."
end

#html_index(type) ⇒ Object

Parameters:

  • type (String)


61
62
63
64
65
66
67
68
69
70
71
72
73
# File 'lib/relaton_ecma/data_fetcher.rb', line 61

def html_index(type) # rubocop:disable Metrics/MethodLength
  result = @agent.get "#{URL}#{type}/"
  # @last_call_time = Time.now
  result.xpath(
    "//li/span[1]/a",
    "//div[contains(@class, 'entry-content-wrapper')][.//a[.='Download']]",
  ).each do |hit|
    # workers << hit
    parse_page(hit)
  rescue StandardError => e
    Util.error { "#{e.message}\n#{e.backtrace}" }
  end
end

#index_id(bib) ⇒ Object



39
40
41
42
43
44
45
# File 'lib/relaton_ecma/data_fetcher.rb', line 39

def index_id(bib)
  { id: bib.docidentifier[0].id }.tap do |i|
    i[:ed] = bib.edition.content if bib.edition
    extent = bib.extent.detect { |e| e.type == "volume" }
    i[:vol] = extent.reference_from if extent
  end
end

#parse_page(hit) ⇒ Object

Parameters:

  • hit (Nokogiri::XML::Element)


56
57
58
# File 'lib/relaton_ecma/data_fetcher.rb', line 56

def parse_page(hit) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
  DataParser.new(hit).parse.each { |item| write_file item }
end

#render_doc(bib) ⇒ Object



47
48
49
50
51
52
53
# File 'lib/relaton_ecma/data_fetcher.rb', line 47

def render_doc(bib)
  case @format
  when "yaml" then bib.to_hash.to_yaml
  when "xml" then bib.to_xml bibdata: true
  when "bibxml" then bib.to_bibxml
  end
end

#write_file(bib) ⇒ Object

Parameters:

  • bib (RelatonItu::ItuBibliographicItem)


24
25
26
27
28
29
30
31
32
33
34
35
36
37
# File 'lib/relaton_ecma/data_fetcher.rb', line 24

def write_file(bib) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
  id = bib.docidentifier[0].id.gsub(%r{[/\s]}, "_")
  id += "-#{bib.edition.content.gsub('.', '_')}" if bib.edition
  extent = bib.extent.detect { |e| e.type == "volume" }
  id += "-#{extent.reference_from}" if extent
  file = "#{@output}/#{id}.#{@ext}"
  if @files.include? file
    Util.warn "Duplicate file #{file}"
  else
    @files << file
    File.write file, render_doc(bib), encoding: "UTF-8"
    @index.add_or_update index_id(bib), file
  end
end