Class: RelatonEcma::DataFetcher
- Inherits:
-
Object
- Object
- RelatonEcma::DataFetcher
- Defined in:
- lib/relaton_ecma/data_fetcher.rb
Constant Summary collapse
- URL =
"https://www.ecma-international.org/publications-and-standards/"
Instance Method Summary collapse
-
#fetch ⇒ void
Fetch data from Ecma website.
- #html_index(type) ⇒ Object
- #index_id(bib) ⇒ Object
-
#initialize(output: "data", format: "yaml") ⇒ DataFetcher
constructor
A new instance of DataFetcher.
- #parse_page(hit) ⇒ Object
- #render_doc(bib) ⇒ Object
- #write_file(bib) ⇒ Object
Constructor Details
#initialize(output: "data", format: "yaml") ⇒ DataFetcher
Returns a new instance of DataFetcher.
13 14 15 16 17 18 19 20 21 |
# File 'lib/relaton_ecma/data_fetcher.rb', line 13 def initialize(output: "data", format: "yaml") @output = output @format = format @ext = format.sub(/^bib/, "") @files = [] @index = Relaton::Index.find_or_create :ECMA @agent = Mechanize.new @agent.user_agent_alias = Mechanize::AGENT_ALIASES.keys[rand(21)] end |
Instance Method Details
#fetch ⇒ void
This method returns an undefined value.
Fetch data from Ecma website.
80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 |
# File 'lib/relaton_ecma/data_fetcher.rb', line 80 def fetch t1 = Time.now puts "Started at: #{t1}" FileUtils.mkdir_p @output html_index "standards" html_index "technical-reports" html_index "mementos" @index.save t2 = Time.now puts "Stopped at: #{t2}" puts "Done in: #{(t2 - t1).round} sec." end |
#html_index(type) ⇒ Object
61 62 63 64 65 66 67 68 69 70 71 72 73 |
# File 'lib/relaton_ecma/data_fetcher.rb', line 61 def html_index(type) # rubocop:disable Metrics/MethodLength result = @agent.get "#{URL}#{type}/" # @last_call_time = Time.now result.xpath( "//li/span[1]/a", "//div[contains(@class, 'entry-content-wrapper')][.//a[.='Download']]", ).each do |hit| # workers << hit parse_page(hit) rescue StandardError => e Util.error { "#{e.}\n#{e.backtrace}" } end end |
#index_id(bib) ⇒ Object
39 40 41 42 43 44 45 |
# File 'lib/relaton_ecma/data_fetcher.rb', line 39 def index_id(bib) { id: bib.docidentifier[0].id }.tap do |i| i[:ed] = bib.edition.content if bib.edition extent = bib.extent.detect { |e| e.type == "volume" } i[:vol] = extent.reference_from if extent end end |
#parse_page(hit) ⇒ Object
56 57 58 |
# File 'lib/relaton_ecma/data_fetcher.rb', line 56 def parse_page(hit) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength DataParser.new(hit).parse.each { |item| write_file item } end |
#render_doc(bib) ⇒ Object
47 48 49 50 51 52 53 |
# File 'lib/relaton_ecma/data_fetcher.rb', line 47 def render_doc(bib) case @format when "yaml" then bib.to_hash.to_yaml when "xml" then bib.to_xml bibdata: true when "bibxml" then bib.to_bibxml end end |
#write_file(bib) ⇒ Object
24 25 26 27 28 29 30 31 32 33 34 35 36 37 |
# File 'lib/relaton_ecma/data_fetcher.rb', line 24 def write_file(bib) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength id = bib.docidentifier[0].id.gsub(%r{[/\s]}, "_") id += "-#{bib.edition.content.gsub('.', '_')}" if bib.edition extent = bib.extent.detect { |e| e.type == "volume" } id += "-#{extent.reference_from}" if extent file = "#{@output}/#{id}.#{@ext}" if @files.include? file Util.warn "Duplicate file #{file}" else @files << file File.write file, render_doc(bib), encoding: "UTF-8" @index.add_or_update index_id(bib), file end end |