Class: RelatonOasis::DataFetcher

Inherits:
Object
  • Object
show all
Defined in:
lib/relaton_oasis/data_fetcher.rb

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(output, format) ⇒ DataFetcher

Initialize a new DataFetcher

Parameters:

  • output (Strin)

    directory to save files, default: “data”

  • format (Strin)

    format of output files (xml, yaml, bibxml); default: yaml



9
10
11
12
13
14
15
16
# File 'lib/relaton_oasis/data_fetcher.rb', line 9

def initialize(output, format)
  @output = output
  @format = format
  @ext = @format.sub(/^bib|^rfc/, "")
  @files = []
  @index = Index.new
  @index1 = Relaton::Index.find_or_create :oasis, file: "index-v1.yaml"
end

Class Method Details

.fetch(output: "data", format: "yaml") ⇒ Object

Initialize fetcher and run fetch

Parameters:

  • output (Strin) (defaults to: "data")

    directory to save files, default: “data”

  • format (Strin) (defaults to: "yaml")

    format of output files (xml, yaml, bibxml); default: yaml



24
25
26
27
28
29
30
31
32
# File 'lib/relaton_oasis/data_fetcher.rb', line 24

def self.fetch(output: "data", format: "yaml")
  t1 = Time.now
  puts "Started at: #{t1}"
  FileUtils.mkdir_p output
  new(output, format).fetch
  t2 = Time.now
  puts "Stopped at: #{t2}"
  puts "Done in: #{(t2 - t1).round} sec."
end

Instance Method Details

#fetchObject

Fetch and save all the documents from OASIS



37
38
39
40
41
42
43
44
45
46
47
# File 'lib/relaton_oasis/data_fetcher.rb', line 37

def fetch
  agent = Mechanize.new
  resp = agent.get "https://www.oasis-open.org/standards/"
  doc = Nokogiri::HTML resp.body
  doc.xpath("//details").map do |item|
    save_doc DataParser.new(item).parse
    fetch_parts item
  end
  @index.save
  @index1.save
end

#fetch_parts(item) ⇒ Object

Fetch and save parts of document

Parameters:

  • item (Nokogiri::HTML::Element)

    document node



54
55
56
57
58
59
60
61
# File 'lib/relaton_oasis/data_fetcher.rb', line 54

def fetch_parts(item)
  parts = item.xpath("./div/div/div[contains(@class, 'standard__grid--cite-as')]/p[strong or span/strong]")
  return unless parts.size > 1

  parts.each do |part|
    save_doc DataPartParser.new(part).parse
  end
end

#file_name(doc) ⇒ String

Generate file name

Parameters:

Returns:

  • (String)

    file name



92
93
94
95
# File 'lib/relaton_oasis/data_fetcher.rb', line 92

def file_name(doc)
  name = doc.docnumber.gsub(/[\s,:\/]/, "_").squeeze("_").upcase
  File.join @output, "#{name}.#{@ext}"
end

#save_doc(doc) ⇒ Object

Save document to file



68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
# File 'lib/relaton_oasis/data_fetcher.rb', line 68

def save_doc(doc) # rubocop:disable Metrics/MethodLength
  c = case @format
      when "xml" then doc.to_xml(bibdata: true)
      when "yaml" then doc.to_hash.to_yaml
      else doc.send("to_#{@format}")
      end
  file = file_name doc
  if @files.include? file
    Util.warn "File #{file} already exists. Document: #{doc.docnumber}"
  else
    @files << file
    @index[doc] = file
  end
  @index1.add_or_update doc.docnumber, file
  File.write file, c, encoding: "UTF-8"
end