Class: RelatonOgc::DataFetcher

Inherits:
Object
  • Object
show all
Includes:
Utils
Defined in:
lib/relaton_ogc/data_fetcher.rb

Defined Under Namespace

Modules: Utils

Constant Summary

Constants included from Utils

Utils::ENDPOINT

Class Method Summary collapse

Instance Method Summary collapse

Methods included from Utils

#etag, #etag=, #get_data

Constructor Details

#initialize(output, format) ⇒ DataFetcher

Create DataFetcher instance

Parameters:

  • output (String)

    directory to save the documents

  • format (String)

    output format “yaml”, “xml”, or “bibxml”



48
49
50
51
52
53
54
55
# File 'lib/relaton_ogc/data_fetcher.rb', line 48

def initialize(output, format)
  @output = output
  @etagfile = File.join output, "etag.txt"
  @format = format
  @ext = format.sub "bib", ""
  @docids = []
  @dupids = Set.new
end

Class Method Details

.fetch(output: "data", format: "yaml") ⇒ Object



61
62
63
64
65
66
67
68
69
# File 'lib/relaton_ogc/data_fetcher.rb', line 61

def self.fetch(output: "data", format: "yaml")
  t1 = Time.now
  puts "Started at: #{t1}"
  FileUtils.mkdir_p output
  new(output, format).fetch
  t2 = Time.now
  puts "Stopped at: #{t2}"
  puts "Done in: #{(t2 - t1).round} sec."
end

Instance Method Details

#content(bib) ⇒ Object



110
111
112
113
114
115
116
# File 'lib/relaton_ogc/data_fetcher.rb', line 110

def content(bib)
  case @format
  when "xml" then bib.to_xml bibdata: true
  when "yaml" then bib.to_hash.to_yaml
  when "bibxml" then bib.to_bibxml
  end
end

#fetchObject



71
72
73
74
75
76
77
78
79
# File 'lib/relaton_ogc/data_fetcher.rb', line 71

def fetch
  get_data do |etag, json|
    no_errors = true
    json.each { |_, hit| fetch_doc(hit) || no_errors = false }
    Util.warn "Duplicated documents: #{@dupids.to_a.join(', ')}" if @dupids.any?
    self.etag = etag if no_errors
    index.save
  end
end

#fetch_doc(hit) ⇒ Object



81
82
83
84
85
86
87
88
89
90
91
# File 'lib/relaton_ogc/data_fetcher.rb', line 81

def fetch_doc(hit)
  return if hit["type"] == "CC"

  bib = Scrapper.parse_page hit
  write_document bib
  true
rescue StandardError => e
  Util.error "Fetching document: #{hit['identifier']}\n" \
  "#{e.class} #{e.message}\n#{e.backtrace}"
  false
end

#file_name(bib) ⇒ Object



105
106
107
108
# File 'lib/relaton_ogc/data_fetcher.rb', line 105

def file_name(bib)
  name = bib.docidentifier[0].id.upcase.gsub(/[\s:.]/, "_")
  "#{@output}/#{name}.#{@ext}"
end

#indexObject



57
58
59
# File 'lib/relaton_ogc/data_fetcher.rb', line 57

def index
  @index ||= Relaton::Index.find_or_create :ogc, file: "index-v1.yaml"
end

#write_document(bib) ⇒ Object

rubocop:disable Metrics/AbcSize



93
94
95
96
97
98
99
100
101
102
103
# File 'lib/relaton_ogc/data_fetcher.rb', line 93

def write_document(bib) # rubocop:disable Metrics/AbcSize
  if @docids.include?(bib.docidentifier[0].id)
    @dupids << bib.docidentifier[0].id
    return
  end

  @docids << bib.docidentifier[0].id
  file = file_name bib
  index.add_or_update bib.docidentifier[0].id, file
  File.write file, content(bib), encoding: "UTF-8"
end