Class: RelatonIec::DataFetcher

Inherits:
Object
  • Object
show all
Defined in:
lib/relaton_iec/data_fetcher.rb

Constant Summary collapse

ENTRYPOINT =
"https://api.iec.ch/harmonized/publications?size=100&sortBy=urn&page=".freeze
CREDENTIAL =
"https://api.iec.ch/oauth/client_credential/accesstoken?grant_type=client_credentials".freeze
LAST_CHANGE_FILE =
"last_change.txt".freeze

Instance Method Summary collapse

Constructor Details

#initialize(source = "iec-harmonised-latest", output: "data", format: "yaml") ⇒ DataFetcher

Initialize new instance.

Parameters:

  • source (String) (defaults to: "iec-harmonised-latest")

    source name (iec-harmonized-all, iec-harmonized-latest)

  • output (String) (defaults to: "data")

    output directory

  • format (String) (defaults to: "yaml")

    format of output files (xml, bibxml, yaml)



14
15
16
17
18
19
20
21
22
23
# File 'lib/relaton_iec/data_fetcher.rb', line 14

def initialize(source = "iec-harmonised-latest", output: "data", format: "yaml")
  @output = output
  @format = format
  @ext = format.sub(/^bib/, "")
  @files = []
  # @index = Index.new "index.yaml"
  @last_change = File.read(LAST_CHANGE_FILE, encoding: "UTF-8") if File.exist? LAST_CHANGE_FILE
  @last_change_max = @last_change.to_s
  @all = source == "iec-harmonised-all"
end

Instance Method Details

#access_tokenString

Get access token.

Returns:

  • (String)

    access token



145
146
147
148
149
150
151
152
153
154
155
# File 'lib/relaton_iec/data_fetcher.rb', line 145

def access_token # rubocop:disable Metrics/AbcSize
  @access_token ||= begin
    uri = URI CREDENTIAL
    req = Net::HTTP::Get.new uri
    req.basic_auth ENV.fetch("IEC_HAPI_PROJ_PUBS_KEY"), ENV.fetch("IEC_HAPI_PROJ_PUBS_SECRET")
    res = Net::HTTP.start(uri.hostname, uri.port, use_ssl: true) do |http|
      http.request req
    end
    JSON.parse(res.body)["access_token"]
  end
end

#create_indexObject



59
60
61
62
63
64
65
66
67
68
# File 'lib/relaton_iec/data_fetcher.rb', line 59

def create_index
  index = Relaton::Index.find_or_create :IEC, file: "index1.yaml"
  index.remove_all
  Dir["{#{@output},static}/*.yaml"].each do |file|
    item = YAML.load_file file
    id = item["docid"].detect { |i| i["primary"] }["id"]
    index.add_or_update id, file
  end
  index.save
end

#fetchObject

Fetch data from IEC.



38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
# File 'lib/relaton_iec/data_fetcher.rb', line 38

def fetch # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
  t1 = Time.now
  puts "Started at: #{t1}"

  if @all
    FileUtils.rm_rf @output
  end
  FileUtils.mkdir_p @output
  fetch_all
  create_index
  save_last_change

  t2 = Time.now
  puts "Stopped at: #{t2}"
  puts "Done in: #{(t2 - t1).round} sec."
rescue StandardError => e
  Util.error do
    "#{e.message}\n#{e.backtrace.join("\n")}"
  end
end

#fetch_allvoid

This method returns an undefined value.

Fetch documents from IEC API.



88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
# File 'lib/relaton_iec/data_fetcher.rb', line 88

def fetch_all # rubocop:disable Metrics/MethodLength
  page = 0
  next_page = true
  while next_page
    res = fetch_page_token page
    unless res.code == "200"
      Util.warn "#{res.body}"
      break
    end
    json = JSON.parse res.body
    json["publication"].each { |pub| fetch_pub pub }
    page += 1
    next_page = res["link"]&.include? "rel=\"last\""
  end
end

#fetch_page(page) ⇒ Net::HTTP::Response

Fetch page from IEC API.

Parameters:

  • page (Integer)

    page number

Returns:

  • (Net::HTTP::Response)

    response



127
128
129
130
131
132
133
134
135
136
137
138
# File 'lib/relaton_iec/data_fetcher.rb', line 127

def fetch_page(page)
  url = "#{ENTRYPOINT}#{page}"
  if !@all && @last_change
    url += "&lastChangeTimestampFrom=#{@last_change}"
  end
  uri = URI url
  req = Net::HTTP::Get.new uri
  req["Authorization"] = "Bearer #{access_token}"
  Net::HTTP.start(uri.hostname, uri.port, use_ssl: true) do |http|
    http.request req
  end
end

#fetch_page_token(page) ⇒ Net::HTTP::Response

Fetch page. If response code is 401, then get new access token and try

Parameters:

  • page (Integer)

    page number

Returns:

  • (Net::HTTP::Response)

    response



111
112
113
114
115
116
117
118
# File 'lib/relaton_iec/data_fetcher.rb', line 111

def fetch_page_token(page)
  res = fetch_page page
  if res.code == "401"
    @access_token = nil
    res = fetch_page page
  end
  res
end

#fetch_pub(pub) ⇒ Object

Fetch publication and save it to file.

Parameters:

  • pub (Hash)

    publication



162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
# File 'lib/relaton_iec/data_fetcher.rb', line 162

def fetch_pub(pub) # rubocop:disable Metrics/MethodLength, Metrics/AbcSize
  bib = DataParser.new(pub).parse
  did = bib.docidentifier.detect &:primary
  file = File.join(@output, "#{did.id.downcase.gsub(/[:\s\/]/, '_')}.#{@ext}")
  if @files.include? file then Util.warn "File #{file} exists."
  else
    @files << file
    # @index.add index_id(pub), file, pub["lastChangeTimestamp"]
  end
  last_change_max pub["lastChangeTimestamp"]
  content = case @format
            when "xml" then bib.to_xml bibdata: true
            when "yaml", "yml" then bib.to_hash.to_yaml
            when "bibxml" then bib.to_bibxml
            end
  File.write file, content, encoding: "UTF-8"
end

#index_id(pub) ⇒ Object



180
181
182
183
184
185
186
187
188
189
# File 'lib/relaton_iec/data_fetcher.rb', line 180

def index_id(pub)
  /-(?<part>\d+)/ =~ pub["reference"]
  title = pub.dig("title", 0, "value")
  return pub["reference"] unless part && title

  ids = title.scan(/(?<=-\sPart\s)#{part[0]}\d+(?=:)/).map do |m|
    pub["reference"].sub(/-#{part}/, "-#{m}")
  end
  ids.size > 1 ? ids : pub["reference"]
end

#last_change_max(date) ⇒ Object



25
26
27
# File 'lib/relaton_iec/data_fetcher.rb', line 25

def last_change_max(date)
  @last_change_max = date if @last_change_max < date
end

#save_last_changeObject



29
30
31
32
33
# File 'lib/relaton_iec/data_fetcher.rb', line 29

def save_last_change
  return if @last_change_max.empty?

  File.write LAST_CHANGE_FILE, @last_change_max, encoding: "UTF-8"
end