Class: RelatonIec::DataFetcher
- Inherits:
-
Object
- Object
- RelatonIec::DataFetcher
- Defined in:
- lib/relaton_iec/data_fetcher.rb
Constant Summary collapse
- ENTRYPOINT =
"https://api.iec.ch/harmonized/publications?size=100&sortBy=urn&page=".freeze
- CREDENTIAL =
"https://api.iec.ch/oauth/client_credential/accesstoken?grant_type=client_credentials".freeze
- LAST_CHANGE_FILE =
"last_change.txt".freeze
Instance Method Summary collapse
-
#access_token ⇒ String
Get access token.
- #create_index ⇒ Object
-
#fetch ⇒ Object
Fetch data from IEC.
-
#fetch_all ⇒ void
Fetch documents from IEC API.
-
#fetch_page(page) ⇒ Net::HTTP::Response
Fetch page from IEC API.
-
#fetch_page_token(page) ⇒ Net::HTTP::Response
Fetch page.
-
#fetch_pub(pub) ⇒ Object
Fetch publication and save it to file.
- #index_id(pub) ⇒ Object
-
#initialize(source = "iec-harmonised-latest", output: "data", format: "yaml") ⇒ DataFetcher
constructor
Initialize new instance.
- #last_change_max(date) ⇒ Object
- #save_last_change ⇒ Object
Constructor Details
#initialize(source = "iec-harmonised-latest", output: "data", format: "yaml") ⇒ DataFetcher
Initialize new instance.
14 15 16 17 18 19 20 21 22 23 |
# File 'lib/relaton_iec/data_fetcher.rb', line 14 def initialize(source = "iec-harmonised-latest", output: "data", format: "yaml") @output = output @format = format @ext = format.sub(/^bib/, "") @files = [] # @index = Index.new "index.yaml" @last_change = File.read(LAST_CHANGE_FILE, encoding: "UTF-8") if File.exist? LAST_CHANGE_FILE @last_change_max = @last_change.to_s @all = source == "iec-harmonised-all" end |
Instance Method Details
#access_token ⇒ String
Get access token.
145 146 147 148 149 150 151 152 153 154 155 |
# File 'lib/relaton_iec/data_fetcher.rb', line 145 def access_token # rubocop:disable Metrics/AbcSize @access_token ||= begin uri = URI CREDENTIAL req = Net::HTTP::Get.new uri req.basic_auth ENV.fetch("IEC_HAPI_PROJ_PUBS_KEY"), ENV.fetch("IEC_HAPI_PROJ_PUBS_SECRET") res = Net::HTTP.start(uri.hostname, uri.port, use_ssl: true) do |http| http.request req end JSON.parse(res.body)["access_token"] end end |
#create_index ⇒ Object
59 60 61 62 63 64 65 66 67 68 |
# File 'lib/relaton_iec/data_fetcher.rb', line 59 def create_index index = Relaton::Index.find_or_create :IEC, file: "index1.yaml" index.remove_all Dir["{#{@output},static}/*.yaml"].each do |file| item = YAML.load_file file id = item["docid"].detect { |i| i["primary"] }["id"] index.add_or_update id, file end index.save end |
#fetch ⇒ Object
Fetch data from IEC.
38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 |
# File 'lib/relaton_iec/data_fetcher.rb', line 38 def fetch # rubocop:disable Metrics/AbcSize, Metrics/MethodLength t1 = Time.now puts "Started at: #{t1}" if @all FileUtils.rm_rf @output end FileUtils.mkdir_p @output fetch_all create_index save_last_change t2 = Time.now puts "Stopped at: #{t2}" puts "Done in: #{(t2 - t1).round} sec." rescue StandardError => e Util.error do "#{e.}\n#{e.backtrace.join("\n")}" end end |
#fetch_all ⇒ void
This method returns an undefined value.
Fetch documents from IEC API.
88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 |
# File 'lib/relaton_iec/data_fetcher.rb', line 88 def fetch_all # rubocop:disable Metrics/MethodLength page = 0 next_page = true while next_page res = fetch_page_token page unless res.code == "200" Util.warn "#{res.body}" break end json = JSON.parse res.body json["publication"].each { |pub| fetch_pub pub } page += 1 next_page = res["link"]&.include? "rel=\"last\"" end end |
#fetch_page(page) ⇒ Net::HTTP::Response
Fetch page from IEC API.
127 128 129 130 131 132 133 134 135 136 137 138 |
# File 'lib/relaton_iec/data_fetcher.rb', line 127 def fetch_page(page) url = "#{ENTRYPOINT}#{page}" if !@all && @last_change url += "&lastChangeTimestampFrom=#{@last_change}" end uri = URI url req = Net::HTTP::Get.new uri req["Authorization"] = "Bearer #{access_token}" Net::HTTP.start(uri.hostname, uri.port, use_ssl: true) do |http| http.request req end end |
#fetch_page_token(page) ⇒ Net::HTTP::Response
Fetch page. If response code is 401, then get new access token and try
111 112 113 114 115 116 117 118 |
# File 'lib/relaton_iec/data_fetcher.rb', line 111 def fetch_page_token(page) res = fetch_page page if res.code == "401" @access_token = nil res = fetch_page page end res end |
#fetch_pub(pub) ⇒ Object
Fetch publication and save it to file.
162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 |
# File 'lib/relaton_iec/data_fetcher.rb', line 162 def fetch_pub(pub) # rubocop:disable Metrics/MethodLength, Metrics/AbcSize bib = DataParser.new(pub).parse did = bib.docidentifier.detect &:primary file = File.join(@output, "#{did.id.downcase.gsub(/[:\s\/]/, '_')}.#{@ext}") if @files.include? file then Util.warn "File #{file} exists." else @files << file # @index.add index_id(pub), file, pub["lastChangeTimestamp"] end last_change_max pub["lastChangeTimestamp"] content = case @format when "xml" then bib.to_xml bibdata: true when "yaml", "yml" then bib.to_hash.to_yaml when "bibxml" then bib.to_bibxml end File.write file, content, encoding: "UTF-8" end |
#index_id(pub) ⇒ Object
180 181 182 183 184 185 186 187 188 189 |
# File 'lib/relaton_iec/data_fetcher.rb', line 180 def index_id(pub) /-(?<part>\d+)/ =~ pub["reference"] title = pub.dig("title", 0, "value") return pub["reference"] unless part && title ids = title.scan(/(?<=-\sPart\s)#{part[0]}\d+(?=:)/).map do |m| pub["reference"].sub(/-#{part}/, "-#{m}") end ids.size > 1 ? ids : pub["reference"] end |
#last_change_max(date) ⇒ Object
25 26 27 |
# File 'lib/relaton_iec/data_fetcher.rb', line 25 def last_change_max(date) @last_change_max = date if @last_change_max < date end |
#save_last_change ⇒ Object
29 30 31 32 33 |
# File 'lib/relaton_iec/data_fetcher.rb', line 29 def save_last_change return if @last_change_max.empty? File.write LAST_CHANGE_FILE, @last_change_max, encoding: "UTF-8" end |