Class: Relaton::Plateau::Fetcher
- Inherits:
-
Object
- Object
- Relaton::Plateau::Fetcher
- Defined in:
- lib/relaton/plateau/fetcher.rb
Overview
Fetcher class to fetch data from the Plateau website
Constant Summary collapse
- HANDBOOKS_URL =
"https://www.mlit.go.jp/plateau/_next/data/1.3.0/libraries/handbooks.json".freeze
- TECHNICAL_REPORTS_URL =
"https://www.mlit.go.jp/plateau/_next/data/1.3.0/libraries/technical-reports.json".freeze
Class Method Summary collapse
Instance Method Summary collapse
-
#create_request(uri) ⇒ Object
Create a GET request with custom headers to mimic a browser.
-
#extract_handbooks_data ⇒ Object
Extract data for handbooks.
-
#extract_technical_reports_data ⇒ Object
Extract data for technical reports.
-
#fetch_json_data(url) ⇒ Hash
Fetch JSON data from a URL with custom headers.
- #file_name(id) ⇒ Object
-
#hadle_response(response) ⇒ Object
Handle different content encodings.
- #index ⇒ Object
-
#initialize(output, format) ⇒ Fetcher
constructor
A new instance of Fetcher.
- #save_document(item) ⇒ Object
- #serialize(item) ⇒ Object
Constructor Details
#initialize(output, format) ⇒ Fetcher
Returns a new instance of Fetcher.
13 14 15 16 17 18 |
# File 'lib/relaton/plateau/fetcher.rb', line 13 def initialize(output, format) @output = output @format = format @ext = format.sub(/^bib/, "") @files = [] end |
Class Method Details
.fetch(source, output: "data", format: "yaml") ⇒ Object
24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 |
# File 'lib/relaton/plateau/fetcher.rb', line 24 def self.fetch(source, output: "data", format: "yaml") t1 = Time.now puts "Started at: #{t1}" FileUtils.mkdir_p output if source == "plateau-handbooks" new(output, format).extract_handbooks_data elsif source == "plateau-technical-reports" new(output, format).extract_technical_reports_data else puts "Invalid source: #{source}" end t2 = Time.now puts "Stopped at: #{t2}" puts "Done in: #{(t2 - t1).round} sec." end |
Instance Method Details
#create_request(uri) ⇒ Object
Create a GET request with custom headers to mimic a browser
43 44 45 46 47 48 49 50 51 52 53 54 |
# File 'lib/relaton/plateau/fetcher.rb', line 43 def create_request(uri) request = Net::HTTP::Get.new(uri) request["User-Agent"] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:127.0) Gecko/20100101 Firefox/127.0" request["Accept"] = "*/*" request["Accept-Language"] = "en-US,en;q=0.5" request["Accept-Encoding"] = "gzip, deflate, br, zstd" request["Referer"] = "https://www.mlit.go.jp/plateau/libraries/" request["purpose"] = "prefetch" request["x-nextjs-data"] = "1" request["Connection"] = "keep-alive" request end |
#extract_handbooks_data ⇒ Object
Extract data for handbooks
100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 |
# File 'lib/relaton/plateau/fetcher.rb', line 100 def extract_handbooks_data data = fetch_json_data(HANDBOOKS_URL) Util.info "Extracting handbooks data..." data["pageProps"]["handbooks"]["nodes"].each do |entry| handbook = entry["handbook"] versions = handbook["versions"] description_parts = handbook["description"]&.split("<br />") || ["", ""] title_en = description_parts[0].strip if description_parts[0] abstract = description_parts[1].strip if description_parts[1] doctype = entry["slug"].match("-") ? "annex" : "handbook" versions.each do |version| item = HandbookParser.new( version: version, entry: entry, title_en: title_en, abstract: abstract, doctype: doctype ).parse save_document(item) end end index.save end |
#extract_technical_reports_data ⇒ Object
Extract data for technical reports
126 127 128 129 130 131 132 133 |
# File 'lib/relaton/plateau/fetcher.rb', line 126 def extract_technical_reports_data data = fetch_json_data(TECHNICAL_REPORTS_URL) Util.info "Extracting technical reports data..." data["pageProps"]["nodes"].map do |entry| save_document(TechnicalReportParser.new(entry).parse) end index.save end |
#fetch_json_data(url) ⇒ Hash
Fetch JSON data from a URL with custom headers
71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 |
# File 'lib/relaton/plateau/fetcher.rb', line 71 def fetch_json_data(url) uri = URI(url) request = create_request(uri) # Send the request and get the response response = Net::HTTP.start(uri.hostname, uri.port, use_ssl: true) do |http| http.request(request) end # Check if the response is successful unless response.code.to_i == 200 Util.warn "Failed to fetch data: #{response.code} #{response.}" return {} end body = hadle_response(response) # Parse the JSON response JSON.parse(body) rescue StandardError => e # Handle any errors during the fetching process Util.error "Error fetching JSON data from #{url}: #{e.}" {} end |
#file_name(id) ⇒ Object
147 148 149 150 151 152 153 154 155 |
# File 'lib/relaton/plateau/fetcher.rb', line 147 def file_name(id) name = id.gsub(/\s+/, "_").gsub(/\W+/, "").downcase if id.match?(/民間活用編/) name += "_private" elsif id.match?(/公共活用編/) name += "_public" end File.join(@output, "#{name}.#{@ext}") end |
#hadle_response(response) ⇒ Object
Handle different content encodings
57 58 59 60 61 62 63 64 65 |
# File 'lib/relaton/plateau/fetcher.rb', line 57 def hadle_response(response) if response["Content-Encoding"] == "gzip" Zlib::GzipReader.new(StringIO.new(response.body)).read elsif response["Content-Encoding"] == "deflate" Zlib::Inflate.inflate(response.body) else response.body end end |
#index ⇒ Object
20 21 22 |
# File 'lib/relaton/plateau/fetcher.rb', line 20 def index @index ||= Relaton::Index.find_or_create :plateau, file: "index-v1.yaml" end |
#save_document(item) ⇒ Object
135 136 137 138 139 140 141 142 143 144 145 |
# File 'lib/relaton/plateau/fetcher.rb', line 135 def save_document(item) id = item.docidentifier.first.id file = file_name id if @files.include?(file) Util.warn "File #{file} already exists, skipping.", key: id else File.write(file, serialize(item)) @files << file index.add_or_update id, file end end |
#serialize(item) ⇒ Object
157 158 159 160 161 162 163 |
# File 'lib/relaton/plateau/fetcher.rb', line 157 def serialize(item) case @format when "yaml" then item.to_hash.to_yaml when "xml" then item.to_xml bibdata: true else item.send("to_#{@format}") end end |