Class: Relaton::Plateau::Fetcher

Inherits:
Object
  • Object
show all
Defined in:
lib/relaton/plateau/fetcher.rb

Overview

Fetcher class to fetch data from the Plateau website

Constant Summary collapse

HANDBOOKS_URL =
"https://www.mlit.go.jp/plateau/_next/data/1.3.0/libraries/handbooks.json".freeze
TECHNICAL_REPORTS_URL =
"https://www.mlit.go.jp/plateau/_next/data/1.3.0/libraries/technical-reports.json".freeze

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(output, format) ⇒ Fetcher

Returns a new instance of Fetcher.



13
14
15
16
17
18
# File 'lib/relaton/plateau/fetcher.rb', line 13

def initialize(output, format)
  @output = output
  @format = format
  @ext = format.sub(/^bib/, "")
  @files = []
end

Class Method Details

.fetch(source, output: "data", format: "yaml") ⇒ Object



24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
# File 'lib/relaton/plateau/fetcher.rb', line 24

def self.fetch(source, output: "data", format: "yaml")
  t1 = Time.now
  puts "Started at: #{t1}"
  FileUtils.mkdir_p output

  if source == "plateau-handbooks"
    new(output, format).extract_handbooks_data
  elsif source == "plateau-technical-reports"
    new(output, format).extract_technical_reports_data
  else
    puts "Invalid source: #{source}"
  end

  t2 = Time.now
  puts "Stopped at: #{t2}"
  puts "Done in: #{(t2 - t1).round} sec."
end

Instance Method Details

#create_request(uri) ⇒ Object

Create a GET request with custom headers to mimic a browser



43
44
45
46
47
48
49
50
51
52
53
54
# File 'lib/relaton/plateau/fetcher.rb', line 43

def create_request(uri)
  request = Net::HTTP::Get.new(uri)
  request["User-Agent"] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:127.0) Gecko/20100101 Firefox/127.0"
  request["Accept"] = "*/*"
  request["Accept-Language"] = "en-US,en;q=0.5"
  request["Accept-Encoding"] = "gzip, deflate, br, zstd"
  request["Referer"] = "https://www.mlit.go.jp/plateau/libraries/"
  request["purpose"] = "prefetch"
  request["x-nextjs-data"] = "1"
  request["Connection"] = "keep-alive"
  request
end

#extract_handbooks_dataObject

Extract data for handbooks



100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# File 'lib/relaton/plateau/fetcher.rb', line 100

def extract_handbooks_data
  data = fetch_json_data(HANDBOOKS_URL)
  Util.info "Extracting handbooks data..."
  data["pageProps"]["handbooks"]["nodes"].each do |entry|
    handbook = entry["handbook"]
    versions = handbook["versions"]

    description_parts = handbook["description"]&.split("<br />") || ["", ""]
    title_en = description_parts[0].strip if description_parts[0]
    abstract = description_parts[1].strip if description_parts[1]

    doctype = entry["slug"].match("-") ?  "annex" : "handbook"

    versions.each do |version|
      item = HandbookParser.new(
        version: version, entry: entry, title_en: title_en, abstract: abstract, doctype: doctype
      ).parse
      save_document(item)
    end
  end
  index.save
end

#extract_technical_reports_dataObject

Extract data for technical reports



126
127
128
129
130
131
132
133
# File 'lib/relaton/plateau/fetcher.rb', line 126

def extract_technical_reports_data
  data = fetch_json_data(TECHNICAL_REPORTS_URL)
  Util.info "Extracting technical reports data..."
  data["pageProps"]["nodes"].map do |entry|
    save_document(TechnicalReportParser.new(entry).parse)
  end
  index.save
end

#fetch_json_data(url) ⇒ Hash

Fetch JSON data from a URL with custom headers

Parameters:

  • url (String)

    The URL to fetch JSON data from

Returns:

  • (Hash)

    The parsed JSON data



71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
# File 'lib/relaton/plateau/fetcher.rb', line 71

def fetch_json_data(url)
  uri = URI(url)

  request = create_request(uri)

  # Send the request and get the response
  response = Net::HTTP.start(uri.hostname, uri.port, use_ssl: true) do |http|
    http.request(request)
  end

  # Check if the response is successful
  unless response.code.to_i == 200
    Util.warn "Failed to fetch data: #{response.code} #{response.message}"
    return {}
  end

  body = hadle_response(response)

  # Parse the JSON response
  JSON.parse(body)
rescue StandardError => e
  # Handle any errors during the fetching process
  Util.error "Error fetching JSON data from #{url}: #{e.message}"
  {}
end

#file_name(id) ⇒ Object



147
148
149
150
151
152
153
154
155
# File 'lib/relaton/plateau/fetcher.rb', line 147

def file_name(id)
  name = id.gsub(/\s+/, "_").gsub(/\W+/, "").downcase
  if id.match?(/民間活用編/)
    name += "_private"
  elsif id.match?(/公共活用編/)
    name += "_public"
  end
  File.join(@output, "#{name}.#{@ext}")
end

#hadle_response(response) ⇒ Object

Handle different content encodings



57
58
59
60
61
62
63
64
65
# File 'lib/relaton/plateau/fetcher.rb', line 57

def hadle_response(response)
  if response["Content-Encoding"] == "gzip"
    Zlib::GzipReader.new(StringIO.new(response.body)).read
  elsif response["Content-Encoding"] == "deflate"
    Zlib::Inflate.inflate(response.body)
  else
    response.body
  end
end

#indexObject



20
21
22
# File 'lib/relaton/plateau/fetcher.rb', line 20

def index
  @index ||= Relaton::Index.find_or_create :plateau, file: "index-v1.yaml"
end

#save_document(item) ⇒ Object



135
136
137
138
139
140
141
142
143
144
145
# File 'lib/relaton/plateau/fetcher.rb', line 135

def save_document(item)
  id = item.docidentifier.first.id
  file = file_name id
  if @files.include?(file)
    Util.warn "File #{file} already exists, skipping.", key: id
  else
    File.write(file, serialize(item))
    @files << file
    index.add_or_update id, file
  end
end

#serialize(item) ⇒ Object



157
158
159
160
161
162
163
# File 'lib/relaton/plateau/fetcher.rb', line 157

def serialize(item)
  case @format
  when "yaml" then item.to_hash.to_yaml
  when "xml" then item.to_xml bibdata: true
  else item.send("to_#{@format}")
  end
end