Class: RelatonNist::DataFetcher

Inherits:
Object
  • Object
show all
Defined in:
lib/relaton_nist/data_fetcher.rb

Constant Summary collapse

URL =
"https://github.com/usnistgov/NIST-Tech-Pubs/releases/download/Sept2024/allrecords-MODS.xml"

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(output, format) ⇒ DataFetcher

Returns a new instance of DataFetcher.



11
12
13
14
15
16
# File 'lib/relaton_nist/data_fetcher.rb', line 11

def initialize(output, format)
  @output = output
  @format = format
  @ext = format.sub(/^bib/, "")
  @files = []
end

Class Method Details

.fetch(output: "data", format: "yaml") ⇒ Object

Fetch all the documnts from dataset

Parameters:

  • output (String) (defaults to: "data")

    foldet name to save the documents

  • format (String) (defaults to: "yaml")

    format to save the documents (yaml, xml, bibxml)



94
95
96
# File 'lib/relaton_nist/data_fetcher.rb', line 94

def self.fetch(output: "data", format: "yaml")
  new(output, format).fetch
end

Instance Method Details

#add_static_filesObject



80
81
82
83
84
85
86
# File 'lib/relaton_nist/data_fetcher.rb', line 80

def add_static_files
  Dir["./static/*.yaml"].each do |file|
    hash = YAML.load_file file
    bib = RelatonNist::NistBibliographicItem.from_hash(hash)
    index.add_or_update bib.docidentifier[0].id, file
  end
end

#fetchObject

Fetch all the documnts from dataset



54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
# File 'lib/relaton_nist/data_fetcher.rb', line 54

def fetch # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
  t1 = Time.now
  puts "Started at: #{t1}"

  FileUtils.mkdir_p @output
  FileUtils.rm Dir[File.join(@output, "*.#{@ext}")]

  fetch_tech_pubs
  add_static_files
  index.save

  t2 = Time.now
  puts "Stopped at: #{t2}"
  puts "Done in: #{(t2 - t1).round} sec."
# rescue StandardError => e
#   Util.error "#{e.message}\n#{e.backtrace[0..5].join("\n")}"
end

#fetch_tech_pubsObject



72
73
74
75
76
77
78
# File 'lib/relaton_nist/data_fetcher.rb', line 72

def fetch_tech_pubs
  docs = LocMods::Collection.from_xml OpenURI.open_uri(URL)
  # docs.xpath(
  #   "/body/query/doi_record/report-paper/report-paper_metadata",
  # )
  docs.mods.each { |doc| write_file ModsParser.new(doc, series).parse }
end

#indexObject



18
19
20
# File 'lib/relaton_nist/data_fetcher.rb', line 18

def index
  @index ||= Relaton::Index.find_or_create :nist, file: "index-v1.yaml"
end

#output(bib) ⇒ Object



43
44
45
46
47
48
49
# File 'lib/relaton_nist/data_fetcher.rb', line 43

def output(bib)
  case @format
  when "yaml" then bib.to_hash.to_yaml
  when "xml" then bib.to_xml bibdata: true
  else bib.send "to_#{@format}"
  end
end

#seriesObject



22
23
24
# File 'lib/relaton_nist/data_fetcher.rb', line 22

def series
  @series ||= YAML.load_file File.expand_path("series.yaml", __dir__)
end

#write_file(bib) ⇒ Object

Save document



31
32
33
34
35
36
37
38
39
40
41
# File 'lib/relaton_nist/data_fetcher.rb', line 31

def write_file(bib) # rubocop:disable Metrics/AbcSize
  id = bib.docidentifier[0].id.gsub(%r{[/\s:.]}, "_").upcase.sub(/^NIST_IR/, "NISTIR")
  file = File.join(@output, "#{id}.#{@ext}")
  if @files.include? file
    Util.warn "File #{file} exists. Docid: #{bib.docidentifier[0].id}"
    # warn "Link: #{bib.link.detect { |l| l.type == 'src' }.content}"
  else @files << file
  end
  index.add_or_update bib.docidentifier[0].id, file
  File.write file, output(bib), encoding: "UTF-8"
end