Class: RelatonIetf::DataFetcher

Inherits:
Object
  • Object
show all
Defined in:
lib/relaton_ietf/data_fetcher.rb

Constant Summary collapse

INDEX1 =
"index-v1".freeze

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(source, output, format) ⇒ DataFetcher

Data fetcher initializer

Parameters:

  • source (String)

    source name

  • output (String)

    directory to save files

  • format (String)

    format of output files (xml, yaml, bibxml); for ietf-rfcsubseries source only: xml



18
19
20
21
22
23
24
25
# File 'lib/relaton_ietf/data_fetcher.rb', line 18

def initialize(source, output, format)
  @source = source
  @output = output
  @format = format
  @ext = @format.sub(/^bib|^rfc/, "")
  @files = []
  @index = Relaton::Index.find_or_create :IETF, file: "#{INDEX1}.yaml"
end

Class Method Details

.fetch(source, output: "data", format: "yaml") ⇒ Object

Initialize fetcher and run fetch

Parameters:

  • source (String)

    source name

  • output (Strin) (defaults to: "data")

    directory to save files, default: “data”

  • format (Strin) (defaults to: "yaml")

    format of output files (xml, yaml, bibxml); default: yaml; for ietf-rfcsubseries source only: xml



35
36
37
38
39
40
41
42
43
# File 'lib/relaton_ietf/data_fetcher.rb', line 35

def self.fetch(source, output: "data", format: "yaml")
  t1 = Time.now
  puts "Started at: #{t1}"
  FileUtils.mkdir_p output # unless Dir.exist? output
  new(source, output, format).fetch
  t2 = Time.now
  puts "Stopped at: #{t2}"
  puts "Done in: #{(t2 - t1).round} sec."
end

Instance Method Details

#add_to_index(entry, file) ⇒ Object



210
211
212
213
214
# File 'lib/relaton_ietf/data_fetcher.rb', line 210

def add_to_index(entry, file)
  docid = entry.docidentifier.detect(&:primary)
  docid ||= entry.docidentifier.first
  @index.add_or_update docid.id, file
end

#create_series(ref, versions) ⇒ Object

Create unversioned bibliographic item

Parameters:

  • ref (String)

    reference

  • versions (Array<String>)

    list of versions



120
121
122
123
124
125
126
127
128
129
130
131
# File 'lib/relaton_ietf/data_fetcher.rb', line 120

def create_series(ref, versions) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
  vs = versions.sort_by { |v| v.match(/\d+$/).to_s.to_i }
  fref = RelatonBib::FormattedRef.new content: ref
  docid = RelatonBib::DocumentIdentifier.new type: "Internet-Draft", id: ref, primary: true
  rel = vs.map { |v| version_relation v, "includes" }
  last_v = HashConverter.hash_to_bib YAML.load_file("#{@output}/#{vs.last}.#{@ext}")
  bib = IetfBibliographicItem.new(
    title: last_v[:title], abstract: last_v[:abstract], formattedref: fref,
    docid: [docid], relation: rel
  )
  save_doc bib
end

#fetchObject

Fetch documents



48
49
50
51
52
53
54
55
# File 'lib/relaton_ietf/data_fetcher.rb', line 48

def fetch
  case @source
  when "ietf-rfcsubseries" then fetch_ieft_rfcsubseries
  when "ietf-internet-drafts" then fetch_ieft_internet_drafts
  when "ietf-rfc-entries" then fetch_ieft_rfcs
  end
  @index.save
end

#fetch_ieft_internet_draftsObject

Fetches ietf-internet-drafts documents



69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# File 'lib/relaton_ietf/data_fetcher.rb', line 69

def fetch_ieft_internet_drafts # rubocop:disable Metrics/MethodLength, Metrics/AbcSize
  versions = Dir["bibxml-ids/*.xml"].each_with_object([]) do |path, vers|
    file = File.basename path, ".xml"
    if file.include?("D.draft-")
      vers << file.sub(/^reference\.I-D\./, "").downcase
      /(?<ver>\d+)$/ =~ file
    end
    bib = BibXMLParser.parse(File.read(path, encoding: "UTF-8"))
    if ver
      version = RelatonBib::BibliographicItem::Version.new nil, ver
      bib.instance_variable_set :@version, [version]
    end
    save_doc bib
  end
  update_versions(versions) if versions.any? && @format != "bibxml"
end

#fetch_ieft_rfcsObject

Fetches ietf-rfc-entries documents



167
168
169
170
171
172
173
174
# File 'lib/relaton_ietf/data_fetcher.rb', line 167

def fetch_ieft_rfcs
  rfc_index.xpath("xmlns:rfc-entry").each do |doc|
    save_doc RfcEntry.parse(doc)
  rescue StandardError => e
    Util.error "Error parsing #{doc.at('./xmlns:doc-id').text}: #{e.message}\n" \
      "#{e.backtrace[0..5].join("\n")}"
  end
end

#fetch_ieft_rfcsubseriesObject

Fetches ietf-rfcsubseries documents



60
61
62
63
64
# File 'lib/relaton_ietf/data_fetcher.rb', line 60

def fetch_ieft_rfcsubseries
  rfc_index.xpath("xmlns:bcp-entry|xmlns:fyi-entry|xmlns:std-entry").each do |doc|
    save_doc RfcIndexEntry.parse(doc)
  end
end

#file_name(entry) ⇒ String

Generate file name

Parameters:

Returns:

  • (String)

    file name



223
224
225
226
227
228
229
230
231
232
233
# File 'lib/relaton_ietf/data_fetcher.rb', line 223

def file_name(entry) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
  id = if entry.respond_to? :docidentifier
         entry.docidentifier.detect { |i| i.type == "Internet-Draft" }&.id
       end
  id ||= entry.docnumber || entry.formattedref.content
  if @source == "ietf-internet-drafts" then id.downcase!
  else id.upcase!
  end
  name = id.gsub(/[\s,:\/]/, "_").squeeze("_")
  File.join @output, "#{name}.#{@ext}"
end

#read_doc(file) ⇒ RelatonIetf::IetfBibliographicItem

Redad saved documents

Parameters:

  • file (String)

    path to file

Returns:



155
156
157
158
159
160
161
162
# File 'lib/relaton_ietf/data_fetcher.rb', line 155

def read_doc(file)
  doc = File.read(file, encoding: "UTF-8")
  case @format
  when "xml" then XMLParser.from_xml(doc)
  when "yaml" then IetfBibliographicItem.from_hash YAML.safe_load(doc)
  else BibXMLParser.parse(doc)
  end
end

#rfc_indexNokogiri::XML::Document

Get RFC index

Returns:

  • (Nokogiri::XML::Document)

    RFC index



181
182
183
184
# File 'lib/relaton_ietf/data_fetcher.rb', line 181

def rfc_index
  uri = URI "https://www.rfc-editor.org/rfc-index.xml"
  Nokogiri::XML(Net::HTTP.get(uri)).at("/xmlns:rfc-index")
end

#save_doc(entry, check_duplicate: true) ⇒ Object

Save document to file

Parameters:



192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
# File 'lib/relaton_ietf/data_fetcher.rb', line 192

def save_doc(entry, check_duplicate: true) # rubocop:disable Metrics/MethodLength, Metrics/CyclomaticComplexity
  return unless entry

  c = case @format
      when "xml" then entry.to_xml(bibdata: true)
      when "yaml" then entry.to_hash.to_yaml
      else entry.send("to_#{@format}")
      end
  file = file_name entry
  if check_duplicate && @files.include?(file)
    Util.warn "File #{file} already exists. Document: #{entry.docnumber}"
  elsif check_duplicate
    @files << file
  end
  File.write file, c, encoding: "UTF-8"
  add_to_index entry, file
end

#update_versions(versions) ⇒ Object

Updates I-D’s versions

Parameters:

  • versions (Array<String>)

    list of versions



91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
# File 'lib/relaton_ietf/data_fetcher.rb', line 91

def update_versions(versions) # rubocop:disable Metrics/MethodLength, Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
  series = ""
  bib_versions = []
  Dir["#{@output}/*.#{@ext}"].each do |file|
    match = /(?<series>draft-.+)-(?<ver>\d{2})\.#{@ext}$/.match file
    if match
      if series != match[:series]
        bib_versions = versions.grep(/^#{Regexp.quote match[:series]}-\d{2}/)
        create_series match[:series], bib_versions
        series = match[:series]
      end
      lv = bib_versions.select { |ref| ref.match(/\d+$/).to_s.to_i < match[:ver].to_i }
      hv = bib_versions.select { |ref| ref.match(/\d+$/).to_s.to_i > match[:ver].to_i }
      if lv.any? || hv.any?
        bib = read_doc(file)
        bib.relation << version_relation(lv.last, "updates") if lv.any?
        bib.relation << version_relation(hv.first, "updatedBy") if hv.any?
        save_doc bib, check_duplicate: false
      end
    end
  end
end

#version_relation(ref, type) ⇒ RelatonBib::DocumentRelation

Create bibitem relation

Parameters:

  • ref (String)

    reference

  • type (String)

    relation type

Returns:

  • (RelatonBib::DocumentRelation)

    relation



141
142
143
144
145
146
# File 'lib/relaton_ietf/data_fetcher.rb', line 141

def version_relation(ref, type)
  fref = RelatonBib::FormattedRef.new content: ref
  docid = RelatonBib::DocumentIdentifier.new type: "Internet-Draft", id: ref, primary: true
  bibitem = IetfBibliographicItem.new formattedref: fref, docid: [docid]
  RelatonBib::DocumentRelation.new(type: type, bibitem: bibitem)
end