Class: RelatonIeee::DataFetcher

Inherits:
Object
  • Object
show all
Defined in:
lib/relaton_ieee/data_fetcher.rb

Constant Summary collapse

RELATION_TYPES =
{
  "S" => { type: "obsoletedBy" },
  "V" => { type: "updates", description: "revises" },
  "T" => { type: "updates", description: "amends" },
  "C" => { type: "updates", description: "corrects" },
  "O" => { type: "adoptedFrom" },
  "P" => { type: "complementOf", description: "supplement" },
  "N" => false, "G" => false,
  "F" => false, "I" => false,
  "E" => false, "B" => false, "W" => false
}.freeze

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(output, format) ⇒ DataFetcher

Create RelatonIeee::DataFetcher instance

Parameters:

  • output (String)

    output dir

  • format (Strong)

    output format. Allowed values: “yaml” or “xml”



28
29
30
31
32
33
34
# File 'lib/relaton_ieee/data_fetcher.rb', line 28

def initialize(output, format)
  @output = output
  @format = format
  @ext = format.sub(/^bib/, "")
  @crossrefs = {}
  @backrefs = {}
end

Instance Attribute Details

#backrefsHash (readonly)

Returns list of AMSID => PubID.

Returns:

  • (Hash)

    list of AMSID => PubID



20
21
22
# File 'lib/relaton_ieee/data_fetcher.rb', line 20

def backrefs
  @backrefs
end

Class Method Details

.fetch(output: "data", format: "yaml") ⇒ Object

Convert documents from ‘ieee-rawbib` dir (IEEE dataset) to BibYAML/BibXML

Parameters:

  • output (String) (defaults to: "data")

    (‘data’) output dir

  • format (String) (defaults to: "yaml")

    (‘yaml’) output format. Allowed values: “yaml” or “xml”



43
44
45
46
47
48
49
50
51
# File 'lib/relaton_ieee/data_fetcher.rb', line 43

def self.fetch(output: "data", format: "yaml")
  t1 = Time.now
  puts "Started at: #{t1}"
  FileUtils.mkdir_p output
  new(output, format).fetch
  t2 = Time.now
  puts "Stopped at: #{t2}"
  puts "Done in: #{(t2 - t1).round} sec."
end

Instance Method Details

#add_crossref(docnumber, amsid) ⇒ Object

Save unresolved relation reference

Parameters:

  • docnumber (String)

    of main document

  • amsid (Nokogiri::XML::Element)

    relation data



125
126
127
128
129
130
131
132
133
# File 'lib/relaton_ieee/data_fetcher.rb', line 125

def add_crossref(docnumber, amsid)
  return if RELATION_TYPES[amsid.type] == false

  ref = { amsid: amsid.date_string, type: amsid.type }
  if @crossrefs[docnumber]
    @crossrefs[docnumber] << ref
  else @crossrefs[docnumber] = [ref]
  end
end

#create_relation(type, fref) ⇒ RelatonBib::DocumentRelation

Create relation instance

Parameters:

  • type (String)

    IEEE relation type

  • fref (String)

    reference

Returns:

  • (RelatonBib::DocumentRelation)


190
191
192
193
194
195
196
197
198
199
200
201
202
203
# File 'lib/relaton_ieee/data_fetcher.rb', line 190

def create_relation(type, fref) # rubocop:disable Metrics/MethodLength
  return if RELATION_TYPES[type] == false

  fr = RelatonBib::FormattedRef.new(content: fref)
  docid = RelatonBib::DocumentIdentifier.new(type: "IEEE", id: fref, primary: true)
  bib = IeeeBibliographicItem.new formattedref: fr, docid: [docid]
  desc = RELATION_TYPES[type][:description]
  description = desc && RelatonBib::FormattedString.new(content: desc, language: "en", script: "Latn")
  RelatonBib::DocumentRelation.new(
    type: RELATION_TYPES[type][:type],
    description: description,
    bibitem: bib,
  )
end

#fetchObject

Convert documents from ‘ieee-rawbib` dir (IEEE dataset) to BibYAML/BibXML



56
57
58
59
60
61
62
63
64
65
66
67
68
# File 'lib/relaton_ieee/data_fetcher.rb', line 56

def fetch # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
  Dir["ieee-rawbib/**/*.{xml,zip}"].reject { |f| f["Deleted_"] }.each do |f|
    xml = case File.extname(f)
          when ".zip" then read_zip f
          when ".xml" then File.read f, encoding: "UTF-8"
          end
    fetch_doc xml, f
  rescue StandardError => e
    Util.error "File: #{f}\n#{e.message}\n#{e.backtrace}"
  end
  # File.write "normtitles.txt", @normtitles.join("\n")
  update_relations
end

#fetch_doc(xml, filename) ⇒ Object

Parse document and save it

Parameters:

  • xml (String)

    content

  • filename (String)

    source file



90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
# File 'lib/relaton_ieee/data_fetcher.rb', line 90

def fetch_doc(xml, filename) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength,Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity
  begin
    doc = Ieee::Idams::Publication.from_xml(xml)
  rescue StandardError
    Util.warn "Empty file: `#{filename}`"
    return
  end
  return if doc.publicationinfo&.standard_id == "0"

  bib = IdamsParser.new(doc, self).parse
  if bib.docnumber.nil?
    Util.warn "PubID parse error. Normtitle: `#{doc.normtitle}`, file: `#{filename}`"
    return
  end
  amsid = doc.publicationinfo.amsid
  if backrefs.value?(bib.docidentifier[0].id) && /updates\.\d+/ !~ filename
    oamsid = backrefs.key bib.docidentifier[0].id
    Util.warn "Document exists ID: `#{bib.docidentifier[0].id}` AMSID: " \
         "`#{amsid}` source: `#{filename}`. Other AMSID: `#{oamsid}`"
    if bib.docidentifier.find(&:primary).id.include?(doc.publicationinfo.stdnumber)
      save_doc bib # rewrite file if the PubID matches to the stdnumber
      backrefs[amsid] = bib.docidentifier[0].id
    end
  else
    save_doc bib
    backrefs[amsid] = bib.docidentifier[0].id
  end
end

#file_name(docnumber) ⇒ String

Make filename from PubID

Parameters:

  • docnumber (String)

Returns:

  • (String)

    filename



156
157
158
159
# File 'lib/relaton_ieee/data_fetcher.rb', line 156

def file_name(docnumber)
  name = docnumber.gsub(/\s-/, "-").gsub(/[\s,:\/]/, "_").squeeze("_").upcase
  File.join @output, "#{name}.#{@ext}"
end

#read_bib(docnumber) ⇒ RelatonIeee::IeeeBibliographicItem

Read document form BibXML/BibYAML file

Parameters:

  • docnumber (String)

Returns:



212
213
214
215
216
217
218
219
# File 'lib/relaton_ieee/data_fetcher.rb', line 212

def read_bib(docnumber)
  c = File.read file_name(docnumber), encoding: "UTF-8"
  case @format
  when "xml" then XMLParser.from_xml c
  when "bibxml" then BibXMLParser.parse c
  else IeeeBibliographicItem.from_hash YAML.safe_load(c)
  end
end

#read_zip(file) ⇒ String

Extract XML file from zip archive

Parameters:

  • file (String)

    path to achive

Returns:

  • (String)

    file content



77
78
79
80
81
82
# File 'lib/relaton_ieee/data_fetcher.rb', line 77

def read_zip(file)
  Zip::File.open(file) do |zf|
    entry = zf.glob("**/*.xml").first
    entry.get_input_stream.read
  end
end

#save_doc(bib) ⇒ Object

Save document to file



140
141
142
143
144
145
146
147
# File 'lib/relaton_ieee/data_fetcher.rb', line 140

def save_doc(bib)
  c = case @format
      when "xml" then bib.to_xml(bibdata: true)
      when "yaml" then bib.to_hash.to_yaml
      else bib.send("to_#{@format}")
      end
  File.write file_name(bib.docnumber), c, encoding: "UTF-8"
end

#update_relationsObject

Update unresoverd relations



164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
# File 'lib/relaton_ieee/data_fetcher.rb', line 164

def update_relations # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
  @crossrefs.each do |dnum, rfs|
    bib = nil
    rfs.each do |rf|
      if backrefs[rf[:amsid]]
        rel = create_relation(rf[:type], backrefs[rf[:amsid]])
        if rel
          bib ||= read_bib(dnum)
          bib.relation << rel
          save_doc bib
        end
      else
        Util.warn "Unresolved relation: '#{rf[:amsid]}' type: '#{rf[:type]}' for '#{dnum}'"
      end
    end
  end
end