Class: RelatonIeee::DataFetcher

Inherits:
Object
  • Object
show all
Defined in:
lib/relaton_ieee/data_fetcher.rb

Constant Summary collapse

RELATION_TYPES =
{
  "S" => { type: "obsoletedBy" },
  "V" => { type: "updates", description: "revises" },
  "T" => { type: "updates", description: "amends" },
  "C" => { type: "updates", description: "corrects" },
  "O" => { type: "adoptedFrom" },
  "P" => { type: "complementOf", description: "supplement" },
  "N" => false, "G" => false,
  "F" => false, "I" => false,
  "E" => false, "B" => false, "W" => false
}.freeze

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(output, format) ⇒ DataFetcher

Create RelatonIeee::DataFetcher instance

Parameters:

  • output (String)

    output dir

  • format (Strong)

    output format. Allowed values: “yaml” or “xml”



28
29
30
31
32
33
34
# File 'lib/relaton_ieee/data_fetcher.rb', line 28

def initialize(output, format)
  @output = output
  @format = format
  @ext = format.sub(/^bib/, "")
  @crossrefs = {}
  @backrefs = {}
end

Instance Attribute Details

#backrefsHash (readonly)

Returns list of AMSID => PubID.

Returns:

  • (Hash)

    list of AMSID => PubID



20
21
22
# File 'lib/relaton_ieee/data_fetcher.rb', line 20

def backrefs
  @backrefs
end

Class Method Details

.fetch(output: "data", format: "yaml") ⇒ Object

Convert documents from ‘ieee-rawbib` dir (IEEE dataset) to BibYAML/BibXML

Parameters:

  • output (String) (defaults to: "data")

    (‘data’) output dir

  • format (String) (defaults to: "yaml")

    (‘yaml’) output format. Allowed values: “yaml” or “xml”



43
44
45
46
47
48
49
50
51
# File 'lib/relaton_ieee/data_fetcher.rb', line 43

def self.fetch(output: "data", format: "yaml")
  t1 = Time.now
  puts "Started at: #{t1}"
  FileUtils.mkdir_p output
  new(output, format).fetch
  t2 = Time.now
  puts "Stopped at: #{t2}"
  puts "Done in: #{(t2 - t1).round} sec."
end

Instance Method Details

#add_crossref(docnumber, amsid) ⇒ Object

Save unresolved relation reference

Parameters:

  • docnumber (String)

    of main document

  • amsid (Nokogiri::XML::Element)

    relation data



127
128
129
130
131
132
133
134
135
# File 'lib/relaton_ieee/data_fetcher.rb', line 127

def add_crossref(docnumber, amsid)
  return if RELATION_TYPES[amsid[:type]] == false

  ref = { amsid: amsid.text, type: amsid[:type] }
  if @crossrefs[docnumber]
    @crossrefs[docnumber] << ref
  else @crossrefs[docnumber] = [ref]
  end
end

#create_relation(type, fref) ⇒ RelatonBib::DocumentRelation

Create relation instance

Parameters:

  • type (String)

    IEEE relation type

  • fref (String)

    reference

Returns:

  • (RelatonBib::DocumentRelation)


192
193
194
195
196
197
198
199
200
201
202
203
204
205
# File 'lib/relaton_ieee/data_fetcher.rb', line 192

def create_relation(type, fref) # rubocop:disable Metrics/MethodLength
  return if RELATION_TYPES[type] == false

  fr = RelatonBib::FormattedRef.new(content: fref)
  docid = RelatonBib::DocumentIdentifier.new(type: "IEEE", id: fref, primary: true)
  bib = IeeeBibliographicItem.new formattedref: fr, docid: [docid]
  desc = RELATION_TYPES[type][:description]
  description = desc && RelatonBib::FormattedString.new(content: desc, language: "en", script: "Latn")
  RelatonBib::DocumentRelation.new(
    type: RELATION_TYPES[type][:type],
    description: description,
    bibitem: bib,
  )
end

#fetchObject

Convert documents from ‘ieee-rawbib` dir (IEEE dataset) to BibYAML/BibXML



56
57
58
59
60
61
62
63
64
65
66
67
68
# File 'lib/relaton_ieee/data_fetcher.rb', line 56

def fetch # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
  Dir["ieee-rawbib/**/*.{xml,zip}"].reject { |f| f["Deleted_"] }.each do |f|
    xml = case File.extname(f)
          when ".zip" then read_zip f
          when ".xml" then File.read f, encoding: "UTF-8"
          end
    fetch_doc xml, f
  rescue StandardError => e
    Util.error "File: #{f}\n#{e.message}\n#{e.backtrace}"
  end
  # File.write "normtitles.txt", @normtitles.join("\n")
  update_relations
end

#fetch_doc(xml, filename) ⇒ Object

Parse document and save it

Parameters:

  • xml (String)

    content

  • filename (String)

    source file



90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
# File 'lib/relaton_ieee/data_fetcher.rb', line 90

def fetch_doc(xml, filename) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength,Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity
  doc = Nokogiri::XML(xml).at("/publication")
  unless doc
    Util.warn "Empty file: `#{filename}`"
    return
  end
  stdid = doc.at("./publicationinfo/standard_id")&.text
  return if stdid == "0"

  fetcher = DataParser.new doc, self
  bib = fetcher.parse
  if bib.docnumber.nil?
    nt = doc&.at("./normtitle")&.text
    Util.warn "PubID parse error. Normtitle: `#{nt}`, file: `#{filename}`"
    return
  end
  amsid = doc.at("./publicationinfo/amsid").text
  if backrefs.value?(bib.docidentifier[0].id) && /updates\.\d+/ !~ filename
    oamsid = backrefs.key bib.docidentifier[0].id
    Util.warn "Document exists ID: `#{bib.docidentifier[0].id}` AMSID: " \
         "`#{amsid}` source: `#{filename}`. Other AMSID: `#{oamsid}`"
    if bib.docidentifier[0].id.include?(doc.at("./publicationinfo/stdnumber").text)
      save_doc bib # rewrite file if the PubID matches to the stdnumber
      backrefs[amsid] = bib.docidentifier[0].id
    end
  else
    save_doc bib
    backrefs[amsid] = bib.docidentifier[0].id
  end
end

#file_name(docnumber) ⇒ String

Make filename from PubID

Parameters:

  • docnumber (String)

Returns:

  • (String)

    filename



158
159
160
161
# File 'lib/relaton_ieee/data_fetcher.rb', line 158

def file_name(docnumber)
  name = docnumber.gsub(/\s-/, "-").gsub(/[\s,:\/]/, "_").squeeze("_").upcase
  File.join @output, "#{name}.#{@ext}"
end

#read_bib(docnumber) ⇒ RelatonIeee::IeeeBibliographicItem

Read document form BibXML/BibYAML file

Parameters:

  • docnumber (String)

Returns:



214
215
216
217
218
219
220
221
# File 'lib/relaton_ieee/data_fetcher.rb', line 214

def read_bib(docnumber)
  c = File.read file_name(docnumber), encoding: "UTF-8"
  case @format
  when "xml" then XMLParser.from_xml c
  when "bibxml" then BibXMLParser.parse c
  else IeeeBibliographicItem.from_hash YAML.safe_load(c)
  end
end

#read_zip(file) ⇒ String

Extract XML file from zip archive

Parameters:

  • file (String)

    path to achive

Returns:

  • (String)

    file content



77
78
79
80
81
82
# File 'lib/relaton_ieee/data_fetcher.rb', line 77

def read_zip(file)
  Zip::File.open(file) do |zf|
    entry = zf.glob("**/*.xml").first
    entry.get_input_stream.read
  end
end

#save_doc(bib) ⇒ Object

Save document to file



142
143
144
145
146
147
148
149
# File 'lib/relaton_ieee/data_fetcher.rb', line 142

def save_doc(bib)
  c = case @format
      when "xml" then bib.to_xml(bibdata: true)
      when "yaml" then bib.to_hash.to_yaml
      else bib.send("to_#{@format}")
      end
  File.write file_name(bib.docnumber), c, encoding: "UTF-8"
end

#update_relationsObject

Update unresoverd relations



166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
# File 'lib/relaton_ieee/data_fetcher.rb', line 166

def update_relations # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
  @crossrefs.each do |dnum, rfs|
    bib = nil
    rfs.each do |rf|
      if backrefs[rf[:amsid]]
        rel = create_relation(rf[:type], backrefs[rf[:amsid]])
        if rel
          bib ||= read_bib(dnum)
          bib.relation << rel
          save_doc bib
        end
      else
        Util.warn "Unresolved relation: '#{rf[:amsid]}' type: '#{rf[:type]}' for '#{dnum}'"
      end
    end
  end
end