Class: RelatonIeee::DataFetcher
- Inherits:
-
Object
- Object
- RelatonIeee::DataFetcher
- Defined in:
- lib/relaton_ieee/data_fetcher.rb
Constant Summary collapse
- RELATION_TYPES =
{ "S" => { type: "obsoletedBy" }, "V" => { type: "updates", description: "revises" }, "T" => { type: "updates", description: "amends" }, "C" => { type: "updates", description: "corrects" }, "O" => { type: "adoptedFrom" }, "P" => { type: "complementOf", description: "supplement" }, "N" => false, "G" => false, "F" => false, "I" => false, "E" => false, "B" => false, "W" => false }.freeze
Instance Attribute Summary collapse
-
#backrefs ⇒ Hash
readonly
List of AMSID => PubID.
Class Method Summary collapse
-
.fetch(output: "data", format: "yaml") ⇒ Object
Convert documents from ‘ieee-rawbib` dir (IEEE dataset) to BibYAML/BibXML.
Instance Method Summary collapse
-
#add_crossref(docnumber, amsid) ⇒ Object
Save unresolved relation reference.
-
#create_relation(type, fref) ⇒ RelatonBib::DocumentRelation
Create relation instance.
-
#fetch ⇒ Object
Convert documents from ‘ieee-rawbib` dir (IEEE dataset) to BibYAML/BibXML.
-
#fetch_doc(xml, filename) ⇒ Object
Parse document and save it.
-
#file_name(docnumber) ⇒ String
Make filename from PubID.
-
#initialize(output, format) ⇒ DataFetcher
constructor
Create RelatonIeee::DataFetcher instance.
-
#read_bib(docnumber) ⇒ RelatonIeee::IeeeBibliographicItem
Read document form BibXML/BibYAML file.
-
#read_zip(file) ⇒ String
Extract XML file from zip archive.
-
#save_doc(bib) ⇒ Object
Save document to file.
-
#update_relations ⇒ Object
Update unresoverd relations.
Constructor Details
#initialize(output, format) ⇒ DataFetcher
Create RelatonIeee::DataFetcher instance
28 29 30 31 32 33 34 |
# File 'lib/relaton_ieee/data_fetcher.rb', line 28 def initialize(output, format) @output = output @format = format @ext = format.sub(/^bib/, "") @crossrefs = {} @backrefs = {} end |
Instance Attribute Details
#backrefs ⇒ Hash (readonly)
Returns list of AMSID => PubID.
20 21 22 |
# File 'lib/relaton_ieee/data_fetcher.rb', line 20 def backrefs @backrefs end |
Class Method Details
.fetch(output: "data", format: "yaml") ⇒ Object
Convert documents from ‘ieee-rawbib` dir (IEEE dataset) to BibYAML/BibXML
43 44 45 46 47 48 49 50 51 |
# File 'lib/relaton_ieee/data_fetcher.rb', line 43 def self.fetch(output: "data", format: "yaml") t1 = Time.now puts "Started at: #{t1}" FileUtils.mkdir_p output new(output, format).fetch t2 = Time.now puts "Stopped at: #{t2}" puts "Done in: #{(t2 - t1).round} sec." end |
Instance Method Details
#add_crossref(docnumber, amsid) ⇒ Object
Save unresolved relation reference
127 128 129 130 131 132 133 134 135 |
# File 'lib/relaton_ieee/data_fetcher.rb', line 127 def add_crossref(docnumber, amsid) return if RELATION_TYPES[amsid[:type]] == false ref = { amsid: amsid.text, type: amsid[:type] } if @crossrefs[docnumber] @crossrefs[docnumber] << ref else @crossrefs[docnumber] = [ref] end end |
#create_relation(type, fref) ⇒ RelatonBib::DocumentRelation
Create relation instance
192 193 194 195 196 197 198 199 200 201 202 203 204 205 |
# File 'lib/relaton_ieee/data_fetcher.rb', line 192 def create_relation(type, fref) # rubocop:disable Metrics/MethodLength return if RELATION_TYPES[type] == false fr = RelatonBib::FormattedRef.new(content: fref) docid = RelatonBib::DocumentIdentifier.new(type: "IEEE", id: fref, primary: true) bib = IeeeBibliographicItem.new formattedref: fr, docid: [docid] desc = RELATION_TYPES[type][:description] description = desc && RelatonBib::FormattedString.new(content: desc, language: "en", script: "Latn") RelatonBib::DocumentRelation.new( type: RELATION_TYPES[type][:type], description: description, bibitem: bib, ) end |
#fetch ⇒ Object
Convert documents from ‘ieee-rawbib` dir (IEEE dataset) to BibYAML/BibXML
56 57 58 59 60 61 62 63 64 65 66 67 68 |
# File 'lib/relaton_ieee/data_fetcher.rb', line 56 def fetch # rubocop:disable Metrics/AbcSize,Metrics/MethodLength Dir["ieee-rawbib/**/*.{xml,zip}"].reject { |f| f["Deleted_"] }.each do |f| xml = case File.extname(f) when ".zip" then read_zip f when ".xml" then File.read f, encoding: "UTF-8" end fetch_doc xml, f rescue StandardError => e Util.error "File: #{f}\n#{e.}\n#{e.backtrace}" end # File.write "normtitles.txt", @normtitles.join("\n") update_relations end |
#fetch_doc(xml, filename) ⇒ Object
Parse document and save it
90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 |
# File 'lib/relaton_ieee/data_fetcher.rb', line 90 def fetch_doc(xml, filename) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength,Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity doc = Nokogiri::XML(xml).at("/publication") unless doc Util.warn "Empty file: `#{filename}`" return end stdid = doc.at("./publicationinfo/standard_id")&.text return if stdid == "0" fetcher = DataParser.new doc, self bib = fetcher.parse if bib.docnumber.nil? nt = doc&.at("./normtitle")&.text Util.warn "PubID parse error. Normtitle: `#{nt}`, file: `#{filename}`" return end amsid = doc.at("./publicationinfo/amsid").text if backrefs.value?(bib.docidentifier[0].id) && /updates\.\d+/ !~ filename oamsid = backrefs.key bib.docidentifier[0].id Util.warn "Document exists ID: `#{bib.docidentifier[0].id}` AMSID: " \ "`#{amsid}` source: `#{filename}`. Other AMSID: `#{oamsid}`" if bib.docidentifier[0].id.include?(doc.at("./publicationinfo/stdnumber").text) save_doc bib # rewrite file if the PubID matches to the stdnumber backrefs[amsid] = bib.docidentifier[0].id end else save_doc bib backrefs[amsid] = bib.docidentifier[0].id end end |
#file_name(docnumber) ⇒ String
Make filename from PubID
158 159 160 161 |
# File 'lib/relaton_ieee/data_fetcher.rb', line 158 def file_name(docnumber) name = docnumber.gsub(/\s-/, "-").gsub(/[\s,:\/]/, "_").squeeze("_").upcase File.join @output, "#{name}.#{@ext}" end |
#read_bib(docnumber) ⇒ RelatonIeee::IeeeBibliographicItem
Read document form BibXML/BibYAML file
214 215 216 217 218 219 220 221 |
# File 'lib/relaton_ieee/data_fetcher.rb', line 214 def read_bib(docnumber) c = File.read file_name(docnumber), encoding: "UTF-8" case @format when "xml" then XMLParser.from_xml c when "bibxml" then BibXMLParser.parse c else IeeeBibliographicItem.from_hash YAML.safe_load(c) end end |
#read_zip(file) ⇒ String
Extract XML file from zip archive
77 78 79 80 81 82 |
# File 'lib/relaton_ieee/data_fetcher.rb', line 77 def read_zip(file) Zip::File.open(file) do |zf| entry = zf.glob("**/*.xml").first entry.get_input_stream.read end end |
#save_doc(bib) ⇒ Object
Save document to file
142 143 144 145 146 147 148 149 |
# File 'lib/relaton_ieee/data_fetcher.rb', line 142 def save_doc(bib) c = case @format when "xml" then bib.to_xml(bibdata: true) when "yaml" then bib.to_hash.to_yaml else bib.send("to_#{@format}") end File.write file_name(bib.docnumber), c, encoding: "UTF-8" end |
#update_relations ⇒ Object
Update unresoverd relations
166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 |
# File 'lib/relaton_ieee/data_fetcher.rb', line 166 def update_relations # rubocop:disable Metrics/AbcSize,Metrics/MethodLength @crossrefs.each do |dnum, rfs| bib = nil rfs.each do |rf| if backrefs[rf[:amsid]] rel = create_relation(rf[:type], backrefs[rf[:amsid]]) if rel bib ||= read_bib(dnum) bib.relation << rel save_doc bib end else Util.warn "Unresolved relation: '#{rf[:amsid]}' type: '#{rf[:type]}' for '#{dnum}'" end end end end |