Class: RelatonCie::DataFetcher
- Inherits:
-
Object
- Object
- RelatonCie::DataFetcher
- Defined in:
- lib/relaton_cie/data_fetcher.rb
Constant Summary collapse
- URL =
"https://www.techstreet.com/cie/searches/31156444?page=1&per_page=100"
Class Method Summary collapse
Instance Method Summary collapse
- #agent ⇒ Object
- #content(bib) ⇒ Object
- #fetch(url) ⇒ Object
- #fetch_abstract(doc) ⇒ Array<RelatonBib::FormattedString>
- #fetch_contributor(doc) ⇒ Array<Hash>
- #fetch_date(doc) ⇒ Array<RelatonBib::BibliographicDate>
- #fetch_docid(hit, doc) ⇒ Array<RelatonBib::DocumentIdentifier>
- #fetch_docnumber(hit) ⇒ Object
- #fetch_doctype ⇒ Object
- #fetch_edition(doc) ⇒ String
- #fetch_link(url) ⇒ Array<RelatonBib::TypedUri>
- #fetch_relation(doc) ⇒ Array<Hash>
- #fetch_title(doc) ⇒ RelatonBib::TypedTitleStringCollection, Array
- #index ⇒ Object
-
#initialize(output, format) ⇒ DataFetcher
constructor
A new instance of DataFetcher.
-
#parse_cie_code(code1, code2, doc = nil) ⇒ Object
rubocop:disable Metrics/CyclomaticComplexity.
- #parse_code(hit, doc = nil) ⇒ Object
- #parse_page(hit) ⇒ Object
- #primary_code(code, doc = nil) ⇒ Object
- #time_req ⇒ Object
- #write_file(bib) ⇒ Object
Constructor Details
#initialize(output, format) ⇒ DataFetcher
Returns a new instance of DataFetcher.
12 13 14 15 16 17 |
# File 'lib/relaton_cie/data_fetcher.rb', line 12 def initialize(output, format) @output = output @format = format @files = [] @ext = format == "bibxml" ? "xml" : format end |
Class Method Details
.fetch(output: "data", format: "yaml") ⇒ Object
228 229 230 231 232 233 234 235 236 237 238 |
# File 'lib/relaton_cie/data_fetcher.rb', line 228 def self.fetch(output: "data", format: "yaml") t1 = Time.now puts "Started at: #{t1}" FileUtils.mkdir_p output new(output, format).fetch URL t2 = Time.now puts "Stopped at: #{t2}" puts "Done in: #{(t2 - t1).round} sec." end |
Instance Method Details
#agent ⇒ Object
19 20 21 |
# File 'lib/relaton_cie/data_fetcher.rb', line 19 def agent @agent ||= Mechanize.new end |
#content(bib) ⇒ Object
182 183 184 185 186 187 188 |
# File 'lib/relaton_cie/data_fetcher.rb', line 182 def content(bib) case @format when "xml" then bib.to_xml(bibdata: true) when "yaml" then bib.to_hash.to_yaml when "bibxml" then bib.to_bibxml end end |
#fetch(url) ⇒ Object
209 210 211 212 213 214 215 216 217 218 |
# File 'lib/relaton_cie/data_fetcher.rb', line 209 def fetch(url) result = time_req { agent.get url } result.xpath("//li[@data-product]").each { |hit| parse_page hit } np = result.at '//a[@class="next_page"]' if np fetch "https://www.techstreet.com#{np[:href]}" else index.save end end |
#fetch_abstract(doc) ⇒ Array<RelatonBib::FormattedString>
124 125 126 127 128 129 130 |
# File 'lib/relaton_cie/data_fetcher.rb', line 124 def fetch_abstract(doc) content = doc.at('//div[contains(@class,"description")]')&.text&.strip return [] if content.nil? || content.empty? [RelatonBib::FormattedString.new(content: content, language: "en", script: "Latn")] end |
#fetch_contributor(doc) ⇒ Array<Hash>
134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 |
# File 'lib/relaton_cie/data_fetcher.rb', line 134 def fetch_contributor(doc) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength,Metrics/CyclomaticComplexity = doc.xpath('//hgroup/p[not(@class="pub_date")]').text.gsub "\"", "" contribs = [] until .empty? /^(?<sname1>\S+(?:\sder?\s)?[^\s,]+) (?:,?\s(?<sname2>[\w-]{2,})(?=,\s+\w\.))? (?:,?\s(?<fname>[\w-]{2,})(?!,\s+\w\.))? (?:(?:\s?,\s?|\s)(?<init>(?:\w(?:\s?\.|\s|,|$)[\s-]?)+))? (?:(?:,\s*|\s+|\.|(?<=\s))(?:and\s)?)?/x =~ raise StandardError, "Author name not found in \"#{}\"" unless $LAST_MATCH_INFO .sub! $LAST_MATCH_INFO.to_s, "" sname = [sname1, sname2].compact.join " " surname = RelatonBib::LocalizedString.new sname, "en", "Latn" initial = (init&.strip || "").split(/(?:,|\.)(?:-|\s)?/).map do |int| RelatonBib::LocalizedString.new(int.strip, "en", "Latn") end forename = fname ? [RelatonBib::LocalizedString.new(fname, "en", "Latn")] : [] fullname = RelatonBib::FullName.new surname: surname, forename: forename, initial: initial person = RelatonBib::Person.new name: fullname contribs << { entity: person, role: [{ type: "author" }] } end org = RelatonBib::Organization.new( name: "Commission Internationale de L'Eclairage", abbreviation: "CIE", url: "cie.co.at" ) contribs << { entity: org, role: [{ type: "publisher" }] } end |
#fetch_date(doc) ⇒ Array<RelatonBib::BibliographicDate>
84 85 86 87 88 89 90 |
# File 'lib/relaton_cie/data_fetcher.rb', line 84 def fetch_date(doc) doc.xpath('//dt[.="Published:"]/following-sibling::dd[1]').map do |d| pd = d.text.strip on = pd.match?(/^\d{4}(?:[^-]|$)/) ? pd : Date.strptime(pd, "%m/%d/%Y").strftime("%Y-%m-%d") RelatonBib::BibliographicDate.new(type: "published", on: on) end end |
#fetch_docid(hit, doc) ⇒ Array<RelatonBib::DocumentIdentifier>
30 31 32 33 34 35 36 37 38 39 40 |
# File 'lib/relaton_cie/data_fetcher.rb', line 30 def fetch_docid(hit, doc) code, code2 = parse_code hit, doc docid = [RelatonBib::DocumentIdentifier.new(type: "CIE", id: code, primary: true)] if code2 type2 = code2.match(/\w+/).to_s docid << RelatonBib::DocumentIdentifier.new(type: type2, id: code2.strip) end isbn = doc.at('//dt[contains(.,"ISBN")]/following-sibling::dd') docid << RelatonBib::DocumentIdentifier.new(type: "ISBN", id: isbn.text.strip) if isbn docid end |
#fetch_docnumber(hit) ⇒ Object
69 70 71 |
# File 'lib/relaton_cie/data_fetcher.rb', line 69 def fetch_docnumber(hit) parse_code(hit).first.sub(/\w+\s/, "") end |
#fetch_doctype ⇒ Object
163 164 165 |
# File 'lib/relaton_cie/data_fetcher.rb', line 163 def fetch_doctype RelatonBib::DocumentType.new(type: "document") end |
#fetch_edition(doc) ⇒ String
94 95 96 |
# File 'lib/relaton_cie/data_fetcher.rb', line 94 def fetch_edition(doc) doc.at('//dt[.="Edition:"]/following-sibling::dd')&.text&.match(/^\d+(?=(st|nd|rd|th))/)&.to_s end |
#fetch_link(url) ⇒ Array<RelatonBib::TypedUri>
118 119 120 |
# File 'lib/relaton_cie/data_fetcher.rb', line 118 def fetch_link(url) [RelatonBib::TypedUri.new(type: "src", content: url)] end |
#fetch_relation(doc) ⇒ Array<Hash>
100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 |
# File 'lib/relaton_cie/data_fetcher.rb', line 100 def fetch_relation(doc) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength doc.xpath('//section[@class="history"]/ol/li[not(contains(@class,"selected-product"))]').map do |rel| ref = rel.at("a") url = "https://www.techstreet.com#{ref[:href]}" title = RelatonBib::TypedTitleString.from_string ref.at('p/span[@class="title"]').text did = ref.at("h3").text docid = [RelatonBib::DocumentIdentifier.new(type: "CIE", id: did, primary: true)] on = ref.at("p/time") date = [RelatonBib::BibliographicDate.new(type: "published", on: on[:datetime])] link = [RelatonBib::TypedUri.new(type: "src", content: url)] bibitem = BibliographicItem.new docid: docid, title: title, link: link, date: date type = ref.at('//li/i[contains(@class,"historical")]') ? "updates" : "updatedBy" { type: type, bibitem: bibitem } end end |
#fetch_title(doc) ⇒ RelatonBib::TypedTitleStringCollection, Array
75 76 77 78 79 80 |
# File 'lib/relaton_cie/data_fetcher.rb', line 75 def fetch_title(doc) t = doc.at("//hgroup/h2", "//hgroup/h1") return [] unless t RelatonBib::TypedTitleString.from_string t.text.strip end |
#index ⇒ Object
23 24 25 |
# File 'lib/relaton_cie/data_fetcher.rb', line 23 def index @index ||= Relaton::Index.find_or_create :cie, file: "index-v1.yaml" end |
#parse_cie_code(code1, code2, doc = nil) ⇒ Object
rubocop:disable Metrics/CyclomaticComplexity
61 62 63 64 65 66 67 |
# File 'lib/relaton_cie/data_fetcher.rb', line 61 def parse_cie_code(code1, code2, doc = nil) # rubocop:disable Metrics/CyclomaticComplexity code = code1.size > 25 && code2 ? "CIE #{code2.sub(/,(\sPages)?/, '')}" : code1 add = doc&.at("//hgroup/h2")&.text&.match(/(Add)endum\s(\d+)$/) return code unless add "#{code} #{add[1]} #{add[2]}" end |
#parse_code(hit, doc = nil) ⇒ Object
42 43 44 45 46 47 |
# File 'lib/relaton_cie/data_fetcher.rb', line 42 def parse_code(hit, doc = nil) code = hit.at("h3/a").text.strip.squeeze(" ").sub(/\u25b9/, "").gsub(" / ", "/") c2idx = %r{(?:\(|/)(?<c2>(?:ISO|IEC)\s[^()]+)} =~ code code = code[0...c2idx].strip if c2idx [primary_code(code, doc), c2] end |
#parse_page(hit) ⇒ Object
191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 |
# File 'lib/relaton_cie/data_fetcher.rb', line 191 def parse_page(hit) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength url = "https://www.techstreet.com#{hit.at('h3/a')[:href]}" doc = time_req { agent.get url } item = BibliographicItem.new( type: "standard", link: fetch_link(url), docnumber: fetch_docnumber(hit), docid: fetch_docid(hit, doc), title: fetch_title(doc), abstract: fetch_abstract(doc), date: fetch_date(doc), edition: fetch_edition(doc), contributor: fetch_contributor(doc), relation: fetch_relation(doc), language: ["en"], script: ["Latn"], doctype: fetch_doctype ) write_file item rescue StandardError => e Util.error do "Document: #{url}\n#{e.}\n#{e.backtrace}" end end |
#primary_code(code, doc = nil) ⇒ Object
49 50 51 52 53 54 55 56 57 58 59 |
# File 'lib/relaton_cie/data_fetcher.rb', line 49 def primary_code(code, doc = nil) /^(?<code1>[^(]+)(?:\((?<code2>\w+\d+,(?:\sPages)?[^)]+))?/ =~ code if code1&.match?(/^CIE/) parse_cie_code code1, code2, doc elsif (pcode = doc&.at('//dt[.="Product Code(s):"]/following-sibling::dd')) "CIE #{pcode.text.strip.match(/[^,]+/)}" else num = code.match(/(?<=\()\w{2}\d+,.+(?=\))/).to_s.gsub(/,(?=\s)/, "").gsub(/,(?=\S)/, " ") "CIE #{num}" end end |
#time_req ⇒ Object
220 221 222 223 224 225 226 |
# File 'lib/relaton_cie/data_fetcher.rb', line 220 def time_req t1 = Time.now result = yield t = 1 - (Time.now - t1) sleep t if t.positive? result end |
#write_file(bib) ⇒ Object
168 169 170 171 172 173 174 175 176 177 178 179 180 |
# File 'lib/relaton_cie/data_fetcher.rb', line 168 def write_file(bib) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength id = bib.docidentifier[0].id.gsub(%r{[/\s\-:.]}, "_") file = "#{@output}/#{id.upcase}.#{@format}" if @files.include? file Util.warn do "File #{file} exists. Docid: #{bib.docidentifier[0].id}\n" \ "Link: #{bib.link.detect { |l| l.type == 'src' }.content}" end else @files << file end index.add_or_update bib.docidentifier[0].id, file File.write file, content(bib), encoding: "UTF-8" end |