Module: RelatonGb::Scrapper
- Included in:
- GbScrapper, SecScrapper, TScrapper
- Defined in:
- lib/relaton_gb/scrapper.rb
Overview
Common scrapping methods.
Constant Summary collapse
- STAGES =
{ "即将实施" => "published", "现行" => "activated", "废止" => "obsoleted", "被代替" => "replaced" }.freeze
Instance Method Summary collapse
- #fetch_structuredidentifier(docref) ⇒ RelatonIsoBib::StructuredIdentifier
- #get_contributors(doc, docref) ⇒ Array<Hash>
- #get_docid(docref) ⇒ Array<RelatonBib::DocumentIdentifier>
- #get_status(doc, status = nil) ⇒ RelatonBib::DocumentStatus
- #get_titles(doc) ⇒ Array<RelatonBib::TypedTitleString>
- #get_type ⇒ Object
- #org(lang, name, gbtype) ⇒ Hash
- #scrapped_data(doc, src, hit) ⇒ Hash
Instance Method Details
#fetch_structuredidentifier(docref) ⇒ RelatonIsoBib::StructuredIdentifier
49 50 51 52 53 54 55 |
# File 'lib/relaton_gb/scrapper.rb', line 49 def fetch_structuredidentifier(docref) m = docref.match(/^([^–—.-]*\d+)\.?((?<=\.)\d+|)/) RelatonIsoBib::StructuredIdentifier.new( project_number: m[1], part_number: m[2], prefix: nil, id: docref, type: "Chinese Standard" ) end |
#get_contributors(doc, docref) ⇒ Array<Hash>
60 61 62 63 64 65 66 67 68 69 |
# File 'lib/relaton_gb/scrapper.rb', line 60 def get_contributors(doc, docref) name = docref.match(/^[^\s]+/).to_s name.sub!(%r{/[TZ]$}, "") unless name =~ /^GB/ gbtype = get_gbtype(doc, docref) orgs = %w[en zh].map { |l| org(l, name, gbtype) }.compact return [] unless orgs.any? entity = RelatonBib::Organization.new name: orgs [{ entity: entity, role: [type: "publisher"] }] end |
#get_docid(docref) ⇒ Array<RelatonBib::DocumentIdentifier>
43 44 45 |
# File 'lib/relaton_gb/scrapper.rb', line 43 def get_docid(docref) [RelatonBib::DocumentIdentifier.new(id: docref, type: "Chinese Standard", primary: true)] end |
#get_status(doc, status = nil) ⇒ RelatonBib::DocumentStatus
101 102 103 104 105 106 |
# File 'lib/relaton_gb/scrapper.rb', line 101 def get_status(doc, status = nil) status ||= doc.at("//td[contains(., '标准状态')]/span")&.text&.strip return unless STAGES[status] RelatonBib::DocumentStatus.new stage: STAGES[status] end |
#get_titles(doc) ⇒ Array<RelatonBib::TypedTitleString>
85 86 87 88 89 90 91 92 |
# File 'lib/relaton_gb/scrapper.rb', line 85 def get_titles(doc) tzh = doc.at("//td[contains(text(), '中文标准名称')]/b").text titles = RelatonBib::TypedTitleString.from_string tzh, "zh", "Hans" ten = doc.at("//td[contains(text(), '英文标准名称')]").text.match(/[\w\s]+/).to_s return titles if ten.empty? titles + RelatonBib::TypedTitleString.from_string(ten, "en", "Latn") end |
#get_type ⇒ Object
94 95 96 |
# File 'lib/relaton_gb/scrapper.rb', line 94 def get_type DocumentType.new type: "standard" end |
#org(lang, name, gbtype) ⇒ Hash
75 76 77 78 79 80 81 |
# File 'lib/relaton_gb/scrapper.rb', line 75 def org(lang, name, gbtype) ag = GbAgencies::Agencies.new(lang, {}, "") content = ag.standard_agency1(gbtype[:scope], name, gbtype[:mandate]) return unless content { language: lang, content: content } end |
#scrapped_data(doc, src, hit) ⇒ Hash
21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 |
# File 'lib/relaton_gb/scrapper.rb', line 21 def scrapped_data(doc, src, hit) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength { fetched: Date.today.to_s, committee: get_committee(doc, hit.docref), docid: get_docid(hit.docref), title: get_titles(doc), contributor: get_contributors(doc, hit.docref), doctype: get_type, docstatus: get_status(doc, hit.status), gbtype: get_gbtype(doc, hit.docref), ccs: get_ccs(doc), ics: get_ics(doc), link: [{ type: "src", content: src }], date: get_dates(doc), language: ["zh"], script: ["Hans"], structuredidentifier: fetch_structuredidentifier(hit.docref), } end |