Module: RelatonGb::Scrapper

Included in:: GbScrapper, SecScrapper, TScrapper

Defined in:: lib/relaton_gb/scrapper.rb

Overview

Common scrapping methods.

Constant Summary collapse

STAGES =

{ "即将实施" => "published",
"现行" => "activated",
"废止" => "obsoleted",
"被代替" => "replaced" }.freeze

Instance Method Summary collapse

Instance Method Details

#fetch_structuredidentifier(docref) ⇒ `RelatonIsoBib::StructuredIdentifier`

Parameters:

docref (String)

Returns:

(RelatonIsoBib::StructuredIdentifier)

# File 'lib/relaton_gb/scrapper.rb', line 49

def fetch_structuredidentifier(docref)
  m = docref.match(/^([^–—.-]*\d+)\.?((?<=\.)\d+|)/)
  RelatonIsoBib::StructuredIdentifier.new(
    project_number: m[1], part_number: m[2], prefix: nil,
    id: docref, type: "Chinese Standard"
  )
end

#get_contributors(doc, docref) ⇒ `Array<Hash>`

Parameters:

doc (Nokogiri::HTML::Document)
docref (Strings)

Returns:

(Array<Hash>)

# File 'lib/relaton_gb/scrapper.rb', line 60

def get_contributors(doc, docref)
  name = docref.match(/^[^\s]+/).to_s
  name.sub!(%r{/[TZ]$}, "") unless name =~ /^GB/
  gbtype = get_gbtype(doc, docref)
  orgs = %w[en zh].map { |l| org(l, name, gbtype) }.compact
  return [] unless orgs.any?

  entity = RelatonBib::Organization.new name: orgs
  [{ entity: entity, role: [type: "publisher"] }]
end

#get_docid(docref) ⇒ `Array<RelatonBib::DocumentIdentifier>`

Parameters:

docref (String)

Returns:

(Array<RelatonBib::DocumentIdentifier>)



43
44
45

# File 'lib/relaton_gb/scrapper.rb', line 43

def get_docid(docref)
  [RelatonBib::DocumentIdentifier.new(id: docref, type: "Chinese Standard", primary: true)]
end

#get_status(doc, status = nil) ⇒ `RelatonBib::DocumentStatus`

Parameters:

doc (Nokogiri::HTML::Document)
status (String, NilClass) (defaults to: nil)

Returns:

(RelatonBib::DocumentStatus)

# File 'lib/relaton_gb/scrapper.rb', line 101

def get_status(doc, status = nil)
  status ||= doc.at("//td[contains(., '标准状态')]/span")&.text&.strip
  return unless STAGES[status]

  RelatonBib::DocumentStatus.new stage: STAGES[status]
end

#get_titles(doc) ⇒ `Array<RelatonBib::TypedTitleString>`

Parameters:

doc (Nokogiri::HTML::Document)

Returns:

(Array<RelatonBib::TypedTitleString>)

# File 'lib/relaton_gb/scrapper.rb', line 85

def get_titles(doc)
  tzh = doc.at("//td[contains(text(), '中文标准名称')]/b").text
  titles = RelatonBib::TypedTitleString.from_string tzh, "zh", "Hans"
  ten = doc.at("//td[contains(text(), '英文标准名称')]").text.match(/[\w\s]+/).to_s
  return titles if ten.empty?

  titles + RelatonBib::TypedTitleString.from_string(ten, "en", "Latn")
end

#get_type ⇒ `Object`



94
95
96

# File 'lib/relaton_gb/scrapper.rb', line 94

def get_type
  DocumentType.new type: "standard"
end

#org(lang, name, gbtype) ⇒ `Hash`

Parameters:

lang (String)
name (String)
gbtype (Hash)

Returns:

(Hash)

# File 'lib/relaton_gb/scrapper.rb', line 75

def org(lang, name, gbtype)
  ag = GbAgencies::Agencies.new(lang, {}, "")
  content = ag.standard_agency1(gbtype[:scope], name, gbtype[:mandate])
  return unless content

  { language: lang, content: content }
end

#scrapped_data(doc, src, hit) ⇒ `Hash`

Parameters:

doc (Nokogiri::HTML::Document)
src (String)
hit (RelatonGb::Hit)

Returns:

(Hash)

# File 'lib/relaton_gb/scrapper.rb', line 21

def scrapped_data(doc, src, hit) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
  {
    fetched: Date.today.to_s,
    committee: get_committee(doc, hit.docref),
    docid: get_docid(hit.docref),
    title: get_titles(doc),
    contributor: get_contributors(doc, hit.docref),
    doctype: get_type,
    docstatus: get_status(doc, hit.status),
    gbtype: get_gbtype(doc, hit.docref),
    ccs: get_ccs(doc),
    ics: get_ics(doc),
    link: [{ type: "src", content: src }],
    date: get_dates(doc),
    language: ["zh"],
    script: ["Hans"],
    structuredidentifier: fetch_structuredidentifier(hit.docref),
  }
end

Module: RelatonGb::Scrapper

Overview

Constant Summary collapse

Instance Method Summary collapse

Instance Method Details

#fetch_structuredidentifier(docref) ⇒ RelatonIsoBib::StructuredIdentifier

#get_contributors(doc, docref) ⇒ Array<Hash>

#get_docid(docref) ⇒ Array<RelatonBib::DocumentIdentifier>

#get_status(doc, status = nil) ⇒ RelatonBib::DocumentStatus

#get_titles(doc) ⇒ Array<RelatonBib::TypedTitleString>

#get_type ⇒ Object

#org(lang, name, gbtype) ⇒ Hash

#scrapped_data(doc, src, hit) ⇒ Hash

#fetch_structuredidentifier(docref) ⇒ `RelatonIsoBib::StructuredIdentifier`

#get_contributors(doc, docref) ⇒ `Array<Hash>`

#get_docid(docref) ⇒ `Array<RelatonBib::DocumentIdentifier>`

#get_status(doc, status = nil) ⇒ `RelatonBib::DocumentStatus`

#get_titles(doc) ⇒ `Array<RelatonBib::TypedTitleString>`

#get_type ⇒ `Object`

#org(lang, name, gbtype) ⇒ `Hash`

#scrapped_data(doc, src, hit) ⇒ `Hash`