Module: Isobib::Scrapper

Defined in:
lib/isobib/scrapper.rb

Overview

Scrapper. rubocop:disable Metrics/ModuleLength

Constant Summary collapse

DOMAIN =
'https://www.iso.org'
TYPES =
{
  'TS'    => 'technicalSpecification',
  'TR'    => 'technicalReport',
  'PAS'   => 'publiclyAvailableSpecification',
  'AWI'   => 'approvedWorkItem',
  'CD'    => 'committeeDraft',
  'FDIS'  => 'finalDraftInternationalStandard',
  'NP'    => 'newProposal',
  'DIS'   => 'draftInternationalStandard',
  'WD'    => 'workingDraft',
  'R'     => 'recommendation',
  'Guide' => 'guide'
}.freeze

Class Method Summary collapse

Class Method Details

.get(text) ⇒ Array<Hash>

Parameters:

  • text (String)

Returns:

  • (Array<Hash>)


42
43
44
45
46
47
48
49
50
51
52
53
54
55
# File 'lib/isobib/scrapper.rb', line 42

def get(text)
  begin
    iso_workers = WorkersPool.new 4
    iso_workers.worker { |hit| iso_worker(hit, iso_workers) }
    algolia_workers = start_algolia_search(text, iso_workers)
    iso_docs = iso_workers.result
    algolia_workers.end
    algolia_workers.result
    iso_docs
  rescue
    warn "Could not connect to http://www.iso.org"
    []
  end
end

.parse_page(hit_data) ⇒ Hash

Parse page. rubocop:disable Metrics/AbcSize, Metrics/MethodLength

Parameters:

  • hit (Hash)

Returns:

  • (Hash)


61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
# File 'lib/isobib/scrapper.rb', line 61

def parse_page(hit_data)
  return unless hit_data['path'].match(/\d+$/)
  doc, url = get_page "/standard/#{hit_data['path'].match(/\d+$/)}.html"

  # Fetch edition.
  edition = doc&.xpath("//strong[contains(text(), 'Edition')]/..")
  &.children&.last&.text&.match(/\d+/)&.to_s

  titles, abstract = fetch_titles_abstract(doc)

  IsoBibItem::IsoBibliographicItem.new(
    docid:        fetch_docid(doc),
    edition:      edition,
    language:     langs(doc).map { |l| l[:lang] },
    script:       langs(doc).map { |l| script(l[:lang]) }.uniq,
    titles:       titles,
    type:         fetch_type(hit_data['title']),
    docstatus:    fetch_status(doc, hit_data['status']),
    ics:          fetch_ics(doc),
    dates:        fetch_dates(doc),
    contributors: fetch_contributors(hit_data['title']),
    workgroup:    fetch_workgroup(doc),
    abstract:     abstract,
    copyright:    fetch_copyright(hit_data['title'], doc),
    link:       fetch_link(doc, url),
    relations:    fetch_relations(doc)
  )
end