Class: LinkScraper::Scrape

Inherits:
Object
  • Object
show all
Defined in:
lib/link_scraper/scrape.rb

Instance Method Summary collapse

Constructor Details

#initialize(args = {}) ⇒ Scrape

attr_accessor :text_criteria, :path_criteria



5
6
7
8
9
# File 'lib/link_scraper/scrape.rb', line 5

def initialize(args={})
  @text_scrub = ScrubDb::Strings.new(args.fetch(:text_criteria, {}))
  @path_scrub = ScrubDb::Strings.new(args.fetch(:path_criteria, {}))
  @noko = Mechanizer::Noko.new
end

Instance Method Details



57
58
59
60
61
# File 'lib/link_scraper/scrape.rb', line 57

def encode_link(link_hsh)
  link_hsh[:text] = encoder(link_hsh[:text])
  link_hsh[:path] = encoder(link_hsh[:path])
  link_hsh
end

#encoder(text) ⇒ Object



64
65
66
67
68
69
70
71
72
73
# File 'lib/link_scraper/scrape.rb', line 64

def encoder(text)
  # if text.present? && !text.valid_encoding?
  if text.present?
    text = text.chars.select(&:valid_encoding?).join
    text = text.delete("^\u{0000}-\u{007F}")
    text = text&.gsub(/\s+/, ' ')&.strip
    text = text.gsub("\"", ' ')&.strip
  end
  text
end

#evaluate_scrub_hsh(hsh) ⇒ Object



76
77
78
79
# File 'lib/link_scraper/scrape.rb', line 76

def evaluate_scrub_hsh(hsh)
  string = nil
  string = hsh[:string] if (hsh[:pos_criteria].any? && hsh[:neg_criteria].empty?)
end


23
24
25
26
27
28
29
30
31
32
33
34
35
# File 'lib/link_scraper/scrape.rb', line 23

def extract_link_from_url(valid_links, url)
  formatted = valid_links.map do |link|

    begin
      link[:path] = URI(link[:path])&.path
    rescue StandardError => e
      puts e.message
    end

    link
  end
  formatted
end


38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
# File 'lib/link_scraper/scrape.rb', line 38

def scrub_link_hashes(link_hashes, url)
  valid_hashes = link_hashes.map do |link_hsh|

    if link_hsh[:text].present? || link_hsh[:path].present?
      link_hsh = encode_link(link_hsh)
      text_hsh = @text_scrub.scrub_string(link_hsh[:text])
      path_hsh = @path_scrub.scrub_string(link_hsh[:path])

      text = evaluate_scrub_hsh(text_hsh)
      path = evaluate_scrub_hsh(path_hsh)
    end

    link_hsh = nil unless (text.present? || path.present?)
    link_hsh
  end

  valid_hashes = valid_hashes.compact
end

#start(url) ⇒ Object



12
13
14
15
16
17
18
19
20
# File 'lib/link_scraper/scrape.rb', line 12

def start(url)
  noko_hash = @noko.scrape({url: url})
  link_hashes = noko_hash[:texts_and_paths]

  err_msg = noko_hash[:err_msg]
  page = noko_hash[:page]
  valid_links = scrub_link_hashes(link_hashes, url)
  valid_links = extract_link_from_url(valid_links, url)
end