Class: LinkScraper::Scrape
- Inherits:
-
Object
- Object
- LinkScraper::Scrape
- Defined in:
- lib/link_scraper/scrape.rb
Instance Method Summary collapse
- #encode_link(link_hsh) ⇒ Object
- #encoder(text) ⇒ Object
- #evaluate_scrub_hsh(hsh) ⇒ Object
- #extract_link_from_url(valid_links, url) ⇒ Object
-
#initialize(args = {}) ⇒ Scrape
constructor
attr_accessor :text_criteria, :path_criteria.
- #scrub_link_hashes(link_hashes, url) ⇒ Object
- #start(url) ⇒ Object
Constructor Details
#initialize(args = {}) ⇒ Scrape
attr_accessor :text_criteria, :path_criteria
5 6 7 8 9 |
# File 'lib/link_scraper/scrape.rb', line 5 def initialize(args={}) @text_scrub = ScrubDb::Strings.new(args.fetch(:text_criteria, {})) @path_scrub = ScrubDb::Strings.new(args.fetch(:path_criteria, {})) @noko = Mechanizer::Noko.new end |
Instance Method Details
#encode_link(link_hsh) ⇒ Object
57 58 59 60 61 |
# File 'lib/link_scraper/scrape.rb', line 57 def encode_link(link_hsh) link_hsh[:text] = encoder(link_hsh[:text]) link_hsh[:path] = encoder(link_hsh[:path]) link_hsh end |
#encoder(text) ⇒ Object
64 65 66 67 68 69 70 71 72 73 |
# File 'lib/link_scraper/scrape.rb', line 64 def encoder(text) # if text.present? && !text.valid_encoding? if text.present? text = text.chars.select(&:valid_encoding?).join text = text.delete("^\u{0000}-\u{007F}") text = text&.gsub(/\s+/, ' ')&.strip text = text.gsub("\"", ' ')&.strip end text end |
#evaluate_scrub_hsh(hsh) ⇒ Object
76 77 78 79 |
# File 'lib/link_scraper/scrape.rb', line 76 def evaluate_scrub_hsh(hsh) string = nil string = hsh[:string] if (hsh[:pos_criteria].any? && hsh[:neg_criteria].empty?) end |
#extract_link_from_url(valid_links, url) ⇒ Object
23 24 25 26 27 28 29 30 31 32 33 34 35 |
# File 'lib/link_scraper/scrape.rb', line 23 def extract_link_from_url(valid_links, url) formatted = valid_links.map do |link| begin link[:path] = URI(link[:path])&.path rescue StandardError => e puts e. end link end formatted end |
#scrub_link_hashes(link_hashes, url) ⇒ Object
38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 |
# File 'lib/link_scraper/scrape.rb', line 38 def scrub_link_hashes(link_hashes, url) valid_hashes = link_hashes.map do |link_hsh| if link_hsh[:text].present? || link_hsh[:path].present? link_hsh = encode_link(link_hsh) text_hsh = @text_scrub.scrub_string(link_hsh[:text]) path_hsh = @path_scrub.scrub_string(link_hsh[:path]) text = evaluate_scrub_hsh(text_hsh) path = evaluate_scrub_hsh(path_hsh) end link_hsh = nil unless (text.present? || path.present?) link_hsh end valid_hashes = valid_hashes.compact end |
#start(url) ⇒ Object
12 13 14 15 16 17 18 19 20 |
# File 'lib/link_scraper/scrape.rb', line 12 def start(url) noko_hash = @noko.scrape({url: url}) link_hashes = noko_hash[:texts_and_paths] err_msg = noko_hash[:err_msg] page = noko_hash[:page] valid_links = scrub_link_hashes(link_hashes, url) valid_links = extract_link_from_url(valid_links, url) end |