Class: Mechanizer::Noko
- Inherits:
-
Object
- Object
- Mechanizer::Noko
- Defined in:
- lib/mechanizer/noko.rb
Instance Method Summary collapse
- #error_parser(err_msg) ⇒ Object
- #extract_links(noko_hash) ⇒ Object
-
#initialize ⇒ Noko
constructor
A new instance of Noko.
- #pre_noko_msg(url) ⇒ Object
- #scrape(args) ⇒ Object
- #start_noko(noko_hash) ⇒ Object
Constructor Details
#initialize ⇒ Noko
Returns a new instance of Noko.
14 15 16 |
# File 'lib/mechanizer/noko.rb', line 14 def initialize @timeout = 60 end |
Instance Method Details
#error_parser(err_msg) ⇒ Object
79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 |
# File 'lib/mechanizer/noko.rb', line 79 def error_parser(err_msg) if err_msg.include?("404 => Net::HTTPNotFound") err_msg = "Error: 404" elsif err_msg.include?("connection refused") err_msg = "Error: Connection" elsif err_msg.include?("undefined method") err_msg = "Error: Method" elsif err_msg.include?("TCP connection") err_msg = "Error: TCP" elsif err_msg.include?("execution expired") err_msg = "Error: Runtime" elsif err_msg.include?("absolute URL needed") err_msg = "Error: URL Not Absolute" else err_msg = "Error: Undefined" end err_msg end |
#extract_links(noko_hash) ⇒ Object
27 28 29 30 31 32 33 34 35 36 37 |
# File 'lib/mechanizer/noko.rb', line 27 def extract_links(noko_hash) links = noko_hash[:page]&.links unless noko_hash[:err_msg].present? || !links.present? noko_hash[:texts_and_paths] = links.map do |link| text = link.text&.downcase&.gsub(/\s+/, ' ')&.strip path = link&.href&.downcase&.strip text_and_path = {text: text, path: path} end end noko_hash end |
#pre_noko_msg(url) ⇒ Object
73 74 75 76 77 |
# File 'lib/mechanizer/noko.rb', line 73 def pre_noko_msg(url) msg = "\n\n#{'='*40}\nSCRAPING: #{url}\nMax Wait Set: #{@timeout} Seconds\n\n" puts msg msg end |
#scrape(args) ⇒ Object
18 19 20 21 22 23 24 25 |
# File 'lib/mechanizer/noko.rb', line 18 def scrape(args) @timeout = args.fetch(:timeout, 60) url = args.fetch(:url) noko_hash = { url: url, err_msg: nil, texts_and_paths: {}, page: nil } noko_hash = start_noko(noko_hash) noko_hash = extract_links(noko_hash) noko_hash end |
#start_noko(noko_hash) ⇒ Object
39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 |
# File 'lib/mechanizer/noko.rb', line 39 def start_noko(noko_hash) url = noko_hash[:url] begin begin pre_noko_msg(url) Timeout::timeout(@timeout) do agent = Mechanize.new # { |agent| agent.follow_meta_refresh = true } agent.user_agent_alias = 'Mac Safari' agent. = true agent.read_timeout = @timeout agent.open_timeout = @timeout # Length of time to wait until a connection is opened in seconds agent.idle_timeout = @timeout # Reset connections that have not been used in this many seconds agent.keep_alive = false # enable begin uri = URI(url) page = agent.get(uri) rescue Mechanize::ResponseReadError => e page = e.force_parse end page.respond_to?('at_css') ? noko_hash[:page] = page : noko_hash[:err_msg] = "Error: Not-Noko-Obj" end rescue Timeout::Error # timeout rescue noko_hash[:err_msg] = 'Error: Timeout' end rescue StandardError => e err_msg = error_parser("Error: #{$!.}") noko_hash[:err_msg] = err_msg end noko_hash end |