Class: Mechanizer::Noko

Inherits:
Object
  • Object
show all
Defined in:
lib/mechanizer/noko.rb

Instance Method Summary collapse

Constructor Details

#initializeNoko

Returns a new instance of Noko.



14
15
16
# File 'lib/mechanizer/noko.rb', line 14

def initialize
  @timeout = 60
end

Instance Method Details

#error_parser(err_msg) ⇒ Object



79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
# File 'lib/mechanizer/noko.rb', line 79

def error_parser(err_msg)
  if err_msg.include?("404 => Net::HTTPNotFound")
    err_msg = "Error: 404"
  elsif err_msg.include?("connection refused")
    err_msg = "Error: Connection"
  elsif err_msg.include?("undefined method")
    err_msg = "Error: Method"
  elsif err_msg.include?("TCP connection")
    err_msg = "Error: TCP"
  elsif err_msg.include?("execution expired")
    err_msg = "Error: Runtime"
  elsif err_msg.include?("absolute URL needed")
    err_msg = "Error: URL Not Absolute"
  else
    err_msg = "Error: Undefined"
  end
  err_msg
end


27
28
29
30
31
32
33
34
35
36
37
# File 'lib/mechanizer/noko.rb', line 27

def extract_links(noko_hash)
  links = noko_hash[:page]&.links
  unless noko_hash[:err_msg].present? || !links.present?
    noko_hash[:texts_and_paths] = links.map do |link|
      text = link.text&.downcase&.gsub(/\s+/, ' ')&.strip
      path = link&.href&.downcase&.strip
      text_and_path = {text: text, path: path}
    end
  end
  noko_hash
end

#pre_noko_msg(url) ⇒ Object



73
74
75
76
77
# File 'lib/mechanizer/noko.rb', line 73

def pre_noko_msg(url)
  msg = "\n\n#{'='*40}\nSCRAPING: #{url}\nMax Wait Set: #{@timeout} Seconds\n\n"
  puts msg
  msg
end

#scrape(args) ⇒ Object



18
19
20
21
22
23
24
25
# File 'lib/mechanizer/noko.rb', line 18

def scrape(args)
  @timeout = args.fetch(:timeout, 60)
  url = args.fetch(:url)
  noko_hash = { url: url, err_msg: nil, texts_and_paths: {}, page: nil }
  noko_hash = start_noko(noko_hash)
  noko_hash = extract_links(noko_hash)
  noko_hash
end

#start_noko(noko_hash) ⇒ Object



39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
# File 'lib/mechanizer/noko.rb', line 39

def start_noko(noko_hash)
  url = noko_hash[:url]
  begin
    begin
      pre_noko_msg(url)
      Timeout::timeout(@timeout) do

        agent = Mechanize.new # { |agent| agent.follow_meta_refresh = true }
        agent.user_agent_alias = 'Mac Safari'
        agent.follow_meta_refresh = true
        agent.read_timeout = @timeout
        agent.open_timeout = @timeout # Length of time to wait until a connection is opened in seconds
        agent.idle_timeout = @timeout # Reset connections that have not been used in this many seconds
        agent.keep_alive = false # enable

        begin
          uri = URI(url)
          page = agent.get(uri)
        rescue Mechanize::ResponseReadError => e
          page = e.force_parse
        end

        page.respond_to?('at_css') ? noko_hash[:page] = page : noko_hash[:err_msg] = "Error: Not-Noko-Obj"
      end
    rescue Timeout::Error # timeout rescue
      noko_hash[:err_msg] = 'Error: Timeout'
    end
  rescue StandardError => e
    err_msg = error_parser("Error: #{$!.message}")
    noko_hash[:err_msg] = err_msg
  end
  noko_hash
end