Class: Browser
- Defined in:
- lib/epitools/browser.rb,
lib/epitools/browser/cache.rb
Overview
A mechanize class that emulates a web-browser, with cache and everything. Progress bars are enabled by default.
Defined Under Namespace
Classes: Cache
Instance Attribute Summary collapse
-
#agent ⇒ Object
Returns the value of attribute agent.
-
#cache ⇒ Object
Returns the value of attribute cache.
-
#delay(override_delay = nil, override_jitter = nil) ⇒ Object
Returns the value of attribute delay.
-
#delay_jitter ⇒ Object
Returns the value of attribute delay_jitter.
-
#use_cache ⇒ Object
Returns the value of attribute use_cache.
Instance Method Summary collapse
- #cache_put(page, url) ⇒ Object
- #cacheable?(page) ⇒ Boolean
-
#get(url, **options) ⇒ Object
Retrieve an URL, and return a Mechanize::Page instance (which acts a bit like a Nokogiri::HTML::Document instance.).
- #init_agent! ⇒ Object
- #init_cache! ⇒ Object
-
#initialize(**options) ⇒ Browser
constructor
Default options: :delay => 1, # Sleep 1 second between gets :delay_jitter => 0.2, # Random deviation from delay :use_cache => true, # Cache all gets :use_logs => false, # Don’t log the detailed transfer info :cookie_file => “cookies.txt” # Save cookies to file.
- #load_cookies! ⇒ Object
- #relative?(url) ⇒ Boolean
- #save_cookies! ⇒ Object
Constructor Details
#initialize(**options) ⇒ Browser
Default options:
:delay => 1, # Sleep 1 second between gets
:delay_jitter => 0.2, # Random deviation from delay
:use_cache => true, # Cache all gets
:use_logs => false, # Don't log the detailed transfer info
:cookie_file => "cookies.txt" # Save cookies to file
39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 |
# File 'lib/epitools/browser.rb', line 39 def initialize(**) @last_get = Time.at(0) @delay = [:delay] || 1 @delay_jitter = [:delay_jitter] || 0.2 @use_cache = !!([:cache] || [:cached] || [:use_cache]) @use_logs = [:logs] || false @cookie_file = [:cookiefile] || "cookies.txt" @cache_file = [:cache_file] || "browser-cache.db" # TODO: @progress, @user_agent, @logfile, @cache_file (default location: ~/.epitools?) if [:proxy] host, port = [:proxy].split(':') TCPSocket::socks_server = host TCPSocket::socks_port = port.to_i end init_agent! init_cache! end |
Instance Attribute Details
#agent ⇒ Object
Returns the value of attribute agent.
29 30 31 |
# File 'lib/epitools/browser.rb', line 29 def agent @agent end |
#cache ⇒ Object
Returns the value of attribute cache.
29 30 31 |
# File 'lib/epitools/browser.rb', line 29 def cache @cache end |
#delay(override_delay = nil, override_jitter = nil) ⇒ Object
Returns the value of attribute delay.
29 30 31 |
# File 'lib/epitools/browser.rb', line 29 def delay @delay end |
#delay_jitter ⇒ Object
Returns the value of attribute delay_jitter.
29 30 31 |
# File 'lib/epitools/browser.rb', line 29 def delay_jitter @delay_jitter end |
#use_cache ⇒ Object
Returns the value of attribute use_cache.
29 30 31 |
# File 'lib/epitools/browser.rb', line 29 def use_cache @use_cache end |
Instance Method Details
#cache_put(page, url) ⇒ Object
112 113 114 115 116 117 118 119 |
# File 'lib/epitools/browser.rb', line 112 def cache_put(page, url) if cache.valid_page?(page) if page.content_type =~ %r{(^text/|^application/javascript|javascript)} puts " |_ writing to cache" cache.put(page, url, :overwrite=>true) end end end |
#cacheable?(page) ⇒ Boolean
105 106 107 108 109 110 |
# File 'lib/epitools/browser.rb', line 105 def cacheable?(page) case page.content_type when %r{^(text|application)} true end end |
#get(url, **options) ⇒ Object
Retrieve an URL, and return a Mechanize::Page instance (which acts a bit like a Nokogiri::HTML::Document instance.)
Options:
:cached => true/false | check cache before getting page
129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 |
# File 'lib/epitools/browser.rb', line 129 def get(url, **) # TODO: Have a base-URL option #if relative?(url) # url = URI.join("http://base-url/", url).to_s #end # Determine the cache setting use_cache = [:cached].nil? ? @use_cache : [:cached] cached_already = cache.include?(url) if use_cache puts puts "[ GET #{url} (using cache: #{!!use_cache}) ]" delay unless cached_already max_retries = 4 retries = 0 begin if use_cache and page = cache.get(url) puts " |_ cached (#{page.content_type})" else page = agent.get(url, [], [:referer]) @last_get = Time.now cache_put(page, url) if use_cache end puts rescue Net::HTTPBadResponse, Errno::ECONNRESET, SocketError, Timeout::Error, SOCKSError => e raise if e. == "getaddrinfo: Name or service not known" retries += 1 return if retries >= max_retries puts " |_ ERROR: #{e.inspect} -- retrying" delay(5) retry =begin rescue Mechanize::ResponseCodeError => e case e.response_code when "401" #=> Net::HTTPUnauthorized p e login! page = get(url) puts when "404" p e raise e when "503" puts " |_ ERROR: #{e.inspect} -- retrying" delay(5) retry else raise e end =end end page end |
#init_agent! ⇒ Object
60 61 62 63 64 65 66 67 68 69 |
# File 'lib/epitools/browser.rb', line 60 def init_agent! @agent = Mechanize.new do |a| # ["Mechanize", "Mac Mozilla", "Linux Mozilla", "Windows IE 6", "iPhone", "Linux Konqueror", "Windows IE 7", "Mac FireFox", "Mac Safari", "Windows Mozilla"] a.max_history = 10 a.user_agent_alias = "Windows Chrome" a.log = Logger.new "mechanize.log" if @use_logs end end |
#init_cache! ⇒ Object
82 83 84 85 |
# File 'lib/epitools/browser.rb', line 82 def init_cache! # TODO: Rescue "couldn't load" exception and disable caching @cache = Cache.new(@cache_file, agent) if @use_cache end |
#load_cookies! ⇒ Object
87 88 89 90 91 92 93 94 |
# File 'lib/epitools/browser.rb', line 87 def if File.exist? @cookie_file agent..load @cookie_file true else false end end |
#relative?(url) ⇒ Boolean
101 102 103 |
# File 'lib/epitools/browser.rb', line 101 def relative?(url) not url[ %r{^https?://} ] end |
#save_cookies! ⇒ Object
96 97 98 99 |
# File 'lib/epitools/browser.rb', line 96 def agent..save_as @cookie_file true end |