Class: Fetcher::Worker
- Inherits:
-
Object
- Object
- Fetcher::Worker
- Includes:
- LogUtils::Logging
- Defined in:
- lib/fetcher/worker.rb
Instance Method Summary collapse
- #cache ⇒ Object
-
#clear_cache ⇒ Object
note: use cache[ uri ] = hash for headers+plus body+plus code(410,etc.) cache[ uri ].
- #copy(src, dest, opts = {}) ⇒ Object
- #get(src) ⇒ Object
-
#get_response(src) ⇒ Object
todo: add file protocol.
-
#initialize(old_logger_do_not_use = nil) ⇒ Worker
constructor
todo/fix: remove logger from c’tor use logutils instead.
- #read(src) ⇒ Object
-
#use_cache=(true_or_false) ⇒ Object
true|false.
- #use_cache? ⇒ Boolean
Constructor Details
#initialize(old_logger_do_not_use = nil) ⇒ Worker
todo/fix: remove logger from c’tor
use logutils instead
34 35 36 37 38 39 40 41 42 |
# File 'lib/fetcher/worker.rb', line 34 def initialize( old_logger_do_not_use=nil ) if old_logger_do_not_use != nil puts "*** depreciated API call [Fetcher.initialize] - do NOT pass in logger; no longer required/needed; logger arg will get removed" end ### cache for conditional get (e.g. etags and last-modified headers/checks) @cache = {} @use_cache = false end |
Instance Method Details
#cache ⇒ Object
47 |
# File 'lib/fetcher/worker.rb', line 47 def cache() @cache; end |
#clear_cache ⇒ Object
note: use cache[ uri ] = hash for headers+plus body+plus code(410,etc.)
cache[ uri ]
46 |
# File 'lib/fetcher/worker.rb', line 46 def clear_cache() @cache = {}; end |
#copy(src, dest, opts = {}) ⇒ Object
73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 |
# File 'lib/fetcher/worker.rb', line 73 def copy( src, dest, opts={} ) ### fix: return true - success or # false - error!!! ## todo: add file protocol - why? why not?? logger.debug "fetch - copy src: #{src} to dest: #{dest}" response = get_response( src ) # NOTE: on error (NOK) raise exception; do NOT copy file; sorry if response.code != '200' raise HttpError.new( response.code, response. ) end ### check: ## why not always use wb??? ## how is it differet for text files? ## will convert newlines (from windows to unix) ??? # check for content type; use 'wb' for images if response.content_type =~ /image/ || response.content_type =~ /zip/ ## use application/zip or something - why? why not?? logger.debug ' switching to binary' mode = 'wb' else mode = 'w' end mode = opts[:mode] if opts[:mode] # if mode flags passed in -take precedence File.open( dest, mode ) do |f| f.write( response.body ) end end |
#get(src) ⇒ Object
52 53 54 55 56 57 |
# File 'lib/fetcher/worker.rb', line 52 def get( src ) # return HTTPResponse (code,message,body,etc.) logger.debug "fetch - get(_response) src: #{src}" get_response( src ) end |
#get_response(src) ⇒ Object
todo: add file protocol
112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 |
# File 'lib/fetcher/worker.rb', line 112 def get_response( src ) uri = URI.parse( src ) # new code: honor proxy env variable HTTP_PROXY proxy = ENV['HTTP_PROXY'] proxy = ENV['http_proxy'] if proxy.nil? # try possible lower/case env variable (for *nix systems) is this necessary?? if proxy proxy = URI.parse( proxy ) logger.debug "using net http proxy: proxy.host=#{proxy.host}, proxy.port=#{proxy.port}" if proxy.user && proxy.password logger.debug " using credentials: proxy.user=#{proxy.user}, proxy.password=****" else logger.debug " using no credentials" end else logger.debug "using direct net http access; no proxy configured" proxy = OpenStruct.new # all fields return nil (e.g. proxy.host, etc.) end http_proxy = Net::HTTP::Proxy( proxy.host, proxy.port, proxy.user, proxy.password ) redirect_limit = 4 response = nil until false raise ArgumentError, 'HTTP redirect too deep' if redirect_limit == 0 redirect_limit -= 1 http = http_proxy.new( uri.host, uri.port ) logger.debug "GET #{uri.request_uri} uri=#{uri}, redirect_limit=#{redirect_limit}" headers = { 'User-Agent' => "fetcher gem v#{VERSION}" } if use_cache? ## check for existing cache entry in cache store (lookup by uri) ## todo/fix: normalize uri!!!! - how? ## - remove query_string ?? fragement ?? why? why not?? ## note: using uri.to_s should return full uri e.g. http://example.com/page.html cache_entry = cache[ uri.to_s ] if cache_entry logger.info "found cache entry for >#{uri.to_s}<" if cache_entry['etag'] logger.info "adding header If-None-Match (etag) >#{cache_entry['etag']}< for conditional GET" headers['If-None-Match'] = cache_entry['etag'] end if cache_entry['last-modified'] logger.info "adding header If-Modified-Since (last-modified) >#{cache_entry['last-modified']}< for conditional GET" headers['If-Modified-Since'] = cache_entry['last-modified'] end end end request = Net::HTTP::Get.new( uri.request_uri, headers ) if uri.instance_of? URI::HTTPS http.use_ssl = true http.verify_mode = OpenSSL::SSL::VERIFY_NONE end response = http.request( request ) if response.code == '200' logger.debug "#{response.code} #{response.}" logger.debug " content_type: #{response.content_type}, content_length: #{response.content_length}" break # will return response elsif( response.code == '304' ) # -- Not Modified - for conditional GETs (using etag,last-modified) logger.debug "#{response.code} #{response.}" break # will return response elsif( response.code == '301' || response.code == '302' || response.code == '303' || response.code == '307' ) # 301 = moved permanently # 302 = found # 303 = see other # 307 = temporary redirect logger.debug "#{response.code} #{response.} location=#{response.header['location']}" newuri = URI.parse( response.header['location'] ) if newuri.relative? logger.debug "url relative; try to make it absolute" newuri = uri + response.header['location'] end uri = newuri else puts "*** error - fetch HTTP - #{response.code} #{response.}" break # will return response end end response end |
#read(src) ⇒ Object
60 61 62 63 64 65 66 67 68 69 70 |
# File 'lib/fetcher/worker.rb', line 60 def read( src ) # return contents (response body) a string logger.debug "fetch - copy src: #{src} into string" response = get_response( src ) # on error return empty string; - check: better return nil- why? why not?? return '' if response.code != '200' response.body.dup # return string copy end |
#use_cache=(true_or_false) ⇒ Object
true|false
48 |
# File 'lib/fetcher/worker.rb', line 48 def use_cache=(true_or_false) @use_cache=true_or_false; end |
#use_cache? ⇒ Boolean
49 |
# File 'lib/fetcher/worker.rb', line 49 def use_cache?() @use_cache; end |