Class: Retriever::Fetch
- Inherits:
-
Object
- Object
- Retriever::Fetch
- Defined in:
- lib/retriever/fetch.rb
Direct Known Subclasses
Constant Summary collapse
- HR =
'###############################'
Instance Attribute Summary collapse
-
#max_pages ⇒ Object
readonly
Returns the value of attribute max_pages.
-
#result ⇒ Object
readonly
Returns the value of attribute result.
-
#t ⇒ Object
readonly
Returns the value of attribute t.
Instance Method Summary collapse
-
#dump ⇒ Object
prints current data collection to STDOUT.
- #errlog(msg) ⇒ Object
-
#good_response?(resp, url) ⇒ Boolean
returns true is resp is ok to continue.
-
#initialize(url, options) ⇒ Fetch
constructor
given target URL and RR options, creates a fetch object.
- #lg(msg) ⇒ Object
- #start ⇒ Object
-
#write ⇒ Object
writes current data collection out to CSV in current directory.
Constructor Details
#initialize(url, options) ⇒ Fetch
given target URL and RR options, creates a fetch object. There is no direct output this is a parent class that the other fetch classes build off of.
17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 |
# File 'lib/retriever/fetch.rb', line 17 def initialize(url, ) @iterator = false @result = [] @connection_tally = { success: 0, error: 0, error_client: 0, error_server: 0 } () if @progress @t = Retriever::Target.new(url, @file_re) @output = "rr-#{@t.host.split('.')[1]}" if @fileharvest && !@output @already_crawled = setup_bloom_filter end |
Instance Attribute Details
#max_pages ⇒ Object (readonly)
Returns the value of attribute max_pages.
13 14 15 |
# File 'lib/retriever/fetch.rb', line 13 def max_pages @max_pages end |
#result ⇒ Object (readonly)
Returns the value of attribute result.
13 14 15 |
# File 'lib/retriever/fetch.rb', line 13 def result @result end |
#t ⇒ Object (readonly)
Returns the value of attribute t.
13 14 15 |
# File 'lib/retriever/fetch.rb', line 13 def t @t end |
Instance Method Details
#dump ⇒ Object
prints current data collection to STDOUT
48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 |
# File 'lib/retriever/fetch.rb', line 48 def dump puts HR puts "Connection Tally:\n#{@connection_tally}\n#{HR}" if @verbose puts "Target URL: #{@t.target}" if @sitemap puts 'Sitemap' elsif @fileharvest puts "File harvest by type: #{@fileharvest}" elsif @seo puts 'SEO Metrics' end puts "Data Dump -- Object Count: #{@result.size}" puts HR @result.each do |line| puts line end puts end |
#errlog(msg) ⇒ Object
39 40 41 |
# File 'lib/retriever/fetch.rb', line 39 def errlog(msg) fail "ERROR: #{msg}" end |
#good_response?(resp, url) ⇒ Boolean
returns true is resp is ok to continue
88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 |
# File 'lib/retriever/fetch.rb', line 88 def good_response?(resp, url) return false unless resp hdr = resp.response_header if hdr.redirection? loc = hdr.location lg("#{url} Redirected to #{loc}") if t.host_re =~ loc @temp_link_stack.push(loc) unless @already_crawled.include?(loc) lg('--Added to stack for later') return false end lg("Redirection outside of target host. No - go. #{loc}") return false end # lets not continue if unsuccessful connection unless hdr.successful? lg("UNSUCCESSFUL CONNECTION -- #{url}") @connection_tally[:error] += 1 @connection_tally[:error_server] += 1 if hdr.server_error? @connection_tally[:error_client] += 1 if hdr.client_error? return false end # let's not continue if not text/html unless hdr['CONTENT_TYPE'] =~ %r{(text/html|application/xhtml+xml)} @already_crawled.insert(url) lg("Page Not text/html -- #{url}") return false end @connection_tally[:success] += 1 true end |
#lg(msg) ⇒ Object
43 44 45 |
# File 'lib/retriever/fetch.rb', line 43 def lg(msg) puts "### #{msg}" if @verbose end |
#start ⇒ Object
33 34 35 36 37 |
# File 'lib/retriever/fetch.rb', line 33 def start @page_one = crawl_page_one @link_stack = create_link_stack @temp_link_stack = [] end |
#write ⇒ Object
writes current data collection out to CSV in current directory
68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 |
# File 'lib/retriever/fetch.rb', line 68 def write return false unless @output i = 0 CSV.open("#{@output}.csv", 'w') do |csv| if (i == 0) && @seo csv << ['URL', 'Page Title', 'Meta Description', 'H1', 'H2'] i += 1 end @result.each do |entry| csv << entry end end puts HR puts "File Created: #{@output}.csv" puts "Object Count: #{@result.size}" puts HR puts end |