Class: Apollo::Crawler::BaseCrawler
- Inherits:
-
Object
- Object
- Apollo::Crawler::BaseCrawler
- Defined in:
- lib/apollo_crawler/crawler/base_crawler.rb
Direct Known Subclasses
GoogleCrawler, HackerNewsCrawler, SlashdotCrawler, StackoverflowCrawler, XkcdCrawler, YoujizzCrawler
Class Method Summary collapse
- .create_metadoc(url, doc) ⇒ Object
- .fetch(url) ⇒ Object
- .name_re ⇒ Object
- .try_get_doc(root, url) ⇒ Object
- .try_get_url(root, url) ⇒ Object
Instance Method Summary collapse
-
#etl(url = nil, opts = {}, &block) ⇒ Object
-
(0) Figure out URL - (1) Extract Data - (2) Extract Links - (3) Go to (0) eventually.
-
-
#extract_data(doc) ⇒ Object
Extracts data from document.
-
#extract_links(doc) ⇒ Object
Extract links to another documents from this document.
-
#fetch_document(url) ⇒ Object
Fetch document.
-
#initialize ⇒ BaseCrawler
constructor
A new instance of BaseCrawler.
-
#name ⇒ Object
Name of the crawler.
- #process_url(url) ⇒ Object
- #url ⇒ Object
Constructor Details
#initialize ⇒ BaseCrawler
Returns a new instance of BaseCrawler.
31 32 33 |
# File 'lib/apollo_crawler/crawler/base_crawler.rb', line 31 def initialize @backlog = [] end |
Class Method Details
.create_metadoc(url, doc) ⇒ Object
160 161 162 163 164 165 166 167 168 169 |
# File 'lib/apollo_crawler/crawler/base_crawler.rb', line 160 def self.(url, doc) return { 'url' => url, 'doc' => doc.encode('UTF-8', {:invalid => :replace, :undef => :replace, :replace => '?'}), 'hash' => Digest::SHA256.new.update(doc).hexdigest, 'created_at' => Time.now.utc, 'expires_at' => nil, 'version' => 0 } end |
.fetch(url) ⇒ Object
48 49 50 |
# File 'lib/apollo_crawler/crawler/base_crawler.rb', line 48 def self.fetch(url) RbConfig::DEFAULT_FETCHER.fetch(url) end |
.name_re ⇒ Object
35 36 37 |
# File 'lib/apollo_crawler/crawler/base_crawler.rb', line 35 def self.name_re() return /crawler$/ end |
.try_get_doc(root, url) ⇒ Object
60 61 62 63 64 65 66 67 68 |
# File 'lib/apollo_crawler/crawler/base_crawler.rb', line 60 def self.try_get_doc(root, url) doc = BaseCrawler.try_get_url(root, url) # TODO: Set experition header return { :doc => doc, :url => url } end |
Instance Method Details
#etl(url = nil, opts = {}, &block) ⇒ Object
-
(0) Figure out URL
-
(1) Extract Data
-
(2) Extract Links
-
(3) Go to (0) eventually
74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
# File 'lib/apollo_crawler/crawler/base_crawler.rb', line 74 def etl(url=nil, opts={}, &block) # Look for passed URL use default instead and fail if it is not valid if(url.nil? || url.empty?) url = self.url end # TODO: Be more agressive, use assert, it is clients responsibility! if(url.nil?) return nil end # We support both - list of urls or single url if(url.kind_of?(Array)) @backlog.concat(url) else @backlog << url end # Counter of processed documents (pages) docs_processed = 0 res = [] # TODO: Respect limit of documents/urls processed while(@backlog.empty? == false) url = @backlog.shift # puts "Processing '#{url}'" doc = self.process_url(url) # Increase counter of processed documents docs_processed = docs_processed + 1 # Process document if was successfuly retreived if(!doc.nil?) # TODO: Use log4r and log it only on info level if block_given? yield doc end # Add document to queue of results res << doc # If if(doc[:links].nil? == false) doc[:links].each do |link| url = link[:link].to_s # TODO: Use log4r and log it only on info level #puts url # TODO: Check if it is unique @backlog << url end end end # Break if limit of documents to processed was reached break if opts[:doc_limit] && docs_processed >= opts[:doc_limit] end return res end |
#extract_data(doc) ⇒ Object
Extracts data from document
212 213 214 215 |
# File 'lib/apollo_crawler/crawler/base_crawler.rb', line 212 def extract_data(doc) res = [] return res end |
#extract_links(doc) ⇒ Object
Extract links to another documents from this document
218 219 220 221 |
# File 'lib/apollo_crawler/crawler/base_crawler.rb', line 218 def extract_links(doc) res = [] return res end |
#fetch_document(url) ⇒ Object
Fetch document
172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 |
# File 'lib/apollo_crawler/crawler/base_crawler.rb', line 172 def fetch_document(url) # TODO: Refactor following idiom if(url == nil) url = self.url end if(url.nil?) return nil end # TODO: Use some (custom-made) low-level HTTTP Protocol cache - just for sure cache = Apollo::Cache::Factory.instance.construct = cache.get(url) do max_attempts = 3 attempt_no = 0 success = false doc = nil while(attempt_no < max_attempts && success == false) do begin doc = BaseCrawler.fetch(url) success = true rescue Exception => e puts "EXCEPTION: Unable to fetch '#{url}', reason: '#{e.to_s}'" sleep 1 attempt_no = attempt_no + 1 success = false end end # Create metadata BaseCrawler.(url, doc) end # TODO: Encapsulate and make more robust => invalid hostname, timeouts and so return Nokogiri::HTML(['doc']) end |
#name ⇒ Object
Name of the crawler
40 41 42 |
# File 'lib/apollo_crawler/crawler/base_crawler.rb', line 40 def name return "Crawler Base" end |
#process_url(url) ⇒ Object
135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 |
# File 'lib/apollo_crawler/crawler/base_crawler.rb', line 135 def process_url(url) doc = self.fetch_document(url) if(doc.nil?) return nil end # Try extract data from document data = self.extract_data(doc) # Try extract links for another documents links = self.extract_links(doc) # TODO: Make configurable if links extracted from doc should be printed # puts links.inspect # Format ETL result res = { :crawler => self.class.name, :data => data, :links => links } return res end |
#url ⇒ Object
44 45 46 |
# File 'lib/apollo_crawler/crawler/base_crawler.rb', line 44 def url return nil end |