Class: Apollo::Crawler::BaseCrawler
- Inherits:
-
Object
- Object
- Apollo::Crawler::BaseCrawler
- Defined in:
- lib/apollo_crawler/crawler/base_crawler.rb
Direct Known Subclasses
GoogleCrawler, HackerNewsCrawler, SlashdotCrawler, SpiderCrawler, StackoverflowCrawler, XkcdCrawler, YoujizzCrawler
Class Method Summary collapse
- .create_metadoc(url, doc) ⇒ Object
- .fetch(url) ⇒ Object
- .name_re ⇒ Object
- .try_get_doc(root, url) ⇒ Object
- .try_get_url(root, url) ⇒ Object
Instance Method Summary collapse
- #enqueue_url(url) ⇒ Object
-
#etl(url = nil, opts = {}, &block) ⇒ Object
-
(0) Figure out URL - (1) Extract Data - (2) Extract Links - (3) Go to (0) eventually.
-
-
#extract_data(doc) ⇒ Object
Extracts data from document.
-
#extract_links(doc) ⇒ Object
Extract links to another documents from this document.
-
#fetch_document(url) ⇒ Object
Fetch document.
-
#initialize ⇒ BaseCrawler
constructor
A new instance of BaseCrawler.
-
#name ⇒ Object
Name of the crawler.
- #process_url(url) ⇒ Object
- #url ⇒ Object
- #url_processed?(url) ⇒ Boolean
Constructor Details
#initialize ⇒ BaseCrawler
Returns a new instance of BaseCrawler.
32 33 34 35 |
# File 'lib/apollo_crawler/crawler/base_crawler.rb', line 32 def initialize @backlog = [] @visited = [] end |
Class Method Details
.create_metadoc(url, doc) ⇒ Object
172 173 174 175 176 177 178 179 180 181 182 183 |
# File 'lib/apollo_crawler/crawler/base_crawler.rb', line 172 def self.(url, doc) body = doc[:body].encode('UTF-8', {:invalid => :replace, :undef => :replace, :replace => '?'}) return { 'url' => url, 'doc' => body, 'hash' => Digest::SHA256.new.update(body).hexdigest, 'created_at' => Time.now.utc, 'expires_at' => nil, 'version' => 0 } end |
.fetch(url) ⇒ Object
50 51 52 |
# File 'lib/apollo_crawler/crawler/base_crawler.rb', line 50 def self.fetch(url) RbConfig::DEFAULT_FETCHER.fetch(url) end |
.name_re ⇒ Object
37 38 39 |
# File 'lib/apollo_crawler/crawler/base_crawler.rb', line 37 def self.name_re() return /crawler$/ end |
.try_get_doc(root, url) ⇒ Object
62 63 64 65 66 67 68 69 70 |
# File 'lib/apollo_crawler/crawler/base_crawler.rb', line 62 def self.try_get_doc(root, url) doc = BaseCrawler.try_get_url(root, url) # TODO: Set experition header return { :doc => doc, :url => url } end |
Instance Method Details
#enqueue_url(url) ⇒ Object
130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 |
# File 'lib/apollo_crawler/crawler/base_crawler.rb', line 130 def enqueue_url(url) urls = [] return urls if url.nil? # We support both - list of urls or single url if(url.kind_of?(Array)) urls = urls.concat(url) else urls << url end urls.each do |u| if(url_processed?(u) == false) @backlog << u end end end |
#etl(url = nil, opts = {}, &block) ⇒ Object
-
(0) Figure out URL
-
(1) Extract Data
-
(2) Extract Links
-
(3) Go to (0) eventually
76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 |
# File 'lib/apollo_crawler/crawler/base_crawler.rb', line 76 def etl(url=nil, opts={}, &block) # Look for passed URL use default instead and fail if it is not valid if(url.nil? || url.empty?) url = self.url end # TODO: Be more agressive, use assert, it is clients responsibility! if(url.nil?) return nil end enqueue_url(url) # Counter of processed documents (pages) docs_processed = 0 res = [] # TODO: Respect limit of documents/urls processed while(@backlog.empty? == false) url = @backlog.shift # puts "Processing '#{url}'" doc = self.process_url(url) # Increase counter of processed documents docs_processed = docs_processed + 1 @visited << url # Process document if was successfuly retreived if(!doc.nil?) # TODO: Use log4r and log it only on info level if block_given? yield doc end # Add document to queue of results res << doc enqueue_url(doc[:links].map(){ |l| l[:link] }) if doc[:links] end # Break if limit of documents to processed was reached break if opts[:doc_limit] && docs_processed >= opts[:doc_limit] end # Return processed document return res end |
#extract_data(doc) ⇒ Object
Extracts data from document
228 229 230 231 |
# File 'lib/apollo_crawler/crawler/base_crawler.rb', line 228 def extract_data(doc) res = [] return res end |
#extract_links(doc) ⇒ Object
Extract links to another documents from this document
234 235 236 237 |
# File 'lib/apollo_crawler/crawler/base_crawler.rb', line 234 def extract_links(doc) res = [] return res end |
#fetch_document(url) ⇒ Object
Fetch document
186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 |
# File 'lib/apollo_crawler/crawler/base_crawler.rb', line 186 def fetch_document(url) # TODO: Refactor following idiom if(url == nil) url = self.url end if(url.nil?) return nil end url = url.to_s # TODO: Use some (custom-made) low-level HTTTP Protocol cache - just for sure cache = Apollo::Cache::Factory.instance.construct = cache.try_get(url) do max_attempts = 3 attempt_no = 0 success = false doc = nil while(attempt_no < max_attempts && success == false) do begin doc = BaseCrawler.fetch(url) success = true rescue Exception => e puts "EXCEPTION: Unable to fetch '#{url}', reason: '#{e.to_s}'" sleep 1 attempt_no = attempt_no + 1 success = false end end # Create metadata BaseCrawler.(url, doc) end # TODO: Encapsulate and make more robust => invalid hostname, timeouts and so return Nokogiri::HTML(['doc']) end |
#name ⇒ Object
Name of the crawler
42 43 44 |
# File 'lib/apollo_crawler/crawler/base_crawler.rb', line 42 def name return "Crawler Base" end |
#process_url(url) ⇒ Object
147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 |
# File 'lib/apollo_crawler/crawler/base_crawler.rb', line 147 def process_url(url) doc = self.fetch_document(url) if(doc.nil?) return nil end # Try extract data from document data = self.extract_data(doc) # Try extract links for another documents links = self.extract_links(doc) # TODO: Make configurable if links extracted from doc should be printed # puts links.inspect # Format ETL result res = { :crawler => self.class.name, :data => data, :links => links } return res end |
#url ⇒ Object
46 47 48 |
# File 'lib/apollo_crawler/crawler/base_crawler.rb', line 46 def url return nil end |
#url_processed?(url) ⇒ Boolean
126 127 128 |
# File 'lib/apollo_crawler/crawler/base_crawler.rb', line 126 def url_processed?(url) return @backlog.include?(url) || @visited.include?(url) end |