Class: Apollo::Crawler::BaseCrawler

Inherits:

Object

Object
Apollo::Crawler::BaseCrawler

show all

Defined in:: lib/apollo_crawler/crawler/base_crawler.rb

Direct Known Subclasses

GoogleCrawler, HackerNewsCrawler, SlashdotCrawler, StackoverflowCrawler, XkcdCrawler, YoujizzCrawler

Class Method Summary collapse

Instance Method Summary collapse

#etl(url = nil, opts = {}, &block) ⇒ Object
- (0) Figure out URL - (1) Extract Data - (2) Extract Links - (3) Go to (0) eventually.
#extract_data(doc) ⇒ Object

Extracts data from document.
#extract_links(doc) ⇒ Object

Extract links to another documents from this document.
#fetch_document(url) ⇒ Object

Fetch document.
#initialize ⇒ BaseCrawler constructor

A new instance of BaseCrawler.
#name ⇒ Object

Name of the crawler.
#process_url(url) ⇒ Object
#url ⇒ Object

Constructor Details

#initialize ⇒ `BaseCrawler`

Returns a new instance of BaseCrawler.



31
32
33

# File 'lib/apollo_crawler/crawler/base_crawler.rb', line 31

def initialize
	@backlog = []
end

Class Method Details

.create_metadoc(url, doc) ⇒ `Object`

# File 'lib/apollo_crawler/crawler/base_crawler.rb', line 160

def self.create_metadoc(url, doc)
	return {
		'url' => url,
		'doc' => doc.encode('UTF-8', {:invalid => :replace, :undef => :replace, :replace => '?'}),
		'hash' => Digest::SHA256.new.update(doc).hexdigest,
		'created_at' => Time.now.utc,
		'expires_at' => nil,
		'version' => 0
	}
end

.fetch(url) ⇒ `Object`



48
49
50

# File 'lib/apollo_crawler/crawler/base_crawler.rb', line 48

def self.fetch(url)
	RbConfig::DEFAULT_FETCHER.fetch(url)
end

.name_re ⇒ `Object`



35
36
37

# File 'lib/apollo_crawler/crawler/base_crawler.rb', line 35

def self.name_re()
	return /crawler$/
end

.try_get_doc(root, url) ⇒ `Object`

# File 'lib/apollo_crawler/crawler/base_crawler.rb', line 60

def self.try_get_doc(root, url)
	doc = BaseCrawler.try_get_url(root, url)
	
	# TODO: Set experition header
	return {
		:doc => doc,
		:url => url
	}
end

.try_get_url(root, url) ⇒ `Object`

# File 'lib/apollo_crawler/crawler/base_crawler.rb', line 52

def self.try_get_url(root, url)
	begin
		return URI.join(root, url)
	rescue
		return nil
	end
end

Instance Method Details

#etl(url = nil, opts = {}, &block) ⇒ `Object`

(0) Figure out URL
(1) Extract Data
(2) Extract Links
(3) Go to (0) eventually

# File 'lib/apollo_crawler/crawler/base_crawler.rb', line 74

def etl(url=nil, opts={}, &block)
	# Look for passed URL use default instead and fail if it is not valid
	if(url.nil? || url.empty?)
		url = self.url
	end

	# TODO: Be more agressive, use assert, it is clients responsibility!
	if(url.nil?)
		return nil
	end

	# We support both - list of urls or single url
	if(url.kind_of?(Array))
		@backlog.concat(url)
	else
		@backlog << url
	end

	# Counter of processed documents (pages)
	docs_processed = 0

	res = []
	# TODO: Respect limit of documents/urls processed
	while(@backlog.empty? == false)
		url = @backlog.shift

		# puts "Processing '#{url}'"
		doc = self.process_url(url)
		
		# Increase counter of processed documents
		docs_processed = docs_processed + 1

		# Process document if was successfuly retreived
		if(!doc.nil?)
			# TODO: Use log4r and log it only on info level
			if block_given?
				yield doc
			end

			# Add document to queue of results
			res << doc

			# If
			if(doc[:links].nil? == false)
				doc[:links].each do |link|
					url = link[:link].to_s
					# TODO: Use log4r and log it only on info level
					#puts url

					# TODO: Check if it is unique
					@backlog << url
				end
			end
		end

		# Break if limit of documents to processed was reached
		break if opts[:doc_limit] && docs_processed >= opts[:doc_limit]
	end
	return res
end

#extract_data(doc) ⇒ `Object`

Extracts data from document

# File 'lib/apollo_crawler/crawler/base_crawler.rb', line 212

def extract_data(doc)
	res = []
	return res
end

#extract_links(doc) ⇒ `Object`

Extract links to another documents from this document

# File 'lib/apollo_crawler/crawler/base_crawler.rb', line 218

def extract_links(doc)
	res = []
	return res
end

#fetch_document(url) ⇒ `Object`

Fetch document

# File 'lib/apollo_crawler/crawler/base_crawler.rb', line 172

def fetch_document(url)
	# TODO: Refactor following idiom
	if(url == nil)
		url = self.url
	end

	if(url.nil?)
		return nil
	end

	# TODO: Use some (custom-made) low-level HTTTP Protocol cache - just for sure
	cache = Apollo::Cache::Factory.instance.construct
	metadoc = cache.get(url) do
		max_attempts = 3
		attempt_no = 0
		success = false
		
		doc = nil
		while(attempt_no < max_attempts && success == false) do
			begin
				doc = BaseCrawler.fetch(url)
				success = true
			rescue Exception => e
				puts "EXCEPTION: Unable to fetch '#{url}', reason: '#{e.to_s}'"
				sleep 1

				attempt_no = attempt_no + 1
				success = false
			end
		end

		# Create metadata
		BaseCrawler.create_metadoc(url, doc)
	end

	# TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
	return Nokogiri::HTML(metadoc['doc'])
end

#name ⇒ `Object`

Name of the crawler



40
41
42

# File 'lib/apollo_crawler/crawler/base_crawler.rb', line 40

def name
	return "Crawler Base" 
end

#process_url(url) ⇒ `Object`

# File 'lib/apollo_crawler/crawler/base_crawler.rb', line 135

def process_url(url)
	doc = self.fetch_document(url)
	if(doc.nil?)
		return nil
	end

	# Try extract data from document
	data = self.extract_data(doc)

	# Try extract links for another documents 
	links = self.extract_links(doc)
	
	# TODO: Make configurable if links extracted from doc should be printed
	# puts links.inspect

	# Format ETL result
	res = { 
		:crawler => self.class.name,
		:data => data,
		:links => links
	}

	return res
end

#url ⇒ `Object`



44
45
46

# File 'lib/apollo_crawler/crawler/base_crawler.rb', line 44

def url
	return nil
end

Class: Apollo::Crawler::BaseCrawler

Direct Known Subclasses

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize ⇒ BaseCrawler

Class Method Details

.create_metadoc(url, doc) ⇒ Object

.fetch(url) ⇒ Object

.name_re ⇒ Object

.try_get_doc(root, url) ⇒ Object

.try_get_url(root, url) ⇒ Object

Instance Method Details

#etl(url = nil, opts = {}, &block) ⇒ Object

#extract_data(doc) ⇒ Object

#extract_links(doc) ⇒ Object

#fetch_document(url) ⇒ Object

#name ⇒ Object

#process_url(url) ⇒ Object

#url ⇒ Object

#initialize ⇒ `BaseCrawler`

.create_metadoc(url, doc) ⇒ `Object`

.fetch(url) ⇒ `Object`

.name_re ⇒ `Object`

.try_get_doc(root, url) ⇒ `Object`

.try_get_url(root, url) ⇒ `Object`

#etl(url = nil, opts = {}, &block) ⇒ `Object`

#extract_data(doc) ⇒ `Object`

#extract_links(doc) ⇒ `Object`

#fetch_document(url) ⇒ `Object`

#name ⇒ `Object`

#process_url(url) ⇒ `Object`

#url ⇒ `Object`