Class: Kudzu::Agent
- Inherits:
-
Object
show all
- Defined in:
- lib/kudzu/agent.rb,
lib/kudzu/agent/robots.rb,
lib/kudzu/agent/fetcher.rb,
lib/kudzu/agent/sleeper.rb,
lib/kudzu/agent/response.rb,
lib/kudzu/agent/reference.rb,
lib/kudzu/agent/robots/txt.rb,
lib/kudzu/agent/url_filterer.rb,
lib/kudzu/agent/util/matcher.rb,
lib/kudzu/agent/page_filterer.rb,
lib/kudzu/agent/robots/parser.rb,
lib/kudzu/agent/url_extractor.rb,
lib/kudzu/agent/http/connection.rb,
lib/kudzu/agent/util/title_parser.rb,
lib/kudzu/agent/http/connection_pool.rb,
lib/kudzu/agent/util/charset_detector.rb,
lib/kudzu/agent/util/mime_type_detector.rb,
lib/kudzu/agent/util/content_type_parser.rb
Defined Under Namespace
Classes: Fetcher, Http, PageFilterer, Reference, Response, Robots, Sleeper, UrlExtractor, UrlFilterer, Util
Instance Method Summary
collapse
Constructor Details
#initialize(config, &block) ⇒ Agent
Returns a new instance of Agent.
5
6
7
8
9
10
11
12
13
|
# File 'lib/kudzu/agent.rb', line 5
def initialize(config, &block)
@config = config
@robots = Robots.new(@config)
@fetcher = Fetcher.new(@config, @robots)
@url_extractor = UrlExtractor.new(@config)
@url_filterer = UrlFilterer.new(@config, @robots)
@page_filterer = PageFilterer.new(@config)
end
|
Instance Method Details
32
33
34
35
36
|
# File 'lib/kudzu/agent.rb', line 32
def (response)
return [] unless redirect_url_allowed?(response)
refs = @url_extractor.(response)
@url_filterer.filter(refs, response.url)
end
|
#fetch(url, request_header = {}) ⇒ Object
20
21
22
23
24
25
26
27
28
29
30
|
# File 'lib/kudzu/agent.rb', line 20
def fetch(url, = {})
response = @fetcher.fetch(url, request_header: )
return response unless response.fetched?
response.size = response.body.size
response.digest = Digest::MD5.hexdigest(response.body)
response.mime_type = Util::MimeTypeDetector.detect(response)
response.charset = Util::CharsetDetector.detect(response) if response.text?
response.title = Util::TitleParser.parse(response)
response
end
|
#filter_response?(response) ⇒ Boolean
38
39
40
41
|
# File 'lib/kudzu/agent.rb', line 38
def filter_response?(response)
return true unless redirect_url_allowed?(response)
!@page_filterer.allowed?(response)
end
|
#start ⇒ Object
15
16
17
18
|
# File 'lib/kudzu/agent.rb', line 15
def start
yield
@fetcher.pool.close
end
|