Class: WebCrawler::Parsers::Url
- Inherits:
-
Object
- Object
- WebCrawler::Parsers::Url
- Defined in:
- lib/web_crawler/parsers/url.rb
Instance Attribute Summary collapse
-
#host ⇒ Object
readonly
Returns the value of attribute host.
-
#scheme ⇒ Object
readonly
Returns the value of attribute scheme.
Instance Method Summary collapse
-
#initialize(host, options = { }) ⇒ Url
constructor
A new instance of Url.
- #normalize(url) ⇒ Object
- #parse(response, &filter) ⇒ Object
Constructor Details
#initialize(host, options = { }) ⇒ Url
Returns a new instance of Url.
5 6 7 8 9 10 11 |
# File 'lib/web_crawler/parsers/url.rb', line 5 def initialize(host, = { }) @scheme = [:secure] ? 'https' : 'http' @host = URI.parse(normalize_host(host.to_s)) @scheme = @host.scheme @options = set_current_page end |
Instance Attribute Details
#host ⇒ Object (readonly)
Returns the value of attribute host.
3 4 5 |
# File 'lib/web_crawler/parsers/url.rb', line 3 def host @host end |
#scheme ⇒ Object (readonly)
Returns the value of attribute scheme.
3 4 5 |
# File 'lib/web_crawler/parsers/url.rb', line 3 def scheme @scheme end |
Instance Method Details
#normalize(url) ⇒ Object
19 20 21 22 23 24 25 26 27 |
# File 'lib/web_crawler/parsers/url.rb', line 19 def normalize(url) if url[/^(:?#{@host.scheme}|https|)\:\/\/#{@host.host}/] normalize_host(url) elsif url == '#' nil else (url[0] == '/' || url[0] == '?' || url[0] == '#') ? join(url).to_s : (@options[:same_host] ? nil : url) end end |
#parse(response, &filter) ⇒ Object
13 14 15 16 17 |
# File 'lib/web_crawler/parsers/url.rb', line 13 def parse(response, &filter) (Hpricot(response.to_s) / "a").map { |a| normalize(a["href"]) }.compact.uniq.tap do |result| result = result.select(&filter) if block_given? end end |