Class: Abrupt::Crawler
Overview
Crawler for a website including all followed urls with performing abrupt services BETA!!!
Constant Summary collapse
- SERVICE_MAPPING =
{ r: Service::Readability, i: Service::Input, s: Service::Subject, c: Service::Complexity, l: Service::Link, p: Service::Picture }
Instance Method Summary collapse
- #canonize_html(html) ⇒ Object
-
#crawl(uri = nil) ⇒ JSON
Crawls a page, saves the service results in result hash and returns an array with the existing uris of this page.
- #fetch_html(uri) ⇒ Object
- #html?(content_type) ⇒ Boolean
- #init_services_hash(html) ⇒ Object
-
#initialize(uri, *args) ⇒ Crawler
constructor
A new instance of Crawler.
- #perform_services(html) ⇒ Object
- #same_host?(uri) ⇒ Boolean
-
#uris_with_same_host(uri) ⇒ Object
TODO: maybe as class method.
Constructor Details
#initialize(uri, *args) ⇒ Crawler
Returns a new instance of Crawler.
28 29 30 31 32 33 34 35 36 37 38 39 40 41 |
# File 'lib/abrupt/crawler.rb', line 28 def initialize(uri, *args) @uri = Addressable::URI.parse(uri).normalize opts = args.first @options = { lang: 'en', services: %w(r i s c l p), depth: '3', word_limit: 20 } @options[:services] = opts[:services] if opts[:services] @options[:lang] = opts[:lang] if opts[:lang] @follow_links = !opts[:nofollow] @result = {} end |
Instance Method Details
#canonize_html(html) ⇒ Object
109 110 111 112 113 |
# File 'lib/abrupt/crawler.rb', line 109 def canonize_html(html) baseurl = "#{@uri.scheme}://#{@uri.host}" converter = Service::AbsoluteUrl.new(html, baseurl: baseurl) converter.execute end |
#crawl(uri = nil) ⇒ JSON
Crawls a page, saves the service results in result hash and returns an array with the existing uris of this page.
48 49 50 51 52 53 54 55 56 57 58 59 |
# File 'lib/abrupt/crawler.rb', line 48 def crawl(uri = nil) Abrupt.log '.' uri ||= @uri.to_str.append_last_slash unless @result[uri] html = fetch_html(uri) @result[uri] ||= {} @result[uri] = perform_services(html) if html # new_uris.select! { |url| same_host?(url) } # filter uris_with_same_host(uri).uniq.each { |url| crawl(url) } if @follow_links end Service::Base.transform_hash(@result) end |
#fetch_html(uri) ⇒ Object
72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 |
# File 'lib/abrupt/crawler.rb', line 72 def fetch_html(uri) uri = Addressable::URI.parse(uri.strip).normalize.to_str begin response = ::RestClient.get uri, accept: :html content_type = response.headers[:content_type].to_s case response.code when 200...400 response.to_str if html?(content_type) else false end rescue => e puts "error fetching html on #{uri}" puts e nil end end |
#html?(content_type) ⇒ Boolean
90 91 92 |
# File 'lib/abrupt/crawler.rb', line 90 def html?(content_type) content_type.start_with?('text/html') end |
#init_services_hash(html) ⇒ Object
98 99 100 101 102 103 104 105 106 107 |
# File 'lib/abrupt/crawler.rb', line 98 def init_services_hash(html) @options[:services].map do |s| s = s.to_sym service_class = SERVICE_MAPPING[s] = service_class. opts = .map { |o| [o, @options[o.to_sym]] }.to_h service = service_class.new(html, opts) [service_class.keyname, service] end.to_h end |
#perform_services(html) ⇒ Object
115 116 117 118 119 120 121 122 123 |
# File 'lib/abrupt/crawler.rb', line 115 def perform_services(html) result = {} html = canonize_html(html) services_hash = init_services_hash(html) services_hash.each do |json_field, service_class| result[json_field.to_sym] = service_class.execute end result end |
#same_host?(uri) ⇒ Boolean
94 95 96 |
# File 'lib/abrupt/crawler.rb', line 94 def same_host?(uri) !uri.to_s.empty? && Addressable::URI.parse(uri).host.eql?(@uri.host) end |
#uris_with_same_host(uri) ⇒ Object
TODO: maybe as class method
62 63 64 65 66 67 68 69 70 |
# File 'lib/abrupt/crawler.rb', line 62 def uris_with_same_host(uri) if @result[uri][:link] && @result[uri][:link]['a'] @result[uri][:link]['a'].to_a.map do |link| link['href'] if same_host?(link['href']) end.compact else [] end end |