Class: Abrupt::Crawler

Inherits:
Object show all
Defined in:
lib/abrupt/crawler.rb

Overview

Crawler for a website including all followed urls with performing abrupt services BETA!!!

Constant Summary collapse

SERVICE_MAPPING =
{
    r: Service::Readability,
    i: Service::Input,
    s: Service::Subject,
    c: Service::Complexity,
    l: Service::Link,
    p: Service::Picture
}

Instance Method Summary collapse

Constructor Details

#initialize(uri, *args) ⇒ Crawler

Returns a new instance of Crawler.



28
29
30
31
32
33
34
35
36
37
38
39
40
41
# File 'lib/abrupt/crawler.rb', line 28

def initialize(uri, *args)
  @uri = Addressable::URI.parse(uri).normalize
  opts = args.first
  @options = {
      lang: 'en',
      services: %w(r i s c l p),
      depth: '3',
      word_limit: 20
  }
  @options[:services] = opts[:services] if opts[:services]
  @options[:lang] = opts[:lang] if opts[:lang]
  @follow_links = !opts[:nofollow]
  @result = {}
end

Instance Method Details

#canonize_html(html) ⇒ Object



109
110
111
112
113
# File 'lib/abrupt/crawler.rb', line 109

def canonize_html(html)
  baseurl = "#{@uri.scheme}://#{@uri.host}"
  converter = Service::AbsoluteUrl.new(html, baseurl: baseurl)
  converter.execute
end

#crawl(uri = nil) ⇒ JSON

Crawls a page, saves the service results in result hash and returns an array with the existing uris of this page.

Parameters:

  • uri (String) (defaults to: nil)

    the uri to crawl

Returns:

  • (JSON)

    result



48
49
50
51
52
53
54
55
56
57
58
59
# File 'lib/abrupt/crawler.rb', line 48

def crawl(uri = nil)
  Abrupt.log '.'
  uri ||= @uri.to_str.append_last_slash
  unless @result[uri]
    html = fetch_html(uri)
    @result[uri] ||= {}
    @result[uri] = perform_services(html) if html
    # new_uris.select! { |url| same_host?(url) } # filter
    uris_with_same_host(uri).uniq.each { |url| crawl(url) } if @follow_links
  end
  Service::Base.transform_hash(@result)
end

#fetch_html(uri) ⇒ Object



72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
# File 'lib/abrupt/crawler.rb', line 72

def fetch_html(uri)
  uri = Addressable::URI.parse(uri.strip).normalize.to_str
  begin
    response = ::RestClient.get uri, accept: :html
    content_type = response.headers[:content_type].to_s
    case response.code
    when 200...400
      response.to_str if html?(content_type)
    else
      false
    end
  rescue => e
    puts "error fetching html on #{uri}"
    puts e
    nil
  end
end

#html?(content_type) ⇒ Boolean

Returns:

  • (Boolean)


90
91
92
# File 'lib/abrupt/crawler.rb', line 90

def html?(content_type)
  content_type.start_with?('text/html')
end

#init_services_hash(html) ⇒ Object



98
99
100
101
102
103
104
105
106
107
# File 'lib/abrupt/crawler.rb', line 98

def init_services_hash(html)
  @options[:services].map do |s|
    s = s.to_sym
    service_class = SERVICE_MAPPING[s]
    available_options = service_class.available_options
    opts = available_options.map { |o| [o, @options[o.to_sym]] }.to_h
    service = service_class.new(html, opts)
    [service_class.keyname, service]
  end.to_h
end

#perform_services(html) ⇒ Object



115
116
117
118
119
120
121
122
123
# File 'lib/abrupt/crawler.rb', line 115

def perform_services(html)
  result = {}
  html = canonize_html(html)
  services_hash = init_services_hash(html)
  services_hash.each do |json_field, service_class|
    result[json_field.to_sym] = service_class.execute
  end
  result
end

#same_host?(uri) ⇒ Boolean

Returns:

  • (Boolean)


94
95
96
# File 'lib/abrupt/crawler.rb', line 94

def same_host?(uri)
  !uri.to_s.empty? && Addressable::URI.parse(uri).host.eql?(@uri.host)
end

#uris_with_same_host(uri) ⇒ Object

TODO: maybe as class method



62
63
64
65
66
67
68
69
70
# File 'lib/abrupt/crawler.rb', line 62

def uris_with_same_host(uri)
  if @result[uri][:link] && @result[uri][:link]['a']
    @result[uri][:link]['a'].to_a.map do |link|
      link['href'] if same_host?(link['href'])
    end.compact
  else
    []
  end
end