Class: Recluse::Profile
- Inherits:
-
Object
- Object
- Recluse::Profile
- Defined in:
- lib/recluse/profile.rb
Overview
A profile is an atomic unit of rules for link checking.
Instance Attribute Summary collapse
-
#blacklist ⇒ Object
Array of URL patterns to check.
-
#email ⇒ Object
Used in the user-agent to identify who is running the crawler.
-
#internal_only ⇒ Object
Don’t check external URLs.
-
#name ⇒ Object
Identifier of the profile.
-
#redirect ⇒ Object
When enabled, will follow redirects and report only the status code for the page that is landed upon.
-
#results ⇒ Object
Hash of resulting HashTrees.
-
#roots ⇒ Object
Array of URLs to start spidering.
-
#scheme_squash ⇒ Object
HTTP and HTTPS schemed URLs are treated as equal.
-
#tasks ⇒ Object
The list of run tests.
-
#whitelist ⇒ Object
Array of exceptions to the blacklist.
Class Method Summary collapse
-
.load(profile) ⇒ Object
Loads profile by name.
Instance Method Summary collapse
-
#==(other) ⇒ Object
Test if profiles share the same configuration options.
-
#create_agent ⇒ Object
Create a
Mechanize
agent. -
#initialize(name, roots, email, blacklist: [], whitelist: [], internal_only: false, scheme_squash: false, redirect: false) ⇒ Profile
constructor
Create a profile.
-
#save ⇒ Object
Saves profile to
~/.recluse/NAME.yaml
. -
#test(key, options = {}) ⇒ Object
Runs test.
Constructor Details
#initialize(name, roots, email, blacklist: [], whitelist: [], internal_only: false, scheme_squash: false, redirect: false) ⇒ Profile
Create a profile.
63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 |
# File 'lib/recluse/profile.rb', line 63 def initialize( name, roots, email, blacklist: [], whitelist: [], internal_only: false, scheme_squash: false, redirect: false ) raise ProfileError, 'Profile needs roots for starting point' if roots.empty? @name = name @email = email @roots = roots.map do |root| if root.class == Link root else Link.new(root, :root) end end @blacklist = blacklist @whitelist = whitelist @internal_only = internal_only @scheme_squash = scheme_squash @redirect = redirect @tasks = {} @results = {} end |
Instance Attribute Details
#blacklist ⇒ Object
Array of URL patterns to check. Optional. Defaults to empty array.
35 36 37 |
# File 'lib/recluse/profile.rb', line 35 def blacklist @blacklist end |
#email ⇒ Object
Used in the user-agent to identify who is running the crawler. This is so that if there’s a problem with your spidering, you will be contacted and not the author of Recluse. Required.
31 32 33 |
# File 'lib/recluse/profile.rb', line 31 def email @email end |
#internal_only ⇒ Object
Don’t check external URLs. Optional. Defaults to false
.
43 44 45 |
# File 'lib/recluse/profile.rb', line 43 def internal_only @internal_only end |
#name ⇒ Object
Identifier of the profile. Make sure that it is filename friendly. Required.
23 24 25 |
# File 'lib/recluse/profile.rb', line 23 def name @name end |
#redirect ⇒ Object
When enabled, will follow redirects and report only the status code for the page that is landed upon. When disabled, will report the redirect status code. Defaults to false
.
51 52 53 |
# File 'lib/recluse/profile.rb', line 51 def redirect @redirect end |
#results ⇒ Object
Hash of resulting HashTrees.
59 60 61 |
# File 'lib/recluse/profile.rb', line 59 def results @results end |
#roots ⇒ Object
Array of URLs to start spidering. Required.
27 28 29 |
# File 'lib/recluse/profile.rb', line 27 def roots @roots end |
#scheme_squash ⇒ Object
HTTP and HTTPS schemed URLs are treated as equal. Optional. Defaults to false
.
47 48 49 |
# File 'lib/recluse/profile.rb', line 47 def scheme_squash @scheme_squash end |
#tasks ⇒ Object
The list of run tests.
55 56 57 |
# File 'lib/recluse/profile.rb', line 55 def tasks @tasks end |
#whitelist ⇒ Object
Array of exceptions to the blacklist. Optional. Defaults to empty array.
39 40 41 |
# File 'lib/recluse/profile.rb', line 39 def whitelist @whitelist end |
Class Method Details
.load(profile) ⇒ Object
Loads profile by name.
151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 |
# File 'lib/recluse/profile.rb', line 151 def self.load(profile) uconf = UserConfig.new '.recluse' raise ProfileError, "Profile '#{profile}' doesn't exist" unless uconf.exist?("#{profile}.yaml") = uconf["#{profile}.yaml"] expects = [:blacklist, :whitelist, :internal_only, :scheme_squash, :redirect] opts = {} expects.each do |e| estr = e.to_s opts[e] = [estr] if .key?(estr) && ![estr].nil? end ret = Profile.new( profile, (.key?('roots') && !['roots'].nil? ? ['roots'] : []), (.key?('email') && !['email'].nil? ? ['email'] : ''), **opts ) ret end |
Instance Method Details
#==(other) ⇒ Object
Test if profiles share the same configuration options.
140 141 142 143 144 145 146 147 |
# File 'lib/recluse/profile.rb', line 140 def ==(other) return false if other.class != self.class instance_variables.all? do |ivar| next true if ivar == '@results'.to_sym next true if ivar == '@roots' && instance_variable_get(ivar).map(&:to_s) == other.instance_variable_get(ivar).map(&:to_s) instance_variable_get(ivar) == other.instance_variable_get(ivar) end end |
#create_agent ⇒ Object
Create a Mechanize
agent.
94 95 96 97 98 99 100 101 102 103 104 |
# File 'lib/recluse/profile.rb', line 94 def create_agent Mechanize.new do |a| a.ssl_version = 'TLSv1' a.verify_mode = OpenSSL::SSL::VERIFY_NONE a.max_history = nil a. = true a.keep_alive = false a.redirect_ok = @redirect a.user_agent = "Mozilla/5.0 (compatible; recluse/#{Recluse::VERSION}; +#{Recluse::URL}) #{@email}" end end |
#save ⇒ Object
Saves profile to ~/.recluse/NAME.yaml
.
123 124 125 126 127 128 129 130 131 132 133 134 135 136 |
# File 'lib/recluse/profile.rb', line 123 def save uconf = UserConfig.new '.recluse' fname = "#{@name}.yaml" = uconf[fname] ['name'] = @name ['roots'] = @roots.map(&:to_s) ['email'] = @email ['blacklist'] = @blacklist ['whitelist'] = @whitelist ['internal_only'] = @internal_only ['scheme_squash'] = @scheme_squash ['redirect'] = @redirect .save end |
#test(key, options = {}) ⇒ Object
Runs test.
108 109 110 111 112 113 114 115 116 117 118 119 |
# File 'lib/recluse/profile.rb', line 108 def test(key, = {}) unless @results.key?(key) && @results[key].class == Recluse::HashTree @results[key] = Recluse::HashTree.new do |url1, url2| url1, url2 = url2, url1 if url2.length > url1.length # Detect if URL exists already, but just has a slash at end (url1 == url2 || (url1.length == (url2.length + 1) && url1[-1] == '/' && url2[-1] != '/' && url1[0...-1] == url2)) end end @tasks[key] = Recluse::Tasks.get(key).new(self, .merge(results: @results[key])) @tasks[key].run @results[key] end |