Class: Recluse::Profile

Inherits:
Object
  • Object
show all
Defined in:
lib/recluse/profile.rb

Overview

A profile is an atomic unit of rules for link checking.

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(name, roots, email, blacklist: [], whitelist: [], internal_only: false, scheme_squash: false, redirect: false) ⇒ Profile

Create a profile.

Raises:



63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
# File 'lib/recluse/profile.rb', line 63

def initialize(
    name,
    roots,
    email,
    blacklist: [],
    whitelist: [],
    internal_only: false,
    scheme_squash: false,
    redirect: false
)
  raise ProfileError, 'Profile needs roots for starting point' if roots.empty?
  @name = name
  @email = email
  @roots = roots.map do |root|
    if root.class == Link
      root
    else
      Link.new(root, :root)
    end
  end
  @blacklist = blacklist
  @whitelist = whitelist
  @internal_only = internal_only
  @scheme_squash = scheme_squash
  @redirect = redirect
  @tasks = {}
  @results = {}
end

Instance Attribute Details

#blacklistObject

Array of URL patterns to check. Optional. Defaults to empty array.



35
36
37
# File 'lib/recluse/profile.rb', line 35

def blacklist
  @blacklist
end

#emailObject

Used in the user-agent to identify who is running the crawler. This is so that if there’s a problem with your spidering, you will be contacted and not the author of Recluse. Required.



31
32
33
# File 'lib/recluse/profile.rb', line 31

def email
  @email
end

#internal_onlyObject

Don’t check external URLs. Optional. Defaults to false.



43
44
45
# File 'lib/recluse/profile.rb', line 43

def internal_only
  @internal_only
end

#nameObject

Identifier of the profile. Make sure that it is filename friendly. Required.



23
24
25
# File 'lib/recluse/profile.rb', line 23

def name
  @name
end

#redirectObject

When enabled, will follow redirects and report only the status code for the page that is landed upon. When disabled, will report the redirect status code. Defaults to false.



51
52
53
# File 'lib/recluse/profile.rb', line 51

def redirect
  @redirect
end

#resultsObject

Hash of resulting HashTrees.



59
60
61
# File 'lib/recluse/profile.rb', line 59

def results
  @results
end

#rootsObject

Array of URLs to start spidering. Required.



27
28
29
# File 'lib/recluse/profile.rb', line 27

def roots
  @roots
end

#scheme_squashObject

HTTP and HTTPS schemed URLs are treated as equal. Optional. Defaults to false.



47
48
49
# File 'lib/recluse/profile.rb', line 47

def scheme_squash
  @scheme_squash
end

#tasksObject

The list of run tests.



55
56
57
# File 'lib/recluse/profile.rb', line 55

def tasks
  @tasks
end

#whitelistObject

Array of exceptions to the blacklist. Optional. Defaults to empty array.



39
40
41
# File 'lib/recluse/profile.rb', line 39

def whitelist
  @whitelist
end

Class Method Details

.load(profile) ⇒ Object

Loads profile by name.

Raises:



151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
# File 'lib/recluse/profile.rb', line 151

def self.load(profile)
  uconf = UserConfig.new '.recluse'
  raise ProfileError, "Profile '#{profile}' doesn't exist" unless uconf.exist?("#{profile}.yaml")
  options = uconf["#{profile}.yaml"]
  expects = [:blacklist, :whitelist, :internal_only, :scheme_squash, :redirect]
  opts = {}
  expects.each do |e|
    estr = e.to_s
    opts[e] = options[estr] if options.key?(estr) && !options[estr].nil?
  end
  ret = Profile.new(
    profile,
    (options.key?('roots') && !options['roots'].nil? ? options['roots'] : []),
    (options.key?('email') && !options['email'].nil? ? options['email'] : ''),
    **opts
  )
  ret
end

Instance Method Details

#==(other) ⇒ Object

Test if profiles share the same configuration options.



140
141
142
143
144
145
146
147
# File 'lib/recluse/profile.rb', line 140

def ==(other)
  return false if other.class != self.class
  instance_variables.all? do |ivar|
    next true if ivar == '@results'.to_sym
    next true if ivar == '@roots' && instance_variable_get(ivar).map(&:to_s) == other.instance_variable_get(ivar).map(&:to_s)
    instance_variable_get(ivar) == other.instance_variable_get(ivar)
  end
end

#create_agentObject

Create a Mechanize agent.



94
95
96
97
98
99
100
101
102
103
104
# File 'lib/recluse/profile.rb', line 94

def create_agent
  Mechanize.new do |a|
    a.ssl_version = 'TLSv1'
    a.verify_mode = OpenSSL::SSL::VERIFY_NONE
    a.max_history = nil
    a.follow_meta_refresh = true
    a.keep_alive = false
    a.redirect_ok = @redirect
    a.user_agent = "Mozilla/5.0 (compatible; recluse/#{Recluse::VERSION}; +#{Recluse::URL}) #{@email}"
  end
end

#saveObject

Saves profile to ~/.recluse/NAME.yaml.



123
124
125
126
127
128
129
130
131
132
133
134
135
136
# File 'lib/recluse/profile.rb', line 123

def save
  uconf = UserConfig.new '.recluse'
  fname = "#{@name}.yaml"
  options = uconf[fname]
  options['name'] = @name
  options['roots'] = @roots.map(&:to_s)
  options['email'] = @email
  options['blacklist'] = @blacklist
  options['whitelist'] = @whitelist
  options['internal_only'] = @internal_only
  options['scheme_squash'] = @scheme_squash
  options['redirect'] = @redirect
  options.save
end

#test(key, options = {}) ⇒ Object

Runs test.



108
109
110
111
112
113
114
115
116
117
118
119
# File 'lib/recluse/profile.rb', line 108

def test(key, options = {})
  unless @results.key?(key) && @results[key].class == Recluse::HashTree
    @results[key] = Recluse::HashTree.new do |url1, url2|
      url1, url2 = url2, url1 if url2.length > url1.length
      # Detect if URL exists already, but just has a slash at end
      (url1 == url2 || (url1.length == (url2.length + 1) && url1[-1] == '/' && url2[-1] != '/' && url1[0...-1] == url2))
    end
  end
  @tasks[key] = Recluse::Tasks.get(key).new(self, options.merge(results: @results[key]))
  @tasks[key].run
  @results[key]
end