Class: Arachni::RPC::Server::Spider

Inherits:
Spider show all
Defined in:
lib/arachni/rpc/server/spider.rb

Overview

Extends the regular Spider with high-performance distributed capabilities.

Author:

Constant Summary collapse

BUFFER_SIZE =

Amount of URLs to buffer before distributing.

1000
FILLUP_ATTEMPTS =

How many times to try and fill the buffer before distributing what’s in it.

200

Constants inherited from Spider

Spider::MAX_TRIES

Instance Attribute Summary

Attributes inherited from Spider

#failures, #opts, #redirects

Instance Method Summary collapse

Methods inherited from Spider

#done?, #fancy_sitemap, #idle?, #on_complete, #on_each_page, #on_each_response, #paths, #pause, #paused?, #push, #resume, #running?, #url

Methods included from Utilities

#available_port, #cookie_encode, #cookies_from_document, #cookies_from_file, #cookies_from_response, #exception_jail, #exclude_path?, #extract_domain, #follow_protocol?, #form_decode, #form_encode, #form_parse_request_body, #forms_from_document, #forms_from_response, #generate_token, #get_path, #html_decode, #html_encode, #include_path?, #links_from_document, #links_from_response, #normalize_url, #page_from_response, #page_from_url, #parse_query, #parse_set_cookie, #parse_url_vars, #path_in_domain?, #path_too_deep?, #port_available?, #rand_port, #redundant_path?, #remove_constants, #seed, #skip_page?, #skip_path?, #skip_resource?, #to_absolute, #uri_decode, #uri_encode, #uri_parse, #uri_parser, #url_sanitize

Methods included from UI::Output

#debug?, #debug_off, #debug_on, #disable_only_positives, #error_logfile, #flush_buffer, #log_error, #mute, #muted?, old_reset_output_options, #only_positives, #only_positives?, #print_bad, #print_debug, #print_debug_backtrace, #print_debug_pp, #print_error, #print_error_backtrace, #print_info, #print_line, #print_ok, #print_status, #print_verbose, #reroute_to_file, #reroute_to_file?, reset_output_options, #set_buffer_cap, #set_error_logfile, #uncap_buffer, #unmute, #verbose, #verbose?

Constructor Details

#initialize(framework) ⇒ Spider

Returns a new instance of Spider.



39
40
41
42
43
44
45
46
47
48
49
50
# File 'lib/arachni/rpc/server/spider.rb', line 39

def initialize( framework )
    super( framework.opts )

    @framework    = framework
    @peers        = {}
    @done_signals = Hash.new( true )

    @distribution_filter   = Support::LookUp::Moolb.new

    @after_each_run_blocks = []
    @on_first_run_blocks   = []
end

Instance Method Details

#after_each_run(&block) ⇒ Object

Parameters:

  • block (Block)

    Block to be called after each URL batch has been consumed.



58
59
60
# File 'lib/arachni/rpc/server/spider.rb', line 58

def after_each_run( &block )
    @after_each_run_blocks << block
end

#clear_distribution_filterObject



52
53
54
# File 'lib/arachni/rpc/server/spider.rb', line 52

def clear_distribution_filter
    @distribution_filter.clear
end

#local_sitemapHash<String, Integer>

Returns URLs crawled by this Instance, along with their HTTP status codes.

Returns:

  • (Hash<String, Integer>)

    URLs crawled by this Instance, along with their HTTP status codes.



137
138
139
# File 'lib/arachni/rpc/server/spider.rb', line 137

def local_sitemap
    @sitemap
end

#on_first_run(&block) ⇒ Object

Parameters:

  • block (Block)

    Block to be called just before the crawl starts.



64
65
66
# File 'lib/arachni/rpc/server/spider.rb', line 64

def on_first_run( &block )
    @on_first_run_blocks << block
end

#peer_done(url) ⇒ Object

Sets a peer crawler’s state to finished. Exposed so that peers can signal the master once they’re done.

Parameters:

  • url (String)

    URL of the finished peer.



152
153
154
155
156
# File 'lib/arachni/rpc/server/spider.rb', line 152

def peer_done( url )
    @done_signals[url] = true
    master_done_handler
    true
end

#run(*args) ⇒ Object

See Also:

  • Arachgni::Spider#run


69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
# File 'lib/arachni/rpc/server/spider.rb', line 69

def run( *args )
    @first_run_blocks ||= call_on_first_run

    if !solo?
        on_complete_blocks = @on_complete_blocks.dup
        @on_complete_blocks.clear
    end

    super( *args )

    flush_url_distribution_buffer
    master_done_handler

    if slave?
        call_after_each_run
    end

    if !solo?
        @on_complete_blocks = on_complete_blocks.dup
    end

    sitemap
end

#signal_if_done(master) ⇒ Object

Signals the ‘master` Instance that this crawler has finished.

Parameters:



163
164
165
166
# File 'lib/arachni/rpc/server/spider.rb', line 163

def signal_if_done( master )
    return if !done?
    master.spider.peer_done( framework.multi_self_url ){}
end

#sitemapArray<String>

Returns Crawled URLs.

Returns:



142
143
144
# File 'lib/arachni/rpc/server/spider.rb', line 142

def sitemap
    @distributed_sitemap || super
end

#update_peers(peers, &block) ⇒ Object

Updates the list of Instances to assist in the crawl.

Parameters:

  • peers (Array<Hash>)

    Array containing Instance info hashes – with ‘:url` and `:token` at least.

  • block (Block)

    Block to be called once the update operation has completed.



103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
# File 'lib/arachni/rpc/server/spider.rb', line 103

def update_peers( peers, &block )
    @peers_array = peers
    sorted_peers = @peers_array.inject( {} ) do |h, p|
        h[p[:url]] = framework.connect_to_instance( p )
        h
    end.sort

    @peers = Hash[sorted_peers]

    @peers[framework.multi_self_url] = framework

    @peers = Hash[@peers.sort]

    @peer_urls    = @peers.keys
    @peer_clients = @peers.values

    if !master?
        block.call if block_given?
        return true
    end

    each = proc do |peer, iter|
        peer.spider.update_peers( @peers_array | [self_instance_info] ) {
            iter.return
        }
    end

    map_peers( each, proc { block.call if block_given? } )

    true
end