Class: Arachni::Page

Inherits:

Object

Object
Arachni::Page

show all

Includes:: Utilities

Defined in:: lib/arachni/page.rb,
lib/arachni/page/dom.rb,
lib/arachni/page/scope.rb,
lib/arachni/page/dom/transition.rb

Overview

It holds page data like elements, cookies, headers, etc…

Author:

Tasos “Zapotek” Laskos <[email protected]>

Defined Under Namespace

Classes: DOM, Error, Scope

Constant Summary collapse

ELEMENTS =

[
    :links, :forms, :cookies, :headers, :link_templates
]

Instance Attribute Summary collapse

#cache ⇒ Hash readonly
#dom ⇒ DOM

DOM snapshot.
#element_audit_whitelist ⇒ Set<Integer> readonly

Audit whitelist based on Element::Capabilities::Auditable#coverage_hash.
#metadata ⇒ Hash readonly

Holds page data that will need to persist between #clear_cache calls and other utility data.
#response ⇒ HTTP::Response readonly

HTTP response.

Class Method Summary collapse

Instance Method Summary collapse

#==(other) ⇒ Object
#_dump(_) ⇒ Object
#audit_element?(element) ⇒ Bool

‘true` if the element should be audited, `false` otherwise.
#body ⇒ String

HTTP response body.
#body=(string) ⇒ Object
#clear_cache ⇒ Page

‘self` with caches cleared.
#code ⇒ String

URL of the page.
#cookie_jar ⇒ Array<Element::Cookie>

Cookies extracted from the supplied cookie-jar.
#do_not_audit_elements ⇒ Object

It forces #audit_element? to always returns false.
#document ⇒ Nokogiri::HTML

Parsed HTML document.
#dup ⇒ Object
#elements ⇒ Array

All page elements.
#eql?(other) ⇒ Boolean
#has_script? ⇒ Boolean

‘true` if the page contains client-side code, `false` otherwise.
#hash ⇒ Object
#initialize(options) ⇒ Page constructor

Needs either a ‘:parser` or a `:response` or user provided data.
#method(*args) ⇒ String

The request method that returned the page.
#parsed_url ⇒ Arachni::URI
#parser ⇒ Parser
#paths ⇒ Array<String>

Paths contained in this page.
#performer ⇒ Object

Object which performed the #request which lead to this page.
#persistent_hash ⇒ Object
#platforms ⇒ Platform

Applicable platforms for the page.
#prepare_for_report ⇒ Object
#query_vars ⇒ Hash

URL query parameters.
#request ⇒ HTTP::Request

HTTP request.
#scope ⇒ Scope
#text? ⇒ Boolean

‘true` if the body of the page is text-base, `false` otherwise.
#title ⇒ String

Title of the page.
#to_h ⇒ Hash (also: #to_hash)

Converts the page data to a hash.
#to_initialization_options ⇒ Object
#to_rpc_data ⇒ Hash

Data representing this instance that are suitable the RPC transmission.
#to_s ⇒ Object
#update_element_audit_whitelist(list) ⇒ Set

#element_audit_whitelist.
#url ⇒ String

URL of the page.

Methods included from Utilities

#available_port, #caller_name, #caller_path, #cookie_decode, #cookie_encode, #cookies_from_document, #cookies_from_file, #cookies_from_response, #exception_jail, #exclude_path?, #follow_protocol?, #form_decode, #form_encode, #forms_from_document, #forms_from_response, #generate_token, #get_path, #hms_to_seconds, #html_decode, #html_encode, #include_path?, #links_from_document, #links_from_response, #normalize_url, #page_from_response, #page_from_url, #parse_set_cookie, #path_in_domain?, #path_too_deep?, #port_available?, #rand_port, #random_seed, #redundant_path?, #remove_constants, #request_parse_body, #seconds_to_hms, #skip_page?, #skip_path?, #skip_resource?, #skip_response?, #to_absolute, #uri_decode, #uri_encode, #uri_parse, #uri_parse_query, #uri_parser, #uri_rewrite

Constructor Details

#initialize(options) ⇒ `Page`

Needs either a ‘:parser` or a `:response` or user provided data.

Parameters:

options (Hash) —

Hash from which to set instance attributes.

Options Hash (options):

:response (Array<HTTP::Response>, HTTP::Response) —

HTTP response of the page – or array of responses for the page for content refinement.
:parser (Parser) —

An instantiated Arachni::Parser.

# File 'lib/arachni/page.rb', line 145

def initialize( options )
    fail ArgumentError, 'Options cannot be empty.' if options.empty?
    options = options.dup

    @cache = {}

    @do_not_audit_elements = options.delete(:do_not_audit_elements)

    @cache[:parser] = options.delete(:parser)
    @response = @cache[:parser].response if @cache[:parser]

    # We need to know whether or not the page has been dynamically updated
    # with elements, in order to optimize #dup and #hash operations.
    @has_custom_elements = Set.new

    @metadata ||= {}

    options.each do |k, v|
        send( "#{k}=", try_dup( v ) )
    end

    @dom = DOM.new( (options[:dom] || {}).merge( page: self ) )

    fail ArgumentError, 'No URL given!' if !url

    Platform::Manager.fingerprint( self )

    @element_audit_whitelist ||= []
    @element_audit_whitelist   = Set.new( @element_audit_whitelist )
end

Instance Attribute Details

#cache ⇒ `Hash` (readonly)

Returns:

(Hash)



121
122
123

# File 'lib/arachni/page.rb', line 121

def cache
  @cache
end

#dom ⇒ `DOM`

Returns DOM snapshot.

Returns:

(DOM) —

DOM snapshot.



112
113
114

# File 'lib/arachni/page.rb', line 112

def dom
  @dom
end

#element_audit_whitelist ⇒ `Set<Integer>` (readonly)

Returns Audit whitelist based on Element::Capabilities::Auditable#coverage_hash.

Returns:

(Set<Integer>) —

Audit whitelist based on Element::Capabilities::Auditable#coverage_hash.

See Also:



134
135
136

# File 'lib/arachni/page.rb', line 134

def element_audit_whitelist
  @element_audit_whitelist
end

#metadata ⇒ `Hash` (readonly)

Returns Holds page data that will need to persist between #clear_cache calls and other utility data.

Returns:

(Hash) —

Holds page data that will need to persist between #clear_cache calls and other utility data.



126
127
128

# File 'lib/arachni/page.rb', line 126

def metadata
  @metadata
end

#response ⇒ `HTTP::Response` (readonly)

Returns HTTP response.

Returns:

(HTTP::Response) —

HTTP response.



116
117
118

# File 'lib/arachni/page.rb', line 116

def response
  @response
end

Class Method Details

._load(data) ⇒ `Object`



546
547
548

# File 'lib/arachni/page.rb', line 546

def self._load( data )
    new( Marshal.load( data ) )
end

.from_data(data) ⇒ `Object`

Parameters:

options (Hash) —

a customizable set of options

# File 'lib/arachni/page.rb', line 82

def self.from_data( data )
    data = data.dup

    data[:response]        ||= {}
    data[:response][:code] ||= 200
    data[:response][:url]  ||= data.delete( :url )
    data[:response][:body] ||= data.delete( :body ) || ''

    data[:response][:request]       ||= {}
    data[:response][:request][:url] ||= data[:response][:url]

    data[:links]   ||= []
    data[:forms]   ||= []
    data[:cookies] ||= []
    data[:headers] ||= []

    data[:cookie_jar] ||= []

    data[:response][:request] = Arachni::HTTP::Request.new( data[:response][:request] )
    data[:response]           = Arachni::HTTP::Response.new( data[:response] )

    new data
end

.from_response(response) ⇒ `Page`

Parameters:

response (HTTP::Response) —

HTTP response to parse.

Returns:

(Page)



59
60
61

# File 'lib/arachni/page.rb', line 59

def self.from_response( response )
    Parser.new( response ).page
end

.from_rpc_data(data) ⇒ `Page`

Parameters:

data (Hash) —

#to_rpc_data

Returns:

(Page)

# File 'lib/arachni/page.rb', line 504

def self.from_rpc_data( data )
    dom = data.delete('dom')
    normalized_data = {}
    data.each do |name, value|

        value = case name
                    when 'response'
                        HTTP::Response.from_rpc_data( value )

                    when 'metadata'
                        sanitized = {}
                        %w(link form cookie header).each do |e|
                            next if !value[e] || !value[e]['nonces']

                            sanitized[e.to_sym] = {}
                            sanitized[e.to_sym][:nonces] = value[e]['nonces']
                        end
                        sanitized

                    when 'links', 'forms', 'cookies'
                        value.map do |e|
                            Element.const_get(name[0...-1].capitalize.to_sym).from_rpc_data( e )
                        end.to_a

                    else
                        value
                end

        normalized_data[name.to_sym] = value
    end

    instance = new( normalized_data )
    instance.instance_variable_set(
        '@dom', DOM.from_rpc_data( dom.merge( page: instance ) )
    )
    instance
end

.from_url(url, opts = {}, &block) ⇒ `Page`

Parameters:

url (String) —

URL to fetch.
opts (Hash) (defaults to: {})
block (Block) —

Block to which to pass the page object. If given, the request will be performed asynchronously. If no block is given, the page will be fetched synchronously and be returned by this method.

Options Hash (opts):

:precision (Integer) — default: 2 —

How many times to request the page and examine changes between requests. Used tp identify nonce tokens etc.
:http (Hash) —

HTTP request options.

Returns:

(Page)

# File 'lib/arachni/page.rb', line 37

def self.from_url( url, opts = {}, &block )
    responses = []

    opts[:precision] ||= 2
    opts[:precision].times do
        HTTP::Client.get( url, opts[:http] || {} ) do |res|
            responses << res
            next if responses.size != opts[:precision]
            block.call( from_response( responses ) ) if block_given?
        end
    end

    if !block_given?
        HTTP::Client.run
        from_response( responses )
    end
end

Instance Method Details

#==(other) ⇒ `Object`



440
441
442

# File 'lib/arachni/page.rb', line 440

def ==( other )
    hash == other.hash
end

#_dump(_) ⇒ `Object`



542
543
544

# File 'lib/arachni/page.rb', line 542

def _dump( _ )
    Marshal.dump( to_initialization_options )
end

#audit_element?(element) ⇒ `Bool`

Returns ‘true` if the element should be audited, `false` otherwise.

Parameters:

element (Element::Capabilities::Auditable, Integer) —

Element or Element::Capabilities::Auditable#coverage_hash.

Returns:

(Bool) —

‘true` if the element should be audited, `false` otherwise.

See Also:

# File 'lib/arachni/page.rb', line 227

def audit_element?( element )
    return if @do_not_audit_elements
    return true if @element_audit_whitelist.empty?
    @element_audit_whitelist.include?(
        element.is_a?( Integer ) ? element : element.coverage_hash
    )
end

#body ⇒ `String`

Returns HTTP response body.

Returns:

(String) —

HTTP response body.

# File 'lib/arachni/page.rb', line 267

def body
    return '' if !@body && !@response
    @body ||= response.body
end

#body=(string) ⇒ `Object`

Parameters:

string (String) —

Page body.

# File 'lib/arachni/page.rb', line 274

def body=( string )
    @has_javascript = nil
    clear_cache

    @body = string.to_s.dup.freeze
end

#clear_cache ⇒ `Page`

Note:

Will preserve caches for elements which have been externally modified.

Returns ‘self` with caches cleared.

Returns:

(Page) —

‘self` with caches cleared.

# File 'lib/arachni/page.rb', line 339

def clear_cache
    ELEMENTS.each do |type|
        next if @has_custom_elements.include? type
        # Remove the association to this page before clearing the elements
        # from cache to make it easier on the GC.
        (@cache[type] || []).each { |e| e.page = nil }
    end

    @cache.delete_if { |k, _| !@has_custom_elements.include? k }
    self
end

#code ⇒ `String`

Returns URL of the page.

Returns:

(String) —

URL of the page.

# File 'lib/arachni/page.rb', line 254

def code
    return 0 if !@code && !response
    @code ||= response.code
end

#cookie_jar ⇒ `Array<Element::Cookie>`

Returns Cookies extracted from the supplied cookie-jar.

Returns:

(Array<Element::Cookie>) —

Cookies extracted from the supplied cookie-jar.



298
299
300

# File 'lib/arachni/page.rb', line 298

def cookie_jar
    @cookie_jar ||= (parser ? parser.cookie_jar : [])
end

#do_not_audit_elements ⇒ `Object`

It forces #audit_element? to always returns false.



236
237
238

# File 'lib/arachni/page.rb', line 236

def do_not_audit_elements
    @do_not_audit_elements = true
end

#document ⇒ `Nokogiri::HTML`

Returns Parsed HTML document.

Returns:

(Nokogiri::HTML) —

Parsed HTML document.



331
332
333

# File 'lib/arachni/page.rb', line 331

def document
    @cache[:document] ||= (parser.nil? ? Nokogiri::HTML( body ) : parser.document)
end

#dup ⇒ `Object`



448
449
450

# File 'lib/arachni/page.rb', line 448

def dup
    self.class.new to_initialization_options
end

#elements ⇒ `Array`

Returns All page elements.

Returns:

(Array) —

All page elements.



318
319
320

# File 'lib/arachni/page.rb', line 318

def elements
    ELEMENTS.map { |type| send( type ) }.flatten
end

#eql?(other) ⇒ `Boolean`

Returns:

(Boolean)



444
445
446

# File 'lib/arachni/page.rb', line 444

def eql?( other )
    self == other
end

#has_script? ⇒ `Boolean`

Returns ‘true` if the page contains client-side code, `false` otherwise.

Returns:

(Boolean) —

‘true` if the page contains client-side code, `false` otherwise.

# File 'lib/arachni/page.rb', line 371

def has_script?
    return @has_javascript if !@has_javascript.nil?

    if !response.headers.content_type.to_s.start_with?( 'text/html' ) ||
        !text? || !document
        return @has_javascript = false
    end

    # First check, quick and simple.
    return @has_javascript = true if document.css( 'script' ).any?

    # Check for event attributes, if there are any then there's JS to be
    # executed.
    Browser::Javascript.events.flatten.each do |event|
        return @has_javascript = true if document.xpath( "//*[@#{event}]" ).any?
    end

    # If there's 'javascript:' in 'href' and 'action' attributes then
    # there's JS to be executed.
    [:action, :href].each do |candidate|
        document.xpath( "//*[@#{candidate}]" ).each do |attribute|
            if attribute.attributes[candidate.to_s].to_s.start_with?( 'javascript:' )
                return @has_javascript = true
            end
        end
    end

    @has_javascript = false
end

#hash ⇒ `Object`



436
437
438

# File 'lib/arachni/page.rb', line 436

def hash
    digest.hash
end

#method(*args) ⇒ `String`

Returns The request method that returned the page.

Returns:

(String) —

The request method that returned the page

# File 'lib/arachni/page.rb', line 324

def method( *args )
    return super( *args ) if args.any?
    response.request.method
end

#parsed_url ⇒ `Arachni::URI`

Returns:

(Arachni::URI)



188
189
190

# File 'lib/arachni/page.rb', line 188

def parsed_url
    Arachni::URI( url )
end

#parser ⇒ `Parser`

Returns:

(Parser)

# File 'lib/arachni/page.rb', line 193

def parser
    return if !@response
    return @cache[:parser] if @cache[:parser]

    @cache[:parser] = Parser.new( @response )

    # The page may have a browser-assigned body, set it as the one to parse.
    @cache[:parser].body = body
    @cache[:parser]
end

#paths ⇒ `Array<String>`

Returns Paths contained in this page.

Returns:

(Array<String>) —

Paths contained in this page.

See Also:

Arachni::Parser#paths



306
307
308

# File 'lib/arachni/page.rb', line 306

def paths
    @cache[:paths] ||= parser ? parser.paths : []
end

#performer ⇒ `Object`

Returns Object which performed the #request which lead to this page.

Returns:

(Object) —

Object which performed the #request which lead to this page.



183
184
185

# File 'lib/arachni/page.rb', line 183

def performer
    request.performer
end

#persistent_hash ⇒ `Object`



432
433
434

# File 'lib/arachni/page.rb', line 432

def persistent_hash
    digest.persistent_hash
end

#platforms ⇒ `Platform`

Returns Applicable platforms for the page.

Returns:

(Platform) —

Applicable platforms for the page.



312
313
314

# File 'lib/arachni/page.rb', line 312

def platforms
    Platform::Manager[url]
end

#prepare_for_report ⇒ `Object`

# File 'lib/arachni/page.rb', line 351

def prepare_for_report
    # We want a hard clear, that's why we don't call #clear_cache.
    @cache.clear

    # If we're dealing with binary data remove it before storing.
    if !text?
        response.body = nil
        self.body     = nil
    end

    @cookie_jar.clear if @cookie_jar

    @dom.digest      = nil
    @dom.skip_states = nil

    self
end

#query_vars ⇒ `Hash`

Returns URL query parameters.

Returns:

(Hash) —

URL query parameters.



261
262
263

# File 'lib/arachni/page.rb', line 261

def query_vars
    @cache[:query_vars] ||= uri_parse_query( url )
end

#request ⇒ `HTTP::Request`

Returns HTTP request.

Returns:

(HTTP::Request) —

HTTP request.



242
243
244

# File 'lib/arachni/page.rb', line 242

def request
    response.request
end

#scope ⇒ `Scope`

Returns:

(Scope)



177
178
179

# File 'lib/arachni/page.rb', line 177

def scope
    @scope = Scope.new( self )
end

#text? ⇒ `Boolean`

Returns ‘true` if the body of the page is text-base, `false` otherwise.

Returns:

(Boolean) —

‘true` if the body of the page is text-base, `false` otherwise.

# File 'lib/arachni/page.rb', line 403

def text?
    return false if !response
    response.text?
end

#title ⇒ `String`

Returns Title of the page.

Returns:

(String) —

Title of the page.



410
411
412

# File 'lib/arachni/page.rb', line 410

def title
    document.css( 'title' ).first.text rescue nil
end

#to_h ⇒ `Hash` Also known as: to_hash

Returns Converts the page data to a hash.

Returns:

(Hash) —

Converts the page data to a hash.

# File 'lib/arachni/page.rb', line 416

def to_h
    skip = [:@document, :@do_not_audit_elements, :@has_custom_elements, :@scope]

    instance_variables.inject({}) do |h, iv|
        next h if skip.include? iv

        h[iv.to_s.gsub( '@', '').to_sym] = try_dup( instance_variable_get( iv ) )
        h
    end.merge(@cache).tap { |h| h.delete :parser }
end

#to_initialization_options ⇒ `Object`

# File 'lib/arachni/page.rb', line 452

def to_initialization_options
    h = {}
    [:body, :cookie_jar, :element_audit_whitelist, :metadata].each do |m|
        h[m] = try_dup( instance_variable_get( "@#{m}".to_sym ) )
        h.delete( m ) if !h[m]
    end

    ELEMENTS.each do |type|
        next if !@has_custom_elements.include?( type )
        h[type] = @cache[type]

        if !h[type] || h[type].empty?
            h.delete( type )
            next
        end

        h[type] = h[type].map { |e| c = e.dup; c.page = nil; c }
    end

    h[:response] = response
    h[:do_not_audit_elements] = @do_not_audit_elements

    h[:dom] = dom.to_h.keys.inject({}) do |dh, k|
        dh[k] = try_dup( dom.send( k ) )
        dh
    end

    h
end

#to_rpc_data ⇒ `Hash`

Returns Data representing this instance that are suitable the RPC transmission.

Returns:

(Hash) —

Data representing this instance that are suitable the RPC transmission.

# File 'lib/arachni/page.rb', line 484

def to_rpc_data
    data        = to_initialization_options.my_stringify_keys(false)
    data['dom'] = dom.to_rpc_data
    data['element_audit_whitelist'] = element_audit_whitelist.to_a
    data['response'] = data['response'].to_rpc_data

    %w(links forms cookies).each do |e|
        next if !data[e]
        data[e] = send(e).map(&:to_rpc_data)
    end

    data.delete 'cookie_jar'

    data
end

#to_s ⇒ `Object`



428
429
430

# File 'lib/arachni/page.rb', line 428

def to_s
    "#<#{self.class}:#{object_id} @url=#{@url.inspect} @dom=#{@dom}>"
end

#update_element_audit_whitelist(list) ⇒ `Set`

Returns #element_audit_whitelist.

Parameters:

list (Array<Element::Capabilities::Auditable, Integer>) —

Audit whitelist based on elements or Element::Capabilities::Auditable#coverage_hashs.

Returns:

(Set) —

#element_audit_whitelist

See Also:

# File 'lib/arachni/page.rb', line 212

def update_element_audit_whitelist( list )
    [list].flatten.each do |e|
        @element_audit_whitelist <<
            (e.is_a?( Integer ) ? e : e.coverage_hash )
    end
end

#url ⇒ `String`

Returns URL of the page.

Returns:

(String) —

URL of the page.



248
249
250

# File 'lib/arachni/page.rb', line 248

def url
    @url ||= @response.url
end

Class: Arachni::Page

Overview

Defined Under Namespace

Constant Summary collapse

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Methods included from Utilities

Constructor Details

#initialize(options) ⇒ Page

Instance Attribute Details

#cache ⇒ Hash (readonly)

#dom ⇒ DOM

#element_audit_whitelist ⇒ Set<Integer> (readonly)

#metadata ⇒ Hash (readonly)

#response ⇒ HTTP::Response (readonly)

Class Method Details

._load(data) ⇒ Object

.from_data(data) ⇒ Object

.from_response(response) ⇒ Page

.from_rpc_data(data) ⇒ Page

.from_url(url, opts = {}, &block) ⇒ Page

Instance Method Details

#==(other) ⇒ Object

#_dump(_) ⇒ Object

#audit_element?(element) ⇒ Bool

#body ⇒ String

#body=(string) ⇒ Object

#clear_cache ⇒ Page

#code ⇒ String

#cookie_jar ⇒ Array<Element::Cookie>

#do_not_audit_elements ⇒ Object

#document ⇒ Nokogiri::HTML

#dup ⇒ Object

#elements ⇒ Array

#eql?(other) ⇒ Boolean

#has_script? ⇒ Boolean

#hash ⇒ Object

#method(*args) ⇒ String

#parsed_url ⇒ Arachni::URI

#parser ⇒ Parser

#paths ⇒ Array<String>

#performer ⇒ Object

#persistent_hash ⇒ Object

#platforms ⇒ Platform

#prepare_for_report ⇒ Object

#query_vars ⇒ Hash

#request ⇒ HTTP::Request

#scope ⇒ Scope

#text? ⇒ Boolean

#title ⇒ String

#to_h ⇒ Hash Also known as: to_hash

#to_initialization_options ⇒ Object

#to_rpc_data ⇒ Hash

#to_s ⇒ Object

#update_element_audit_whitelist(list) ⇒ Set

#url ⇒ String

#initialize(options) ⇒ `Page`

#cache ⇒ `Hash` (readonly)

#dom ⇒ `DOM`

#element_audit_whitelist ⇒ `Set<Integer>` (readonly)

#metadata ⇒ `Hash` (readonly)

#response ⇒ `HTTP::Response` (readonly)

._load(data) ⇒ `Object`

.from_data(data) ⇒ `Object`

.from_response(response) ⇒ `Page`

.from_rpc_data(data) ⇒ `Page`

.from_url(url, opts = {}, &block) ⇒ `Page`

#==(other) ⇒ `Object`

#_dump(_) ⇒ `Object`

#audit_element?(element) ⇒ `Bool`

#body ⇒ `String`

#body=(string) ⇒ `Object`

#clear_cache ⇒ `Page`

#code ⇒ `String`

#cookie_jar ⇒ `Array<Element::Cookie>`

#do_not_audit_elements ⇒ `Object`

#document ⇒ `Nokogiri::HTML`

#dup ⇒ `Object`

#elements ⇒ `Array`

#eql?(other) ⇒ `Boolean`

#has_script? ⇒ `Boolean`

#hash ⇒ `Object`

#method(*args) ⇒ `String`

#parsed_url ⇒ `Arachni::URI`

#parser ⇒ `Parser`

#paths ⇒ `Array<String>`

#performer ⇒ `Object`

#persistent_hash ⇒ `Object`

#platforms ⇒ `Platform`

#prepare_for_report ⇒ `Object`

#query_vars ⇒ `Hash`

#request ⇒ `HTTP::Request`

#scope ⇒ `Scope`

#text? ⇒ `Boolean`

#title ⇒ `String`

#to_h ⇒ `Hash` Also known as: to_hash

#to_initialization_options ⇒ `Object`

#to_rpc_data ⇒ `Hash`

#to_s ⇒ `Object`

#update_element_audit_whitelist(list) ⇒ `Set`

#url ⇒ `String`