Class: Arachni::Parser

Inherits:
Object show all
Includes:
UI::Output, Utilities
Defined in:
lib/arachni/parser.rb,
lib/arachni/parser/sax.rb,
lib/arachni/parser/document.rb,
lib/arachni/parser/nodes/base.rb,
lib/arachni/parser/nodes/text.rb,
lib/arachni/parser/nodes/comment.rb,
lib/arachni/parser/nodes/element.rb,
lib/arachni/parser/with_children.rb,
lib/arachni/parser/extractors/base.rb,
lib/arachni/parser/nodes/with_value.rb,
lib/arachni/parser/with_children/search.rb,
lib/arachni/parser/nodes/element/with_attributes.rb,
lib/arachni/parser/nodes/element/with_attributes/attributes.rb

Overview

Analyzes HTML code extracting inputs vectors and supporting information.

Author:

Defined Under Namespace

Modules: Extractors, Nodes, WithChildren Classes: Document, SAX

Constant Summary collapse

CACHE_SIZES =
{
    parse:          50,
    parse_xml:      50,
    parse_fragment: 100
}
CACHE =
{}
WHITELIST =
%w(
    title base a form frame iframe meta input select option script link area
    textarea input select button comment !--
)
IGNORE_REQUEST_HEADERS =
[
    HTTP::Client::SEED_HEADER_NAME,
    'Content-Length'
]

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Methods included from Utilities

#available_port, available_port_mutex, #bytes_to_kilobytes, #bytes_to_megabytes, #caller_name, #caller_path, #cookie_decode, #cookie_encode, #cookies_from_file, #cookies_from_parser, #cookies_from_response, #exception_jail, #exclude_path?, #follow_protocol?, #form_decode, #form_encode, #forms_from_parser, #forms_from_response, #full_and_absolute_url?, #generate_token, #get_path, #hms_to_seconds, #html_decode, #html_encode, #include_path?, #links_from_parser, #links_from_response, #normalize_url, #page_from_response, #page_from_url, #parse_set_cookie, #path_in_domain?, #path_too_deep?, #port_available?, #rand_port, #random_seed, #redundant_path?, #regexp_array_match, #remove_constants, #request_parse_body, #seconds_to_hms, #skip_page?, #skip_path?, #skip_resource?, #skip_response?, #uri_decode, #uri_encode, #uri_parse, #uri_parse_query, #uri_parser, #uri_rewrite

Methods included from UI::Output

#caller_location, #debug?, #debug_level, #debug_level_1?, #debug_level_2?, #debug_level_3?, #debug_level_4?, #debug_off, #debug_on, #disable_only_positives, #error_buffer, #error_log_fd, #error_logfile, #has_error_log?, #included, #log_error, #mute, #muted?, #only_positives, #only_positives?, #print_bad, #print_debug, #print_debug_backtrace, #print_debug_exception, #print_debug_level_1, #print_debug_level_2, #print_debug_level_3, #print_debug_level_4, #print_error, #print_error_backtrace, #print_exception, #print_info, #print_line, #print_ok, #print_status, #print_verbose, #reroute_to_file, #reroute_to_file?, reset_output_options, #set_error_logfile, #unmute, #verbose?, #verbose_off, #verbose_on

Constructor Details

#initialize(resource) ⇒ Parser

Returns a new instance of Parser.

Parameters:

  • resource (Document, HTTP::Response, Array<HTTP::Response>)

    Response(s) to analyze and parse. By providing multiple responses the parser will be able to perform some preliminary differential analysis and identify nonce tokens in inputs.


157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
# File 'lib/arachni/parser.rb', line 157

def initialize( resource )
    case resource

        when Document
            @resource = :document
            @document = resource

        when HTTP::Response
            @resource = :response

            @response = resource
            self.url = @response.url

        when Array
            @secondary_responses = resource[1..-1]
            @secondary_responses.compact! if @secondary_responses
            response = resource.shift

            @resource = :response

            @response = response
            self.url = response.url
    end
end

Instance Attribute Details

#responseHTTP::Response

Returns:


151
152
153
# File 'lib/arachni/parser.rb', line 151

def response
  @response
end

#urlString

Returns:


148
149
150
# File 'lib/arachni/parser.rb', line 148

def url
  @url
end

Class Method Details

.markup?(string) ⇒ Boolean

Returns:

  • (Boolean)

108
109
110
111
112
113
114
# File 'lib/arachni/parser.rb', line 108

def markup?( string )
    begin
        Ox.parse( string ).is_a?( Ox::Element )
    rescue => e
        false
    end
end

.parse(html, options = {}) ⇒ Object


65
66
67
68
69
70
71
72
73
74
75
76
# File 'lib/arachni/parser.rb', line 65

def parse( html, options = {} )
    CACHE[__method__].fetch [html, options] do
        handler, sax_options = prepare_ox_options( options )

        begin
            Ox.sax_html( handler, StringIO.new( html ), sax_options )
        rescue SAX::Stop
        end

        handler.document
    end
end

.parse_fragment(html) ⇒ Object


93
94
95
96
97
98
99
100
# File 'lib/arachni/parser.rb', line 93

def parse_fragment( html )
    CACHE[__method__].fetch html do
        parse( html ).children.first.tap do |o|
            o.parent   = nil
            o.document = nil
        end
    end
end

.parse_xml(xml) ⇒ Object


102
103
104
105
106
# File 'lib/arachni/parser.rb', line 102

def parse_xml( xml )
    CACHE[__method__].fetch xml do
        Nokogiri::XML( xml )
    end
end

.push_parse(options = {}) ⇒ Object


78
79
80
81
82
83
84
85
86
87
88
89
90
91
# File 'lib/arachni/parser.rb', line 78

def push_parse( options = {} )
    buffer, buffer_in = IO.pipe

    document, sax_options = prepare_ox_options( options )

    push_parse_pool.post do
        begin
            Ox.sax_html( document, buffer, sax_options )
        rescue SAX::Stop
        end
    end

    [buffer_in, document]
end

Instance Method Details

#baseString

Returns Base href, if there is one.

Returns:

  • (String)

    Base href, if there is one.


430
431
432
# File 'lib/arachni/parser.rb', line 430

def base
    @base ||= document.nodes_by_name( :base ).map { |b| b['href'] }.first || @url
end

#bodyObject


231
232
233
# File 'lib/arachni/parser.rb', line 231

def body
    @body || (@response.body if from_response?)
end

#body=(string) ⇒ String

Returns Override the #response body for the parsing process.

Returns:


226
227
228
229
# File 'lib/arachni/parser.rb', line 226

def body=( string )
    @links = @forms = @cookies = @document = nil
    @body = string
end

Returns Cookies with which to update the HTTP cookie-jar.

Returns:


405
406
407
408
409
410
411
412
413
414
415
416
# File 'lib/arachni/parser.rb', line 405

def cookie_jar
    return @cookie_jar.freeze if @cookie_jar
    from_jar = []

    # Make a list of the response cookie names.
    cookie_names = Set.new( cookies.map( &:name ) )

    from_jar |= HTTP::Client.cookie_jar.for_url( @url ).
        reject { |cookie| cookie_names.include?( cookie.name ) }

    @cookie_jar = (cookies | from_jar)
end

#cookiesArray<Element::Cookie>

Returns Cookies from HTTP headers and response body.

Returns:


368
369
370
371
372
373
374
375
# File 'lib/arachni/parser.rb', line 368

def cookies
    return @cookies.freeze if @cookies

    @cookies = Cookie.from_headers( @url, @response.headers )
    return @cookies if !text? || !Cookie.in_html?( body )

    @cookies |= Cookie.from_parser( self )
end

#cookies_to_be_auditedArray<Element::Cookie>

Returns Cookies to be audited.

Returns:


379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
# File 'lib/arachni/parser.rb', line 379

def cookies_to_be_audited
    return @cookies_to_be_audited.freeze if @cookies_to_be_audited
    return [] if !text?

    # Make a list of the response cookie names.
    cookie_names = Set.new( cookies.map(&:name) )

    # Grab all cookies from the cookiejar giving preferrence to the ones
    # specified by the current page, if there are any.
    from_http_jar = HTTP::Client.cookie_jar.cookies.reject do |c|
        cookie_names.include?( c.name )
    end

    # These cookies are to be audited and thus are dirty and anarchistic,
    # so they have to contain even cookies completely irrelevant to the
    # current page. I.e. it contains all cookies that have been observed
    # since the beginning of the scan
    @cookies_to_be_audited = (cookies | from_http_jar).map do |c|
        dc = c.dup
        dc.action = @url
        dc
    end
end

#documentArachni::Parser::Document?

Returns a parsed HTML document from the body of the HTTP response or nil if the response data wasn't text-based or the response couldn't be parsed.

Returns:

  • (Arachni::Parser::Document, nil)

    Returns a parsed HTML document from the body of the HTTP response or nil if the response data wasn't text-based or the response couldn't be parsed.


239
240
241
242
243
244
# File 'lib/arachni/parser.rb', line 239

def document
    return @document if @document
    return if !text?

    @document = self.class.parse( body, filter: true )
end

#formsArray<Element::Form>

Returns Forms from #document.

Returns:


270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
# File 'lib/arachni/parser.rb', line 270

def forms
    return @forms.freeze if @forms
    return [] if !text? || (body && !Form.in_html?( body ))

    f = Form.from_parser( self )
    return f if !@secondary_responses

    @secondary_responses.each do |response|
        next if response.body.to_s.empty?

        Form.from_parser( Parser.new( response ) ).each do |form2|
            f.each do |form|
                next if "#{form.coverage_id}:#{form.name_or_id}" !=
                    "#{form2.coverage_id}:#{form2.name_or_id}"

                form.inputs.each do |k, v|
                    next if v == form2.inputs[k] ||
                        form.field_type_for( k ) != :hidden

                    form.nonce_name = k
                end
            end
        end
    end

    @forms = f
end

#from_document?Boolean

Returns:

  • (Boolean)

220
221
222
# File 'lib/arachni/parser.rb', line 220

def from_document?
    @resource == :document
end

#from_response?Boolean

Returns:

  • (Boolean)

216
217
218
# File 'lib/arachni/parser.rb', line 216

def from_response?
    @resource == :response
end

#headersHash

Note:

It will include common request headers as well headers from the HTTP request.

Returns List of valid auditable HTTP header fields.

Returns:

  • (Hash)

    List of valid auditable HTTP header fields.


251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
# File 'lib/arachni/parser.rb', line 251

def headers
    @headers ||= {
        'Accept'          => 'text/html,application/xhtml+xml,application' +
            '/xml;q=0.9,*/*;q=0.8',
        'Accept-Charset'  => 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
        'Accept-Encoding' => 'gzip;q=1.0,deflate;q=0.6,identity;q=0.3',
        'From'            => Options.authorized_by  || '',
        'User-Agent'      => Options.http.user_agent || '',
        'Referer'         => @url,
        'Pragma'          => 'no-cache'
    }.merge(
        response.request.headers.dup.tap do |h|
            IGNORE_REQUEST_HEADERS.each { |k| h.delete k }
        end
    ).map { |k, v| Header.new( url: @url, inputs: { k => v } ) }.freeze
end

#jsonsArray<Element::JSON>

Returns:


339
340
341
# File 'lib/arachni/parser.rb', line 339

def jsons
    @jsons ||= [JSON.from_request( @url, response.request )].compact
end

Returns Link to the page.

Returns:


300
301
302
303
# File 'lib/arachni/parser.rb', line 300

def link
    return if link_vars.empty? && (@response && !@response.redirection?)
    Link.new( url: @url, inputs: link_vars )
end

Returns LinkTemplate for the current page.

Returns:


307
308
309
310
311
312
313
314
315
316
317
# File 'lib/arachni/parser.rb', line 307

def link_template
    template, inputs = LinkTemplate.extract_inputs( @url )
    return if !template

    LinkTemplate.new(
        url:      @url.freeze,
        action:   @url.freeze,
        inputs:   inputs,
        template: template
    )
end

Returns Links matching OptionsGroups::Audit#link_templates in #document.

Returns:


330
331
332
333
334
335
336
# File 'lib/arachni/parser.rb', line 330

def link_templates
    return @link_templates.freeze if @link_templates
    return @link_templates = [link_template].compact if !text?

    @link_templates =
        [link_template].compact | LinkTemplate.from_parser( self )
end

Returns Parameters found in #url.

Returns:


350
351
352
353
354
# File 'lib/arachni/parser.rb', line 350

def link_vars
    return {} if !(parsed = uri_parse( @url ))

    @link_vars ||= parsed.rewrite.query_parameters.freeze
end

Returns Links in #document.

Returns:


321
322
323
324
325
326
# File 'lib/arachni/parser.rb', line 321

def links
    return @links.freeze if @links
    return @links = [link].compact if !text? || (body && !Link.in_html?( body ))

    @links = [link].compact | Link.from_parser( self )
end

#pagePage

Returns:


206
207
208
# File 'lib/arachni/parser.rb', line 206

def page
    @page ||= Page.new( parser: self )
end

#pathsArray<String>

Returns Distinct links to follow.

Returns:


420
421
422
423
424
425
426
# File 'lib/arachni/parser.rb', line 420

def paths
  return @paths if @paths
  @paths = []
  return @paths.freeze if !document

  @paths = run_extractors.freeze
end

#text?Boolean

Returns true if the given HTTP response data are text based, false otherwise.

Returns:

  • (Boolean)

    true if the given HTTP response data are text based, false otherwise.


212
213
214
# File 'lib/arachni/parser.rb', line 212

def text?
    from_response? ? @response.text? : true
end

#to_absolute(relative_url) ⇒ String

Converts a relative URL to an absolute one.

Parameters:

  • relative_url (String)

    URL to convert to absolute.

Returns:


195
196
197
198
199
200
201
202
203
# File 'lib/arachni/parser.rb', line 195

def to_absolute( relative_url )
    if (url = base)
        base_url = url
    else
        base_url = @url
    end

    super( relative_url, base_url )
end

#ui_formsObject

Dummy method, only the browser can fill this in.


362
363
364
# File 'lib/arachni/parser.rb', line 362

def ui_forms
    []
end

#ui_inputsObject

Dummy method, only the browser can fill this in.


357
358
359
# File 'lib/arachni/parser.rb', line 357

def ui_inputs
    []
end

#xmlsArray<Element::XML>

Returns:


344
345
346
# File 'lib/arachni/parser.rb', line 344

def xmls
    @xmls ||= [XML.from_request( @url, response.request )].compact
end