Class: Arachni::Parser

Inherits:
Object show all
Includes:
UI::Output, Utilities
Defined in:
lib/arachni/parser.rb

Overview

HTML Parser

Analyzes HTML code extracting forms, links and cookies depending on user opts.

ignored.

Author:

Defined Under Namespace

Modules: Extractors

Instance Attribute Summary collapse

Instance Method Summary collapse

Methods included from Utilities

#available_port, #cookie_encode, #cookies_from_document, #cookies_from_file, #cookies_from_response, #exception_jail, #exclude_path?, #extract_domain, #follow_protocol?, #form_decode, #form_encode, #form_parse_request_body, #forms_from_document, #forms_from_response, #generate_token, #get_path, #html_decode, #html_encode, #include_path?, #links_from_document, #links_from_response, #normalize_url, #page_from_response, #page_from_url, #parse_query, #parse_set_cookie, #parse_url_vars, #path_in_domain?, #path_too_deep?, #port_available?, #rand_port, #redundant_path?, #remove_constants, #seed, #skip_page?, #skip_path?, #skip_resource?, #uri_decode, #uri_encode, #uri_parse, #uri_parser, #url_sanitize

Methods included from UI::Output

#debug?, #debug_off, #debug_on, #disable_only_positives, #error_logfile, #flush_buffer, #log_error, #mute, #muted?, old_reset_output_options, #only_positives, #only_positives?, #print_bad, #print_debug, #print_debug_backtrace, #print_debug_pp, #print_error, #print_error_backtrace, #print_info, #print_line, #print_ok, #print_status, #print_verbose, #reroute_to_file, #reroute_to_file?, reset_output_options, #set_buffer_cap, #set_error_logfile, #uncap_buffer, #unmute, #verbose, #verbose?

Constructor Details

#initialize(res, opts = Options) ⇒ Parser

Returns a new instance of Parser.

Parameters:

  • res (Typhoeus::Responses, Array<Typhoeus::Responses>)

    Response(s) to analyze and parse into a Arachni::Page. By providing multiple responses the parser will be able to perform some preliminary differential analysis and identify nonce tokens in inputs.

  • opts (Options) (defaults to: Options)


78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
# File 'lib/arachni/parser.rb', line 78

def initialize( res, opts = Options )
    @opts = opts

    if res.is_a? Array
        @secondary_responses = res[1..-1]
        @secondary_responses.compact! if @secondary_responses
        res = res.shift
    end

    @code     = res.code
    self.url  = res.effective_url
    @html     = res.body
    @response = res

    @response_headers = res.headers_hash

    @doc   = nil
    @paths = nil
end

Instance Attribute Details

#optsOptions (readonly)

Returns Options instance.

Returns:



68
69
70
# File 'lib/arachni/parser.rb', line 68

def opts
  @opts
end

#urlString

Returns The url of the page.

Returns:

  • (String)

    The url of the page.



65
66
67
# File 'lib/arachni/parser.rb', line 65

def url
  @url
end

Instance Method Details

#baseString

Returns ‘base href`, if there is one.

Returns:

  • (String)

    ‘base href`, if there is one.



319
320
321
# File 'lib/arachni/parser.rb', line 319

def base
    @base ||= doc.search( '//base[@href]' ).first['href'] rescue nil
end

#cookiesArray<Element::Cookie>

Returns Cookies from HTTP headers and response body.

Returns:



304
305
306
307
# File 'lib/arachni/parser.rb', line 304

def cookies
    ( Cookie.from_document( @url, doc ) |
      Cookie.from_headers( @url, @response_headers ) )
end

#docNokogiri::HTML?

Returns a parsed HTML document from the body of the HTTP response or ‘nil` if the response data wasn’t text-based or the response couldn’t be parsed.

Returns:

  • (Nokogiri::HTML, nil)

    Returns a parsed HTML document from the body of the HTTP response or ‘nil` if the response data wasn’t text-based or the response couldn’t be parsed.



225
226
227
228
# File 'lib/arachni/parser.rb', line 225

def doc
    return @doc if @doc
    @doc = Nokogiri::HTML( @html ) if text? rescue nil
end

#forms(html = nil) ⇒ Array<Element::Form>

Returns Forms from ‘html`.

Parameters:

  • html (String, Nokogiri::HTML) (defaults to: nil)

    Document to analyze, defaults to #doc.

Returns:



254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
# File 'lib/arachni/parser.rb', line 254

def forms( html = nil )
    return [] if !text? && !html

    f = Form.from_document( @url, html || doc )
    return f if !@secondary_responses

    @secondary_responses.each do |response|
        next if response.body.to_s.empty?

        Form.from_document( @url, response.body ).each do |form2|
            f.each do |form|
                next if "#{form.id}:#{form.name_or_id}" !=
                    "#{form2.id}:#{form2.name_or_id}"

                form.auditable.each do |k, v|
                    next if !(v != form2.auditable[k] &&
                        form.field_type_for( k ) == 'hidden')

                    form.nonce_name = k
                end
            end
        end
    end

    f
end

#headersHash

Note:

It’s more of a placeholder method, it doesn’t actually analyze anything. It’s a long shot that any of these will be vulnerable but better be safe than sorry.

Returns List of valid auditable HTTP header fields.

Returns:

  • (Hash)

    List of valid auditable HTTP header fields.



237
238
239
240
241
242
243
244
245
246
247
248
# File 'lib/arachni/parser.rb', line 237

def headers
    {
        'Accept'          => 'text/html,application/xhtml+xml,application' +
            '/xml;q=0.9,*/*;q=0.8',
        'Accept-Charset'  => 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
        'Accept-Encoding' => 'gzip;q=1.0,deflate;q=0.6,identity;q=0.3',
        'From'            => @opts.authed_by  || '',
        'User-Agent'      => @opts.user_agent || '',
        'Referer'         => @url,
        'Pragma'          => 'no-cache'
    }.map { |k, v| Header.new( @url, { k => v } ) }
end

Returns Parameters found in ‘url`.

Parameters:

  • url (String)

    URL to analyze.

Returns:

  • (Hash)

    Parameters found in ‘url`.



297
298
299
# File 'lib/arachni/parser.rb', line 297

def link_vars( url )
    Link.parse_query_vars( url )
end

Returns Links in ‘html`.

Parameters:

  • html (String, Nokogiri::HTML) (defaults to: nil)

    Document to analyze, defaults to #doc.

Returns:



285
286
287
288
289
290
291
292
293
# File 'lib/arachni/parser.rb', line 285

def links( html = nil )
    return [] if !text? && !html

    if !(vars = link_vars( @url )).empty? || @response.redirection?
        [Link.new( @url, vars )]
    else
        []
    end | Link.from_document( @url, html || doc )
end

#pagePage Also known as: run

Returns Parsed page object based on the given options and HTTP responses.

Returns:

  • (Page)

    Parsed page object based on the given options and HTTP responses.



122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
# File 'lib/arachni/parser.rb', line 122

def page
    req_method = @response.request ? @response.request.method.to_s : 'get'

    self_link = Link.new( @url, inputs: link_vars( @url ) )

    # Non text files won't contain any auditable elements.
    if !text?
        page = Page.new(
            code:             @code,
            url:              @url,
            method:           req_method,
            query_vars:       self_link.auditable,
            body:             @html,
            request_headers:  @response.request ? @response.request.headers : {},
            response_headers: @response_headers,
            text:             false,
            links:            [self_link]
        )
        Platform::Manager.fingerprint( page ) if Options.fingerprint?
        return page
    end

    # Extract cookies from the response.
    c_cookies = cookies

    # Make a list of the response cookie names.
    cookie_names = c_cookies.map { |c| c.name }

    from_jar = []

    # If there's a Netscape cookiejar file load cookies from it but only
    # new ones, i.e. only if they weren't already in the response.
    if @opts.cookie_jar.is_a?( String ) && File.exists?( @opts.cookie_jar )
        from_jar |= cookies_from_file( @url, @opts.cookie_jar )
            .reject { |c| cookie_names.include?( c.name ) }
    end

    # If we somehow have runtime configuration cookies load them too, but
    # only if they haven't already been seen.
    if @opts.cookies && !@opts.cookies.empty?
        from_jar |= @opts.cookies.reject { |c| cookie_names.include?( c.name ) }
    end

    # grab cookies from the HTTP cookiejar and filter out old ones, as usual
    from_http_jar = HTTP.instance.cookie_jar.cookies.reject do |c|
        cookie_names.include?( c.name )
    end

    # These cookies are to be audited and thus are dirty and anarchistic,
    # so they have to contain even cookies completely irrelevant to the
    # current page. I.e. it contains all cookies that have been observed
    # since the beginning of the scan
    cookies_to_be_audited = (c_cookies | from_jar | from_http_jar).map do |c|
        dc = c.dup
        dc.action = @url
        dc
    end

    page = Page.new(
        code:             @code,
        url:              @url,
        query_vars:       self_link.auditable,
        method:           req_method,
        body:             @html,

        request_headers:  @response.request ? @response.request.headers : {},
        response_headers: @response_headers,

        document:         doc,

        # All paths seen in the page.
        paths:            paths,
        forms:            forms,

        # All `href` attributes from `a` elements.
        links:            links | [self_link],

        cookies:          cookies_to_be_audited,
        headers:          headers,

        # This is the page cookiejar, each time the page is to be audited
        # by a module, the cookiejar of the HTTP class will be updated
        # with the cookies specified here.
        cookiejar:        c_cookies | from_jar,

        # Contains text-based data -- i.e. not a binary response.
        text:             true
    )
    Platform::Manager.fingerprint( page ) if Options.fingerprint?
    page
end

#pathsArray<String>

Returns Distinct links to follow.

Returns:



310
311
312
313
314
315
316
# File 'lib/arachni/parser.rb', line 310

def paths
  return @paths unless @paths.nil?
  @paths = []
  return @paths if !doc

  @paths = run_extractors
end

#text?Boolean

Returns ‘true` if the given HTTP response data are text based, `false` otherwise.

Returns:

  • (Boolean)

    ‘true` if the given HTTP response data are text based, `false` otherwise.



217
218
219
# File 'lib/arachni/parser.rb', line 217

def text?
    @response.text?
end

#to_absolute(relative_url) ⇒ String

Converts a relative URL to an absolute one.

Parameters:

  • relative_url (String)

    URL to convert to absolute.

Returns:



111
112
113
114
115
116
117
118
# File 'lib/arachni/parser.rb', line 111

def to_absolute( relative_url )
    if url = base
        base_url = url
    else
        base_url = @url
    end
    super( relative_url, base_url )
end