Class: Spidr::Agent

Inherits:
Object
  • Object
show all
Includes:
Actions, Events, Filters, Sanitizers
Defined in:
lib/spidr/agent.rb

Instance Attribute Summary collapse

Attributes included from Filters

#schemes

Attributes included from Sanitizers

#strip_fragments, #strip_query

Class Method Summary collapse

Instance Method Summary collapse

Methods included from Actions

#continue!, #initialize_actions, #pause!, #pause=, #paused?, #skip_link!, #skip_page!

Methods included from Events

#all_headers, #every_atom_doc, #every_atom_page, #every_bad_request_page, #every_css_page, #every_doc, #every_failed_url, #every_forbidden_page, #every_html_doc, #every_html_page, #every_internal_server_error_page, #every_javascript_page, #every_link, #every_missing_page, #every_ms_word_page, #every_ok_page, #every_page, #every_pdf_page, #every_redirect_page, #every_rss_doc, #every_rss_page, #every_timedout_page, #every_txt_page, #every_unauthorized_page, #every_url, #every_url_like, #every_xml_doc, #every_xml_page, #every_xsl_doc, #every_xsl_page, #every_zip_page, #initialize_events, #urls_like

Methods included from Filters

#ignore_exts, #ignore_exts_like, #ignore_hosts, #ignore_hosts_like, #ignore_links, #ignore_links_like, #ignore_ports, #ignore_ports_like, #ignore_urls, #ignore_urls_like, #initialize_filters, #visit_ext?, #visit_exts, #visit_exts_like, #visit_host?, #visit_hosts, #visit_hosts_like, #visit_link?, #visit_links, #visit_links_like, #visit_port?, #visit_ports, #visit_ports_like, #visit_scheme?, #visit_url?, #visit_urls, #visit_urls_like

Methods included from Sanitizers

#initialize_sanitizers, #sanitize_url

Constructor Details

#initialize(options = {}) {|agent| ... } ⇒ Agent

Creates a new Agent object.

Parameters:

  • options (Hash) (defaults to: {})

    Additional options

  • :proxy (Hash)

    a customizable set of options

Options Hash (options):

  • :proxy (Hash) — default: Spidr.proxy

    The proxy information to use.

  • :host_header (String)

    The HTTP Host header to use with each request.

  • :host_headers (Hash{String,Regexp => String})

    The HTTP Host headers to use for specific hosts.

  • :user_agent (String) — default: Spidr.user_agent

    The User-Agent string to send with each requests.

  • :referer (String)

    The Referer URL to send with each request.

  • :delay (Integer) — default: 0

    The number of seconds to pause between each request.

  • :queue (Set, Array)

    The initial queue of URLs to visit.

  • :history (Set, Array)

    The initial list of visited URLs.

  • :max_depth (Integer)

    The maximum link depth to follow.

Yields:

  • (agent)

    If a block is given, it will be passed the newly created agent for further configuration.

Yield Parameters:

  • agent (Agent)

    The newly created agent.



110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
# File 'lib/spidr/agent.rb', line 110

def initialize(options={})
  @host_header = options[:host_header]
  @host_headers = {}

  if options[:host_headers]
    @host_headers.merge!(options[:host_headers])
  end

  @user_agent = options.fetch(:user_agent,Spidr.user_agent)
  @referer = options[:referer]

  @sessions = SessionCache.new(options.fetch(:proxy,Spidr.proxy))
  @cookies = CookieJar.new
  @authorized = AuthStore.new

  @running = false
  @delay = options.fetch(:delay,0)
  @history = Set[]
  @failures = Set[]
  @queue = []

  @levels = Hash.new(0)
  @max_depth = options[:max_depth]

  initialize_sanitizers(options)
  initialize_filters(options)
  initialize_actions(options)
  initialize_events(options)

  yield self if block_given?
end

Instance Attribute Details

#authorizedObject

HTTP Authentication credentials



32
33
34
# File 'lib/spidr/agent.rb', line 32

def authorized
  @authorized
end

#cookiesObject (readonly)

Cached cookies



50
51
52
# File 'lib/spidr/agent.rb', line 50

def cookies
  @cookies
end

#delayObject

Delay in between fetching pages



38
39
40
# File 'lib/spidr/agent.rb', line 38

def delay
  @delay
end

#failuresObject

List of unreachable URLs



44
45
46
# File 'lib/spidr/agent.rb', line 44

def failures
  @failures
end

#historyObject Also known as: visited_urls

History containing visited URLs



41
42
43
# File 'lib/spidr/agent.rb', line 41

def history
  @history
end

#host_headerObject

HTTP Host Header to use



23
24
25
# File 'lib/spidr/agent.rb', line 23

def host_header
  @host_header
end

#host_headersObject (readonly)

HTTP Host Headers to use for specific hosts



26
27
28
# File 'lib/spidr/agent.rb', line 26

def host_headers
  @host_headers
end

#levelsObject (readonly)

The visited URLs and their depth within a site



56
57
58
# File 'lib/spidr/agent.rb', line 56

def levels
  @levels
end

#max_depthObject (readonly)

Maximum depth



53
54
55
# File 'lib/spidr/agent.rb', line 53

def max_depth
  @max_depth
end

#queueObject Also known as: pending_urls

Queue of URLs to visit



47
48
49
# File 'lib/spidr/agent.rb', line 47

def queue
  @queue
end

#refererObject

Referer to use



35
36
37
# File 'lib/spidr/agent.rb', line 35

def referer
  @referer
end

#user_agentObject

User-Agent to use



29
30
31
# File 'lib/spidr/agent.rb', line 29

def user_agent
  @user_agent
end

Class Method Details

.host(name, options = {}) {|agent| ... } ⇒ Object

Creates a new agent and spiders the given host.

Parameters:

  • The (String)

    host-name to spider.

  • options (Hash) (defaults to: {})

    Additional options. See #initialize.

Yields:

  • (agent)

    If a block is given, it will be passed the newly created agent before it begins spidering.

Yield Parameters:

  • agent (Agent)

    The newly created agent.



202
203
204
# File 'lib/spidr/agent.rb', line 202

def self.host(name,options={},&block)
  site(URI::HTTP.build(:host => name, :path => '/'),options,&block)
end

.site(url, options = {}) {|agent| ... } ⇒ Object

Creates a new agent and spiders the web-site located at the given URL.

Parameters:

  • url (URI::HTTP, String)

    The web-site to spider.

  • options (Hash) (defaults to: {})

    Additional options. See #initialize.

Yields:

  • (agent)

    If a block is given, it will be passed the newly created agent before it begins spidering.

Yield Parameters:

  • agent (Agent)

    The newly created agent.



179
180
181
182
183
184
# File 'lib/spidr/agent.rb', line 179

def self.site(url,options={},&block)
  url = URI(url.to_s) unless url.kind_of?(URI)

  agent = new(options.merge(:host => url.host),&block)
  agent.start_at(url)
end

.start_at(url, options = {}) {|agent| ... } ⇒ Object

Creates a new agent and begin spidering at the given URL.

Parameters:

  • url (URI::HTTP, String)

    The URL to start spidering at.

  • options (Hash) (defaults to: {})

    Additional options. See #initialize.

Yields:

  • (agent)

    If a block is given, it will be passed the newly created agent before it begins spidering.

Yield Parameters:

  • agent (Agent)

    The newly created agent.



158
159
160
161
# File 'lib/spidr/agent.rb', line 158

def self.start_at(url,options={},&block)
  agent = new(options,&block)
  agent.start_at(url)
end

Instance Method Details

#clearObject

Clears the history of the agent.



209
210
211
212
213
214
# File 'lib/spidr/agent.rb', line 209

def clear
  @queue.clear
  @history.clear
  @failures.clear
  return self
end

#dequeueURI::HTTP (protected)

Dequeues a URL that will later be visited.

Returns:

  • (URI::HTTP)

    The URL that was at the front of the queue.



703
704
705
# File 'lib/spidr/agent.rb', line 703

def dequeue
  @queue.shift
end

#enqueue(url, level = 0) ⇒ Boolean

Enqueues a given URL for visiting, only if it passes all of the agent's rules for visiting a given URL.

Parameters:

  • url (URI::HTTP, String)

    The URL to enqueue for visiting.

Returns:

  • (Boolean)

    Specifies whether the URL was enqueued, or ignored.



456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
# File 'lib/spidr/agent.rb', line 456

def enqueue(url,level=0)
  url = sanitize_url(url)

  if (!(queued?(url)) && visit?(url))
    link = url.to_s

    begin
      @every_url_blocks.each { |url_block| url_block.call(url) }

      @every_url_like_blocks.each do |pattern,url_blocks|
        match = case pattern
                when Regexp
                  link =~ pattern
                else
                  (pattern == link) || (pattern == url)
                end

        if match
          url_blocks.each { |url_block| url_block.call(url) }
        end
      end
    rescue Actions::Paused => action
      raise(action)
    rescue Actions::SkipLink
      return false
    rescue Actions::Action
    end
    
    @queue << url
    @levels[url] = level
    return true
  end

  return false
end

#failed(url) ⇒ Object (protected)

Adds a given URL to the failures list.

Parameters:

  • url (URI::HTTP)

    The URL to add to the failures list.



732
733
734
735
736
# File 'lib/spidr/agent.rb', line 732

def failed(url)
  @failures << url
  @every_failed_url_blocks.each { |fail_block| fail_block.call(url) }
  return true
end

#failed?(url) ⇒ Boolean

Determines whether a given URL could not be visited.

Parameters:

  • url (URI::HTTP, String)

    The URL to check for failures.

Returns:

  • (Boolean)

    Specifies whether the given URL was unable to be visited.



399
400
401
402
403
# File 'lib/spidr/agent.rb', line 399

def failed?(url)
  url = URI(url.to_s) unless url.kind_of?(URI)

  return @failures.include?(url)
end

#get_page(url) {|page| ... } ⇒ Page?

Requests and creates a new Page object from a given URL.

Parameters:

  • url (URI::HTTP)

    The URL to request.

Yields:

  • (page)

    If a block is given, it will be passed the page that represents the response.

Yield Parameters:

  • page (Page)

    The page for the response.

Returns:

  • (Page, nil)

    The page for the response, or nil if the request failed.



508
509
510
511
512
513
514
515
516
517
518
519
520
# File 'lib/spidr/agent.rb', line 508

def get_page(url)
  url = URI(url.to_s)

  prepare_request(url) do |session,path,headers|
    new_page = Page.new(url,session.get(path,headers))

    # save any new cookies
    @cookies.from_page(new_page)

    yield new_page if block_given?
    return new_page
  end
end

#post_page(url, post_data = '') {|page| ... } ⇒ Page?

Posts supplied form data and creates a new Page object from a given URL.

Parameters:

  • url (URI::HTTP)

    The URL to request.

  • post_data (String) (defaults to: '')

    Form option data.

Yields:

  • (page)

    If a block is given, it will be passed the page that represents the response.

Yield Parameters:

  • page (Page)

    The page for the response.

Returns:

  • (Page, nil)

    The page for the response, or nil if the request failed.

Since:

  • 0.2.2



543
544
545
546
547
548
549
550
551
552
553
554
555
# File 'lib/spidr/agent.rb', line 543

def post_page(url,post_data='')
  url = URI(url.to_s)

  prepare_request(url) do |session,path,headers|
    new_page = Page.new(url,session.post(path,post_data,headers))

    # save any new cookies
    @cookies.from_page(new_page)

    yield new_page if block_given?
    return new_page
  end
end

#prepare_request(url) {|request| ... } ⇒ Object (protected)

Normalizes the request path and grabs a session to handle page get and post requests.

Parameters:

  • url (URI::HTTP)

    The URL to request.

Yields:

  • (request)

    A block whose purpose is to make a page request.

Yield Parameters:

  • session (Net::HTTP)

    An HTTP session object.

  • path (String)

    Normalized URL string.

  • headers (Hash)

    A Hash of request header options.

Since:

  • 0.2.2



644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
# File 'lib/spidr/agent.rb', line 644

def prepare_request(url,&block)
  host = url.host
  port = url.port
  path = unless url.path.empty?
           url.path
         else
           '/'
         end

  # append the URL query to the path
  path += "?#{url.query}" if url.query

  # set any additional HTTP headers
  headers = {}

  unless @host_headers.empty?
    @host_headers.each do |name,header|
      if host.match(name)
        headers['Host'] = header
        break
      end
    end
  end

  headers['Host'] ||= @host_header if @host_header
  headers['User-Agent'] = @user_agent if @user_agent
  headers['Referer'] = @referer if @referer

  if (authorization = @authorized.for_url(url))
    headers['Authorization'] = "Basic #{authorization}"
  end

  if (header_cookies = @cookies.for_host(url.host))
    headers['Cookie'] = header_cookies
  end

  begin
    sleep(@delay) if @delay > 0

    yield @sessions[url], path, headers
  rescue SystemCallError,
         Timeout::Error,
         SocketError,
         Net::HTTPBadResponse,
         IOError

    @sessions.kill!(url)

    failed(url)
    return nil
  end
end

#proxyHash

The proxy information the agent uses.

Returns:

  • (Hash)

    The proxy information.

See Also:

Since:

  • 0.2.2



280
281
282
# File 'lib/spidr/agent.rb', line 280

def proxy
  @sessions.proxy
end

#proxy=(new_proxy) ⇒ Hash

Sets the proxy information that the agent uses.

Parameters:

  • new_proxy (Hash)

    The new proxy information.

Returns:

  • (Hash)

    The new proxy information.

See Also:

Since:

  • 0.2.2



297
298
299
# File 'lib/spidr/agent.rb', line 297

def proxy=(new_proxy)
  @sessions.proxy = new_proxy
end

#queued?(url) ⇒ Boolean

Determines whether a given URL has been enqueued.

Parameters:

  • url (URI::HTTP)

    The URL to search for in the queue.

Returns:

  • (Boolean)

    Specifies whether the given URL has been queued for visiting.



442
443
444
# File 'lib/spidr/agent.rb', line 442

def queued?(url)
  @queue.include?(url)
end

#run {|page| ... } ⇒ Object

Start spidering until the queue becomes empty or the agent is paused.

Yields:

  • (page)

    If a block is given, it will be passed every page visited.

Yield Parameters:

  • page (Page)

    A page which has been visited.



243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
# File 'lib/spidr/agent.rb', line 243

def run(&block)
  @running = true

  until (@queue.empty? || paused?)
    begin
      visit_page(dequeue,&block)
    rescue Actions::Paused
      return self
    rescue Actions::Action
    end
  end

  @running = false
  @sessions.clear
  return self
end

#running?Boolean

Determines if the agent is running.

Returns:

  • (Boolean)

    Specifies whether the agent is running or stopped.



266
267
268
# File 'lib/spidr/agent.rb', line 266

def running?
  @running == true
end

#start_at(url) {|page| ... } ⇒ Object

Start spidering at a given URL.

Parameters:

  • url (URI::HTTP, String)

    The URL to start spidering at.

Yields:

  • (page)

    If a block is given, it will be passed every page visited.

Yield Parameters:

  • page (Page)

    A page which has been visited.



228
229
230
231
# File 'lib/spidr/agent.rb', line 228

def start_at(url,&block)
  enqueue(url)
  return run(&block)
end

#to_hashHash

Converts the agent into a Hash.

Returns:

  • (Hash)

    The agent represented as a Hash containing the history and the queue of the agent.



617
618
619
# File 'lib/spidr/agent.rb', line 617

def to_hash
  {:history => @history, :queue => @queue}
end

#visit?(url) ⇒ Boolean (protected)

Determines if a given URL should be visited.

Parameters:

  • url (URI::HTTP)

    The URL in question.

Returns:

  • (Boolean)

    Specifies whether the given URL should be visited.



716
717
718
719
720
721
722
723
724
# File 'lib/spidr/agent.rb', line 716

def visit?(url)
  !visited?(url) &&
   visit_scheme?(url.scheme) &&
   visit_host?(url.host) &&
   visit_port?(url.port) &&
   visit_link?(url.to_s) &&
   visit_url?(url) &&
   visit_ext?(url.path)
end

#visit_page(url) {|page| ... } ⇒ Page?

Visits a given URL, and enqueus the links recovered from the URL to be visited later.

Parameters:

  • url (URI::HTTP, String)

    The URL to visit.

Yields:

  • (page)

    If a block is given, it will be passed the page which was visited.

Yield Parameters:

  • page (Page)

    The page which was visited.

Returns:

  • (Page, nil)

    The page that was visited. If nil is returned, either the request for the page failed, or the page was skipped.



574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
# File 'lib/spidr/agent.rb', line 574

def visit_page(url)
  url = sanitize_url(url)

  get_page(url) do |page|
    @history << page.url

    begin
      @every_page_blocks.each { |page_block| page_block.call(page) }

      yield page if block_given?
    rescue Actions::Paused => action
      raise(action)
    rescue Actions::SkipPage
      return nil
    rescue Actions::Action
    end

    page.each_url do |next_url|
      begin
        @every_link_blocks.each do |link_block|
          link_block.call(page.url,next_url)
        end
      rescue Actions::Paused => action
        raise(action)
      rescue Actions::SkipLink
        next
      rescue Actions::Action
      end

      if (@max_depth.nil? || @max_depth > @levels[url])
        enqueue(next_url,@levels[url] + 1)
      end
    end
  end
end

#visited?(url) ⇒ Boolean

Determines whether a URL was visited or not.

Parameters:

  • url (URI::HTTP, String)

    The URL to search for.

Returns:

  • (Boolean)

    Specifies whether a URL was visited.



358
359
360
361
362
# File 'lib/spidr/agent.rb', line 358

def visited?(url)
  url = URI(url.to_s) unless url.kind_of?(URI)

  return @history.include?(url)
end

#visited_hostsArray<String>

Specifies all hosts that were visited.

Returns:

  • (Array<String>)

    The hosts which have been visited.



345
346
347
# File 'lib/spidr/agent.rb', line 345

def visited_hosts
  visited_urls.map { |uri| uri.host }.uniq
end

Specifies the links which have been visited.

Returns:

  • (Array<String>)

    The links which have been visited.



335
336
337
# File 'lib/spidr/agent.rb', line 335

def visited_links
  @history.map { |url| url.to_s }
end