Class: Spidr::Agent

Inherits:
Object
  • Object
show all
Includes:
Actions, Events, Filters, Sanitizers
Defined in:
lib/spidr/agent.rb

Instance Attribute Summary collapse

Attributes included from Filters

#schemes

Attributes included from Sanitizers

#strip_fragments, #strip_query

Class Method Summary collapse

Instance Method Summary collapse

Methods included from Actions

#continue!, #initialize_actions, #pause!, #pause=, #paused?, #skip_link!, #skip_page!

Methods included from Events

#all_headers, #every_atom_doc, #every_atom_page, #every_bad_request_page, #every_css_page, #every_doc, #every_failed_url, #every_forbidden_page, #every_html_doc, #every_html_page, #every_internal_server_error_page, #every_javascript_page, #every_link, #every_missing_page, #every_ms_word_page, #every_ok_page, #every_page, #every_pdf_page, #every_redirect_page, #every_rss_doc, #every_rss_page, #every_timedout_page, #every_txt_page, #every_unauthorized_page, #every_url, #every_url_like, #every_xml_doc, #every_xml_page, #every_xsl_doc, #every_xsl_page, #every_zip_page, #initialize_events, #urls_like

Methods included from Filters

#ignore_exts, #ignore_exts_like, #ignore_hosts, #ignore_hosts_like, #ignore_links, #ignore_links_like, #ignore_ports, #ignore_ports_like, #ignore_urls, #ignore_urls_like, #initialize_filters, #visit_ext?, #visit_exts, #visit_exts_like, #visit_host?, #visit_hosts, #visit_hosts_like, #visit_link?, #visit_links, #visit_links_like, #visit_port?, #visit_ports, #visit_ports_like, #visit_scheme?, #visit_url?, #visit_urls, #visit_urls_like

Methods included from Sanitizers

#initialize_sanitizers, #sanitize_url

Constructor Details

#initialize(options = {}) {|agent| ... } ⇒ Agent

Creates a new Agent object.

Parameters:

  • options (Hash) (defaults to: {})

    Additional options

  • :proxy (Hash)

    a customizable set of options

Options Hash (options):

  • :proxy (Hash) — default: Spidr.proxy

    The proxy information to use.

  • :host_header (String)

    The HTTP Host header to use with each request.

  • :host_headers (Hash{String,Regexp => String})

    The HTTP Host headers to use for specific hosts.

  • :user_agent (String) — default: Spidr.user_agent

    The User-Agent string to send with each requests.

  • :referer (String)

    The Referer URL to send with each request.

  • :delay (Integer) — default: 0

    The number of seconds to pause between each request.

  • :queue (Set, Array)

    The initial queue of URLs to visit.

  • :history (Set, Array)

    The initial list of visited URLs.

  • :max_depth (Integer)

    The maximum link depth to follow.

Yields:

  • (agent)

    If a block is given, it will be passed the newly created agent for further configuration.

Yield Parameters:

  • agent (Agent)

    The newly created agent.

See Also:



116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
# File 'lib/spidr/agent.rb', line 116

def initialize(options={})
  @host_header  = options[:host_header]
  @host_headers = {}

  if options[:host_headers]
    @host_headers.merge!(options[:host_headers])
  end

  @user_agent = options.fetch(:user_agent,Spidr.user_agent)
  @referer    = options[:referer]

  @sessions   = SessionCache.new(options.fetch(:proxy,Spidr.proxy))
  @cookies    = CookieJar.new
  @authorized = AuthStore.new

  @running  = false
  @delay    = options.fetch(:delay,0)
  @history  = Set[]
  @failures = Set[]
  @queue    = []

  @levels    = Hash.new(0)
  @max_depth = options[:max_depth]

  initialize_sanitizers(options)
  initialize_filters(options)
  initialize_actions(options)
  initialize_events(options)

  yield self if block_given?
end

Instance Attribute Details

#authorizedObject

HTTP Authentication credentials



33
34
35
# File 'lib/spidr/agent.rb', line 33

def authorized
  @authorized
end

#cookiesObject (readonly)

Cached cookies



51
52
53
# File 'lib/spidr/agent.rb', line 51

def cookies
  @cookies
end

#delayObject

Delay in between fetching pages



39
40
41
# File 'lib/spidr/agent.rb', line 39

def delay
  @delay
end

#failuresObject

List of unreachable URLs



45
46
47
# File 'lib/spidr/agent.rb', line 45

def failures
  @failures
end

#historyObject Also known as: visited_urls

History containing visited URLs



42
43
44
# File 'lib/spidr/agent.rb', line 42

def history
  @history
end

#host_headerObject

HTTP Host Header to use



24
25
26
# File 'lib/spidr/agent.rb', line 24

def host_header
  @host_header
end

#host_headersObject (readonly)

HTTP Host Headers to use for specific hosts



27
28
29
# File 'lib/spidr/agent.rb', line 27

def host_headers
  @host_headers
end

#levelsObject (readonly)

The visited URLs and their depth within a site



57
58
59
# File 'lib/spidr/agent.rb', line 57

def levels
  @levels
end

#max_depthObject (readonly)

Maximum depth



54
55
56
# File 'lib/spidr/agent.rb', line 54

def max_depth
  @max_depth
end

#queueObject Also known as: pending_urls

Queue of URLs to visit



48
49
50
# File 'lib/spidr/agent.rb', line 48

def queue
  @queue
end

#refererObject

Referer to use



36
37
38
# File 'lib/spidr/agent.rb', line 36

def referer
  @referer
end

#user_agentObject

User-Agent to use



30
31
32
# File 'lib/spidr/agent.rb', line 30

def user_agent
  @user_agent
end

Class Method Details

._site(url, options = {}, regex, &block) ⇒ Object



215
216
217
218
219
220
# File 'lib/spidr/agent.rb', line 215

def self._site(url,options={},regex,&block)
  url = URI(url.to_s) unless url.kind_of?(URI)

  agent = new(options.merge(:host => url.host),&block)
  agent.start_at(regex,url)
end

.host(name, options = {}) {|agent| ... } ⇒ Object

Creates a new agent and spiders the given host.

Parameters:

  • The (String)

    host-name to spider.

  • options (Hash) (defaults to: {})

    Additional options. See #initialize.

Yields:

  • (agent)

    If a block is given, it will be passed the newly created agent before it begins spidering.

Yield Parameters:

  • agent (Agent)

    The newly created agent.



238
239
240
241
# File 'lib/spidr/agent.rb', line 238

def self.host(name,options={},&block)
  agent = new(options.merge(:host => name),&block)
  agent.start_at(URI::HTTP.build(:host => name, :path => '/'))
end

.site(url, options = {}) {|agent| ... } ⇒ Object

Creates a new agent and spiders the web-site located at the given URL.

Parameters:

  • url (URI::HTTP, String)

    The web-site to spider.

  • options (Hash) (defaults to: {})

    Additional options. See #initialize.

Yields:

  • (agent)

    If a block is given, it will be passed the newly created agent before it begins spidering.

Yield Parameters:

  • agent (Agent)

    The newly created agent.



207
208
209
210
211
212
# File 'lib/spidr/agent.rb', line 207

def self.site(url,options={},&block)
  url = URI(url.to_s) unless url.kind_of?(URI)

  agent = new(options.merge(:host => url.host),&block)
  agent.start_at(url)
end

.start_at(url, regex, options = {}) {|agent| ... } ⇒ Object

Creates a new agent and begin spidering at the given URL.

Parameters:

  • url (URI::HTTP, String)

    The URL to start spidering at.

  • options (Hash) (defaults to: {})

    Additional options. See #initialize.

Yields:

  • (agent)

    If a block is given, it will be passed the newly created agent before it begins spidering.

Yield Parameters:

  • agent (Agent)

    The newly created agent.



164
165
166
167
# File 'lib/spidr/agent.rb', line 164

def self.start_at(url,options={},&block)
  agent = new(options,&block)
  agent.start_at(url)
end

Instance Method Details

#_run(regex) {|page| ... } ⇒ Object

Start spidering until the queue becomes empty or the agent is paused.

Yields:

  • (page)

    If a block is given, it will be passed every page visited.

Yield Parameters:

  • page (Page)

    A page which has been visited.



287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
# File 'lib/spidr/agent.rb', line 287

def _run(regex,&block)
  @running = true

  until (@queue.empty? || paused?)
    begin
      _visit_page(regex,dequeue,&block)
    rescue Actions::Paused
      return self
    rescue Actions::Action
    end
  end

  @running = false
  @sessions.clear
  return self
end

#_visit_page(regex, url) {|page| ... } ⇒ Page?

Visits a given URL, and enqueus the links recovered from the URL to be visited later.

Parameters:

  • url (URI::HTTP, String)

    The URL to visit.

Yields:

  • (page)

    If a block is given, it will be passed the page which was visited.

Yield Parameters:

  • page (Page)

    The page which was visited.

Returns:

  • (Page, nil)

    The page that was visited. If nil is returned, either the request for the page failed, or the page was skipped.



321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
# File 'lib/spidr/agent.rb', line 321

def _visit_page(regex,url)
  url = sanitize_url(url)

  get_page(url) do |page|
    @history << page.url

    begin
      @every_page_blocks.each { |page_block| page_block.call(page) }

      yield page if block_given?
    rescue Actions::Paused => action
      raise(action)
    rescue Actions::SkipPage
      return nil
    rescue Actions::Action
    end

    page.each_url do |next_url|
      begin
        @every_link_blocks.each do |link_block|
          link_block.call(page.url,next_url)
        end
      rescue Actions::Paused => action
        raise(action)
      rescue Actions::SkipLink
        next
      rescue Actions::Action
      end

      if (@max_depth.nil? || @max_depth > @levels[url])
        if(regex.match(next_url))
          enqueue(next_url,@levels[url] + 1)
        end
      end
    end
  end
end

#clearObject

Clears the history of the agent.



246
247
248
249
250
251
# File 'lib/spidr/agent.rb', line 246

def clear
  @queue.clear
  @history.clear
  @failures.clear
  return self
end

#dequeueURI::HTTP (protected)

Dequeues a URL that will later be visited.

Returns:

  • (URI::HTTP)

    The URL that was at the front of the queue.



830
831
832
# File 'lib/spidr/agent.rb', line 830

def dequeue
  @queue.shift
end

#enqueue(url, level = 0) ⇒ Boolean

Enqueues a given URL for visiting, only if it passes all of the agent's rules for visiting a given URL.

Parameters:

  • url (URI::HTTP, String)

    The URL to enqueue for visiting.

Returns:

  • (Boolean)

    Specifies whether the URL was enqueued, or ignored.



582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
# File 'lib/spidr/agent.rb', line 582

def enqueue(url,level=0)
  url = sanitize_url(url)

  if (!(queued?(url)) && visit?(url))
    link = url.to_s

    begin
      @every_url_blocks.each { |url_block| url_block.call(url) }

      @every_url_like_blocks.each do |pattern,url_blocks|
        match = case pattern
                when Regexp
                  link =~ pattern
                else
                  (pattern == link) || (pattern == url)
                end

        if match
          url_blocks.each { |url_block| url_block.call(url) }
        end
      end
    rescue Actions::Paused => action
      raise(action)
    rescue Actions::SkipLink
      return false
    rescue Actions::Action
    end
    
    @queue << url
    @levels[url] = level
    return true
  end

  return false
end

#failed(url) ⇒ Object (protected)

Adds a given URL to the failures list.

Parameters:

  • url (URI::HTTP)

    The URL to add to the failures list.



859
860
861
862
863
# File 'lib/spidr/agent.rb', line 859

def failed(url)
  @failures << url
  @every_failed_url_blocks.each { |fail_block| fail_block.call(url) }
  return true
end

#failed?(url) ⇒ Boolean

Determines whether a given URL could not be visited.

Parameters:

  • url (URI::HTTP, String)

    The URL to check for failures.

Returns:

  • (Boolean)

    Specifies whether the given URL was unable to be visited.



525
526
527
528
529
# File 'lib/spidr/agent.rb', line 525

def failed?(url)
  url = URI(url.to_s) unless url.kind_of?(URI)

  return @failures.include?(url)
end

#get_page(url) {|page| ... } ⇒ Page?

Requests and creates a new Page object from a given URL.

Parameters:

  • url (URI::HTTP)

    The URL to request.

Yields:

  • (page)

    If a block is given, it will be passed the page that represents the response.

Yield Parameters:

  • page (Page)

    The page for the response.

Returns:

  • (Page, nil)

    The page for the response, or nil if the request failed.



634
635
636
637
638
639
640
641
642
643
644
645
646
# File 'lib/spidr/agent.rb', line 634

def get_page(url)
  url = URI(url.to_s)

  prepare_request(url) do |session,path,headers|
    new_page = Page.new(url,session.get(path,headers))

    # save any new cookies
    @cookies.from_page(new_page)

    yield new_page if block_given?
    return new_page
  end
end

#post_page(url, post_data = '') {|page| ... } ⇒ Page?

Posts supplied form data and creates a new Page object from a given URL.

Parameters:

  • url (URI::HTTP)

    The URL to request.

  • post_data (String) (defaults to: '')

    Form option data.

Yields:

  • (page)

    If a block is given, it will be passed the page that represents the response.

Yield Parameters:

  • page (Page)

    The page for the response.

Returns:

  • (Page, nil)

    The page for the response, or nil if the request failed.

Since:

  • 0.2.2



669
670
671
672
673
674
675
676
677
678
679
680
681
# File 'lib/spidr/agent.rb', line 669

def post_page(url,post_data='')
  url = URI(url.to_s)

  prepare_request(url) do |session,path,headers|
    new_page = Page.new(url,session.post(path,post_data,headers))

    # save any new cookies
    @cookies.from_page(new_page)

    yield new_page if block_given?
    return new_page
  end
end

#prepare_request(url) {|request| ... } ⇒ Object (protected)

Normalizes the request path and grabs a session to handle page get and post requests.

Parameters:

  • url (URI::HTTP)

    The URL to request.

Yields:

  • (request)

    A block whose purpose is to make a page request.

Yield Parameters:

  • session (Net::HTTP)

    An HTTP session object.

  • path (String)

    Normalized URL string.

  • headers (Hash)

    A Hash of request header options.

Since:

  • 0.2.2



770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
# File 'lib/spidr/agent.rb', line 770

def prepare_request(url,&block)
  host = url.host
  port = url.port
  path = unless url.path.empty?
           url.path
         else
           '/'
         end

  # append the URL query to the path
  path += "?#{url.query}" if url.query

  # set any additional HTTP headers
  headers = {}

  unless @host_headers.empty?
    @host_headers.each do |name,header|
      if host.match(name)
        headers['Host'] = header
        break
      end
    end
  end

  headers['Host']     ||= @host_header if @host_header
  headers['User-Agent'] = @user_agent if @user_agent
  headers['Referer']    = @referer if @referer

  if (authorization = @authorized.for_url(url))
    headers['Authorization'] = "Basic #{authorization}"
  end

  if (header_cookies = @cookies.for_host(url.host))
    headers['Cookie'] = header_cookies
  end

  begin
    sleep(@delay) if @delay > 0

    yield @sessions[url], path, headers
  rescue SystemCallError,
         Timeout::Error,
         SocketError,
         IOError,
         OpenSSL::SSL::SSLError,
         Net::HTTPBadResponse

    @sessions.kill!(url)

    failed(url)
    return nil
  end
end

#proxyHash

The proxy information the agent uses.

Returns:

  • (Hash)

    The proxy information.

See Also:

Since:

  • 0.2.2



406
407
408
# File 'lib/spidr/agent.rb', line 406

def proxy
  @sessions.proxy
end

#proxy=(new_proxy) ⇒ Hash

Sets the proxy information that the agent uses.

Parameters:

  • new_proxy (Hash)

    The new proxy information.

Returns:

  • (Hash)

    The new proxy information.

See Also:

Since:

  • 0.2.2



423
424
425
# File 'lib/spidr/agent.rb', line 423

def proxy=(new_proxy)
  @sessions.proxy = new_proxy
end

#queued?(url) ⇒ Boolean

Determines whether a given URL has been enqueued.

Parameters:

  • url (URI::HTTP)

    The URL to search for in the queue.

Returns:

  • (Boolean)

    Specifies whether the given URL has been queued for visiting.



568
569
570
# File 'lib/spidr/agent.rb', line 568

def queued?(url)
  @queue.include?(url)
end

#run {|page| ... } ⇒ Object

Start spidering until the queue becomes empty or the agent is paused.

Yields:

  • (page)

    If a block is given, it will be passed every page visited.

Yield Parameters:

  • page (Page)

    A page which has been visited.



369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
# File 'lib/spidr/agent.rb', line 369

def run(&block)
  @running = true

  until (@queue.empty? || paused?)
    begin
      visit_page(dequeue,&block)
    rescue Actions::Paused
      return self
    rescue Actions::Action
    end
  end

  @running = false
  @sessions.clear
  return self
end

#running?Boolean

Determines if the agent is running.

Returns:

  • (Boolean)

    Specifies whether the agent is running or stopped.



392
393
394
# File 'lib/spidr/agent.rb', line 392

def running?
  @running == true
end

#start_at(url, regex) {|page| ... } ⇒ Object

Start spidering at a given URL.

Parameters:

  • url (URI::HTTP, String)

    The URL to start spidering at.

Yields:

  • (page)

    If a block is given, it will be passed every page visited.

Yield Parameters:

  • page (Page)

    A page which has been visited.



265
266
267
268
# File 'lib/spidr/agent.rb', line 265

def start_at(url,&block)
  enqueue(url)
  return run(&block)
end

#to_hashHash

Converts the agent into a Hash.

Returns:

  • (Hash)

    The agent represented as a Hash containing the history and the queue of the agent.



743
744
745
# File 'lib/spidr/agent.rb', line 743

def to_hash
  {:history => @history, :queue => @queue}
end

#visit?(url) ⇒ Boolean (protected)

Determines if a given URL should be visited.

Parameters:

  • url (URI::HTTP)

    The URL in question.

Returns:

  • (Boolean)

    Specifies whether the given URL should be visited.



843
844
845
846
847
848
849
850
851
# File 'lib/spidr/agent.rb', line 843

def visit?(url)
  !visited?(url) &&
   visit_scheme?(url.scheme) &&
   visit_host?(url.host) &&
   visit_port?(url.port) &&
   visit_link?(url.to_s) &&
   visit_url?(url) &&
   visit_ext?(url.path)
end

#visit_page(url) {|page| ... } ⇒ Page?

Visits a given URL, and enqueus the links recovered from the URL to be visited later.

Parameters:

  • url (URI::HTTP, String)

    The URL to visit.

Yields:

  • (page)

    If a block is given, it will be passed the page which was visited.

Yield Parameters:

  • page (Page)

    The page which was visited.

Returns:

  • (Page, nil)

    The page that was visited. If nil is returned, either the request for the page failed, or the page was skipped.



700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
# File 'lib/spidr/agent.rb', line 700

def visit_page(url)
  url = sanitize_url(url)

  get_page(url) do |page|
    @history << page.url

    begin
      @every_page_blocks.each { |page_block| page_block.call(page) }

      yield page if block_given?
    rescue Actions::Paused => action
      raise(action)
    rescue Actions::SkipPage
      return nil
    rescue Actions::Action
    end

    page.each_url do |next_url|
      begin
        @every_link_blocks.each do |link_block|
          link_block.call(page.url,next_url)
        end
      rescue Actions::Paused => action
        raise(action)
      rescue Actions::SkipLink
        next
      rescue Actions::Action
      end

      if (@max_depth.nil? || @max_depth > @levels[url])
        enqueue(next_url,@levels[url] + 1)
      end
    end
  end
end

#visited?(url) ⇒ Boolean

Determines whether a URL was visited or not.

Parameters:

  • url (URI::HTTP, String)

    The URL to search for.

Returns:

  • (Boolean)

    Specifies whether a URL was visited.



484
485
486
487
488
# File 'lib/spidr/agent.rb', line 484

def visited?(url)
  url = URI(url.to_s) unless url.kind_of?(URI)

  return @history.include?(url)
end

#visited_hostsArray<String>

Specifies all hosts that were visited.

Returns:

  • (Array<String>)

    The hosts which have been visited.



471
472
473
# File 'lib/spidr/agent.rb', line 471

def visited_hosts
  visited_urls.map { |uri| uri.host }.uniq
end

Specifies the links which have been visited.

Returns:

  • (Array<String>)

    The links which have been visited.



461
462
463
# File 'lib/spidr/agent.rb', line 461

def visited_links
  @history.map { |url| url.to_s }
end