Class: Anemone::Core

Inherits:

Object

Object
Anemone::Core

show all

Defined in:: lib/anemone/core.rb

Constant Summary collapse

DEFAULT_OPTS =

{
  # run 4 Tentacle threads to fetch pages
  :threads => 4,
  # disable verbose output
  :verbose => false,
  # don't throw away the page response body after scanning it for links
  :discard_page_bodies => false,
  # identify self as Anemone/VERSION
  :user_agent => "Anemone/#{Anemone::VERSION}",
  # no delay between requests
  :delay => 0,
  # don't obey the robots exclusion protocol
  :obey_robots_txt => false,
  # by default, don't limit the depth of the crawl
  :depth_limit => false,
  # number of times HTTP redirects will be followed
  :redirect_limit => 5
}

Instance Attribute Summary collapse

#opts ⇒ Object

Hash of options for the crawl.
#pages ⇒ Object readonly

PageHash storing all Page objects encountered during the crawl.

Class Method Summary collapse

.crawl(urls, opts = {}) ⇒ Object

Convenience method to start a new crawl.

Instance Method Summary collapse

#after_crawl(&block) ⇒ Object

Add a block to be executed on the PageHash after the crawl is finished.
#focus_crawl(&block) ⇒ Object

Specify a block which will select which links to follow on each page.
#initialize(urls, opts = {}) {|_self| ... } ⇒ Core constructor

Initialize the crawl with starting urls (single URL or Array of URLs) and optional block.
#on_every_page(&block) ⇒ Object

Add a block to be executed on every Page as they are encountered during the crawl.
#on_pages_like(*patterns, &block) ⇒ Object

Add a block to be executed on Page objects with a URL matching one or more patterns.
#run ⇒ Object

Perform the crawl.
#skip_links_like(*patterns) ⇒ Object

Add one ore more Regex patterns for URLs which should not be followed.

Constructor Details

#initialize(urls, opts = {}) {|_self| ... } ⇒ `Core`

Initialize the crawl with starting urls (single URL or Array of URLs) and optional block

Yields:

(_self)

Yield Parameters:

_self (Anemone::Core) —

the object that the method was called on

# File 'lib/anemone/core.rb', line 48

def initialize(urls, opts = {})
  @urls = [urls].flatten.map{ |url| url.is_a?(URI) ? url : URI(url) }
  @urls.each{ |url| url.path = '/' if url.path.empty? }

  @tentacles = []
  @pages = PageHash.new
  @on_every_page_blocks = []
  @on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] }
  @skip_link_patterns = []
  @after_crawl_blocks = []

  process_options opts

  yield self if block_given?
end

Instance Attribute Details

#opts ⇒ `Object`

Hash of options for the crawl



23
24
25

# File 'lib/anemone/core.rb', line 23

def opts
  @opts
end

#pages ⇒ `Object` (readonly)

PageHash storing all Page objects encountered during the crawl



20
21
22

# File 'lib/anemone/core.rb', line 20

def pages
  @pages
end

Class Method Details

.crawl(urls, opts = {}) ⇒ `Object`

Convenience method to start a new crawl

# File 'lib/anemone/core.rb', line 67

def self.crawl(urls, opts = {})
  self.new(urls, opts) do |core|
    yield core if block_given?
    core.run
  end
end

Instance Method Details

#after_crawl(&block) ⇒ `Object`

Add a block to be executed on the PageHash after the crawl is finished

# File 'lib/anemone/core.rb', line 78

def after_crawl(&block)
  @after_crawl_blocks << block
  self
end

#focus_crawl(&block) ⇒ `Object`

Specify a block which will select which links to follow on each page. The block should return an Array of URI objects.

# File 'lib/anemone/core.rb', line 118

def focus_crawl(&block)
  @focus_crawl_block = block
  self
end

#on_every_page(&block) ⇒ `Object`

Add a block to be executed on every Page as they are encountered during the crawl

# File 'lib/anemone/core.rb', line 96

def on_every_page(&block)
  @on_every_page_blocks << block
  self
end

#on_pages_like(*patterns, &block) ⇒ `Object`

Add a block to be executed on Page objects with a URL matching one or more patterns

# File 'lib/anemone/core.rb', line 105

def on_pages_like(*patterns, &block)
  if patterns
    patterns.each do |pattern|
      @on_pages_like_blocks[pattern] << block
    end
  end
  self
end

#run ⇒ `Object`

Perform the crawl

# File 'lib/anemone/core.rb', line 126

def run
  @urls.delete_if { |url| !visit_link?(url) }
  return if @urls.empty?
  
  link_queue = Queue.new
  page_queue = Queue.new

  @opts[:threads].times do
    @tentacles << Thread.new { Tentacle.new(link_queue, page_queue, @opts).run }
  end
  
  @urls.each{ |url| link_queue.enq(url) }

  loop do
    page = page_queue.deq
    
    @pages[page.url] = page
    
    puts "#{page.url} Queue: #{link_queue.size}" if @opts[:verbose]
    
    # perform the on_every_page blocks for this page
    do_page_blocks(page)

    page.discard_doc! if @opts[:discard_page_bodies]
    
    links_to_follow(page).each do |link|
      link_queue.enq([link, page])
      @pages[link] = nil
    end
    
    # create an entry in the page hash for each alias of this page,
    # i.e. all the pages that redirected to this page
    page.aliases.each do |aka|
      if !@pages.has_key?(aka) or @pages[aka].nil?
        @pages[aka] = page.alias_clone(aka)
      end
      @pages[aka].add_alias!(page.url)
    end
    
    # if we are done with the crawl, tell the threads to end
    if link_queue.empty? and page_queue.empty?
      until link_queue.num_waiting == @tentacles.size
        Thread.pass
      end
      
      if page_queue.empty?
        @tentacles.size.times { link_queue.enq(:END)}
        break
      end
    end
    
  end

  @tentacles.each { |t| t.join }

  do_after_crawl_blocks()
  
  self
end

#skip_links_like(*patterns) ⇒ `Object`

Add one ore more Regex patterns for URLs which should not be followed

# File 'lib/anemone/core.rb', line 87

def skip_links_like(*patterns)
  @skip_link_patterns.concat [patterns].flatten.compact
  self
end

Class: Anemone::Core

Constant Summary collapse

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(urls, opts = {}) {|_self| ... } ⇒ Core

Instance Attribute Details

#opts ⇒ Object

#pages ⇒ Object (readonly)

Class Method Details

.crawl(urls, opts = {}) ⇒ Object

Instance Method Details

#after_crawl(&block) ⇒ Object

#focus_crawl(&block) ⇒ Object

#on_every_page(&block) ⇒ Object

#on_pages_like(*patterns, &block) ⇒ Object

#run ⇒ Object

#skip_links_like(*patterns) ⇒ Object