Class: Medusa::Core

Inherits:

Object

Object
Medusa::Core

show all

Defined in:: lib/medusa/core.rb

Constant Summary collapse

DEFAULT_OPTS =

{
  # run 4 Tentacle threads to fetch pages
  :threads => 4,
  # don't throw away the page response body after scanning it for links
  :discard_page_bodies => false,
  # identify self as Medusa/VERSION
  :user_agent => "Medusa/#{Medusa::VERSION}",
  # no delay between requests
  :delay => 0,
  # don't obey the robots exclusion protocol
  :obey_robots_txt => false,
  # by default, don't limit the depth of the crawl
  :depth_limit => false,
  # number of times HTTP redirects will be followed
  :redirect_limit => 5,
  # storage engine defaults to In-memory store in +process_options+ if none specified
  :storage => nil,
  # cleanups of the storage on every startup of the crawler
  :clear_on_startup => true,
  # Hash of cookie name => value to send with HTTP requests
  :cookies => nil,
  # accept cookies from the server and send them back?
  :accept_cookies => false,
  # skip any link with a query string? e.g. http://foo.com/?u=user
  :skip_query_strings => false,
  # proxy server hostname
  :proxy_host => nil,
  # proxy server port number
  :proxy_port => false,
  # HTTP read timeout in seconds
  :read_timeout => nil
}.freeze

Instance Attribute Summary collapse

#opts ⇒ Object readonly

Hash of options for the crawl.
#pages ⇒ Object readonly

PageStore storing all Page objects encountered during the crawl.

Class Method Summary collapse

.crawl(urls, opts = {}) ⇒ Object

Convenience method to start a new crawl.

Instance Method Summary collapse

#after_crawl(&block) ⇒ Object

Add a block to be executed on the PageStore after the crawl is finished.
#focus_crawl(&block) ⇒ Object

Specify a block which will select which links to follow on each page.
#initialize(urls, opts = {}) {|_self| ... } ⇒ Core constructor

Initialize the crawl with starting urls (single URL or Array of URLs) and optional block.
#on_every_page(&block) ⇒ Object

Add a block to be executed on every Page as they are encountered during the crawl.
#on_pages_like(*patterns, &block) ⇒ Object

Add a block to be executed on Page objects with a URL matching one or more patterns.
#run ⇒ Object

Perform the crawl.
#skip_links_like(*patterns) ⇒ Object

Add one ore more Regex patterns for URLs which should not be followed.

Constructor Details

#initialize(urls, opts = {}) {|_self| ... } ⇒ `Core`

Initialize the crawl with starting urls (single URL or Array of URLs) and optional block

Yields:

(_self)

Yield Parameters:

_self (Medusa::Core) —

the object that the method was called on

# File 'lib/medusa/core.rb', line 70

def initialize(urls, opts = {})
  @urls = [urls].flatten.map{ |url| url.is_a?(URI) ? url : URI(url) }
  @urls.each{ |url| url.path = '/' if url.path.empty? }

  @tentacles = []
  @on_every_page_blocks = []
  @on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] }
  @skip_link_patterns = []
  @after_crawl_blocks = []
  @opts = opts
  @focus_crawl_block = nil


  yield self if block_given?
end

Instance Attribute Details

#opts ⇒ `Object` (readonly)

Hash of options for the crawl



24
25
26

# File 'lib/medusa/core.rb', line 24

def opts
  @opts
end

#pages ⇒ `Object` (readonly)

PageStore storing all Page objects encountered during the crawl



22
23
24

# File 'lib/medusa/core.rb', line 22

def pages
  @pages
end

Class Method Details

.crawl(urls, opts = {}) ⇒ `Object`

Convenience method to start a new crawl

# File 'lib/medusa/core.rb', line 89

def self.crawl(urls, opts = {})
  self.new(urls, opts) do |core|
    yield core if block_given?
    core.run
  end
end

Instance Method Details

#after_crawl(&block) ⇒ `Object`

Add a block to be executed on the PageStore after the crawl is finished

# File 'lib/medusa/core.rb', line 100

def after_crawl(&block)
  @after_crawl_blocks << block
  self
end

#focus_crawl(&block) ⇒ `Object`

Specify a block which will select which links to follow on each page. The block should return an Array of URI objects.

# File 'lib/medusa/core.rb', line 140

def focus_crawl(&block)
  @focus_crawl_block = block
  self
end

#on_every_page(&block) ⇒ `Object`

Add a block to be executed on every Page as they are encountered during the crawl

# File 'lib/medusa/core.rb', line 118

def on_every_page(&block)
  @on_every_page_blocks << block
  self
end

#on_pages_like(*patterns, &block) ⇒ `Object`

Add a block to be executed on Page objects with a URL matching one or more patterns

# File 'lib/medusa/core.rb', line 127

def on_pages_like(*patterns, &block)
  if patterns
    patterns.each do |pattern|
      @on_pages_like_blocks[pattern] << block
    end
  end
  self
end

#run ⇒ `Object`

Perform the crawl

# File 'lib/medusa/core.rb', line 148

def run

  process_options

  @urls.delete_if { |url| !visit_link?(url) }
  return if @urls.empty?

  link_queue = Queue.new
  page_queue = Queue.new

  @opts[:threads].times do
    @tentacles << Thread.new { Tentacle.new(link_queue, page_queue, @opts.dup).run }
  end

  @urls.each{ |url| link_queue.enq(url) }

  loop do
    page = page_queue.deq
    @pages.touch_key page.url
    do_page_blocks page
    page.discard_doc! if @opts[:discard_page_bodies]

    links = links_to_follow page
    links.each do |link|
      link_queue << [link, page.url.dup, page.depth + 1]
    end
    @pages.touch_keys links

    @pages[page.url] = page

    # if we are done with the crawl, tell the threads to end
    if link_queue.empty? and page_queue.empty?
      until link_queue.num_waiting == @tentacles.size
        Thread.pass
      end
      if page_queue.empty?
        @tentacles.size.times { link_queue << :END }
        break
      end
    end
  end

  @tentacles.each { |thread| thread.join }
  do_after_crawl_blocks
  self
end

#skip_links_like(*patterns) ⇒ `Object`

Add one ore more Regex patterns for URLs which should not be followed

# File 'lib/medusa/core.rb', line 109

def skip_links_like(*patterns)
  @skip_link_patterns.concat [patterns].flatten.compact
  self
end

Class: Medusa::Core

Constant Summary collapse

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(urls, opts = {}) {|_self| ... } ⇒ Core

Instance Attribute Details

#opts ⇒ Object (readonly)

#pages ⇒ Object (readonly)

Class Method Details

.crawl(urls, opts = {}) ⇒ Object

Instance Method Details

#after_crawl(&block) ⇒ Object

#focus_crawl(&block) ⇒ Object

#on_every_page(&block) ⇒ Object

#on_pages_like(*patterns, &block) ⇒ Object

#run ⇒ Object

#skip_links_like(*patterns) ⇒ Object