Class: Medusa::Core

Inherits:
Object
  • Object
show all
Defined in:
lib/medusa/core.rb

Constant Summary collapse

DEFAULT_OPTS =
{
  # run 4 Tentacle threads to fetch pages
  :threads => 4,
  # don't throw away the page response body after scanning it for links
  :discard_page_bodies => false,
  # identify self as Medusa/VERSION
  :user_agent => "Medusa/#{Medusa::VERSION}",
  # no delay between requests
  :delay => 0,
  # don't obey the robots exclusion protocol
  :obey_robots_txt => false,
  # by default, don't limit the depth of the crawl
  :depth_limit => false,
  # number of times HTTP redirects will be followed
  :redirect_limit => 5,
  # storage engine defaults to In-memory store in +process_options+ if none specified
  :storage => nil,
  # cleanups of the storage on every startup of the crawler
  :clear_on_startup => true,
  # Hash of cookie name => value to send with HTTP requests
  :cookies => nil,
  # accept cookies from the server and send them back?
  :accept_cookies => false,
  # skip any link with a query string? e.g. http://foo.com/?u=user
  :skip_query_strings => false,
  # proxy server hostname
  :proxy_host => nil,
  # proxy server port number
  :proxy_port => false,
  # HTTP read timeout in seconds
  :read_timeout => nil
}.freeze

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(urls, opts = {}) {|_self| ... } ⇒ Core

Initialize the crawl with starting urls (single URL or Array of URLs) and optional block

Yields:

  • (_self)

Yield Parameters:

  • _self (Medusa::Core)

    the object that the method was called on



70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# File 'lib/medusa/core.rb', line 70

def initialize(urls, opts = {})
  @urls = [urls].flatten.map{ |url| url.is_a?(URI) ? url : URI(url) }
  @urls.each{ |url| url.path = '/' if url.path.empty? }

  @tentacles = []
  @on_every_page_blocks = []
  @on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] }
  @skip_link_patterns = []
  @after_crawl_blocks = []
  @opts = opts
  @focus_crawl_block = nil


  yield self if block_given?
end

Instance Attribute Details

#optsObject (readonly)

Hash of options for the crawl



24
25
26
# File 'lib/medusa/core.rb', line 24

def opts
  @opts
end

#pagesObject (readonly)

PageStore storing all Page objects encountered during the crawl



22
23
24
# File 'lib/medusa/core.rb', line 22

def pages
  @pages
end

Class Method Details

.crawl(urls, opts = {}) ⇒ Object

Convenience method to start a new crawl



89
90
91
92
93
94
# File 'lib/medusa/core.rb', line 89

def self.crawl(urls, opts = {})
  self.new(urls, opts) do |core|
    yield core if block_given?
    core.run
  end
end

Instance Method Details

#after_crawl(&block) ⇒ Object

Add a block to be executed on the PageStore after the crawl is finished



100
101
102
103
# File 'lib/medusa/core.rb', line 100

def after_crawl(&block)
  @after_crawl_blocks << block
  self
end

#focus_crawl(&block) ⇒ Object

Specify a block which will select which links to follow on each page. The block should return an Array of URI objects.



140
141
142
143
# File 'lib/medusa/core.rb', line 140

def focus_crawl(&block)
  @focus_crawl_block = block
  self
end

#on_every_page(&block) ⇒ Object

Add a block to be executed on every Page as they are encountered during the crawl



118
119
120
121
# File 'lib/medusa/core.rb', line 118

def on_every_page(&block)
  @on_every_page_blocks << block
  self
end

#on_pages_like(*patterns, &block) ⇒ Object

Add a block to be executed on Page objects with a URL matching one or more patterns



127
128
129
130
131
132
133
134
# File 'lib/medusa/core.rb', line 127

def on_pages_like(*patterns, &block)
  if patterns
    patterns.each do |pattern|
      @on_pages_like_blocks[pattern] << block
    end
  end
  self
end

#runObject

Perform the crawl



148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
# File 'lib/medusa/core.rb', line 148

def run

  process_options

  @urls.delete_if { |url| !visit_link?(url) }
  return if @urls.empty?

  link_queue = Queue.new
  page_queue = Queue.new

  @opts[:threads].times do
    @tentacles << Thread.new { Tentacle.new(link_queue, page_queue, @opts.dup).run }
  end

  @urls.each{ |url| link_queue.enq(url) }

  loop do
    page = page_queue.deq
    @pages.touch_key page.url
    do_page_blocks page
    page.discard_doc! if @opts[:discard_page_bodies]

    links = links_to_follow page
    links.each do |link|
      link_queue << [link, page.url.dup, page.depth + 1]
    end
    @pages.touch_keys links

    @pages[page.url] = page

    # if we are done with the crawl, tell the threads to end
    if link_queue.empty? and page_queue.empty?
      until link_queue.num_waiting == @tentacles.size
        Thread.pass
      end
      if page_queue.empty?
        @tentacles.size.times { link_queue << :END }
        break
      end
    end
  end

  @tentacles.each { |thread| thread.join }
  do_after_crawl_blocks
  self
end

Add one ore more Regex patterns for URLs which should not be followed



109
110
111
112
# File 'lib/medusa/core.rb', line 109

def skip_links_like(*patterns)
  @skip_link_patterns.concat [patterns].flatten.compact
  self
end