Class: Anemone::Core

Inherits:
Object
  • Object
show all
Defined in:
lib/anemone/core.rb

Constant Summary collapse

DEFAULT_OPTS =
{
  # run 4 Tentacle threads to fetch pages
  :threads => 4,
  # disable verbose output
  :verbose => false,
  # don't throw away the page response body after scanning it for links
  :discard_page_bodies => false,
  # identify self as Anemone/VERSION
  :user_agent => "Anemone/#{Anemone::VERSION}",
  # no delay between requests
  :delay => 0,
  # don't obey the robots exclusion protocol
  :obey_robots_txt => false,
  # by default, don't limit the depth of the crawl
  :depth_limit => false,
  # number of times HTTP redirects will be followed
  :redirect_limit => 5,
  # storage engine defaults to Hash in +process_options+ if none specified
  :storage => nil,
  # Hash of cookie name => value to send with HTTP requests
  :cookies => nil,
  # accept cookies from the server and send them back?
  :accept_cookies => false,
  # skip any link with a query string? e.g. http://foo.com/?u=user
  :skip_query_strings => false,
  # proxy server hostname 
  :proxy_host => nil,
  # proxy server port number
  :proxy_port => false,
  # HTTP read timeout in seconds
  :read_timeout => nil
}

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(urls, opts = {}) {|_self| ... } ⇒ Core

Initialize the crawl with starting urls (single URL or Array of URLs) and optional block

Yields:

  • (_self)

Yield Parameters:

  • _self (Anemone::Core)

    the object that the method was called on



72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
# File 'lib/anemone/core.rb', line 72

def initialize(urls, opts = {})
  @urls = [urls].flatten.map{ |url| url.is_a?(URI) ? url : URI(url) }
  @urls.each{ |url| url.path = '/' if url.path.empty? }

  @tentacles = []
  @on_every_page_blocks = []
  @before_filter_urls = nil
  @on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] }
  @skip_link_patterns = []
  @include_link_patterns = []
  @after_crawl_blocks = []
  @opts = opts

  yield self if block_given?
end

Instance Attribute Details

#optsObject (readonly)

Hash of options for the crawl



26
27
28
# File 'lib/anemone/core.rb', line 26

def opts
  @opts
end

#pagesObject (readonly)

PageStore storing all Page objects encountered during the crawl



24
25
26
# File 'lib/anemone/core.rb', line 24

def pages
  @pages
end

Class Method Details

.crawl(urls, opts = {}) ⇒ Object

Convenience method to start a new crawl



91
92
93
94
95
96
# File 'lib/anemone/core.rb', line 91

def self.crawl(urls, opts = {})
  self.new(urls, opts) do |core|
    yield core if block_given?
    core.run
  end
end

Instance Method Details

#after_crawl(&block) ⇒ Object

Add a block to be executed on the PageStore after the crawl is finished



102
103
104
105
# File 'lib/anemone/core.rb', line 102

def after_crawl(&block)
  @after_crawl_blocks << block
  self
end

#filter_urls(&block) ⇒ Object



134
135
136
137
# File 'lib/anemone/core.rb', line 134

def filter_urls(&block)
  @before_filter_urls = block
  self
end

#focus_crawl(&block) ⇒ Object

Specify a block which will select which links to follow on each page. The block should return an Array of URI objects.



156
157
158
159
# File 'lib/anemone/core.rb', line 156

def focus_crawl(&block)
  @focus_crawl_block = block
  self
end

Add one ore more Regex patterns for URLs which should not be followed



120
121
122
123
# File 'lib/anemone/core.rb', line 120

def include_links_like(*patterns)
  @include_link_patterns.concat [patterns].flatten.compact
  self
end

#on_every_page(&block) ⇒ Object

Add a block to be executed on every Page as they are encountered during the crawl



129
130
131
132
# File 'lib/anemone/core.rb', line 129

def on_every_page(&block)
  @on_every_page_blocks << block
  self
end

#on_pages_like(*patterns, &block) ⇒ Object

Add a block to be executed on Page objects with a URL matching one or more patterns



143
144
145
146
147
148
149
150
# File 'lib/anemone/core.rb', line 143

def on_pages_like(*patterns, &block)
  if patterns
    patterns.each do |pattern|
      @on_pages_like_blocks[pattern] << block
    end
  end
  self
end

#runObject

Perform the crawl



164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
# File 'lib/anemone/core.rb', line 164

def run
  process_options

  @urls.delete_if { |url| !visit_link?(url) }
  return if @urls.empty?

  link_queue = Queue.new
  page_queue = Queue.new

  @opts[:threads].times do
    @tentacles << Thread.new { Tentacle.new(link_queue, page_queue, @opts).run }
  end

  @urls.each{ |url| link_queue.enq(url) }

  loop do
    page = page_queue.deq
    @pages.touch_key page.url
    puts "#{page.url} Queue: #{link_queue.size}" if @opts[:verbose]
    do_page_blocks page
    page.discard_doc! if @opts[:discard_page_bodies]

    links = links_to_follow page
    links = do_filter_urls(links, page.depth) if @before_filter_urls
    links.each do |link|
      link_queue << [link, page.url.dup, page.depth + 1]
    end
    @pages.touch_keys links

    @pages[page.url] = page

    # if we are done with the crawl, tell the threads to end
    if link_queue.empty? and page_queue.empty?
      until link_queue.num_waiting == @tentacles.size
        Thread.pass
      end
      if page_queue.empty?
        @tentacles.size.times { link_queue << :END }
        break
      end
    end
  end

  @tentacles.each { |thread| thread.join }
  do_after_crawl_blocks
  self
end

Add one ore more Regex patterns for URLs which should not be followed



111
112
113
114
# File 'lib/anemone/core.rb', line 111

def skip_links_like(*patterns)
  @skip_link_patterns.concat [patterns].flatten.compact
  self
end