Class: Anemone::Core

Inherits:
Object
  • Object
show all
Defined in:
lib/anemone/core.rb

Constant Summary collapse

DEFAULT_OPTS =
{
  # run 4 Tentacle threads to fetch pages
  :threads => 4,
  # disable verbose output
  :verbose => false,
  # don't throw away the page response body after scanning it for links
  :discard_page_bodies => false,
  # identify self as Anemone/VERSION
  :user_agent => "Anemone/#{Anemone::VERSION}",
  # no delay between requests
  :delay => 0,
  # don't obey the robots exclusion protocol
  :obey_robots_txt => false,
  # by default, don't limit the depth of the crawl
  :depth_limit => false,
  # number of times HTTP redirects will be followed
  :redirect_limit => 5
}

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(urls, opts = {}) {|_self| ... } ⇒ Core

Initialize the crawl with starting urls (single URL or Array of URLs) and optional block

Yields:

  • (_self)

Yield Parameters:

  • _self (Anemone::Core)

    the object that the method was called on



48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# File 'lib/anemone/core.rb', line 48

def initialize(urls, opts = {})
  @urls = [urls].flatten.map{ |url| url.is_a?(URI) ? url : URI(url) }
  @urls.each{ |url| url.path = '/' if url.path.empty? }

  @tentacles = []
  @pages = PageHash.new
  @on_every_page_blocks = []
  @on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] }
  @skip_link_patterns = []
  @after_crawl_blocks = []

  process_options opts

  yield self if block_given?
end

Instance Attribute Details

#optsObject

Hash of options for the crawl



23
24
25
# File 'lib/anemone/core.rb', line 23

def opts
  @opts
end

#pagesObject (readonly)

PageHash storing all Page objects encountered during the crawl



20
21
22
# File 'lib/anemone/core.rb', line 20

def pages
  @pages
end

Class Method Details

.crawl(urls, opts = {}) ⇒ Object

Convenience method to start a new crawl



67
68
69
70
71
72
# File 'lib/anemone/core.rb', line 67

def self.crawl(urls, opts = {})
  self.new(urls, opts) do |core|
    yield core if block_given?
    core.run
  end
end

Instance Method Details

#after_crawl(&block) ⇒ Object

Add a block to be executed on the PageHash after the crawl is finished



78
79
80
81
# File 'lib/anemone/core.rb', line 78

def after_crawl(&block)
  @after_crawl_blocks << block
  self
end

#focus_crawl(&block) ⇒ Object

Specify a block which will select which links to follow on each page. The block should return an Array of URI objects.



118
119
120
121
# File 'lib/anemone/core.rb', line 118

def focus_crawl(&block)
  @focus_crawl_block = block
  self
end

#on_every_page(&block) ⇒ Object

Add a block to be executed on every Page as they are encountered during the crawl



96
97
98
99
# File 'lib/anemone/core.rb', line 96

def on_every_page(&block)
  @on_every_page_blocks << block
  self
end

#on_pages_like(*patterns, &block) ⇒ Object

Add a block to be executed on Page objects with a URL matching one or more patterns



105
106
107
108
109
110
111
112
# File 'lib/anemone/core.rb', line 105

def on_pages_like(*patterns, &block)
  if patterns
    patterns.each do |pattern|
      @on_pages_like_blocks[pattern] << block
    end
  end
  self
end

#runObject

Perform the crawl



126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
# File 'lib/anemone/core.rb', line 126

def run
  @urls.delete_if { |url| !visit_link?(url) }
  return if @urls.empty?
  
  link_queue = Queue.new
  page_queue = Queue.new

  @opts[:threads].times do
    @tentacles << Thread.new { Tentacle.new(link_queue, page_queue, @opts).run }
  end
  
  @urls.each{ |url| link_queue.enq(url) }

  loop do
    page = page_queue.deq
    
    @pages[page.url] = page
    
    puts "#{page.url} Queue: #{link_queue.size}" if @opts[:verbose]
    
    # perform the on_every_page blocks for this page
    do_page_blocks(page)

    page.discard_doc! if @opts[:discard_page_bodies]
    
    links_to_follow(page).each do |link|
      link_queue.enq([link, page])
      @pages[link] = nil
    end
    
    # create an entry in the page hash for each alias of this page,
    # i.e. all the pages that redirected to this page
    page.aliases.each do |aka|
      if !@pages.has_key?(aka) or @pages[aka].nil?
        @pages[aka] = page.alias_clone(aka)
      end
      @pages[aka].add_alias!(page.url)
    end
    
    # if we are done with the crawl, tell the threads to end
    if link_queue.empty? and page_queue.empty?
      until link_queue.num_waiting == @tentacles.size
        Thread.pass
      end
      
      if page_queue.empty?
        @tentacles.size.times { link_queue.enq(:END)}
        break
      end
    end
    
  end

  @tentacles.each { |t| t.join }

  do_after_crawl_blocks()
  
  self
end

Add one ore more Regex patterns for URLs which should not be followed



87
88
89
90
# File 'lib/anemone/core.rb', line 87

def skip_links_like(*patterns)
  @skip_link_patterns.concat [patterns].flatten.compact
  self
end