Class: Anemone::Core
- Inherits:
-
Object
- Object
- Anemone::Core
- Defined in:
- lib/anemone/core.rb
Constant Summary collapse
- DEFAULT_OPTS =
{ # run 4 Tentacle threads to fetch pages :threads => 4, # disable verbose output :verbose => false, # don't throw away the page response body after scanning it for links :discard_page_bodies => false, # identify self as Anemone/VERSION :user_agent => "Anemone/#{Anemone::VERSION}", # no delay between requests :delay => 0, # don't obey the robots exclusion protocol :obey_robots_txt => false, # by default, don't limit the depth of the crawl :depth_limit => false, # number of times HTTP redirects will be followed :redirect_limit => 5, # storage engine defaults to Hash in +process_options+ if none specified :storage => nil, # Hash of cookie name => value to send with HTTP requests :cookies => nil, # accept cookies from the server and send them back? :accept_cookies => false, # skip any link with a query string? e.g. http://foo.com/?u=user :skip_query_strings => false, # proxy server hostname :proxy_host => nil, # proxy server port number :proxy_port => false, # HTTP read timeout in seconds :read_timeout => nil }
Instance Attribute Summary collapse
-
#opts ⇒ Object
readonly
Hash of options for the crawl.
-
#pages ⇒ Object
readonly
PageStore storing all Page objects encountered during the crawl.
Class Method Summary collapse
-
.crawl(urls, opts = {}) ⇒ Object
Convenience method to start a new crawl.
Instance Method Summary collapse
-
#after_crawl(&block) ⇒ Object
Add a block to be executed on the PageStore after the crawl is finished.
- #filter_urls(&block) ⇒ Object
-
#focus_crawl(&block) ⇒ Object
Specify a block which will select which links to follow on each page.
-
#include_links_like(*patterns) ⇒ Object
Add one ore more Regex patterns for URLs which should not be followed.
-
#initialize(urls, opts = {}) {|_self| ... } ⇒ Core
constructor
Initialize the crawl with starting urls (single URL or Array of URLs) and optional block.
-
#on_every_page(&block) ⇒ Object
Add a block to be executed on every Page as they are encountered during the crawl.
-
#on_pages_like(*patterns, &block) ⇒ Object
Add a block to be executed on Page objects with a URL matching one or more patterns.
-
#run ⇒ Object
Perform the crawl.
-
#skip_links_like(*patterns) ⇒ Object
Add one ore more Regex patterns for URLs which should not be followed.
Constructor Details
#initialize(urls, opts = {}) {|_self| ... } ⇒ Core
Initialize the crawl with starting urls (single URL or Array of URLs) and optional block
72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 |
# File 'lib/anemone/core.rb', line 72 def initialize(urls, opts = {}) @urls = [urls].flatten.map{ |url| url.is_a?(URI) ? url : URI(url) } @urls.each{ |url| url.path = '/' if url.path.empty? } @tentacles = [] @on_every_page_blocks = [] @before_filter_urls = nil @on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] } @skip_link_patterns = [] @include_link_patterns = [] @after_crawl_blocks = [] @opts = opts yield self if block_given? end |
Instance Attribute Details
#opts ⇒ Object (readonly)
Hash of options for the crawl
26 27 28 |
# File 'lib/anemone/core.rb', line 26 def opts @opts end |
#pages ⇒ Object (readonly)
PageStore storing all Page objects encountered during the crawl
24 25 26 |
# File 'lib/anemone/core.rb', line 24 def pages @pages end |
Class Method Details
.crawl(urls, opts = {}) ⇒ Object
Convenience method to start a new crawl
91 92 93 94 95 96 |
# File 'lib/anemone/core.rb', line 91 def self.crawl(urls, opts = {}) self.new(urls, opts) do |core| yield core if block_given? core.run end end |
Instance Method Details
#after_crawl(&block) ⇒ Object
Add a block to be executed on the PageStore after the crawl is finished
102 103 104 105 |
# File 'lib/anemone/core.rb', line 102 def after_crawl(&block) @after_crawl_blocks << block self end |
#filter_urls(&block) ⇒ Object
134 135 136 137 |
# File 'lib/anemone/core.rb', line 134 def filter_urls(&block) @before_filter_urls = block self end |
#focus_crawl(&block) ⇒ Object
Specify a block which will select which links to follow on each page. The block should return an Array of URI objects.
156 157 158 159 |
# File 'lib/anemone/core.rb', line 156 def focus_crawl(&block) @focus_crawl_block = block self end |
#include_links_like(*patterns) ⇒ Object
Add one ore more Regex patterns for URLs which should not be followed
120 121 122 123 |
# File 'lib/anemone/core.rb', line 120 def include_links_like(*patterns) @include_link_patterns.concat [patterns].flatten.compact self end |
#on_every_page(&block) ⇒ Object
Add a block to be executed on every Page as they are encountered during the crawl
129 130 131 132 |
# File 'lib/anemone/core.rb', line 129 def on_every_page(&block) @on_every_page_blocks << block self end |
#on_pages_like(*patterns, &block) ⇒ Object
Add a block to be executed on Page objects with a URL matching one or more patterns
143 144 145 146 147 148 149 150 |
# File 'lib/anemone/core.rb', line 143 def on_pages_like(*patterns, &block) if patterns patterns.each do |pattern| @on_pages_like_blocks[pattern] << block end end self end |
#run ⇒ Object
Perform the crawl
164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 |
# File 'lib/anemone/core.rb', line 164 def run @urls.delete_if { |url| !visit_link?(url) } return if @urls.empty? link_queue = Queue.new page_queue = Queue.new @opts[:threads].times do @tentacles << Thread.new { Tentacle.new(link_queue, page_queue, @opts).run } end @urls.each{ |url| link_queue.enq(url) } loop do page = page_queue.deq @pages.touch_key page.url puts "#{page.url} Queue: #{link_queue.size}" if @opts[:verbose] do_page_blocks page page.discard_doc! if @opts[:discard_page_bodies] links = links_to_follow page links = do_filter_urls(links, page.depth) if @before_filter_urls links.each do |link| link_queue << [link, page.url.dup, page.depth + 1] end @pages.touch_keys links @pages[page.url] = page # if we are done with the crawl, tell the threads to end if link_queue.empty? and page_queue.empty? until link_queue.num_waiting == @tentacles.size Thread.pass end if page_queue.empty? @tentacles.size.times { link_queue << :END } break end end end @tentacles.each { |thread| thread.join } do_after_crawl_blocks self end |
#skip_links_like(*patterns) ⇒ Object
Add one ore more Regex patterns for URLs which should not be followed
111 112 113 114 |
# File 'lib/anemone/core.rb', line 111 def skip_links_like(*patterns) @skip_link_patterns.concat [patterns].flatten.compact self end |