Class: Anemone::Core
- Inherits:
-
Object
- Object
- Anemone::Core
- Defined in:
- lib/anemone/core.rb
Constant Summary collapse
- DEFAULT_OPTS =
{ # run 4 Tentacle threads to fetch pages :threads => 4, # disable verbose output :verbose => false, # don't throw away the page response body after scanning it for links :discard_page_bodies => false, # identify self as Anemone/VERSION :user_agent => "Anemone/#{Anemone::VERSION}", # no delay between requests :delay => 0, # don't obey the robots exclusion protocol :obey_robots_txt => false, # by default, don't limit the depth of the crawl :depth_limit => false, # number of times HTTP redirects will be followed :redirect_limit => 5, # storage engine defaults to Hash in +process_options+ if none specified :storage => nil, # Hash of cookie name => value to send with HTTP requests :cookies => nil, # accept cookies from the server and send them back? :accept_cookies => false, # Authentication :authorization => nil, }
Instance Attribute Summary collapse
-
#opts ⇒ Object
readonly
Hash of options for the crawl.
-
#pages ⇒ Object
readonly
PageStore storing all Page objects encountered during the crawl.
Class Method Summary collapse
-
.crawl(urls, opts = {}) ⇒ Object
Convenience method to start a new crawl.
Instance Method Summary collapse
-
#after_crawl(&block) ⇒ Object
Add a block to be executed on the PageStore after the crawl is finished.
-
#focus_crawl(&block) ⇒ Object
Specify a block which will select which links to follow on each page.
-
#initialize(urls, opts = {}) {|_self| ... } ⇒ Core
constructor
Initialize the crawl with starting urls (single URL or Array of URLs) and optional block.
-
#on_every_page(&block) ⇒ Object
Add a block to be executed on every Page as they are encountered during the crawl.
-
#on_pages_like(*patterns, &block) ⇒ Object
Add a block to be executed on Page objects with a URL matching one or more patterns.
-
#run ⇒ Object
Perform the crawl.
-
#skip_links_like(*patterns) ⇒ Object
Add one ore more Regex patterns for URLs which should not be followed.
Constructor Details
#initialize(urls, opts = {}) {|_self| ... } ⇒ Core
Initialize the crawl with starting urls (single URL or Array of URLs) and optional block
64 65 66 67 68 69 70 71 72 73 74 75 76 |
# File 'lib/anemone/core.rb', line 64 def initialize(urls, opts = {}) @urls = [urls].flatten.map{ |url| url.is_a?(URI) ? url : URI(url) } @urls.each{ |url| url.path = '/' if url.path.empty? } @tentacles = [] @on_every_page_blocks = [] @on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] } @skip_link_patterns = [] @after_crawl_blocks = [] @opts = opts yield self if block_given? end |
Instance Attribute Details
#opts ⇒ Object (readonly)
Hash of options for the crawl
24 25 26 |
# File 'lib/anemone/core.rb', line 24 def opts @opts end |
#pages ⇒ Object (readonly)
PageStore storing all Page objects encountered during the crawl
22 23 24 |
# File 'lib/anemone/core.rb', line 22 def pages @pages end |
Class Method Details
.crawl(urls, opts = {}) ⇒ Object
Convenience method to start a new crawl
81 82 83 84 85 86 |
# File 'lib/anemone/core.rb', line 81 def self.crawl(urls, opts = {}) self.new(urls, opts) do |core| yield core if block_given? core.run end end |
Instance Method Details
#after_crawl(&block) ⇒ Object
Add a block to be executed on the PageStore after the crawl is finished
92 93 94 95 |
# File 'lib/anemone/core.rb', line 92 def after_crawl(&block) @after_crawl_blocks << block self end |
#focus_crawl(&block) ⇒ Object
Specify a block which will select which links to follow on each page. The block should return an Array of URI objects.
132 133 134 135 |
# File 'lib/anemone/core.rb', line 132 def focus_crawl(&block) @focus_crawl_block = block self end |
#on_every_page(&block) ⇒ Object
Add a block to be executed on every Page as they are encountered during the crawl
110 111 112 113 |
# File 'lib/anemone/core.rb', line 110 def on_every_page(&block) @on_every_page_blocks << block self end |
#on_pages_like(*patterns, &block) ⇒ Object
Add a block to be executed on Page objects with a URL matching one or more patterns
119 120 121 122 123 124 125 126 |
# File 'lib/anemone/core.rb', line 119 def on_pages_like(*patterns, &block) if patterns patterns.each do |pattern| @on_pages_like_blocks[pattern] << block end end self end |
#run ⇒ Object
Perform the crawl
140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 |
# File 'lib/anemone/core.rb', line 140 def run @urls.delete_if { |url| !visit_link?(url) } return if @urls.empty? link_queue = Queue.new page_queue = Queue.new @opts[:threads].times do @tentacles << Thread.new { Tentacle.new(link_queue, page_queue, @opts).run } end @urls.each{ |url| link_queue.enq(url) (url) if url.user } loop do page = page_queue.deq @pages.touch_key page.url puts "#{page.url} Queue: #{link_queue.size}" if @opts[:verbose] do_page_blocks page page.discard_doc! if @opts[:discard_page_bodies] links = links_to_follow page links.each do |link| link_queue << [link, page.url.dup, page.depth + 1] end @pages.touch_keys links @pages[page.url] = page # if we are done with the crawl, tell the threads to end if link_queue.empty? and page_queue.empty? until link_queue.num_waiting == @tentacles.size Thread.pass end if page_queue.empty? @tentacles.size.times { link_queue << :END } break end end end @tentacles.each { |thread| thread.join } do_after_crawl_blocks self end |
#skip_links_like(*patterns) ⇒ Object
Add one ore more Regex patterns for URLs which should not be followed
101 102 103 104 |
# File 'lib/anemone/core.rb', line 101 def skip_links_like(*patterns) @skip_link_patterns.concat [patterns].flatten.compact self end |