Class: Anemone::Core
- Inherits:
-
Object
- Object
- Anemone::Core
- Defined in:
- lib/anemone/core.rb
Constant Summary collapse
- DEFAULT_OPTS =
{ # run 4 Tentacle threads to fetch pages :threads => 4, # disable verbose output :verbose => false, # don't throw away the page response body after scanning it for links :discard_page_bodies => false, # identify self as Anemone/VERSION :user_agent => "Anemone/#{Anemone::VERSION}", # no delay between requests :delay => 0, # don't obey the robots exclusion protocol :obey_robots_txt => false, # by default, don't limit the depth of the crawl :depth_limit => false, # number of times HTTP redirects will be followed :redirect_limit => 5 }
Instance Attribute Summary collapse
-
#opts ⇒ Object
Hash of options for the crawl.
-
#pages ⇒ Object
readonly
PageHash storing all Page objects encountered during the crawl.
Class Method Summary collapse
-
.crawl(urls, opts = {}) ⇒ Object
Convenience method to start a new crawl.
Instance Method Summary collapse
-
#after_crawl(&block) ⇒ Object
Add a block to be executed on the PageHash after the crawl is finished.
-
#focus_crawl(&block) ⇒ Object
Specify a block which will select which links to follow on each page.
-
#initialize(urls, opts = {}) {|_self| ... } ⇒ Core
constructor
Initialize the crawl with starting urls (single URL or Array of URLs) and optional block.
-
#on_every_page(&block) ⇒ Object
Add a block to be executed on every Page as they are encountered during the crawl.
-
#on_pages_like(*patterns, &block) ⇒ Object
Add a block to be executed on Page objects with a URL matching one or more patterns.
-
#run ⇒ Object
Perform the crawl.
-
#skip_links_like(*patterns) ⇒ Object
Add one ore more Regex patterns for URLs which should not be followed.
Constructor Details
#initialize(urls, opts = {}) {|_self| ... } ⇒ Core
Initialize the crawl with starting urls (single URL or Array of URLs) and optional block
48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 |
# File 'lib/anemone/core.rb', line 48 def initialize(urls, opts = {}) @urls = [urls].flatten.map{ |url| url.is_a?(URI) ? url : URI(url) } @urls.each{ |url| url.path = '/' if url.path.empty? } @tentacles = [] @pages = PageHash.new @on_every_page_blocks = [] @on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] } @skip_link_patterns = [] @after_crawl_blocks = [] opts yield self if block_given? end |
Instance Attribute Details
#opts ⇒ Object
Hash of options for the crawl
23 24 25 |
# File 'lib/anemone/core.rb', line 23 def opts @opts end |
#pages ⇒ Object (readonly)
PageHash storing all Page objects encountered during the crawl
20 21 22 |
# File 'lib/anemone/core.rb', line 20 def pages @pages end |
Class Method Details
.crawl(urls, opts = {}) ⇒ Object
Convenience method to start a new crawl
67 68 69 70 71 72 |
# File 'lib/anemone/core.rb', line 67 def self.crawl(urls, opts = {}) self.new(urls, opts) do |core| yield core if block_given? core.run end end |
Instance Method Details
#after_crawl(&block) ⇒ Object
Add a block to be executed on the PageHash after the crawl is finished
78 79 80 81 |
# File 'lib/anemone/core.rb', line 78 def after_crawl(&block) @after_crawl_blocks << block self end |
#focus_crawl(&block) ⇒ Object
Specify a block which will select which links to follow on each page. The block should return an Array of URI objects.
118 119 120 121 |
# File 'lib/anemone/core.rb', line 118 def focus_crawl(&block) @focus_crawl_block = block self end |
#on_every_page(&block) ⇒ Object
Add a block to be executed on every Page as they are encountered during the crawl
96 97 98 99 |
# File 'lib/anemone/core.rb', line 96 def on_every_page(&block) @on_every_page_blocks << block self end |
#on_pages_like(*patterns, &block) ⇒ Object
Add a block to be executed on Page objects with a URL matching one or more patterns
105 106 107 108 109 110 111 112 |
# File 'lib/anemone/core.rb', line 105 def on_pages_like(*patterns, &block) if patterns patterns.each do |pattern| @on_pages_like_blocks[pattern] << block end end self end |
#run ⇒ Object
Perform the crawl
126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 |
# File 'lib/anemone/core.rb', line 126 def run @urls.delete_if { |url| !visit_link?(url) } return if @urls.empty? link_queue = Queue.new page_queue = Queue.new @opts[:threads].times do @tentacles << Thread.new { Tentacle.new(link_queue, page_queue, @opts).run } end @urls.each{ |url| link_queue.enq(url) } loop do page = page_queue.deq @pages[page.url] = page puts "#{page.url} Queue: #{link_queue.size}" if @opts[:verbose] # perform the on_every_page blocks for this page do_page_blocks(page) page.discard_doc! if @opts[:discard_page_bodies] links_to_follow(page).each do |link| link_queue.enq([link, page]) @pages[link] = nil end # create an entry in the page hash for each alias of this page, # i.e. all the pages that redirected to this page page.aliases.each do |aka| if !@pages.has_key?(aka) or @pages[aka].nil? @pages[aka] = page.alias_clone(aka) end @pages[aka].add_alias!(page.url) end # if we are done with the crawl, tell the threads to end if link_queue.empty? and page_queue.empty? until link_queue.num_waiting == @tentacles.size Thread.pass end if page_queue.empty? @tentacles.size.times { link_queue.enq(:END)} break end end end @tentacles.each { |t| t.join } do_after_crawl_blocks() self end |
#skip_links_like(*patterns) ⇒ Object
Add one ore more Regex patterns for URLs which should not be followed
87 88 89 90 |
# File 'lib/anemone/core.rb', line 87 def skip_links_like(*patterns) @skip_link_patterns.concat [patterns].flatten.compact self end |