Class: Medusa::Core
- Inherits:
-
Object
- Object
- Medusa::Core
- Defined in:
- lib/medusa/core.rb
Constant Summary collapse
- DEFAULT_OPTS =
{ # run 4 Tentacle threads to fetch pages :threads => 4, # don't throw away the page response body after scanning it for links :discard_page_bodies => false, # identify self as Medusa/VERSION :user_agent => "Medusa/#{Medusa::VERSION}", # no delay between requests :delay => 0, # don't obey the robots exclusion protocol :obey_robots_txt => false, # by default, don't limit the depth of the crawl :depth_limit => false, # number of times HTTP redirects will be followed :redirect_limit => 5, # storage engine defaults to In-memory store in +process_options+ if none specified :storage => nil, # cleanups of the storage on every startup of the crawler :clear_on_startup => true, # Hash of cookie name => value to send with HTTP requests :cookies => nil, # accept cookies from the server and send them back? :accept_cookies => false, # skip any link with a query string? e.g. http://foo.com/?u=user :skip_query_strings => false, # proxy server hostname :proxy_host => nil, # proxy server port number :proxy_port => false, # HTTP read timeout in seconds :read_timeout => nil }.freeze
Instance Attribute Summary collapse
-
#opts ⇒ Object
readonly
Hash of options for the crawl.
-
#pages ⇒ Object
readonly
PageStore storing all Page objects encountered during the crawl.
Class Method Summary collapse
-
.crawl(urls, opts = {}) ⇒ Object
Convenience method to start a new crawl.
Instance Method Summary collapse
-
#after_crawl(&block) ⇒ Object
Add a block to be executed on the PageStore after the crawl is finished.
-
#focus_crawl(&block) ⇒ Object
Specify a block which will select which links to follow on each page.
-
#initialize(urls, opts = {}) {|_self| ... } ⇒ Core
constructor
Initialize the crawl with starting urls (single URL or Array of URLs) and optional block.
-
#on_every_page(&block) ⇒ Object
Add a block to be executed on every Page as they are encountered during the crawl.
-
#on_pages_like(*patterns, &block) ⇒ Object
Add a block to be executed on Page objects with a URL matching one or more patterns.
-
#run ⇒ Object
Perform the crawl.
-
#skip_links_like(*patterns) ⇒ Object
Add one ore more Regex patterns for URLs which should not be followed.
Constructor Details
#initialize(urls, opts = {}) {|_self| ... } ⇒ Core
Initialize the crawl with starting urls (single URL or Array of URLs) and optional block
70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 |
# File 'lib/medusa/core.rb', line 70 def initialize(urls, opts = {}) @urls = [urls].flatten.map{ |url| url.is_a?(URI) ? url : URI(url) } @urls.each{ |url| url.path = '/' if url.path.empty? } @tentacles = [] @on_every_page_blocks = [] @on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] } @skip_link_patterns = [] @after_crawl_blocks = [] @opts = opts @focus_crawl_block = nil yield self if block_given? end |
Instance Attribute Details
#opts ⇒ Object (readonly)
Hash of options for the crawl
24 25 26 |
# File 'lib/medusa/core.rb', line 24 def opts @opts end |
#pages ⇒ Object (readonly)
PageStore storing all Page objects encountered during the crawl
22 23 24 |
# File 'lib/medusa/core.rb', line 22 def pages @pages end |
Class Method Details
.crawl(urls, opts = {}) ⇒ Object
Convenience method to start a new crawl
89 90 91 92 93 94 |
# File 'lib/medusa/core.rb', line 89 def self.crawl(urls, opts = {}) self.new(urls, opts) do |core| yield core if block_given? core.run end end |
Instance Method Details
#after_crawl(&block) ⇒ Object
Add a block to be executed on the PageStore after the crawl is finished
100 101 102 103 |
# File 'lib/medusa/core.rb', line 100 def after_crawl(&block) @after_crawl_blocks << block self end |
#focus_crawl(&block) ⇒ Object
Specify a block which will select which links to follow on each page. The block should return an Array of URI objects.
140 141 142 143 |
# File 'lib/medusa/core.rb', line 140 def focus_crawl(&block) @focus_crawl_block = block self end |
#on_every_page(&block) ⇒ Object
Add a block to be executed on every Page as they are encountered during the crawl
118 119 120 121 |
# File 'lib/medusa/core.rb', line 118 def on_every_page(&block) @on_every_page_blocks << block self end |
#on_pages_like(*patterns, &block) ⇒ Object
Add a block to be executed on Page objects with a URL matching one or more patterns
127 128 129 130 131 132 133 134 |
# File 'lib/medusa/core.rb', line 127 def on_pages_like(*patterns, &block) if patterns patterns.each do |pattern| @on_pages_like_blocks[pattern] << block end end self end |
#run ⇒ Object
Perform the crawl
148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 |
# File 'lib/medusa/core.rb', line 148 def run @urls.delete_if { |url| !visit_link?(url) } return if @urls.empty? link_queue = Queue.new page_queue = Queue.new @opts[:threads].times do @tentacles << Thread.new { Tentacle.new(link_queue, page_queue, @opts.dup).run } end @urls.each{ |url| link_queue.enq(url) } loop do page = page_queue.deq @pages.touch_key page.url do_page_blocks page page.discard_doc! if @opts[:discard_page_bodies] links = links_to_follow page links.each do |link| link_queue << [link, page.url.dup, page.depth + 1] end @pages.touch_keys links @pages[page.url] = page # if we are done with the crawl, tell the threads to end if link_queue.empty? and page_queue.empty? until link_queue.num_waiting == @tentacles.size Thread.pass end if page_queue.empty? @tentacles.size.times { link_queue << :END } break end end end @tentacles.each { |thread| thread.join } do_after_crawl_blocks self end |
#skip_links_like(*patterns) ⇒ Object
Add one ore more Regex patterns for URLs which should not be followed
109 110 111 112 |
# File 'lib/medusa/core.rb', line 109 def skip_links_like(*patterns) @skip_link_patterns.concat [patterns].flatten.compact self end |