Module: Scrapey
- Defined in:
- lib/scrapey/tee.rb,
lib/scrapey/tor.rb,
lib/scrapey/cache.rb,
lib/scrapey/multi.rb,
lib/scrapey/scrapey.rb,
lib/scrapey/database.rb,
lib/scrapey/template.rb,
lib/scrapey/constants.rb,
lib/scrapey/cache/disk.rb,
lib/scrapey/cache/redis.rb
Defined Under Namespace
Modules: Template Classes: Tee
Constant Summary collapse
- VERSION =
"0.0.16"- BASEDIR =
File.(File.dirname($0)).gsub(/\/src$/,'')
- URL =
"https://github.com/monkeysuffrage/scrapey"
Class Method Summary collapse
Instance Method Summary collapse
- #cache_filename(url) ⇒ Object
- #change_identity ⇒ Object
- #check_db_config ⇒ Object
- #debug(msg) ⇒ Object
- #delete_cache(url) ⇒ Object
- #disable_cache ⇒ Object
- #enqueue(url) ⇒ Object
- #fields(*args) ⇒ Object
- #get(*args) ⇒ Object
- #get_or_post(method, url, options = {}, *args) ⇒ Object
- #goto(*args) ⇒ Object
- #head(*args) ⇒ Object
- #init_db ⇒ Object
- #is_cached?(url) ⇒ Boolean
- #load_cache(url) ⇒ Object
- #multi_get(*args) ⇒ Object
- #multi_get_or_post(method, all_urls, options = {}) ⇒ Object
- #multi_head(*args) ⇒ Object
- #multi_post(*args) ⇒ Object
- #post(*args) ⇒ Object
- #save(item) ⇒ Object
- #save_cache(url, body, options = {}) ⇒ Object
- #set_proxy(*args) ⇒ Object
- #tables(*args) ⇒ Object
- #truncate(*args) ⇒ Object
- #ts ⇒ Object
- #use_cache(options = {}) ⇒ Object
- #use_tor ⇒ Object
- #visit(*args) ⇒ Object
- #visited?(url) ⇒ Boolean
- #with_cache(cassette_name = 'my_cassette') ⇒ Object
- #without_cache ⇒ Object
Class Method Details
.init(b) ⇒ Object
3 4 5 6 7 8 9 10 |
# File 'lib/scrapey/scrapey.rb', line 3 def self.init b eval "include Scrapey", b # some defaults that I like eval "@agent ||= Mechanize.new{|a| a.history.max_size = 10}", b eval "@agent.user_agent = 'Scrapey v#{Scrapey::VERSION} - #{Scrapey::URL}'", b eval "@agent.verify_mode = OpenSSL::SSL::VERIFY_NONE", b end |
Instance Method Details
#cache_filename(url) ⇒ Object
5 6 7 |
# File 'lib/scrapey/cache/disk.rb', line 5 def cache_filename url @config['cache_dir'] + "/" + Digest::MD5.hexdigest(url) + ".cache" end |
#change_identity ⇒ Object
8 9 10 11 12 13 14 |
# File 'lib/scrapey/tor.rb', line 8 def change_identity debug "changing identity..." localhost = Net::Telnet::new("Host" => "localhost", "Port" => "9051", "Timeout" => 10, "Prompt" => /250 OK\n/) localhost.cmd('AUTHENTICATE ""') {|c| raise "Cannot authenticate to Tor" unless "250 OK\n" == c} localhost.cmd('signal NEWNYM') {|c| raise "Cannot switch Tor to new route" unless "250 OK\n" == c} localhost.close end |
#check_db_config ⇒ Object
2 3 4 |
# File 'lib/scrapey/database.rb', line 2 def check_db_config raise 'No database configured' unless @config['database'] end |
#debug(msg) ⇒ Object
81 82 83 |
# File 'lib/scrapey/scrapey.rb', line 81 def debug msg puts msg if @debug end |
#delete_cache(url) ⇒ Object
28 29 30 |
# File 'lib/scrapey/cache/disk.rb', line 28 def delete_cache url FileUtils.rm cache_filename(url) end |
#disable_cache ⇒ Object
14 15 16 17 18 |
# File 'lib/scrapey/cache.rb', line 14 def disable_cache @use_cache = false yield @use_cache = true end |
#enqueue(url) ⇒ Object
89 90 91 92 93 |
# File 'lib/scrapey/scrapey.rb', line 89 def enqueue url @url_list ||= File.open("#{BASEDIR}/config/urls.txt", 'w') @url_list << url @url_list << "\n" end |
#fields(*args) ⇒ Object
56 57 58 |
# File 'lib/scrapey/scrapey.rb', line 56 def fields *args @fields = args end |
#get(*args) ⇒ Object
46 |
# File 'lib/scrapey/scrapey.rb', line 46 def get *args; get_or_post 'get', *args; end |
#get_or_post(method, url, options = {}, *args) ⇒ Object
13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 |
# File 'lib/scrapey/scrapey.rb', line 13 def get_or_post method, url, ={}, *args agent = ['goto', 'visit'].include?(method) ? @browser : @agent _retries = .delete :retries _sleep = .delete :sleep begin new_args = method, url unless .empty? && args.empty? new_args << args.each{|arg| new_args << arg} end doc = load_cache(url) if @use_cache return doc if doc page = agent.send *new_args str = page.respond_to?('root') ? page.root.to_s : page.body save_cache(url, str) if @use_cache #exit if Object.const_defined? :Ocra page rescue Exception => e case when defined? on_error return on_error e, method, url, , *args when _retries && _retries > 0 puts "Error. Retries remaining: #{[:retries]}" sleep _sleep if _sleep get_or_post method, url, .merge({:retries => _retries - 1, :sleep => _sleep}), *args else raise e end end end |
#goto(*args) ⇒ Object
49 |
# File 'lib/scrapey/scrapey.rb', line 49 def goto *args; get_or_post 'goto', *args; end |
#head(*args) ⇒ Object
48 |
# File 'lib/scrapey/scrapey.rb', line 48 def head *args; get_or_post 'head', *args; end |
#init_db ⇒ Object
24 25 26 27 28 29 30 31 32 33 34 35 |
# File 'lib/scrapey/database.rb', line 24 def init_db [ 'active_record', 'active_record/schema', 'active_record/connection_adapters/abstract/schema_definitions', @config['database']['adapter'], 'tzinfo', 'active_support/all', 'active_support/multibyte/chars' ].each{|lib| require lib} ActiveRecord::Base.establish_connection(@config['database']) end |
#is_cached?(url) ⇒ Boolean
9 10 11 |
# File 'lib/scrapey/cache/disk.rb', line 9 def is_cached? url File.exists? cache_filename(url) end |
#load_cache(url) ⇒ Object
13 14 15 16 17 18 19 20 21 22 |
# File 'lib/scrapey/cache/disk.rb', line 13 def load_cache url filename = cache_filename url return nil unless File::exists?(filename) debug "Loading #{filename} from cache" begin Mechanize::Page.new URI.parse(url), [], Marshal.load(File.open(filename, "rb"){|f| f.read}), nil, @agent rescue Exception => e puts e. end end |
#multi_get(*args) ⇒ Object
50 |
# File 'lib/scrapey/multi.rb', line 50 def multi_get *args; multi_get_or_post 'get_content', *args; end |
#multi_get_or_post(method, all_urls, options = {}) ⇒ Object
7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 |
# File 'lib/scrapey/multi.rb', line 7 def multi_get_or_post method, all_urls, = {} # some sensible defaults threads = [:threads] || 20 on_success = [:on_success] || :on_success on_error = [:on_error] || :on_error user_agent = [:user_agent] || "Scrapey v#{Scrapey::VERSION} - #{Scrapey::URL}" proxy = [:proxy] || nil timeout = [:timeout] || 1000 follow_redirect = [:follow_redirect] || true @lock ||= Mutex.new @http_clients ||= threads.times.map do c = HTTPClient.new proxy, user_agent c.ssl_config.verify_mode = OpenSSL::SSL::VERIFY_NONE c.receive_timeout = timeout yield c if block_given? c end debug 'starting multi' all_urls.each_slice(threads) do |urls| urls.each_with_index.map do |url, i| Thread.new do begin response = @http_clients[i].send method, url, [:query], [:headers], :follow_redirect => follow_redirect rescue Exception => e error = e end @lock.synchronize do if response send on_success, url, response else send on_error, url, e end end end end.each{|thread| thread.join} end end |
#multi_head(*args) ⇒ Object
52 |
# File 'lib/scrapey/multi.rb', line 52 def multi_head *args; multi_get_or_post 'head', *args; end |
#multi_post(*args) ⇒ Object
51 |
# File 'lib/scrapey/multi.rb', line 51 def multi_post *args; multi_get_or_post 'post_content', *args; end |
#post(*args) ⇒ Object
47 |
# File 'lib/scrapey/scrapey.rb', line 47 def post *args; get_or_post 'post', *args; end |
#save(item) ⇒ Object
60 61 62 63 64 65 66 67 68 69 70 71 72 |
# File 'lib/scrapey/scrapey.rb', line 60 def save item unless @csv && !@csv.closed? @csv = CSV.open @output, 'w' @csv << @fields if @fields end case when item.is_a?(Array) then @csv << item when item.is_a?(Hash) || item.is_a?(CSV::Row) raise 'No fields defined!' unless @fields @csv << @fields.map{|f| item[f]} else raise "unsupported type: #{item.class}" end end |
#save_cache(url, body, options = {}) ⇒ Object
24 25 26 |
# File 'lib/scrapey/cache/disk.rb', line 24 def save_cache url, doc, = {} File.open(cache_filename(url), "wb") {|f| f << Marshal.dump(doc) } end |
#set_proxy(*args) ⇒ Object
52 53 54 |
# File 'lib/scrapey/scrapey.rb', line 52 def set_proxy *args @agent.set_proxy *args end |
#tables(*args) ⇒ Object
6 7 8 9 10 11 12 13 14 15 |
# File 'lib/scrapey/database.rb', line 6 def tables *args check_db_config missing_tables = false args.each do |arg| model = Object.const_set(arg, Class.new(ActiveRecord::Base) {}) missing_tables = true unless model.table_exists? end schema = "#{BASEDIR}/src/schema.rb" require schema if missing_tables && File.exists?(schema) end |
#truncate(*args) ⇒ Object
17 18 19 20 21 22 |
# File 'lib/scrapey/database.rb', line 17 def truncate *args check_db_config args.each do |arg| ActiveRecord::Base.connection.execute("TRUNCATE TABLE #{Object.const_get(arg).table_name}") end end |
#ts ⇒ Object
85 86 87 |
# File 'lib/scrapey/scrapey.rb', line 85 def ts Time.now.to_i.to_s end |
#use_cache(options = {}) ⇒ Object
3 4 5 6 7 8 9 10 11 12 |
# File 'lib/scrapey/cache.rb', line 3 def use_cache = {} @use_cache = true if @redis = .delete(:redis) require 'scrapey/cache/redis' else require 'scrapey/cache/disk' @config['cache_dir'] ||= "#{BASEDIR}/cache" FileUtils.mkdir_p @config['cache_dir'] end end |
#use_tor ⇒ Object
4 5 6 |
# File 'lib/scrapey/tor.rb', line 4 def use_tor set_proxy('localhost', 8118) end |
#visit(*args) ⇒ Object
50 |
# File 'lib/scrapey/scrapey.rb', line 50 def visit *args; get_or_post 'visit', *args; end |
#visited?(url) ⇒ Boolean
74 75 76 77 78 79 |
# File 'lib/scrapey/scrapey.rb', line 74 def visited? url @visited ||= [] return true if @visited.include? url @visited << url false end |
#with_cache(cassette_name = 'my_cassette') ⇒ Object
25 26 27 28 29 30 31 32 33 34 35 36 37 38 |
# File 'lib/scrapey/cache.rb', line 25 def with_cache cassette_name = 'my_cassette' require 'vcr' require 'fakeweb' VCR.configure do |c| c.cassette_library_dir = "#{BASEDIR}/cache" c.hook_into :fakeweb c.allow_http_connections_when_no_cassette = true end VCR.use_cassette(cassette_name, :record => :new_episodes, :match_requests_on => [:method, :uri, :body]) do yield end end |
#without_cache ⇒ Object
21 22 23 |
# File 'lib/scrapey/cache.rb', line 21 def without_cache yield end |