Module: Scrapey
- Defined in:
- lib/scrapey/tee.rb,
lib/scrapey/tor.rb,
lib/scrapey/cache.rb,
lib/scrapey/multi.rb,
lib/scrapey/scrapey.rb,
lib/scrapey/database.rb,
lib/scrapey/template.rb,
lib/scrapey/constants.rb,
lib/scrapey/cache/disk.rb,
lib/scrapey/cache/redis.rb
Defined Under Namespace
Modules: Template Classes: Tee
Constant Summary collapse
- VERSION =
"0.0.17"- BASEDIR =
File.(File.dirname($0)).gsub(/\/src$/,'')
- URL =
"https://github.com/monkeysuffrage/scrapey"
Class Method Summary collapse
Instance Method Summary collapse
- #cache_filename(url) ⇒ Object
- #change_identity ⇒ Object
- #check_db_config ⇒ Object
- #debug(msg) ⇒ Object
- #delete_cache(url) ⇒ Object
- #disable_cache ⇒ Object
- #enqueue(url) ⇒ Object
- #fields(*args) ⇒ Object
- #get(*args) ⇒ Object
- #get_or_post(method, url, options = {}, *args) ⇒ Object
- #goto(*args) ⇒ Object
- #head(*args) ⇒ Object
- #init_db ⇒ Object
- #is_cached?(url) ⇒ Boolean
-
#load_cache(url) ⇒ Object
def load_cache url filename = cache_filename url return nil unless File::exists?(filename) debug “Loading #filename from cache” begin Mechanize::Page.new URI.parse(url), [], Marshal.load(File.open(filename, “rb”){|f| f.read}), nil, @agent rescue Exception => e puts e.message end end.
- #multi_get(*args) ⇒ Object
- #multi_get_or_post(method, all_urls, options = {}) ⇒ Object
- #multi_head(*args) ⇒ Object
- #multi_post(*args) ⇒ Object
- #post(*args) ⇒ Object
- #save(item, output = nil) ⇒ Object
- #save_cache(url, body, options = {}) ⇒ Object
- #save_images(urls) ⇒ Object
- #set_proxy(*args) ⇒ Object
- #tables(*args) ⇒ Object
- #truncate(*args) ⇒ Object
- #ts ⇒ Object
- #use_cache(options = {}) ⇒ Object
- #use_tor ⇒ Object
- #visit(*args) ⇒ Object
- #visited?(url) ⇒ Boolean
- #with_cache(cassette_name = 'my_cassette') ⇒ Object
- #without_cache ⇒ Object
Class Method Details
.init(b) ⇒ Object
3 4 5 6 7 8 9 10 |
# File 'lib/scrapey/scrapey.rb', line 3 def self.init b eval "include Scrapey", b # some defaults that I like eval "@agent ||= Mechanize.new{|a| a.history.max_size = 10}", b eval "@agent.user_agent = 'Scrapey v#{Scrapey::VERSION} - #{Scrapey::URL}'", b eval "@agent.verify_mode = OpenSSL::SSL::VERIFY_NONE", b end |
Instance Method Details
#cache_filename(url) ⇒ Object
5 6 7 |
# File 'lib/scrapey/cache/disk.rb', line 5 def cache_filename url @config['cache_dir'] + "/" + Digest::MD5.hexdigest(url) + ".cache" end |
#change_identity ⇒ Object
8 9 10 11 12 13 14 |
# File 'lib/scrapey/tor.rb', line 8 def change_identity debug "changing identity..." localhost = Net::Telnet::new("Host" => "localhost", "Port" => "9051", "Timeout" => 10, "Prompt" => /250 OK\n/) localhost.cmd('AUTHENTICATE ""') {|c| raise "Cannot authenticate to Tor" unless "250 OK\n" == c} localhost.cmd('signal NEWNYM') {|c| raise "Cannot switch Tor to new route" unless "250 OK\n" == c} localhost.close end |
#check_db_config ⇒ Object
2 3 4 |
# File 'lib/scrapey/database.rb', line 2 def check_db_config raise 'No database configured' unless @config['database'] end |
#debug(msg) ⇒ Object
108 109 110 |
# File 'lib/scrapey/scrapey.rb', line 108 def debug msg puts msg if @debug end |
#delete_cache(url) ⇒ Object
46 47 48 |
# File 'lib/scrapey/cache/disk.rb', line 46 def delete_cache url FileUtils.rm(cache_filename(url)) rescue nil end |
#disable_cache ⇒ Object
14 15 16 17 18 |
# File 'lib/scrapey/cache.rb', line 14 def disable_cache @use_cache = false yield @use_cache = true end |
#enqueue(url) ⇒ Object
116 117 118 119 120 |
# File 'lib/scrapey/scrapey.rb', line 116 def enqueue url @url_list ||= File.open("#{BASEDIR}/config/urls.txt", 'w') @url_list << url @url_list << "\n" end |
#fields(*args) ⇒ Object
57 58 59 |
# File 'lib/scrapey/scrapey.rb', line 57 def fields *args @fields = args end |
#get(*args) ⇒ Object
47 |
# File 'lib/scrapey/scrapey.rb', line 47 def get *args; get_or_post 'get', *args; end |
#get_or_post(method, url, options = {}, *args) ⇒ Object
13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 |
# File 'lib/scrapey/scrapey.rb', line 13 def get_or_post method, url, ={}, *args agent = ['goto', 'visit'].include?(method) ? @browser : @agent _retries = .delete :retries _sleep = .delete :sleep begin new_args = method, url unless .empty? && args.empty? new_args << args.each{|arg| new_args << arg} end doc = load_cache(url) if @use_cache return doc if doc page = agent.send *new_args # str = page.respond_to?('root') ? page.root.to_s : page.body # save_cache(url, str) if @use_cache save_cache(url, page.body) if @use_cache #exit if Object.const_defined? :Ocra page rescue Exception => e case when defined? on_error return on_error e, method, url, , *args when _retries && _retries > 0 puts "Error. Retries remaining: #{[:retries]}" sleep _sleep if _sleep get_or_post method, url, .merge({:retries => _retries - 1, :sleep => _sleep}), *args else raise e end end end |
#goto(*args) ⇒ Object
50 |
# File 'lib/scrapey/scrapey.rb', line 50 def goto *args; get_or_post 'goto', *args; end |
#head(*args) ⇒ Object
49 |
# File 'lib/scrapey/scrapey.rb', line 49 def head *args; get_or_post 'head', *args; end |
#init_db ⇒ Object
24 25 26 27 28 29 30 31 32 33 34 35 |
# File 'lib/scrapey/database.rb', line 24 def init_db [ 'active_record', 'active_record/schema', 'active_record/connection_adapters/abstract/schema_definitions', @config['database']['adapter'], 'tzinfo', 'active_support/all', 'active_support/multibyte/chars' ].each{|lib| require lib} ActiveRecord::Base.establish_connection(@config['database']) end |
#is_cached?(url) ⇒ Boolean
9 10 11 |
# File 'lib/scrapey/cache/disk.rb', line 9 def is_cached? url File.exists? cache_filename(url) end |
#load_cache(url) ⇒ Object
def load_cache url
filename = cache_filename url
return nil unless File::exists?(filename)
debug "Loading #{filename} from cache"
begin
Mechanize::Page.new URI.parse(url), [], Marshal.load(File.open(filename, "rb"){|f| f.read}), nil, @agent
rescue Exception => e
puts e.message
end
end
def save_cache url, doc, options = {}
File.open(cache_filename(url), "wb") {|f| f << Marshal.dump(doc) }
end
30 31 32 33 34 35 36 37 38 39 |
# File 'lib/scrapey/cache/disk.rb', line 30 def load_cache url filename = cache_filename url return nil unless File::exists?(filename) debug "Loading #{filename} from cache" begin Mechanize::Page.new URI.parse(url), [], Marshal.load(Zlib::Inflate.inflate(File.open(filename, "rb"){|f| f.read})), nil, @agent rescue Exception => e puts e. end end |
#multi_get(*args) ⇒ Object
50 |
# File 'lib/scrapey/multi.rb', line 50 def multi_get *args; multi_get_or_post 'get_content', *args; end |
#multi_get_or_post(method, all_urls, options = {}) ⇒ Object
7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 |
# File 'lib/scrapey/multi.rb', line 7 def multi_get_or_post method, all_urls, = {} # some sensible defaults threads = [:threads] || 20 on_success = [:on_success] || :on_success on_error = [:on_error] || :on_error user_agent = [:user_agent] || "Scrapey v#{Scrapey::VERSION} - #{Scrapey::URL}" proxy = [:proxy] || nil timeout = [:timeout] || 1000 follow_redirect = [:follow_redirect] || true @lock ||= Mutex.new @http_clients ||= threads.times.map do c = HTTPClient.new proxy, user_agent c.ssl_config.verify_mode = OpenSSL::SSL::VERIFY_NONE c.receive_timeout = timeout yield c if block_given? c end debug 'starting multi' all_urls.each_slice(threads) do |urls| urls.each_with_index.map do |url, i| Thread.new do begin response = @http_clients[i].send method, url, [:query], [:headers], :follow_redirect => follow_redirect rescue Exception => e error = e end @lock.synchronize do if response send on_success, url, response else send on_error, url, e end end end end.each{|thread| thread.join} end end |
#multi_head(*args) ⇒ Object
52 |
# File 'lib/scrapey/multi.rb', line 52 def multi_head *args; multi_get_or_post 'head', *args; end |
#multi_post(*args) ⇒ Object
51 |
# File 'lib/scrapey/multi.rb', line 51 def multi_post *args; multi_get_or_post 'post_content', *args; end |
#post(*args) ⇒ Object
48 |
# File 'lib/scrapey/scrapey.rb', line 48 def post *args; get_or_post 'post', *args; end |
#save(item, output = nil) ⇒ Object
77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 |
# File 'lib/scrapey/scrapey.rb', line 77 def save item, output = nil output ||= @output @csvs ||= {} unless @csvs[output] obj = {} begin fn = output.gsub(/(?<!csv)$/, '.csv') obj[:csv] = CSV.open fn, 'w' rescue Exception => e if e.is_a?(Errno::EACCES) puts "Unable to access #{fn} - is it locked?" exit else raise e end end obj[:fields] = output == @output && @fields && !@fields.empty? ? @fields : item.keys obj[:csv] << obj[:fields] @csvs[output] = obj end @csvs[output][:csv] << @csvs[output][:fields].map{|f| item[f]} end |
#save_cache(url, body, options = {}) ⇒ Object
41 42 43 |
# File 'lib/scrapey/cache/disk.rb', line 41 def save_cache url, doc, = {} File.open(cache_filename(url), "wb") {|f| f << Zlib::Deflate.deflate(Marshal.dump(doc)) } end |
#save_images(urls) ⇒ Object
61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 |
# File 'lib/scrapey/scrapey.rb', line 61 def save_images urls folder = "#{BASEDIR}/images" Dir.mkdir(folder) unless Dir.exists?(folder) names = [] urls.each do |url| name = url[/[^\/]+$/] binding.pry unless name names << name fn = "#{folder}/#{name}" next if File.exists?(fn) file = @agent.get(url) File.open(fn, 'wb'){|f| f << file.body} end names end |
#set_proxy(*args) ⇒ Object
53 54 55 |
# File 'lib/scrapey/scrapey.rb', line 53 def set_proxy *args @agent.set_proxy *args end |
#tables(*args) ⇒ Object
6 7 8 9 10 11 12 13 14 15 |
# File 'lib/scrapey/database.rb', line 6 def tables *args check_db_config missing_tables = false args.each do |arg| model = Object.const_set(arg, Class.new(ActiveRecord::Base) {}) missing_tables = true unless model.table_exists? end schema = "#{BASEDIR}/src/schema.rb" require schema if missing_tables && File.exists?(schema) end |
#truncate(*args) ⇒ Object
17 18 19 20 21 22 |
# File 'lib/scrapey/database.rb', line 17 def truncate *args check_db_config args.each do |arg| ActiveRecord::Base.connection.execute("TRUNCATE TABLE #{Object.const_get(arg).table_name}") end end |
#ts ⇒ Object
112 113 114 |
# File 'lib/scrapey/scrapey.rb', line 112 def ts Time.now.to_i.to_s end |
#use_cache(options = {}) ⇒ Object
3 4 5 6 7 8 9 10 11 12 |
# File 'lib/scrapey/cache.rb', line 3 def use_cache = {} @use_cache = true if @redis = .delete(:redis) require 'scrapey/cache/redis' else require 'scrapey/cache/disk' @config['cache_dir'] ||= "#{BASEDIR}/cache" FileUtils.mkdir_p @config['cache_dir'] end end |
#use_tor ⇒ Object
4 5 6 |
# File 'lib/scrapey/tor.rb', line 4 def use_tor set_proxy('localhost', 8118) end |
#visit(*args) ⇒ Object
51 |
# File 'lib/scrapey/scrapey.rb', line 51 def visit *args; get_or_post 'visit', *args; end |
#visited?(url) ⇒ Boolean
101 102 103 104 105 106 |
# File 'lib/scrapey/scrapey.rb', line 101 def visited? url @visited ||= [] return true if @visited.include? url @visited << url false end |
#with_cache(cassette_name = 'my_cassette') ⇒ Object
25 26 27 28 29 30 31 32 33 34 35 36 37 38 |
# File 'lib/scrapey/cache.rb', line 25 def with_cache cassette_name = 'my_cassette' require 'vcr' require 'fakeweb' VCR.configure do |c| c.cassette_library_dir = "#{BASEDIR}/cache" c.hook_into :fakeweb c.allow_http_connections_when_no_cassette = true end VCR.use_cassette(cassette_name, :record => :new_episodes, :match_requests_on => [:method, :uri, :body]) do yield end end |
#without_cache ⇒ Object
21 22 23 |
# File 'lib/scrapey/cache.rb', line 21 def without_cache yield end |