Module: Scrapey

Defined in:
lib/scrapey/tee.rb,
lib/scrapey/tor.rb,
lib/scrapey/cache.rb,
lib/scrapey/multi.rb,
lib/scrapey/scrapey.rb,
lib/scrapey/database.rb,
lib/scrapey/template.rb,
lib/scrapey/constants.rb,
lib/scrapey/cache/disk.rb,
lib/scrapey/cache/redis.rb

Defined Under Namespace

Modules: Template Classes: Tee

Constant Summary collapse

VERSION =
"0.0.16"
BASEDIR =
File.expand_path(File.dirname($0)).gsub(/\/src$/,'')
URL =
"https://github.com/monkeysuffrage/scrapey"

Class Method Summary collapse

Instance Method Summary collapse

Class Method Details

.init(b) ⇒ Object



3
4
5
6
7
8
9
10
# File 'lib/scrapey/scrapey.rb', line 3

def self.init b
  eval "include Scrapey", b

  # some defaults that I like
  eval "@agent ||= Mechanize.new{|a| a.history.max_size = 10}", b
  eval "@agent.user_agent = 'Scrapey v#{Scrapey::VERSION} - #{Scrapey::URL}'", b
  eval "@agent.verify_mode = OpenSSL::SSL::VERIFY_NONE", b
end

Instance Method Details

#cache_filename(url) ⇒ Object



5
6
7
# File 'lib/scrapey/cache/disk.rb', line 5

def cache_filename url
  @config['cache_dir'] + "/" + Digest::MD5.hexdigest(url) + ".cache"
end

#change_identityObject



8
9
10
11
12
13
14
# File 'lib/scrapey/tor.rb', line 8

def change_identity
  debug "changing identity..."
  localhost = Net::Telnet::new("Host" => "localhost", "Port" => "9051", "Timeout" => 10, "Prompt" => /250 OK\n/)
  localhost.cmd('AUTHENTICATE ""') {|c| raise "Cannot authenticate to Tor" unless "250 OK\n" == c}
  localhost.cmd('signal NEWNYM') {|c| raise "Cannot switch Tor to new route" unless "250 OK\n" == c}
  localhost.close
end

#check_db_configObject



2
3
4
# File 'lib/scrapey/database.rb', line 2

def check_db_config
  raise 'No database configured' unless @config['database']
end

#debug(msg) ⇒ Object



81
82
83
# File 'lib/scrapey/scrapey.rb', line 81

def debug msg
  puts msg if @debug
end

#delete_cache(url) ⇒ Object



28
29
30
# File 'lib/scrapey/cache/disk.rb', line 28

def delete_cache url
  FileUtils.rm cache_filename(url)
end

#disable_cacheObject



14
15
16
17
18
# File 'lib/scrapey/cache.rb', line 14

def disable_cache
  @use_cache = false
  yield
  @use_cache = true
end

#enqueue(url) ⇒ Object



89
90
91
92
93
# File 'lib/scrapey/scrapey.rb', line 89

def enqueue url
  @url_list ||= File.open("#{BASEDIR}/config/urls.txt", 'w')
  @url_list << url
  @url_list << "\n"
end

#fields(*args) ⇒ Object



56
57
58
# File 'lib/scrapey/scrapey.rb', line 56

def fields *args
  @fields = args
end

#get(*args) ⇒ Object



46
# File 'lib/scrapey/scrapey.rb', line 46

def get *args; get_or_post 'get', *args; end

#get_or_post(method, url, options = {}, *args) ⇒ Object



13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
# File 'lib/scrapey/scrapey.rb', line 13

def get_or_post method, url, options={}, *args
  agent = ['goto', 'visit'].include?(method) ? @browser : @agent
  _retries = options.delete :retries
  _sleep = options.delete :sleep
  begin
    new_args = method, url
    unless options.empty? && args.empty? 
      new_args << options
      args.each{|arg| new_args << arg}
    end
    
    doc = load_cache(url) if @use_cache
    return doc if doc

    page = agent.send *new_args
    str = page.respond_to?('root') ? page.root.to_s : page.body
    save_cache(url, str) if @use_cache

    #exit if Object.const_defined? :Ocra
    page
  rescue Exception => e
    case
      when defined? on_error
        return on_error e, method, url, options, *args
      when _retries && _retries > 0
        puts "Error. Retries remaining: #{options[:retries]}"
        sleep _sleep if _sleep
        get_or_post method, url, options.merge({:retries => _retries - 1, :sleep => _sleep}), *args
      else raise e
    end
  end
end

#goto(*args) ⇒ Object



49
# File 'lib/scrapey/scrapey.rb', line 49

def goto *args; get_or_post 'goto', *args; end

#head(*args) ⇒ Object



48
# File 'lib/scrapey/scrapey.rb', line 48

def head *args; get_or_post 'head', *args; end

#init_dbObject



24
25
26
27
28
29
30
31
32
33
34
35
# File 'lib/scrapey/database.rb', line 24

def init_db
  [
  'active_record',
  'active_record/schema',
  'active_record/connection_adapters/abstract/schema_definitions',
  @config['database']['adapter'],
  'tzinfo',
  'active_support/all',
  'active_support/multibyte/chars'
  ].each{|lib| require lib}
	ActiveRecord::Base.establish_connection(@config['database']) 
end

#is_cached?(url) ⇒ Boolean

Returns:

  • (Boolean)


9
10
11
# File 'lib/scrapey/cache/disk.rb', line 9

def is_cached? url
  File.exists? cache_filename(url)
end

#load_cache(url) ⇒ Object



13
14
15
16
17
18
19
20
21
22
# File 'lib/scrapey/cache/disk.rb', line 13

def load_cache url
  filename = cache_filename url
  return nil unless File::exists?(filename)
  debug "Loading #{filename} from cache"
  begin
    Mechanize::Page.new URI.parse(url), [], Marshal.load(File.open(filename, "rb"){|f| f.read}), nil, @agent
  rescue Exception => e
    puts e.message
  end
end

#multi_get(*args) ⇒ Object



50
# File 'lib/scrapey/multi.rb', line 50

def multi_get *args; multi_get_or_post 'get_content', *args; end

#multi_get_or_post(method, all_urls, options = {}) ⇒ Object



7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
# File 'lib/scrapey/multi.rb', line 7

def multi_get_or_post method, all_urls, options = {}

  # some sensible defaults
  threads         = options[:threads]         || 20
  on_success      = options[:on_success]      || :on_success
  on_error        = options[:on_error]        || :on_error
  user_agent      = options[:user_agent]      || "Scrapey v#{Scrapey::VERSION} - #{Scrapey::URL}"
  proxy           = options[:proxy]           || nil
  timeout         = options[:timeout]         || 1000
  follow_redirect = options[:follow_redirect] || true

  @lock ||= Mutex.new

  @http_clients ||= threads.times.map do
    c = HTTPClient.new proxy, user_agent
    c.ssl_config.verify_mode = OpenSSL::SSL::VERIFY_NONE
    c.receive_timeout =  timeout
    yield c if block_given?
    c
  end

  debug 'starting multi'

  all_urls.each_slice(threads) do |urls|
    urls.each_with_index.map do |url, i|
      Thread.new do
        begin
          response = @http_clients[i].send method, url, options[:query], options[:headers], :follow_redirect => follow_redirect
        rescue Exception => e
          error = e
        end
        @lock.synchronize do
          if response
            send on_success, url, response
          else
            send on_error, url, e
          end
        end
      end
    end.each{|thread| thread.join}
  end
end

#multi_head(*args) ⇒ Object



52
# File 'lib/scrapey/multi.rb', line 52

def multi_head *args; multi_get_or_post 'head', *args; end

#multi_post(*args) ⇒ Object



51
# File 'lib/scrapey/multi.rb', line 51

def multi_post *args; multi_get_or_post 'post_content', *args; end

#post(*args) ⇒ Object



47
# File 'lib/scrapey/scrapey.rb', line 47

def post *args; get_or_post 'post', *args; end

#save(item) ⇒ Object



60
61
62
63
64
65
66
67
68
69
70
71
72
# File 'lib/scrapey/scrapey.rb', line 60

def save item
  unless @csv && !@csv.closed?
    @csv = CSV.open @output, 'w'
    @csv << @fields if @fields
  end
  case
    when item.is_a?(Array) then @csv << item
    when item.is_a?(Hash) || item.is_a?(CSV::Row)
      raise 'No fields defined!' unless @fields
      @csv << @fields.map{|f| item[f]}
    else raise "unsupported type: #{item.class}"
  end
end

#save_cache(url, body, options = {}) ⇒ Object



24
25
26
# File 'lib/scrapey/cache/disk.rb', line 24

def save_cache url, doc, options = {}
  File.open(cache_filename(url), "wb") {|f| f << Marshal.dump(doc) }
end

#set_proxy(*args) ⇒ Object



52
53
54
# File 'lib/scrapey/scrapey.rb', line 52

def set_proxy *args
  @agent.set_proxy *args
end

#tables(*args) ⇒ Object



6
7
8
9
10
11
12
13
14
15
# File 'lib/scrapey/database.rb', line 6

def tables *args
  check_db_config
  missing_tables = false
  args.each do |arg|
    model = Object.const_set(arg, Class.new(ActiveRecord::Base) {})
    missing_tables = true unless model.table_exists?
  end
  schema = "#{BASEDIR}/src/schema.rb"
  require schema if missing_tables && File.exists?(schema)
end

#truncate(*args) ⇒ Object



17
18
19
20
21
22
# File 'lib/scrapey/database.rb', line 17

def truncate *args
  check_db_config
  args.each do |arg|
    ActiveRecord::Base.connection.execute("TRUNCATE TABLE #{Object.const_get(arg).table_name}")
  end
end

#tsObject



85
86
87
# File 'lib/scrapey/scrapey.rb', line 85

def ts
  Time.now.to_i.to_s
end

#use_cache(options = {}) ⇒ Object



3
4
5
6
7
8
9
10
11
12
# File 'lib/scrapey/cache.rb', line 3

def use_cache options = {}
  @use_cache = true
  if @redis = options.delete(:redis)
    require 'scrapey/cache/redis'
  else
    require 'scrapey/cache/disk'
    @config['cache_dir'] ||= "#{BASEDIR}/cache"
    FileUtils.mkdir_p @config['cache_dir']
  end
end

#use_torObject



4
5
6
# File 'lib/scrapey/tor.rb', line 4

def use_tor
  set_proxy('localhost', 8118)
end

#visit(*args) ⇒ Object



50
# File 'lib/scrapey/scrapey.rb', line 50

def visit *args; get_or_post 'visit', *args; end

#visited?(url) ⇒ Boolean

Returns:

  • (Boolean)


74
75
76
77
78
79
# File 'lib/scrapey/scrapey.rb', line 74

def visited? url
  @visited ||= []
  return true if @visited.include? url
  @visited << url
  false
end

#with_cache(cassette_name = 'my_cassette') ⇒ Object



25
26
27
28
29
30
31
32
33
34
35
36
37
38
# File 'lib/scrapey/cache.rb', line 25

def with_cache cassette_name = 'my_cassette'
  require 'vcr'
  require 'fakeweb'

  VCR.configure do |c|
    c.cassette_library_dir = "#{BASEDIR}/cache"
    c.hook_into :fakeweb
    c.allow_http_connections_when_no_cassette = true
  end

  VCR.use_cassette(cassette_name, :record => :new_episodes, :match_requests_on => [:method, :uri, :body]) do
    yield
  end
end

#without_cacheObject



21
22
23
# File 'lib/scrapey/cache.rb', line 21

def without_cache
  yield
end