Module: Scrapey

Defined in:
lib/scrapey/tee.rb,
lib/scrapey/tor.rb,
lib/scrapey/cache.rb,
lib/scrapey/multi.rb,
lib/scrapey/scrapey.rb,
lib/scrapey/database.rb,
lib/scrapey/template.rb,
lib/scrapey/constants.rb,
lib/scrapey/cache/disk.rb,
lib/scrapey/cache/redis.rb

Defined Under Namespace

Modules: Template Classes: Tee

Constant Summary collapse

VERSION =
"0.0.17"
BASEDIR =
File.expand_path(File.dirname($0)).gsub(/\/src$/,'')
URL =
"https://github.com/monkeysuffrage/scrapey"

Class Method Summary collapse

Instance Method Summary collapse

Class Method Details

.init(b) ⇒ Object



3
4
5
6
7
8
9
10
# File 'lib/scrapey/scrapey.rb', line 3

def self.init b
  eval "include Scrapey", b

  # some defaults that I like
  eval "@agent ||= Mechanize.new{|a| a.history.max_size = 10}", b
  eval "@agent.user_agent = 'Scrapey v#{Scrapey::VERSION} - #{Scrapey::URL}'", b
  eval "@agent.verify_mode = OpenSSL::SSL::VERIFY_NONE", b
end

Instance Method Details

#cache_filename(url) ⇒ Object



5
6
7
# File 'lib/scrapey/cache/disk.rb', line 5

def cache_filename url
  @config['cache_dir'] + "/" + Digest::MD5.hexdigest(url) + ".cache"
end

#change_identityObject



8
9
10
11
12
13
14
# File 'lib/scrapey/tor.rb', line 8

def change_identity
  debug "changing identity..."
  localhost = Net::Telnet::new("Host" => "localhost", "Port" => "9051", "Timeout" => 10, "Prompt" => /250 OK\n/)
  localhost.cmd('AUTHENTICATE ""') {|c| raise "Cannot authenticate to Tor" unless "250 OK\n" == c}
  localhost.cmd('signal NEWNYM') {|c| raise "Cannot switch Tor to new route" unless "250 OK\n" == c}
  localhost.close
end

#check_db_configObject



2
3
4
# File 'lib/scrapey/database.rb', line 2

def check_db_config
  raise 'No database configured' unless @config['database']
end

#debug(msg) ⇒ Object



108
109
110
# File 'lib/scrapey/scrapey.rb', line 108

def debug msg
  puts msg if @debug
end

#delete_cache(url) ⇒ Object



46
47
48
# File 'lib/scrapey/cache/disk.rb', line 46

def delete_cache url
  FileUtils.rm(cache_filename(url)) rescue nil
end

#disable_cacheObject



14
15
16
17
18
# File 'lib/scrapey/cache.rb', line 14

def disable_cache
  @use_cache = false
  yield
  @use_cache = true
end

#enqueue(url) ⇒ Object



116
117
118
119
120
# File 'lib/scrapey/scrapey.rb', line 116

def enqueue url
  @url_list ||= File.open("#{BASEDIR}/config/urls.txt", 'w')
  @url_list << url
  @url_list << "\n"
end

#fields(*args) ⇒ Object



57
58
59
# File 'lib/scrapey/scrapey.rb', line 57

def fields *args
  @fields = args
end

#get(*args) ⇒ Object



47
# File 'lib/scrapey/scrapey.rb', line 47

def get *args; get_or_post 'get', *args; end

#get_or_post(method, url, options = {}, *args) ⇒ Object



13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
# File 'lib/scrapey/scrapey.rb', line 13

def get_or_post method, url, options={}, *args
  agent = ['goto', 'visit'].include?(method) ? @browser : @agent
  _retries = options.delete :retries
  _sleep = options.delete :sleep
  begin
    new_args = method, url
    unless options.empty? && args.empty? 
      new_args << options
      args.each{|arg| new_args << arg}
    end
    
    doc = load_cache(url) if @use_cache
    return doc if doc

    page = agent.send *new_args
    # str = page.respond_to?('root') ? page.root.to_s : page.body
    # save_cache(url, str) if @use_cache
    save_cache(url, page.body) if @use_cache

    #exit if Object.const_defined? :Ocra
    page
  rescue Exception => e
    case
      when defined? on_error
        return on_error e, method, url, options, *args
      when _retries && _retries > 0
        puts "Error. Retries remaining: #{options[:retries]}"
        sleep _sleep if _sleep
        get_or_post method, url, options.merge({:retries => _retries - 1, :sleep => _sleep}), *args
      else raise e
    end
  end
end

#goto(*args) ⇒ Object



50
# File 'lib/scrapey/scrapey.rb', line 50

def goto *args; get_or_post 'goto', *args; end

#head(*args) ⇒ Object



49
# File 'lib/scrapey/scrapey.rb', line 49

def head *args; get_or_post 'head', *args; end

#init_dbObject



24
25
26
27
28
29
30
31
32
33
34
35
# File 'lib/scrapey/database.rb', line 24

def init_db
  [
  'active_record',
  'active_record/schema',
  'active_record/connection_adapters/abstract/schema_definitions',
  @config['database']['adapter'],
  'tzinfo',
  'active_support/all',
  'active_support/multibyte/chars'
  ].each{|lib| require lib}
	ActiveRecord::Base.establish_connection(@config['database']) 
end

#is_cached?(url) ⇒ Boolean

Returns:

  • (Boolean)


9
10
11
# File 'lib/scrapey/cache/disk.rb', line 9

def is_cached? url
  File.exists? cache_filename(url)
end

#load_cache(url) ⇒ Object

def load_cache url

  filename = cache_filename url
  return nil unless File::exists?(filename)
  debug "Loading #{filename} from cache"
  begin
    Mechanize::Page.new URI.parse(url), [], Marshal.load(File.open(filename, "rb"){|f| f.read}), nil, @agent
  rescue Exception => e
    puts e.message
  end
end

def save_cache url, doc, options = {}
  File.open(cache_filename(url), "wb") {|f| f << Marshal.dump(doc) }
end


30
31
32
33
34
35
36
37
38
39
# File 'lib/scrapey/cache/disk.rb', line 30

def load_cache url
  filename = cache_filename url
  return nil unless File::exists?(filename)
  debug "Loading #{filename} from cache"
  begin
    Mechanize::Page.new URI.parse(url), [], Marshal.load(Zlib::Inflate.inflate(File.open(filename, "rb"){|f| f.read})), nil, @agent
  rescue Exception => e
    puts e.message
  end
end

#multi_get(*args) ⇒ Object



50
# File 'lib/scrapey/multi.rb', line 50

def multi_get *args; multi_get_or_post 'get_content', *args; end

#multi_get_or_post(method, all_urls, options = {}) ⇒ Object



7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
# File 'lib/scrapey/multi.rb', line 7

def multi_get_or_post method, all_urls, options = {}

  # some sensible defaults
  threads         = options[:threads]         || 20
  on_success      = options[:on_success]      || :on_success
  on_error        = options[:on_error]        || :on_error
  user_agent      = options[:user_agent]      || "Scrapey v#{Scrapey::VERSION} - #{Scrapey::URL}"
  proxy           = options[:proxy]           || nil
  timeout         = options[:timeout]         || 1000
  follow_redirect = options[:follow_redirect] || true

  @lock ||= Mutex.new

  @http_clients ||= threads.times.map do
    c = HTTPClient.new proxy, user_agent
    c.ssl_config.verify_mode = OpenSSL::SSL::VERIFY_NONE
    c.receive_timeout =  timeout
    yield c if block_given?
    c
  end

  debug 'starting multi'

  all_urls.each_slice(threads) do |urls|
    urls.each_with_index.map do |url, i|
      Thread.new do
        begin
          response = @http_clients[i].send method, url, options[:query], options[:headers], :follow_redirect => follow_redirect
        rescue Exception => e
          error = e
        end
        @lock.synchronize do
          if response
            send on_success, url, response
          else
            send on_error, url, e
          end
        end
      end
    end.each{|thread| thread.join}
  end
end

#multi_head(*args) ⇒ Object



52
# File 'lib/scrapey/multi.rb', line 52

def multi_head *args; multi_get_or_post 'head', *args; end

#multi_post(*args) ⇒ Object



51
# File 'lib/scrapey/multi.rb', line 51

def multi_post *args; multi_get_or_post 'post_content', *args; end

#post(*args) ⇒ Object



48
# File 'lib/scrapey/scrapey.rb', line 48

def post *args; get_or_post 'post', *args; end

#save(item, output = nil) ⇒ Object



77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
# File 'lib/scrapey/scrapey.rb', line 77

def save item, output = nil
  output ||= @output
  @csvs ||= {}
  unless @csvs[output]
    obj = {}
    begin
      fn = output.gsub(/(?<!csv)$/, '.csv')
      obj[:csv] = CSV.open fn, 'w'
    rescue Exception => e
      if e.is_a?(Errno::EACCES)
        puts "Unable to access #{fn} - is it locked?"
        exit
      else
        raise e
      end
    end
    obj[:fields] = output == @output && @fields && !@fields.empty? ? @fields : item.keys
    obj[:csv] << obj[:fields]
    @csvs[output] = obj
  end
  @csvs[output][:csv] << @csvs[output][:fields].map{|f| item[f]}
end

#save_cache(url, body, options = {}) ⇒ Object



41
42
43
# File 'lib/scrapey/cache/disk.rb', line 41

def save_cache url, doc, options = {}
  File.open(cache_filename(url), "wb") {|f| f << Zlib::Deflate.deflate(Marshal.dump(doc)) }
end

#save_images(urls) ⇒ Object



61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# File 'lib/scrapey/scrapey.rb', line 61

def save_images urls
  folder = "#{BASEDIR}/images"
  Dir.mkdir(folder) unless Dir.exists?(folder)
  names = []
  urls.each do |url|
    name = url[/[^\/]+$/]
    binding.pry unless name
    names << name
    fn = "#{folder}/#{name}"
    next if File.exists?(fn)
    file = @agent.get(url)
    File.open(fn, 'wb'){|f| f << file.body}
  end
  names
end

#set_proxy(*args) ⇒ Object



53
54
55
# File 'lib/scrapey/scrapey.rb', line 53

def set_proxy *args
  @agent.set_proxy *args
end

#tables(*args) ⇒ Object



6
7
8
9
10
11
12
13
14
15
# File 'lib/scrapey/database.rb', line 6

def tables *args
  check_db_config
  missing_tables = false
  args.each do |arg|
    model = Object.const_set(arg, Class.new(ActiveRecord::Base) {})
    missing_tables = true unless model.table_exists?
  end
  schema = "#{BASEDIR}/src/schema.rb"
  require schema if missing_tables && File.exists?(schema)
end

#truncate(*args) ⇒ Object



17
18
19
20
21
22
# File 'lib/scrapey/database.rb', line 17

def truncate *args
  check_db_config
  args.each do |arg|
    ActiveRecord::Base.connection.execute("TRUNCATE TABLE #{Object.const_get(arg).table_name}")
  end
end

#tsObject



112
113
114
# File 'lib/scrapey/scrapey.rb', line 112

def ts
  Time.now.to_i.to_s
end

#use_cache(options = {}) ⇒ Object



3
4
5
6
7
8
9
10
11
12
# File 'lib/scrapey/cache.rb', line 3

def use_cache options = {}
  @use_cache = true
  if @redis = options.delete(:redis)
    require 'scrapey/cache/redis'
  else
    require 'scrapey/cache/disk'
    @config['cache_dir'] ||= "#{BASEDIR}/cache"
    FileUtils.mkdir_p @config['cache_dir']
  end
end

#use_torObject



4
5
6
# File 'lib/scrapey/tor.rb', line 4

def use_tor
  set_proxy('localhost', 8118)
end

#visit(*args) ⇒ Object



51
# File 'lib/scrapey/scrapey.rb', line 51

def visit *args; get_or_post 'visit', *args; end

#visited?(url) ⇒ Boolean

Returns:

  • (Boolean)


101
102
103
104
105
106
# File 'lib/scrapey/scrapey.rb', line 101

def visited? url
  @visited ||= []
  return true if @visited.include? url
  @visited << url
  false
end

#with_cache(cassette_name = 'my_cassette') ⇒ Object



25
26
27
28
29
30
31
32
33
34
35
36
37
38
# File 'lib/scrapey/cache.rb', line 25

def with_cache cassette_name = 'my_cassette'
  require 'vcr'
  require 'fakeweb'

  VCR.configure do |c|
    c.cassette_library_dir = "#{BASEDIR}/cache"
    c.hook_into :fakeweb
    c.allow_http_connections_when_no_cassette = true
  end

  VCR.use_cassette(cassette_name, :record => :new_episodes, :match_requests_on => [:method, :uri, :body]) do
    yield
  end
end

#without_cacheObject



21
22
23
# File 'lib/scrapey/cache.rb', line 21

def without_cache
  yield
end