Class: RWGet::Controller

Inherits:
Object
  • Object
show all
Defined in:
lib/rwget/controller.rb

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(options) ⇒ Controller

Returns a new instance of Controller.



11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
# File 'lib/rwget/controller.rb', line 11

def initialize(options)
  @options = options
  @options[:user_agent] ||= "Ruby/Wget" 
  
  @options[:accept_patterns] ||= []
  @options[:reject_patterns] ||= []
      
  %w[quota depth wait limit_rate time_limit].each do |key|
    key = key.to_sym
    @options[key] = @options[key].to_i
  end
  
  @queue = (options[:queue_class] ? self.class.resolve_class(options[:queue_class]) : RWGet::Queue).new(options)
  @fetch = (options[:fetch_class] ? self.class.resolve_class(options[:fetch_class]) : RWGet::Fetch).new(options)
  @store = (options[:store_class] ? self.class.resolve_class(options[:store_class]) : RWGet::Store).new(options)
  @links = (options[:links_class] ? self.class.resolve_class(options[:links_class]) : RWGet::Links).new(options)
  @dupes = (options[:dupes_class] ? self.class.resolve_class(options[:dupes_class]) : RWGet::Dupes).new(options)
end

Instance Attribute Details

#optionsObject (readonly)

Returns the value of attribute options.



3
4
5
# File 'lib/rwget/controller.rb', line 3

def options
  @options
end

Class Method Details

.resolve_class(string) ⇒ Object



5
6
7
8
9
# File 'lib/rwget/controller.rb', line 5

def self.resolve_class(string)
  string.split("::").inject(Kernel) do |const, string|
    const.const_get(string)
  end
end

Instance Method Details

#closeObject



116
117
118
119
120
# File 'lib/rwget/controller.rb', line 116

def close
  [@queue, @fetch, @store, @links, @dupes].each do |obj|
    obj.close if obj.respond_to?(:close)
  end
end

#key_for(uri) ⇒ Object



104
105
106
107
108
109
110
111
112
113
114
# File 'lib/rwget/controller.rb', line 104

def key_for(uri)
  arr = []
  arr << options[:prefix]           if options[:prefix]
  arr << @start_time                if options[:timestampize]
  arr << uri.scheme                 if options[:protocol_directories]
  arr << uri.host                   unless options[:no_host_directories]
  paths = uri.path.split("/")
  paths << paths.pop + "?" + uri.query     if uri.query
  paths.shift                       if paths.first.to_s.empty?
  File.join(arr + paths)
end

#legal?(link) ⇒ Boolean

Returns:

  • (Boolean)


89
90
91
92
93
94
95
96
97
98
99
100
101
102
# File 'lib/rwget/controller.rb', line 89

def legal?(link)
  unless options[:span_hosts] || @original_hosts.include?(link.host)
    puts "can't span hosts: #{link}"
    return false 
  end
  link = link.to_s
  legal = options[:accept_patterns].empty?
  puts "accepted by default: #{link}" if legal
  legal ||= options[:accept_patterns].any?{|p| link =~ p}
  puts "not in accept patterns: #{link}" if !legal
  rejected = options[:reject_patterns].any?{|p| link =~ p}
  puts "in reject patterns: #{link}" if rejected
  legal && !rejected
end

#startObject



30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
# File 'lib/rwget/controller.rb', line 30

def start
  @start_time = Time.now.to_i.to_s
  @start = Time.now
  @original_hosts = Set.new
  options[:seeds].each do |seed| 
    @queue.put(seed, 0) 
    @original_hosts << URI.parse(seed).host
  end
  
  downloaded = 0
  while (options[:quota] == 0 || downloaded < options[:quota]) && 
        (options[:time_limit] == 0 || Time.now - @start < options[:time_limit]) 

    url, depth = @queue.get
    
    unless url
      puts "no more urls"
      return
    end
    
    if options[:depth] > 0 && depth > options[:depth]
      next 
    end
    
    uri = URI.parse(url)
    
    while options[:limit_rate] > 0 && downloaded / (Time.now - @start) > options[:limit_rate]
      puts "sleeping until under rate limit"
      sleep 1 
    end
    puts "download rate: #{downloaded / (Time.now - @start)}bps"
    
    puts "downloading #{uri}"
    effective_url, tmpfile = @fetch.fetch(uri, options[:user_agent])
    
    if tmpfile
      downloaded += File.size(tmpfile.path)
      puts "parsing links"
      @links.urls(effective_url, tmpfile).each do |link|
        legal = legal?(link)
        dupe = @dupes.dupe?(link)
        puts "dupe: #{link}" if dupe
        if legal && !dupe
          puts "adding link: #{link}"
          @queue.put(link, depth + 1)
        end 
      end
      key = key_for(uri)
      puts "storing at #{key}"
      @store.put(key, tmpfile)
      sleep options[:wait]
      tmpfile.close rescue nil
    else
      puts "unable to download"
    end  
  end
  puts "hit time/quota"
end