Class: Robotex

Inherits:
Object
  • Object
show all
Defined in:
lib/robotex.rb

Defined Under Namespace

Classes: ParsedRobots

Constant Summary collapse

VERSION =
'1.0.0'
DEFAULT_TIMEOUT =
3

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(user_agent = nil) ⇒ Robotex

Returns a new instance of Robotex.



118
119
120
121
122
123
# File 'lib/robotex.rb', line 118

def initialize(user_agent = nil)
  user_agent = "Robotex/#{VERSION} (http://www.github.com/chriskite/robotex)" if user_agent.nil?
  @user_agent = user_agent
  @last_accessed = Time.at(1)
  @parsed = {}
end

Instance Attribute Details

#user_agentObject (readonly)

Returns the value of attribute user_agent.



14
15
16
# File 'lib/robotex.rb', line 14

def user_agent
  @user_agent
end

Class Method Details

.get_robots_txt(uri, user_agent) ⇒ Object



100
101
102
103
104
105
106
107
108
# File 'lib/robotex.rb', line 100

def self.get_robots_txt(uri, user_agent)
  begin
    Timeout::timeout(Robotex.timeout) do
      io = URI.join(uri.to_s, "/robots.txt").open("User-Agent" => user_agent) rescue nil
    end 
  rescue Timeout::Error
    STDERR.puts "robots.txt request timed out"
  end
end

.timeoutObject



114
115
116
# File 'lib/robotex.rb', line 114

def self.timeout
  @timeout || DEFAULT_TIMEOUT
end

.timeout=(t) ⇒ Object



110
111
112
# File 'lib/robotex.rb', line 110

def self.timeout=(t)
  @timeout = t
end

Instance Method Details

#allowed?(uri) ⇒ Boolean

Download the server’s robots.txt, and return try if we are allowed to acces the url, false otherwise

Returns:

  • (Boolean)


133
134
135
# File 'lib/robotex.rb', line 133

def allowed?(uri)
  parse_host(uri).allowed?(uri, @user_agent)
end

#delay(uri) ⇒ Object

Return the value of the Crawl-Delay directive, or nil if none



139
140
141
# File 'lib/robotex.rb', line 139

def delay(uri)
  parse_host(uri).delay(@user_agent)
end

#delay!(uri) ⇒ Object

Sleep for the amount of time necessary to obey the Crawl-Delay specified by the server



146
147
148
149
150
# File 'lib/robotex.rb', line 146

def delay!(uri)
  delay = delay(uri)
  sleep delay - (Time.now - @last_accessed) if !!delay
  @last_accessed = Time.now
end

#parse_host(uri) ⇒ Object



125
126
127
128
# File 'lib/robotex.rb', line 125

def parse_host(uri)
  uri = URI.parse(uri.to_s) unless uri.is_a?(URI)
  @parsed[uri.host] ||= ParsedRobots.new(uri, @user_agent)
end