Class: Robots

Inherits:
Object
  • Object
show all
Defined in:
lib/robots.rb

Overview

Robots retrieves and processes the robots.txt file from the target server

Instance Method Summary collapse

Constructor Details

#initialize(options) ⇒ Robots

Processes the robots.txt file



5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
# File 'lib/robots.rb', line 5

def initialize(options)
  @options = options
  raise "options should be a hash" unless options.kind_of? Hash
  raise ":url is required" unless @options.has_key? :url
  @options[:file] = "robots.txt" unless @options.has_key? :file
  @options[:user_agent] = "cobweb" unless @options.has_key? :user_agent

  uri = URI.parse(@options[:url])
  content = Cobweb.new(:cache => nil, :text_mime_types => ["text/html", "application/xhtml+xml", "text/plain"]).get([uri.scheme, "://", uri.host, ":", uri.port, "/", @options[:file]].join)
  if content[:mime_type][0..4] == "text/"
    @raw_data = parse_data(content[:body])

    if @options.has_key?(:user_agent) && @raw_data.has_key?(@options[:user_agent].to_s.downcase.to_sym)
      @params = @raw_data[@options[:user_agent].to_s.downcase.to_sym]
    else
      raise "Wildcard user-agent is not present" unless @raw_data.has_key? :*
      @params = @raw_data[:*]
    end
  else
    raise "Invalid mime type: #{content[:content_type]}"
  end
end

Instance Method Details

#allowed?(url) ⇒ Boolean

Returns:

  • (Boolean)


28
29
30
31
32
33
34
35
36
37
# File 'lib/robots.rb', line 28

def allowed?(url)
  uri = URI.parse(url)
  @params[:allow].each do |pattern|
    return true if uri.path.match(Cobweb.escape_pattern_for_regex(pattern, @options))
  end
  @params[:disallow].each do |pattern|
    return false if uri.path.match(Cobweb.escape_pattern_for_regex(pattern, @options))
  end
  true
end

#contentsObject



43
44
45
# File 'lib/robots.rb', line 43

def contents
  @raw_data
end

#user_agent_settingsObject



39
40
41
# File 'lib/robots.rb', line 39

def user_agent_settings
  @params
end