Class: Scrape::RobotsTxt

Inherits:
Object
  • Object
show all
Defined in:
lib/scrape/robots_txt.rb

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(rules) ⇒ RobotsTxt

Returns a new instance of RobotsTxt.



4
5
6
7
# File 'lib/scrape/robots_txt.rb', line 4

def initialize rules
  @rules = rules
  @rules.default = Scrape::RobotsTxtRules.new
end

Class Method Details

.load(url, default = true) ⇒ Object



50
51
52
53
54
55
56
# File 'lib/scrape/robots_txt.rb', line 50

def self.load url, default = true
  url = Addressable::URI.join(url, "/robots.txt") if default
  parse Scrape.open(url)
rescue Scrape::HTTPError
  Scrape.logger.warn "Failed to obtain robots.txt: #{url}"
  nil
end

.parse(content) ⇒ Object



31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
# File 'lib/scrape/robots_txt.rb', line 31

def self.parse content
  return if content.nil?
  rules, user_agent = Hash.new, nil

  content.split("\n").each do |line|
    case line
    when /^#/
      next
    when /User-agent:\s*(.+)/
      user_agent = $1.strip
      rules.update user_agent => Scrape::RobotsTxtRules.new
    when /Disallow:\s*(.+)/
      rules[user_agent] << $1.strip
    end
  end

  new rules
end

Instance Method Details

#=~(str) ⇒ Object



23
24
25
# File 'lib/scrape/robots_txt.rb', line 23

def =~ str
  self[Scrape.user_agent] =~ str
end

#[](user_agent) ⇒ Object



17
18
19
20
21
# File 'lib/scrape/robots_txt.rb', line 17

def [] user_agent
  rules  = @rules[user_agent].clone
  rules += @rules['*'] unless user_agent == '*'
  rules
end

#disallowsObject



13
14
15
# File 'lib/scrape/robots_txt.rb', line 13

def disallows
  @rules.values.flatten
end

#each(&block) ⇒ Object



27
28
29
# File 'lib/scrape/robots_txt.rb', line 27

def each &block
  @rules.each &block
end

#user_agentsObject



9
10
11
# File 'lib/scrape/robots_txt.rb', line 9

def user_agents
  @rules.keys
end