Class: Scrape::RobotsTxt
- Inherits:
-
Object
- Object
- Scrape::RobotsTxt
- Defined in:
- lib/scrape/robots_txt.rb
Class Method Summary collapse
Instance Method Summary collapse
- #=~(str) ⇒ Object
- #[](user_agent) ⇒ Object
- #disallows ⇒ Object
- #each(&block) ⇒ Object
-
#initialize(rules) ⇒ RobotsTxt
constructor
A new instance of RobotsTxt.
- #user_agents ⇒ Object
Constructor Details
#initialize(rules) ⇒ RobotsTxt
Returns a new instance of RobotsTxt.
4 5 6 7 |
# File 'lib/scrape/robots_txt.rb', line 4 def initialize rules @rules = rules @rules.default = Scrape::RobotsTxtRules.new end |
Class Method Details
.load(url, default = true) ⇒ Object
50 51 52 53 54 55 56 |
# File 'lib/scrape/robots_txt.rb', line 50 def self.load url, default = true url = Addressable::URI.join(url, "/robots.txt") if default parse Scrape.open(url) rescue Scrape::HTTPError Scrape.logger.warn "Failed to obtain robots.txt: #{url}" nil end |
.parse(content) ⇒ Object
31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 |
# File 'lib/scrape/robots_txt.rb', line 31 def self.parse content return if content.nil? rules, user_agent = Hash.new, nil content.split("\n").each do |line| case line when /^#/ next when /User-agent:\s*(.+)/ user_agent = $1.strip rules.update user_agent => Scrape::RobotsTxtRules.new when /Disallow:\s*(.+)/ rules[user_agent] << $1.strip end end new rules end |
Instance Method Details
#=~(str) ⇒ Object
23 24 25 |
# File 'lib/scrape/robots_txt.rb', line 23 def =~ str self[Scrape.user_agent] =~ str end |
#[](user_agent) ⇒ Object
17 18 19 20 21 |
# File 'lib/scrape/robots_txt.rb', line 17 def [] user_agent rules = @rules[user_agent].clone rules += @rules['*'] unless user_agent == '*' rules end |
#disallows ⇒ Object
13 14 15 |
# File 'lib/scrape/robots_txt.rb', line 13 def disallows @rules.values.flatten end |
#each(&block) ⇒ Object
27 28 29 |
# File 'lib/scrape/robots_txt.rb', line 27 def each &block @rules.each &block end |
#user_agents ⇒ Object
9 10 11 |
# File 'lib/scrape/robots_txt.rb', line 9 def user_agents @rules.keys end |