Class: Probot

Inherits:

Object

Object
Probot

show all

Defined in:: lib/probot.rb,
lib/probot/version.rb

Overview

Two main parts of this class:

Parse a robots.txt file
Find the most specific rule for a given URL. We use the length of the regexp as a proxy for specificity.

Defined Under Namespace

Classes: ParsedLine

Constant Summary collapse

VERSION =

"0.2.0"

Instance Attribute Summary collapse

#agent ⇒ Object

Returns the value of attribute agent.
#doc ⇒ Object readonly

Returns the value of attribute doc.
#rules ⇒ Object readonly

Returns the value of attribute rules.
#site ⇒ Object

Returns the value of attribute site.
#sitemaps ⇒ Object

Returns the value of attribute sitemaps.

Class Method Summary collapse

.allowed?(url, agent: "*") ⇒ Boolean

Instance Method Summary collapse

#allowed ⇒ Object
#allowed?(url) ⇒ Boolean

If a URL is not disallowed, it is allowed - so we check if it is explictly disallowed and if not, it’s allowed.
#allowed_best(url) ⇒ Object
#allowed_matches(url) ⇒ Object
#crawl_delay ⇒ Object
#disallowed ⇒ Object
#disallowed?(url) ⇒ Boolean
#disallowed_best(url) ⇒ Object
#disallowed_matches(url) ⇒ Object
#fetch_robots_txt(url) ⇒ Object
#found_agents ⇒ Object
#initialize(data, agent: "*") ⇒ Probot constructor

A new instance of Probot.
#matches(url) ⇒ Object
#matching_rule(url) ⇒ Object
#parse(doc) ⇒ Object
#pattern_length(regexp) ⇒ Object
#request_headers ⇒ Object

Constructor Details

#initialize(data, agent: "*") ⇒ `Probot`

Returns a new instance of Probot.

Raises:

(ArgumentError)

# File 'lib/probot.rb', line 25

def initialize(data, agent: "*")
  raise ArgumentError, "The first argument must be a string" unless data.is_a?(String)
  @agent = agent

  @rules = {}
  @current_agents = ["*"]
  @current_agents.each { |agent| @rules[agent] ||= {"disallow" => [], "allow" => [], "crawl_delay" => 0} }
  @sitemaps = []
  @site = URI(data) if data.start_with?("http")
  @doc = @site.nil? ? data : fetch_robots_txt(@site)
  parse(@doc)
end

Instance Attribute Details

#agent ⇒ `Object`

Returns the value of attribute agent.



23
24
25

# File 'lib/probot.rb', line 23

def agent
  @agent
end

#doc ⇒ `Object` (readonly)

Returns the value of attribute doc.



22
23
24

# File 'lib/probot.rb', line 22

def doc
  @doc
end

#rules ⇒ `Object` (readonly)

Returns the value of attribute rules.



22
23
24

# File 'lib/probot.rb', line 22

def rules
  @rules
end

#site ⇒ `Object`

Returns the value of attribute site.



23
24
25

# File 'lib/probot.rb', line 23

def site
  @site
end

#sitemaps ⇒ `Object`

Returns the value of attribute sitemaps.



23
24
25

# File 'lib/probot.rb', line 23

def sitemaps
  @sitemaps
end

Class Method Details

.allowed?(url, agent: "*") ⇒ `Boolean`

Returns:

(Boolean)

149	# File 'lib/probot.rb', line 149 def self.allowed?(url, agent: "*") = Probot.new(url, agent: agent).allowed?(url)

Instance Method Details

#allowed ⇒ `Object`

52	# File 'lib/probot.rb', line 52 def allowed = rules.dig(@agent, "allow") \|\| rules.dig("*", "allow")

#allowed?(url) ⇒ `Boolean`

If a URL is not disallowed, it is allowed - so we check if it is explictly disallowed and if not, it’s allowed.

Returns:

(Boolean)

67	# File 'lib/probot.rb', line 67 def allowed?(url) = !disallowed?(url)

#allowed_best(url) ⇒ `Object`

62	# File 'lib/probot.rb', line 62 def allowed_best(url) = allowed_matches(url).max_by { \|k, v\| v }

#allowed_matches(url) ⇒ `Object`

56	# File 'lib/probot.rb', line 56 def allowed_matches(url) = allowed.select { \|allowed_url\| url.match?(allowed_url) }.to_h { \|rule\| [rule, pattern_length(rule)] }

#crawl_delay ⇒ `Object`

46	# File 'lib/probot.rb', line 46 def crawl_delay = rules.dig(@agent, "crawl_delay")

#disallowed ⇒ `Object`

50	# File 'lib/probot.rb', line 50 def disallowed = rules.dig(@agent, "disallow") \|\| rules.dig("*", "disallow")

#disallowed?(url) ⇒ `Boolean`

Returns:

(Boolean)

69	# File 'lib/probot.rb', line 69 def disallowed?(url) = matching_rule(url)&.keys&.first == :disallow

#disallowed_best(url) ⇒ `Object`

60	# File 'lib/probot.rb', line 60 def disallowed_best(url) = disallowed_matches(url).max_by { \|k, v\| v }

#disallowed_matches(url) ⇒ `Object`

54	# File 'lib/probot.rb', line 54 def disallowed_matches(url) = disallowed.select { \|disallowed_url\| url.match?(disallowed_url) }.to_h { \|rule\| [rule, pattern_length(rule)] }

#fetch_robots_txt(url) ⇒ `Object`

# File 'lib/probot.rb', line 40

def fetch_robots_txt(url)
  Net::HTTP.get(URI(url).tap { |u| u.path = "/robots.txt" }, request_headers)
rescue
  ""
end

#found_agents ⇒ `Object`

48	# File 'lib/probot.rb', line 48 def found_agents = rules.keys

#matches(url) ⇒ `Object`

58	# File 'lib/probot.rb', line 58 def matches(url) = {disallowed: disallowed_matches(url), allowed: allowed_matches(url)}

#matching_rule(url) ⇒ `Object`

# File 'lib/probot.rb', line 64

def matching_rule(url) = (disallowed_best(url)&.last.to_i > allowed_best(url)&.last.to_i) ? {disallow: disallowed_best(url)&.first} : {allow: allowed_best(url)&.first}

#parse(doc) ⇒ `Object`

# File 'lib/probot.rb', line 71

def parse(doc)
  # We need to handle consective user-agent lines, which are considered to be part of the same record.
  subsequent_agent = false

  doc.lines.each do |line|
    next if line.start_with?("#") || !line.include?(":") || line.split(":").length < 2

    data = ParsedLine.new(line)

    if data.agent?
      if subsequent_agent
        @current_agents << data.value
      else
        @current_agents = [data.value]
        subsequent_agent = true
      end

      @current_agents.each { |agent| rules[agent] ||= {"disallow" => [], "allow" => [], "crawl_delay" => 0} }
      next
    end

    # All Regex characters are escaped, then we unescape * and $ as they may used in robots.txt
    if data.allow? || data.disallow?
      @current_agents.each { |agent| rules[agent][data.key] << Regexp.new(Regexp.escape(data.value).gsub('\*', ".*").gsub('\$', "$")) }

      # When user-agent strings are found on consecutive lines, they are considered to be part of the same record. Google ignores crawl_delay.
      subsequent_agent = false
      next
    end

    if data.crawl_delay?
      @current_agents.each { |agent| rules[agent][data.key] = data.value }
      next
    end

    # Ensure we have an absolute URL
    if data.sitemap?
      sitemap_uri = URI(data.value)
      sitemap_uri = sitemap_uri.host.nil? ? URI.join(*[site, sitemap_uri].compact) : sitemap_uri
      @sitemaps << sitemap_uri.to_s
      next
    end

    @current_agents.each { |agent| rules[agent][data.key] = data.value }
  end
end

#pattern_length(regexp) ⇒ `Object`

118	# File 'lib/probot.rb', line 118 def pattern_length(regexp) = regexp.source.gsub(/(\\[\\$\.])/, "").length

#request_headers ⇒ `Object`

38	# File 'lib/probot.rb', line 38 def request_headers = (agent == "*") ? {} : {"User-Agent" => @agent}

Class: Probot

Overview

Defined Under Namespace

Constant Summary collapse

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(data, agent: "*") ⇒ Probot

Instance Attribute Details

#agent ⇒ Object

#doc ⇒ Object (readonly)

#rules ⇒ Object (readonly)

#site ⇒ Object

#sitemaps ⇒ Object

Class Method Details

.allowed?(url, agent: "*") ⇒ Boolean

Instance Method Details

#allowed ⇒ Object

#allowed?(url) ⇒ Boolean

#allowed_best(url) ⇒ Object

#allowed_matches(url) ⇒ Object

#crawl_delay ⇒ Object

#disallowed ⇒ Object

#disallowed?(url) ⇒ Boolean

#disallowed_best(url) ⇒ Object

#disallowed_matches(url) ⇒ Object

#fetch_robots_txt(url) ⇒ Object

#found_agents ⇒ Object

#matches(url) ⇒ Object

#matching_rule(url) ⇒ Object

#parse(doc) ⇒ Object

#pattern_length(regexp) ⇒ Object

#request_headers ⇒ Object

#initialize(data, agent: "*") ⇒ `Probot`

#agent ⇒ `Object`

#doc ⇒ `Object` (readonly)

#rules ⇒ `Object` (readonly)

#site ⇒ `Object`

#sitemaps ⇒ `Object`

.allowed?(url, agent: "*") ⇒ `Boolean`

#allowed ⇒ `Object`

#allowed?(url) ⇒ `Boolean`

#allowed_best(url) ⇒ `Object`

#allowed_matches(url) ⇒ `Object`

#crawl_delay ⇒ `Object`

#disallowed ⇒ `Object`

#disallowed?(url) ⇒ `Boolean`

#disallowed_best(url) ⇒ `Object`

#disallowed_matches(url) ⇒ `Object`

#fetch_robots_txt(url) ⇒ `Object`

#found_agents ⇒ `Object`

#matches(url) ⇒ `Object`

#matching_rule(url) ⇒ `Object`

#parse(doc) ⇒ `Object`

#pattern_length(regexp) ⇒ `Object`

#request_headers ⇒ `Object`