Module: CrawlerDetection
- Defined in:
- lib/crawler_detection.rb
Constant Summary collapse
- WAYBACK_MACHINE_URL =
"archive.org"
Class Method Summary collapse
-
.allow_crawler?(user_agent) ⇒ Boolean
Given a user_agent that returns true from crawler?, should its request be allowed?.
- .crawler?(user_agent, via_header = nil) ⇒ Boolean
- .is_blocked_crawler?(user_agent) ⇒ Boolean
- .show_browser_update?(user_agent) ⇒ Boolean
- .to_matcher(string, type: nil) ⇒ Object
Class Method Details
.allow_crawler?(user_agent) ⇒ Boolean
Given a user_agent that returns true from crawler?, should its request be allowed?
66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 |
# File 'lib/crawler_detection.rb', line 66 def self.allow_crawler?(user_agent) if SiteSetting.allowed_crawler_user_agents.blank? && SiteSetting.blocked_crawler_user_agents.blank? return true end @allowlisted_matchers ||= {} @blocklisted_matchers ||= {} if SiteSetting.allowed_crawler_user_agents.present? allowlisted = @allowlisted_matchers[SiteSetting.allowed_crawler_user_agents] ||= to_matcher( SiteSetting.allowed_crawler_user_agents, ) !user_agent.nil? && user_agent.match?(allowlisted) else blocklisted = @blocklisted_matchers[SiteSetting.blocked_crawler_user_agents] ||= to_matcher( SiteSetting.blocked_crawler_user_agents, ) user_agent.nil? || !user_agent.match?(blocklisted) end end |
.crawler?(user_agent, via_header = nil) ⇒ Boolean
17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 |
# File 'lib/crawler_detection.rb', line 17 def self.crawler?(user_agent, via_header = nil) if user_agent.nil? || user_agent&.include?(WAYBACK_MACHINE_URL) || via_header&.include?(WAYBACK_MACHINE_URL) return true end # this is done to avoid regenerating regexes @non_crawler_matchers ||= {} @matchers ||= {} possibly_real = ( @non_crawler_matchers[SiteSetting.non_crawler_user_agents] ||= to_matcher( SiteSetting.non_crawler_user_agents, type: :real, ) ) if user_agent.match?(possibly_real) known_bots = (@matchers[SiteSetting.crawler_user_agents] ||= to_matcher(SiteSetting.crawler_user_agents)) if user_agent.match?(known_bots) bypass = ( @matchers[SiteSetting.crawler_check_bypass_agents] ||= to_matcher( SiteSetting.crawler_check_bypass_agents, ) ) !user_agent.match?(bypass) else false end else true end end |
.is_blocked_crawler?(user_agent) ⇒ Boolean
90 91 92 |
# File 'lib/crawler_detection.rb', line 90 def self.is_blocked_crawler?(user_agent) crawler?(user_agent) && !allow_crawler?(user_agent) end |
.show_browser_update?(user_agent) ⇒ Boolean
54 55 56 57 58 59 60 61 62 63 |
# File 'lib/crawler_detection.rb', line 54 def self.show_browser_update?(user_agent) return false if SiteSetting.browser_update_user_agents.blank? @browser_update_matchers ||= {} matcher = @browser_update_matchers[SiteSetting.browser_update_user_agents] ||= to_matcher( SiteSetting.browser_update_user_agents, ) user_agent.match?(matcher) end |
.to_matcher(string, type: nil) ⇒ Object
6 7 8 9 10 11 12 13 14 15 |
# File 'lib/crawler_detection.rb', line 6 def self.to_matcher(string, type: nil) escaped = string.split("|").map { |agent| Regexp.escape(agent) }.join("|") if type == :real && Rails.env == "test" # we need this bypass so we properly render views escaped << "|Rails Testing" end Regexp.new(escaped, Regexp::IGNORECASE) end |