Class: Splam::Rules::Href

Inherits:
Splam::Rule show all
Defined in:
lib/splam/rules/href.rb

Overview

This plugin checks for links in the text, and adds scores for having many links, and

Instance Attribute Summary

Attributes inherited from Splam::Rule

#body, #reasons, #score, #suite, #weight

Instance Method Summary collapse

Methods inherited from Splam::Rule

#add_score, inherited, #initialize, #line_safe?, #name, run

Constructor Details

This class inherits a constructor from Splam::Rule

Instance Method Details

#runObject



5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
# File 'lib/splam/rules/href.rb', line 5

def run
  # add_score 3 * @body.scan("href=http").size, "Shitty html 'href=http'" # 3 points for shitty html
  add_score 35 * @body.scan(/href\=\s*http/).size, "Shitty html 'href=http'" # 15 points for shitty html
  add_score 35 * @body.scan(/href\="\s+http/).size, "Shitty html 'href=\" http'" # 15 points for shitty html
  add_score 50 * @body.scan(/\A<a.*?<\/a>\Z/).size, "Single link post'"      # 50 points for shitty

  link_count = @body.scan("http://").size
  add_score 1 * link_count, "Matched 'http://'" # 1 point per link
  add_score 50, "More than 10 links" if link_count > 10  # more than 10 links? spam.
  add_score 100, "More than 20 links" if link_count > 20 # more than 20 links? definitely spam.
  add_score 1000, "More than 50 links" if link_count > 50 # more than 20 links? definitely spam.
      
  # Modify these scores to weight certain problematic domains.
  # You may need to modify these for your application
  suspicious_top_level_domains = {
    'ru' => 20,  # Russian? spammer.
    'cn' => 20,  # Chinese? spammer.
    'us' => 8,   # .us ? possibly spam
    'it' => 5,
    'tk' => 20,
    'eu' => 20,
    'pl' => 8,
    'info' => 20, 
    'biz'  => 40 # no-one uses these for reals
  }
  suspicious_sites = {
    'cnn' => 10, # Honestly, who links to CNN?
    'bbc' => 10
  }
  
  tokens = @body.split(" ")
  if tokens[-1] =~ /^http:\/\//
    add_score 20, "Text ends in a http token"
    add_score 150, "Text ends in a http token and only has one token" if link_count == 1
    add_score 150, "Text ends in a http token with a shitty domain " if tokens[-1].match(/http:\/\/#{suspicious_sites.keys.join("|")}\./)
  end
  
  @body.scan(/http:\/\/(.*?)[\/\>\]?]/) do |match|
    # $stderr.puts "checking #{match}"
    if domain = match.to_s.split(".")
      tld = domain[-1]

      if found = suspicious_top_level_domains[tld]
        add_score found, "Suspicious top-level domain: '#{tld}'"
      end
      
      if found = suspicious_sites[domain[-2]]
        add_score found, "Suspicious hostname: '#{domain[-2]}'"
        add_score found * 5, "..document ends in suspicious hostname" if tokens[-1] =~ /^http:\/\//
      end
      
    end
  end
end