Module: Simhash

Defined in:
lib/simhash.rb,
lib/simhash/stopwords.rb,
lib/simhash/stopwords/en.rb,
lib/simhash/stopwords/ru.rb

Defined Under Namespace

Modules: Stopwords

Constant Summary collapse

DEFAULT_STRING_HASH_METHOD =
String.public_instance_methods.include?("hash_vl") ? :hash_vl : :hash_vl_rb
PUNCTUATION_REGEXP =
if RUBY_VERSION >= "1.9"
  /(\s|\d|[^\p{L}]|\302\240| *— *|[«»…\-–—]| )+/u
else
  /(\s|\d|\W|\302\240| *— *|[«»…\-–—]| )+/u
end

Class Method Summary collapse

Class Method Details

.each_filtered_token(tokens, options = {}) ⇒ Object



46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
# File 'lib/simhash.rb', line 46

def self.each_filtered_token(tokens, options={})
  token_min_size = options[:token_min_size].to_i
  stop_sentenses = options[:stop_sentenses]
  tokens.each do |token|
    # cutting punctuation (\302\240 is unbreakable space)
    token = token.gsub(PUNCTUATION_REGEXP, ' ') if !options[:preserve_punctuation]
    
    token = Unicode::downcase(token.strip)
    
    # cutting stop-words
    token = token.split(" ").reject{ |w| Stopwords::ALL.index(" #{w} ") != nil }.join(" ") if options[:stop_words]
  
    # cutting stop-sentenses
    next if stop_sentenses && stop_sentenses.include?(" #{token} ")
          
    next if token.size.zero? || token.mb_chars.size < token_min_size
    
    yield token      
  end
end

.filtered_tokens(tokens, options = {}) ⇒ Object



67
68
69
70
71
# File 'lib/simhash.rb', line 67

def self.filtered_tokens(tokens, options={})
  filtered_tokens = []
  self.each_filtered_token(tokens, options) { |token| filtered_tokens << token }
  filtered_tokens  
end

.hash(tokens, options = {}) ⇒ Object



24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
# File 'lib/simhash.rb', line 24

def self.hash(tokens, options={})
  hashbits = options[:hashbits] || 64
  hashing_method = options[:hashing_method] || DEFAULT_STRING_HASH_METHOD
      
  v = [0] * hashbits
  masks = v.dup
  masks.each_with_index {|e, i| masks[i] = (1 << i)}
  
  self.each_filtered_token(tokens, options) do |token|
    hashed_token = token.send(hashing_method, hashbits).to_i
    hashbits.times do |i|
      v[i] += (hashed_token & masks[i]).zero? ? -1 : +1
    end
  end
 
  fingerprint = 0

  hashbits.times { |i| fingerprint += 1 << i if v[i] >= 0 }  
    
  fingerprint    
end

.hmObject



73
74
75
# File 'lib/simhash.rb', line 73

def self.hm
  @@string_hash_method
end