Module: Simhash

Defined in:
lib/simhash.rb,
lib/simhash/version.rb,
lib/simhash/stopwords.rb,
lib/simhash/stopwords/en.rb,
lib/simhash/stopwords/ru.rb

Defined Under Namespace

Modules: Stopwords

Constant Summary collapse

VERSION =
"1.1.1"

Class Method Summary collapse

Class Method Details

.each_token(tokens, options = {}) ⇒ Object



41
42
43
44
45
46
47
48
49
# File 'lib/simhash.rb', line 41

def self.each_token(tokens, options={})
  token_min_size = options[:token_min_size].to_i

  tokens.each do |token|
    next if token.size.zero? || token.mb_chars.size < token_min_size
    
    yield token      
  end
end

.hash(tokens, options = {}) ⇒ Object

Parameters:

  • hashbits

    The number of bits in the resulting hash

  • hasher

    An object which returns a numeric hash when #hash is called with a token



19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
# File 'lib/simhash.rb', line 19

def self.hash(tokens, options={})
  hashbits = options[:hashbits] || 64
  hasher = options[:hasher] || StringHasher.new(hashbits)
      
  v = [0] * hashbits
  masks = v.dup
  masks.each_with_index {|e, i| masks[i] = (1 << i)}
  
  self.each_token(tokens, options) do |token|
    hashed_token = hasher.hash(token).to_i
    hashbits.times do |i|
      v[i] += (hashed_token & masks[i]).zero? ? -1 : +1
    end
  end
 
  fingerprint = 0

  hashbits.times { |i| fingerprint += 1 << i if v[i] >= 0 }  
    
  fingerprint    
end

.hmObject



51
52
53
# File 'lib/simhash.rb', line 51

def self.hm
  @@string_hash_method
end