Module: Simhash
- Defined in:
- lib/simhash.rb,
lib/simhash/version.rb,
lib/simhash/stopwords.rb,
lib/simhash/stopwords/en.rb,
lib/simhash/stopwords/ru.rb
Defined Under Namespace
Modules: Stopwords
Constant Summary collapse
- VERSION =
"1.1.1"
Class Method Summary collapse
Class Method Details
.each_token(tokens, options = {}) ⇒ Object
41 42 43 44 45 46 47 48 49 |
# File 'lib/simhash.rb', line 41 def self.each_token(tokens, ={}) token_min_size = [:token_min_size].to_i tokens.each do |token| next if token.size.zero? || token.mb_chars.size < token_min_size yield token end end |
.hash(tokens, options = {}) ⇒ Object
19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 |
# File 'lib/simhash.rb', line 19 def self.hash(tokens, ={}) hashbits = [:hashbits] || 64 hasher = [:hasher] || StringHasher.new(hashbits) v = [0] * hashbits masks = v.dup masks.each_with_index {|e, i| masks[i] = (1 << i)} self.each_token(tokens, ) do |token| hashed_token = hasher.hash(token).to_i hashbits.times do |i| v[i] += (hashed_token & masks[i]).zero? ? -1 : +1 end end fingerprint = 0 hashbits.times { |i| fingerprint += 1 << i if v[i] >= 0 } fingerprint end |
.hm ⇒ Object
51 52 53 |
# File 'lib/simhash.rb', line 51 def self.hm @@string_hash_method end |