Module: Simhash
- Defined in:
- lib/simhash.rb,
lib/simhash/stopwords.rb,
lib/simhash/stopwords/en.rb,
lib/simhash/stopwords/ru.rb
Defined Under Namespace
Modules: Stopwords
Constant Summary collapse
- DEFAULT_STRING_HASH_METHOD =
String.public_instance_methods.include?("hash_vl") ? :hash_vl : :hash_vl_rb
- PUNCTUATION_REGEXP =
if RUBY_VERSION >= "1.9" /(\s|\d|[^\p{L}]|\302\240| *— *|[«»…\-–—]| )+/u else /(\s|\d|\W|\302\240| *— *|[«»…\-–—]| )+/u end
Class Method Summary collapse
- .each_filtered_token(tokens, options = {}) ⇒ Object
- .filtered_tokens(tokens, options = {}) ⇒ Object
- .hash(tokens, options = {}) ⇒ Object
- .hm ⇒ Object
Class Method Details
.each_filtered_token(tokens, options = {}) ⇒ Object
46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 |
# File 'lib/simhash.rb', line 46 def self.each_filtered_token(tokens, ={}) token_min_size = [:token_min_size].to_i stop_sentenses = [:stop_sentenses] tokens.each do |token| # cutting punctuation (\302\240 is unbreakable space) token = token.gsub(PUNCTUATION_REGEXP, ' ') if ![:preserve_punctuation] token = Unicode::downcase(token.strip) # cutting stop-words token = token.split(" ").reject{ |w| Stopwords::ALL.index(" #{w} ") != nil }.join(" ") if [:stop_words] # cutting stop-sentenses next if stop_sentenses && stop_sentenses.include?(" #{token} ") next if token.size.zero? || token.mb_chars.size < token_min_size yield token end end |
.filtered_tokens(tokens, options = {}) ⇒ Object
67 68 69 70 71 |
# File 'lib/simhash.rb', line 67 def self.filtered_tokens(tokens, ={}) filtered_tokens = [] self.each_filtered_token(tokens, ) { |token| filtered_tokens << token } filtered_tokens end |
.hash(tokens, options = {}) ⇒ Object
24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 |
# File 'lib/simhash.rb', line 24 def self.hash(tokens, ={}) hashbits = [:hashbits] || 64 hashing_method = [:hashing_method] || DEFAULT_STRING_HASH_METHOD v = [0] * hashbits masks = v.dup masks.each_with_index {|e, i| masks[i] = (1 << i)} self.each_filtered_token(tokens, ) do |token| hashed_token = token.send(hashing_method, hashbits).to_i hashbits.times do |i| v[i] += (hashed_token & masks[i]).zero? ? -1 : +1 end end fingerprint = 0 hashbits.times { |i| fingerprint += 1 << i if v[i] >= 0 } fingerprint end |
.hm ⇒ Object
73 74 75 |
# File 'lib/simhash.rb', line 73 def self.hm @@string_hash_method end |