Class: Fuzzy::Scorer

Inherits:
Object
  • Object
show all
Defined in:
lib/fuzzy.rb

Defined Under Namespace

Classes: TermSet, Token

Instance Method Summary collapse

Constructor Details

#initialize(corpus) ⇒ Scorer

Returns a new instance of Scorer.



23
24
25
26
27
28
# File 'lib/fuzzy.rb', line 23

def initialize corpus
  corpus = (corpus || []).reject{|c| c[:weight].blank? or c[:terms].blank?}
  @corpus = corpus.map{|c| TermSet.new(c[:weight], c[:terms].reject{|t| t.blank?})}
  @total_weight = @corpus.sum {|c| c.weight}
  @weighted_tokens = @corpus.flat_map{ |c| c.tokens @total_weight }
end

Instance Method Details

#normalized_tokensObject



50
51
52
53
54
55
56
57
58
59
60
# File 'lib/fuzzy.rb', line 50

def normalized_tokens
  basic_tokens = tokens
  max = basic_tokens.max_by(&:weight).weight
  min = basic_tokens.min_by(&:weight).weight
  # Calculate m and c values for the linear transform y=mx+c
  # m = (y' - y)/(x' - x)
  m = (1 - 0).fdiv(max - min)
  # Substituting the max values in, we get 1 = m(max) + c
  c = 1 - (m * max)
  basic_tokens.map{|t| Token.new(t.token, (t.weight*m + c))}
end

#rank(query) ⇒ Object



30
31
32
33
34
35
36
37
38
# File 'lib/fuzzy.rb', line 30

def rank query
  scores = @weighted_tokens.map do |wt|
    length_score = wt.token.starts_with?(query) ? query.length.fdiv(wt.token.length) : 0
    length_score * wt.weight
  end
  score_count = scores.count{|s| s > 0}
  return 0 unless score_count > 0
  scores.sum / score_count
end

#tokenizeObject



40
41
42
43
44
# File 'lib/fuzzy.rb', line 40

def tokenize
  @corpus.flat_map{|c| c.cleaned_terms.to_a}.flat_map do |str|
    (1..str.length).map { |len| str.slice(0, len) }
  end.to_set
end

#tokensObject



46
47
48
# File 'lib/fuzzy.rb', line 46

def tokens
  tokenize.map{|t| Token.new(t, rank(t))}
end