Class: Wikipedia::VandalismDetection::Algorithms::KullbackLeiblerDivergence

Inherits:
Object
  • Object
show all
Defined in:
lib/wikipedia/vandalism_detection/algorithms/kullback_leibler_divergence.rb

Instance Method Summary collapse

Instance Method Details

#of(text_a, text_b) ⇒ Object

Returns the Symmetric Kullback-Leibler divergence with simple back-off of the given text’s character distribution. For implementation details, see: staff.science.uva.nl/~tsagias/?p=185

Raises:

  • (Exception)


11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
# File 'lib/wikipedia/vandalism_detection/algorithms/kullback_leibler_divergence.rb', line 11

def of(text_a, text_b)
  text_a = text_a.encode('UTF-8', 'binary', invalid: :replace, undef: :replace, replace: '')
  text_b = text_b.encode('UTF-8', 'binary', invalid: :replace, undef: :replace, replace: '')

  return Features::MISSING_VALUE unless !!text_a.match(/[[:alnum:]]/) && !!text_b.match(/[[:alnum:]]/)

  distribution_a = character_distribution(text_a)
  distribution_b = character_distribution(text_b)

  sum_a = distribution_a.values.inject(:+)
  sum_b = distribution_b.values.inject(:+)

  character_diff = (distribution_b.keys - distribution_a.keys)

  epsilon = [(distribution_a.values.min / sum_a), (distribution_b.values.min / sum_b)].min * 0.001
  gamma = 1 - character_diff.size * epsilon

  # check if values sum up to 1.0
  sum_a_diff = 1.0 - distribution_a.values.inject(0) { |sum, value| sum += (value / sum_a) }.abs
  sum_b_diff = 1.0 - distribution_b.values.inject(0) { |sum, value| sum += (value / sum_b) }.abs

  raise(Exception, "Text a distr. does not sum up to 1.0") if sum_a_diff > 9e-6
  raise(Exception, "Text b distr. does not sum up to 1.0") if sum_b_diff > 9e-6

  divergence = 0.0

  distribution_a.each do |character, distribution|
    prob_a = distribution / sum_a
    prob_b = (distribution_b.has_key?(character) ? (gamma * (distribution_b[character] / sum_b)) : epsilon)

    divergence += ((prob_a - prob_b) * Math.log(prob_a / prob_b))
  end

  divergence
end