Class: Wikipedia::VandalismDetection::Evaluator

Inherits:
Object
  • Object
show all
Defined in:
lib/wikipedia/vandalism_detection/evaluator.rb

Overview

This class provides methods for the evaluation of a Wikipedia::VandalismDetection::Classifier using the weka framwork.

Examples:

classifier = Wikipedia::VandalismDetection::Classifier.new
evaluator = Wikipedia::VandalsimDetection::Evaluator(classifier)

evaluation = evaluator.cross_validate
evaluation = evaluator.cross_validate(equally_distributed: true)

puts evaluation[:precision]
puts evaluation[:recall]
puts evaluation[:area_under_prc]

Constant Summary collapse

DEFAULT_SAMPLE_COUNT =
200.freeze

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(classifier) ⇒ Evaluator

Returns a new instance of Evaluator.

Raises:

  • (ArgumentError)


31
32
33
34
35
36
37
38
# File 'lib/wikipedia/vandalism_detection/evaluator.rb', line 31

def initialize(classifier)
  raise(ArgumentError, 'Classifier param has to be a Wikipedia::VandalismDetection::Classifier instance') unless
      classifier.is_a?(Wikipedia::VandalismDetection::Classifier)

  @config = Wikipedia::VandalismDetection.configuration
  @classifier = classifier
  @classifier_instance = classifier.classifier_instance
end

Class Method Details

.false_negative?(target_class, confidence, threshold) ⇒ Boolean

Returns whether the given confidence value represents a false negative (FN) regarding the given target class and threshold.

Returns:

  • (Boolean)


222
223
224
# File 'lib/wikipedia/vandalism_detection/evaluator.rb', line 222

def self.false_negative?(target_class, confidence, threshold)
  target_class == Instances::VANDALISM_SHORT && confidence.to_f <= threshold.to_f
end

.false_positive?(target_class, confidence, threshold) ⇒ Boolean

Returns whether the given confidence value represents a false positive (FP) regarding the given target class and threshold.

Returns:

  • (Boolean)


216
217
218
# File 'lib/wikipedia/vandalism_detection/evaluator.rb', line 216

def self.false_positive?(target_class, confidence, threshold)
  target_class == Instances::REGULAR_SHORT && confidence.to_f >= threshold.to_f
end

.true_negative?(target_class, confidence, threshold) ⇒ Boolean

Returns whether the given confidence value represents a true negative (TN) regarding the given target class and threshold.

Returns:

  • (Boolean)


210
211
212
# File 'lib/wikipedia/vandalism_detection/evaluator.rb', line 210

def self.true_negative?(target_class, confidence, threshold)
  target_class == Instances::REGULAR_SHORT && confidence.to_f < threshold.to_f
end

.true_positive?(target_class, confidence, threshold) ⇒ Boolean

Returns whether the given confidence value represents a true positive (TP) regarding the given target class and threshold.

Returns:

  • (Boolean)


204
205
206
# File 'lib/wikipedia/vandalism_detection/evaluator.rb', line 204

def self.true_positive?(target_class, confidence, threshold)
  target_class == Instances::VANDALISM_SHORT && confidence.to_f > threshold.to_f
end

Instance Method Details

#area_under_curve(x_values, y_values) ⇒ Object

Returns the calculated area under curve for given point values x and y values has to be float arrays of the same length.

Raises:

  • (ArgumentError)


241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
# File 'lib/wikipedia/vandalism_detection/evaluator.rb', line 241

def area_under_curve(x_values, y_values)
  raise ArgumentError, 'x and y values must have the same length!' unless x_values.count == y_values.count

  sum = 0.0
  last_index = x_values.size - 1

  # trapezoid area formular: A = 1/2 * (b1 + b2) * h
  x_values.each_with_index do |x, index |
    break if index == last_index

    h = x_values[index + 1] - x
    b1 = y_values[index]
    b2 = y_values[index + 1]

    sum += 0.5 * (b1 + b2) * h
  end

  sum.abs
end

#create_testcorpus_classification_file!(file_path, ground_truth_data) ⇒ Object

Creates the test corpus text file by classifying the configured test samples All sub steps (as creating the test arff file, etc.) are run automattically if needed.

Raises:

  • (ArgumentError)


310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
# File 'lib/wikipedia/vandalism_detection/evaluator.rb', line 310

def create_testcorpus_classification_file!(file_path, ground_truth_data)
  raise(ArgumentError, "Ground truth data hash is not allowed to be nil!") if ground_truth_data.nil?

  dataset = TestDataset.build!

  dir_name = File.dirname(file_path)
  FileUtils.mkdir_p(dir_name) unless Dir.exists?(dir_name)
  file = File.open(file_path, 'w')

  feature_names = dataset.enumerate_attributes.to_a.map { |attr| attr.name.upcase }[0...-2]
  header = ['OLDREVID', 'NEWREVID', 'C', 'CONF', *feature_names].join(' ')

  file.puts header

  dataset.to_a2d.each do |instance|
    features = instance[0...-3]
    old_revision_id = instance[-3].to_i
    new_revision_id = instance[-2].to_i
    ground_truth_class_name = Instances::CLASSES_SHORT[Instances::CLASSES.key(instance[-1])]

    classification = @classifier.classify(features, return_all_params: true)
    class_value = Features::MISSING_VALUE

    if @config.classifier_type.match(/Functions::LibSVM/) && @config.classifier_options.match(/-s 2/i)
      # LibSVM with one class has only one class during training
      # Vandalism will get class index 0 while classifying
      # Regular will get missing (or Instances::NOT_KNOWN_INDEX in Wikipedia::VandalismDetection::Classifier)

      if classification[:class_index] == 0
        class_value = 1.0
      elsif classification[:class_index] == Instances::NOT_KNOWN_INDEX
        class_value = 0.0
      end
    else
      if classification[:class_index] == Instances::VANDALISM_CLASS_INDEX
        class_value = 1.0
      elsif classification[:class_index] == Instances::REGULAR_CLASS_INDEX
        class_value = 0.0
      end
    end

    confidence = classification[:confidence] || class_value

    must_be_inverted = @config.use_occ? && !!(@classifier.classifier_instance.options =~ /#{Instances::VANDALISM}/)
    confidence_value =  must_be_inverted ? (1.0 - confidence) : confidence
    features = features.join(' ').gsub(Float::NAN.to_s, Features::MISSING_VALUE).split

    file.puts [old_revision_id, new_revision_id, ground_truth_class_name, confidence_value, *features].join(' ')
  end

  file.close
end

#cross_validate(options = {}) ⇒ Object

Cross validates the classifier. Fold is used as defined in configuration (default is 10).

Examples:

classifier = Wikipedia::VandalismDetection::Classifier.new
evaluation = classifier.cross_validate
evaluation = classifier.cross_validate(equally_distributed: true)


48
49
50
51
52
53
54
55
56
57
58
59
# File 'lib/wikipedia/vandalism_detection/evaluator.rb', line 48

def   cross_validate(options = {})
  equally_distributed = options[:equally_distributed]

  fold_defaults = Wikipedia::VandalismDetection::DefaultConfiguration::DEFAULTS['classifier']['cross-validation-fold']
  fold = (@config.cross_validation_fold || fold_defaults)

  if equally_distributed
    cross_validate_equally_distributed(fold)
  else
    cross_validate_all_instances(fold)
  end
end

#curve_data(options = {}) ⇒ Object

Returns a Hash comprising the evaluation curve data Arrays for precision, recall

or

classifier = Wikipedia::VandalismDetection::Classifier.new
evaluator = Wikipedia::VandalismDetection::Evaluator.new(classifier)

curve_data = evaluator.curve_data

curve_data[:precision]
# => [0.76, ..., 0.91]

curve_data[:recall]
# => [0.87, ..., 0.89]

curve_data[:area_under_prc]
# => 0.83

Examples:

classifier = Wikipedia::VandalismDetection::Classifier.new
evaluator = classifier.evaluator


80
81
82
83
84
85
86
87
88
89
90
91
92
# File 'lib/wikipedia/vandalism_detection/evaluator.rb', line 80

def curve_data(options = {})
  evaluations = cross_validate(options)
  threshold_curve = Weka::Classifiers::Evaluation::ThresholdCurve.new

  evaluation_data = (evaluations.is_a? Array) ? evaluations[0] : evaluations

  instances = threshold_curve.curve(evaluation_data.predictions, Instances::VANDALISM_CLASS_INDEX)
  precision = instances.return_attr_data('Precision')
  recall = instances.return_attr_data('Recall')
  area_under_prc = evaluation_data.area_under_prc(Instances::VANDALISM_CLASS_INDEX)

  { precision: precision, recall: recall, area_under_prc: area_under_prc }
end

#evaluate_testcorpus_classification(options = {}) ⇒ Object

Evaluates the classification of the configured test corpus against the given ground truth. Runs the file creation automatically unless the classification file exists, yet.

Number of samples to use can be set by ‘sample_count: <number>’ parameter Default number of samples is 100.

Returns a Hash with values:

:recalls - recall values
:precisions - precision values
:fp_rates - fals positive rate values
:auprc - area under precision recall curve
:auroc - area under receiver operator curve
:total_recall - overall classifier recall value
:total_precision - overall classifier precision value

or

classifier = Wikipedia::VandalismDetection::Classifier.new
evaluator = Wikipedia::VandalismDetection::Evaluator.new(classifier)

evaluator.evaluate_testcorpus_classification
evaluator.evaluate_testcorpus_classification(sample_count: 50)

Examples:

classifier = Wikipedia::VandalismDetection::Classifier.new
evaluator = classifier.evaluator

Raises:



119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
# File 'lib/wikipedia/vandalism_detection/evaluator.rb', line 119

def evaluate_testcorpus_classification(options = {})
  ground_truth_file_path = @config.test_corpus_ground_truth_file

  raise(GroundTruthFileNotConfiguredError, 'Ground truth file path has to be set for test set evaluation!') \
    unless ground_truth_file_path

  raise(GroundTruthFileNotFoundError, 'Configured ground truth file is not available.') \
    unless File.exist?(ground_truth_file_path)

  ground_truth = ground_truth_hash(ground_truth_file_path)
  create_testcorpus_classification_file!(@config.test_output_classification_file, ground_truth)
  classification = classification_hash(@config.test_output_classification_file)

  sample_count = options[:sample_count] || DEFAULT_SAMPLE_COUNT
  curves = test_performance_curves(ground_truth, classification, sample_count)
  precision_recall = maximum_precision_recall(curves[:precisions], curves[:recalls])

  curves[:total_recall] = precision_recall[:recall]
  curves[:total_precision] = precision_recall[:precision]

  curves
end

#feature_analysis(options = {}) ⇒ Object

Returns a hash comprising each feature’s predictive values analysis for different thresholds. The Hash structure is the following one: {

feature_name_1:
 {
    0.0 => {fp: , fn: , tp: , tn: },
    ... => {fp: , fn: , tp: , tn: },
    1.0 => {fp: , fn: , tp: , tn: }
 },
...,
feature_name_n:
 {
    0.0 => {fp: , fn: , tp: , tn: },
    ... => {fp: , fn: , tp: , tn: },
    1.0 => {fp: , fn: , tp: , tn: }
 },

}



380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
# File 'lib/wikipedia/vandalism_detection/evaluator.rb', line 380

def feature_analysis(options = {})
  sample_count = options[:sample_count] || DEFAULT_SAMPLE_COUNT
  thresholds = (0.0..1.0).step(1.0 / (sample_count - 1)).to_a

  ground_truth_file_path = @config.test_corpus_ground_truth_file
  training_dataset = TrainingDataset.instances
  test_dataset = TestDataset.build!

  analysis = {}

  @config.features.each_with_index do |feature_name, index |
    puts "analyzing feature... '#{feature_name}'"

    dataset = filter_single_attribute(training_dataset, index)
    print ' | train classifier with feature data...'
    classifier = Classifier.new(dataset)
    print "done \n"

    classification = classification_data(classifier, test_dataset)
    ground_truth = ground_truth_hash(ground_truth_file_path)

    values = {}

    thresholds.each do |threshold|
      values[threshold] = predictive_values(ground_truth, classification, threshold)
    end

    analysis[feature_name] = values
  end

  analysis
end

#full_analysis(options = {}) ⇒ Object

Returns a hash comprising the classifiers predictive values for using all configured features for different thresholds.



415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
# File 'lib/wikipedia/vandalism_detection/evaluator.rb', line 415

def full_analysis(options = {})
  sample_count = options[:sample_count] || DEFAULT_SAMPLE_COUNT
  thresholds = (0.0..1.0).step(1.0 / (sample_count - 1)).to_a

  ground_truth_file_path = @config.test_corpus_ground_truth_file

  puts 'train classifier...'
  classifier = Classifier.new

  test_dataset = TestDataset.build!

  puts 'computing classification...'
  classification = classification_data(classifier, test_dataset)
  ground_truth = ground_truth_hash(ground_truth_file_path)

  analysis = {}

  thresholds.each do |threshold|
    analysis[threshold] = predictive_values(ground_truth, classification, threshold)
  end

  print "done \n"
  analysis
end

#maximum_precision_recall(precisions, recalls) ⇒ Object

Returns the maximum precision recall pair



297
298
299
300
301
302
303
304
305
306
# File 'lib/wikipedia/vandalism_detection/evaluator.rb', line 297

def maximum_precision_recall(precisions, recalls)
  areas = precisions.each_with_index.map do |precision, index|
    [precision * recalls[index], index]
  end

  areas.reject! { |b| !b.all? { |f| !f.to_f.nan? } } # remove arrays with NaN values
  max_index = areas.sort.max[1]

  { precision: precisions[max_index], recall: recalls[max_index] }
end

#performance_parameters(tp, fp, tn, fn) ⇒ Object

Returns a hash with performance parameters computed from given TP, FP, TN, FN



227
228
229
230
231
232
233
234
235
236
237
# File 'lib/wikipedia/vandalism_detection/evaluator.rb', line 227

def performance_parameters(tp, fp, tn, fn)
  precision = ((tp + fp == 0) ? 1.0 : (tp.to_f / (tp.to_f + fp.to_f)))
  recall = ((tp + fn == 0) ? 1.0 : (tp.to_f / (tp.to_f + fn.to_f)))
  fp_rate = ((fp + tn == 0) ? 1.0: (fp.to_f / (fp.to_f + tn.to_f)))

  {
      precision: precision,
      recall: recall,
      fp_rate: fp_rate
  }
end

#predictive_values(ground_truth, classification, threshold) ⇒ Object

Returns the predictive values hash (TP,FP, TN, FN) for a certain threshold.



178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
# File 'lib/wikipedia/vandalism_detection/evaluator.rb', line 178

def predictive_values(ground_truth, classification, threshold)
  tp = 0 # vandalism which is classified as vandalism
  fp = 0 # regular that is classified as vandalism
  tn = 0 # regular that is classified as regular
  fn = 0 # vandalism that is classified as regular

  ground_truth.each do |sample|
    values = sample[1]
    target_class = values[:class]

    key = :"#{values[:old_revision_id]}-#{values[:new_revision_id]}"
    next unless classification.has_key? key # go on if annotated is not in classification

    confidence = classification[key][:confidence]

    tp += 1 if Evaluator.true_positive?(target_class, confidence, threshold)  # True Positives
    fn += 1 if Evaluator.false_negative?(target_class, confidence, threshold) # False Negatives
    fp += 1 if Evaluator.false_positive?(target_class, confidence, threshold) # False Positives
    tn += 1 if Evaluator.true_negative?(target_class, confidence, threshold)  # True Negatives
  end

  { tp: tp, fp: fp, tn: tn, fn: fn }
end

#sort_curve_values(x_values, y_values, start_values = nil, end_values = nil) ⇒ Object

Returns given value array sorted by first array (x_values) Return value is a Hash { x: <x_values_sorted>, y: <y_values_sorted_by_x> } start_value is added in front of arrays if set, e.g. 0.0, y: 1.0 end_values is added to end of arrays if set, e.g. {x: 1.0, y: 1.0 }

Examples:

evaluator.sort_curve_values(x, y, { x: 0.0, y: 0.0 }, { x: 1.0, y: 1.0 })
#=>Hash { x: [0.0, *x, 1.0], y: [0.0, *y, 1.0] }


269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
# File 'lib/wikipedia/vandalism_detection/evaluator.rb', line 269

def sort_curve_values(x_values, y_values, start_values = nil, end_values = nil)
  merge_sorted = (x_values.each_with_index.map { |x, index| [x, y_values[index]] })
  merge_sorted = merge_sorted.sort_by{ |values| [values[0], - values[1]] }.uniq

  x = merge_sorted.transpose[0]
  y = merge_sorted.transpose[1]

  start_values_set = start_values && (start_values.has_key?(:x) || start_values.has_key?(:y))
  end_values_set = end_values && (end_values.has_key?(:x) || end_values.has_key?(:y))

  if start_values_set
    unless x.first == start_values[:x] && y.first == start_values[:y]
      x.unshift(start_values[:x] || x.first)
      y.unshift(start_values[:y] || y.first)
    end
  end

  if end_values_set
    unless x.last == end_values[:x] && y.last == end_values[:y]
      x.push(end_values[:x] || x.last)
      y.push(end_values[:y] || y.last)
    end
  end

  { x: x, y: y }
end

#test_performance_curves(ground_truth, classification, sample_count) ⇒ Object

Returns the performance curve points (recall, precision, fp-rate) and computed area under curves.



143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
# File 'lib/wikipedia/vandalism_detection/evaluator.rb', line 143

def test_performance_curves(ground_truth, classification, sample_count)
  thresholds = (0.0...1.0).step(1.0 / (sample_count.to_f)).to_a
  thresholds.shift #remove first value to not use the [0,1] value in curve

  precisions = []
  recalls = []
  fp_rates =  []

  thresholds.each do |threshold|
    values = predictive_values(ground_truth, classification, threshold)
    performance_params = performance_parameters(values[:tp], values[:fp], values[:tn], values[:fn])

    precisions.push performance_params[:precision]
    recalls.push performance_params[:recall]
    fp_rates.push performance_params[:fp_rate]
  end

  tp_rates = recalls
  pr_sorted = sort_curve_values(recalls, precisions, { x: 0.0 }, { y: 0.0 })
  roc_sorted = sort_curve_values(fp_rates, tp_rates, { y: 0.0 }, { x: 1.0 })

  recalls = pr_sorted[:x]
  precisions = pr_sorted[:y]
  fp_rates = roc_sorted[:x]
  tp_rates = roc_sorted[:y]

  pr_auc = area_under_curve(recalls, precisions)
  roc_auc = area_under_curve(fp_rates, tp_rates)

  { precisions: precisions, recalls: recalls,
    fp_rates: fp_rates, tp_rates: tp_rates,
    pr_auc: pr_auc, roc_auc: roc_auc }
end