Class: Wikipedia::VandalismDetection::TestDataset

Inherits:
Object
  • Object
show all
Defined in:
lib/wikipedia/vandalism_detection/test_dataset.rb

Overview

This class provides methods for getting and creating a test ARFF file from a configured test corpus.

Class Method Summary collapse

Class Method Details

.buildObject Also known as: instances

Returns an instance dataset from the configured gold annotation file using the configured features from feature_calculator parameter.



23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
# File 'lib/wikipedia/vandalism_detection/test_dataset.rb', line 23

def self.build
  @config = Wikipedia::VandalismDetection.configuration
  print "\ncreating test dataset..."

  edits_file = @config.test_corpus_edits_file
  raise EditsFileNotConfiguredError unless edits_file

  edits = CSV.parse(File.read(edits_file), headers: true)

  output_directory = File.join(@config.output_base_directory, 'test')
  FileUtils.mkdir_p(output_directory) unless Dir.exists?(output_directory)
  FileUtils.mkdir_p(@config.output_base_directory) unless Dir.exists?(@config.output_base_directory)

  # create feature file hash with io objects
  feature_files = @config.features.inject({}) do |hash, feature_name|
    file_name = "#{feature_name.gsub(' ', '_').downcase}.arff"
    arff_file = File.join(output_directory, file_name)

    unless File.exists?(arff_file)
      dataset = Instances.empty_for_test_feature(feature_name)
      dataset.to_ARFF(arff_file)
      hash[feature_name] = File.open(arff_file, 'a')
    end

    hash
  end

  feature_calculator = FeatureCalculator.new

  unless feature_files.empty?
    processed_edits = 0
    edits_count = edits.count

    edits.each do |edit_data|
      old_revision_id = edit_data['oldrevisionid']
      new_revision_id = edit_data['newrevisionid']

      processed_edits += 1
      print_progress(processed_edits, edits_count, "computing test features")

      next unless (annotated_revision?(old_revision_id) && annotated_revision?(new_revision_id))
      edit = create_edit_from(edit_data)

      feature_files.each do |feature_name, file|
        value = feature_calculator.calculate_feature_for(edit, feature_name)
        file.puts [value, old_revision_id, new_revision_id].join(',')
      end
    end

    # close all io objects
    feature_files.each do |feature_name, file|
      file.close
      puts "\n'#{File.basename(file.path)}' saved to #{File.dirname(file.path)}"
    end
  end

  dataset = merge_feature_arffs(@config.features, output_directory)
  dataset = normalize(dataset) if @config.classifier_type.match('Functions::LibSVM')
  dataset
end

.build!Object

Saves and returns the dataset as ARFF file. As test data the configured data corpus from /config/config.yml is used.



90
91
92
93
94
95
96
97
98
99
# File 'lib/wikipedia/vandalism_detection/test_dataset.rb', line 90

def self.build!
  @config = Wikipedia::VandalismDetection.configuration

  dataset = self.instances
  output_file = @config.test_output_arff_file
  dataset.to_ARFF(output_file)
  puts "\n'#{File.basename(output_file)}' saved to #{File.dirname(output_file)}"

  dataset
end

.create_corpus_file_index!Object

Saves and returns a file index hash of structure [file_name => full_path] for the given directory.



191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
# File 'lib/wikipedia/vandalism_detection/test_dataset.rb', line 191

def self.create_corpus_file_index!
  @config = Wikipedia::VandalismDetection.configuration
  revisions_directory = @config.test_corpus_revisions_directory

  raise RevisionsDirectoryNotConfiguredError unless revisions_directory

  print "\nCreating test corpus index file..."
  file_index = {}

  Dir.open revisions_directory do |part_directories|
    part_directories.each do |part_directory|
      Dir.open "#{revisions_directory}/#{part_directory}" do |contents|
        contents.each do |file|
          path = "#{revisions_directory}/#{part_directory}/#{file}"

          if File.file?(path) && (file =~ /\d+.txt/) && annotated_revision?(file)
            file_index[file] = path
            print "\r processed #{file_index.count } files"
          end
        end
      end
    end
  end

  file = @config.test_output_index_file
  dirname = File.dirname(file)
  FileUtils.mkdir(dirname) unless Dir.exists?(dirname)

  written = File.open(file, 'w') { |f| f.write(file_index.to_yaml) }
  print "\nSaved test corpus index file to #{file}.\n" if written > 0

  file_index
end

.edit(old_revision_id, new_revision_id) ⇒ Object

Returns the Edit with the given revision ids. Test corpus is searched for the revisions’ data.



278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
# File 'lib/wikipedia/vandalism_detection/test_dataset.rb', line 278

def self.edit(old_revision_id, new_revision_id)
  @config = Wikipedia::VandalismDetection.configuration
  edits_file = @config.test_corpus_edits_file
  raise EditsFileNotConfiguredError unless edits_file

  @edits_csv ||= CSV.parse(File.read(edits_file), headers: true)

  edit_data = @edits_csv.find do |row|
    row['oldrevisionid'] == old_revision_id && row['newrevisionid'] == new_revision_id
  end

  if (annotated_revision?(old_revision_id) && annotated_revision?(new_revision_id)) && edit_data
    create_edit_from(edit_data)
  end
end

.ground_truth_hash(ground_truth_file) ⇒ Object

Returns a hash for classification data from given ground truth file



167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
# File 'lib/wikipedia/vandalism_detection/test_dataset.rb', line 167

def self.ground_truth_hash(ground_truth_file)
  file = File.read(ground_truth_file)
  ground_truth_samples = file.lines.to_a

  ground_truth = {}

  ground_truth_samples.each do |line|
    line_parts = line.split(' ')

    old_revision_id = line_parts[0].to_i
    new_revision_id = line_parts[1].to_i
    class_short = line_parts[2]

    ground_truth[:"#{old_revision_id}-#{new_revision_id}"] = {
        old_revision_id: old_revision_id,
        new_revision_id: new_revision_id,
        class: class_short
    }
  end

  ground_truth
end

.normalize(dataset) ⇒ Object

Returns the normalized dataset (important for lib svm one class classification)



260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
# File 'lib/wikipedia/vandalism_detection/test_dataset.rb', line 260

def self.normalize(dataset)
  remove_filter = Weka::Filters::Unsupervised::Attribute::Remove.new
  remove_filter.data(dataset)
  remove_filter.filter_options("-V -R 1-#{@config.features.count}")
  numerics_dataset = remove_filter.use

  remove_filter.filter_options("-R 1-#{@config.features.count}")
  non_numerics_dataset = remove_filter.use

  normalize_filter = Weka::Filters::Unsupervised::Attribute::Normalize.new
  normalize_filter.data(numerics_dataset)
  normalized_dataset = normalize_filter.use

  normalized_dataset.merge_with non_numerics_dataset
end