Class: Wikipedia::VandalismDetection::TrainingDataset

Inherits:
Object
  • Object
show all
Defined in:
lib/wikipedia/vandalism_detection/training_dataset.rb

Overview

This class provides methods for getting and creating a training ARFF file from a configured training corpus.

Class Method Summary collapse

Class Method Details

.balanced_instancesObject

Returns the balanced training dataset (same number of vandalism & regular instances)



91
92
93
94
95
96
97
98
99
100
101
102
# File 'lib/wikipedia/vandalism_detection/training_dataset.rb', line 91

def self.balanced_instances
  dataset = self.build
  filter = Weka::Filters::Supervised::Instance::SpreadSubsample.new

  #uniform distribution (remove majority instances)
  filter.set do
    data dataset
    filter_options '-M 1'
  end

  filter.use
end

.buildObject Also known as: instances

Returns an instance dataset from the configured gold annotation file using the configured features from feature_calculator parameter.



24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# File 'lib/wikipedia/vandalism_detection/training_dataset.rb', line 24

def self.build
  @config = Wikipedia::VandalismDetection.configuration

  print "\ncreating training dataset..."

  annotations_file = @config.training_corpus_annotations_file
  raise AnnotationsFileNotConfiguredError unless annotations_file

  annotations = CSV.parse(File.read(annotations_file), headers: true)
  annotation_data = annotations.map { |row| { edit_id: row['editid'], class: row['class'] } }

  output_directory = File.join(@config.output_base_directory, 'training')
  FileUtils.mkdir_p(output_directory) unless Dir.exists?(output_directory)
  FileUtils.mkdir_p(@config.output_base_directory) unless Dir.exists?(@config.output_base_directory)

  # create feature file hash with io objects
  feature_files = @config.features.inject({}) do |hash, feature_name|
    file_name = "#{feature_name.gsub(' ', '_').downcase}.arff"
    arff_file = File.join(output_directory, file_name)

    unless File.exists?(arff_file)
      dataset = Instances.empty_for_feature(feature_name)
      dataset.to_ARFF(arff_file)
      hash[feature_name] = File.open(arff_file, 'a')
    end

    hash
  end

  feature_calculator = FeatureCalculator.new

  unless feature_files.empty?
    processed_edits = 0

    annotation_data.each do |row|
      edit_id = row[:edit_id]
      vandalism = row[:class]
      edit = create_edit_from(edit_id)

      feature_files.each do |feature_name, file|
        value = feature_calculator.calculate_feature_for(edit, feature_name)
        file.puts [value, vandalism].join(',')
      end

      processed_edits += 1
      print_progress(processed_edits, @edits_csv.count, "computing training features")
    end

    # close all io objects
    feature_files.each do |feature_name, file|
      file.close
      puts "'#{File.basename(file.path)}' saved to #{File.dirname(file.path)}"
    end
  end

  dataset = merge_feature_arffs(@config.features, output_directory)
  dataset.class_index = @config.features.count
  dataset = replace_missing_values(dataset) if @config.replace_training_data_missing_values?

  dataset
end

.create_corpus_file_index!Object

Saves and returns a file index hash of structure [file_name => full_path] for the given directory.



154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
# File 'lib/wikipedia/vandalism_detection/training_dataset.rb', line 154

def self.create_corpus_file_index!
  @config = Wikipedia::VandalismDetection.configuration
  revisions_directory = @config.training_corpus_revisions_directory

  raise RevisionsDirectoryNotConfiguredError unless revisions_directory

  print "\ncreating file index..."
  file_index = {}

  Dir.open revisions_directory do |part_directories|
    part_directories.each do |part_directory|
      Dir.open "#{revisions_directory}/#{part_directory}" do |contents|
        contents.each do |file|
          path = "#{revisions_directory}/#{part_directory}/#{file}"

          if File.file?(path) && (file =~ /\d+.txt/)
            file_index[file] = path
            print "\r processed #{file_index.count } files"
          end
        end
      end
    end
  end

  file = @config.training_output_index_file
  dirname = File.dirname(file)
  FileUtils.mkdir(dirname) unless Dir.exists?(dirname)

  written = File.open(file, 'w') { |f| f.write(file_index.to_yaml) }
  print "Index file saved to #{file}.\n" if written > 0

  file_index
end

.oversampled_instances(options = {}) ⇒ Object

Returns an oversampled training dataset. Oversampling options can be set by using e.g:

percentage: 200
undersampling: false

For oversampling Weka SMOTE package is used. For SMOTE method see paper: arxiv.org/pdf/1106.1813.pdf Doc: weka.sourceforge.net/doc.packages/SMOTE/weka/filters/supervised/instance/SMOTE.html



112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
# File 'lib/wikipedia/vandalism_detection/training_dataset.rb', line 112

def self.oversampled_instances(options = {})
  config = Wikipedia::VandalismDetection.configuration
  default_options = config.oversampling_options
  options[:percentage] = default_options[:percentage] unless options[:percentage]
  options[:undersampling] = default_options[:undersampling] unless options[:undersampling]

  smote = Weka::Filters::Supervised::Instance::SMOTE.new
  dataset = self.build
  percentage = options[:percentage]
  smote_options = "-P #{percentage.to_i}" if percentage

  smote.set do
    data dataset
    filter_options smote_options if smote_options
  end

  undersampling = options[:undersampling] / 100.0
  smote_dataset = smote.use

  if undersampling > 0.0
    subsample = Weka::Filters::Supervised::Instance::SpreadSubsample.new

    # balance (remove majority instances)
    subsample.set do
      data smote_dataset
      filter_options "-M #{undersampling}"
    end

    subsample.use
  else
    smote_dataset
  end
end