Class: Wikipedia::VandalismDetection::Configuration

Inherits:
Object
  • Object
show all
Includes:
Singleton
Defined in:
lib/wikipedia/vandalism_detection/configuration.rb

Constant Summary collapse

TRAINING_DATA_BALANCED =
'balanced'
TRAINING_DATA_UNBALANCED =
'unbalanced'
TRAINING_DATA_OVERSAMPLED =
'oversampled'

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initializeConfiguration

Returns a new instance of Configuration.



28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
# File 'lib/wikipedia/vandalism_detection/configuration.rb', line 28

def initialize
  config = DefaultConfiguration[DefaultConfiguration::DEFAULTS]
  @config_from_file ||= config.load_config_file(config.config_file)

  @data ||= (@config_from_file ? config.deep_merge(@config_from_file) : config)

  @classifier_type = @data['classifier']['type']
  @classifier_options = @data['classifier']['options']
  @cross_validation_fold = @data['classifier']['cross-validation-fold']
  @training_data_options = @data['classifier']['training-data-options']
  @replace_missing_values = @data['classifier']['replace-missing-values'].to_s

  @features = @data['features']
  @output_base_directory = File.expand_path(@data['output']['base_directory'], __FILE__)
  @training_arff_file_name = @data['output']['training']['arff_file']
  @test_arff_file_name = @data['output']['test']['arff_file']
end

Dynamic Method Handling

This class handles dynamic methods through the method_missing method

#method_missing(method_name, *args) ⇒ Object

Returns file/path string for corpora files/directories and output files after following schema: <corpus type>_<progress stage>_<file name>.

Instead of ‘corpora’ the word ‘corpus’ is used for grammatical reasons.

example:

training_corpus_edits_file()
test_output_index_file()


137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
# File 'lib/wikipedia/vandalism_detection/configuration.rb', line 137

def method_missing(method_name, *args)
  return instance_variable_get("@#{method_name}") if instance_variable_defined?("@#{method_name}")

  file_path_parts = method_name.to_s.split('_')

  if file_path_parts.count >= 4
    corpus_type = file_path_parts[0]
    progress_stage = file_path_parts[1]
    file_path = file_path_parts[2..-1].join('_')

    if progress_stage == 'corpus'
      progress_stage = 'corpora'
      path = File.join(@data[progress_stage]['base_directory'], @data[progress_stage][corpus_type]['base_directory'])
    elsif progress_stage == 'output'
      path = @output_base_directory
    else
      return super
    end

    relative_path = File.join(path, @data[progress_stage][corpus_type][file_path])
    absolute_path = File.expand_path(relative_path, __FILE__)
    instance_variable_set "@#{method_name}", absolute_path
  else
    super
  end
end

Instance Attribute Details

#classifier_optionsObject (readonly)

Returns the value of attribute classifier_options.



20
21
22
# File 'lib/wikipedia/vandalism_detection/configuration.rb', line 20

def classifier_options
  @classifier_options
end

#classifier_typeObject (readonly)

Returns the value of attribute classifier_type.



20
21
22
# File 'lib/wikipedia/vandalism_detection/configuration.rb', line 20

def classifier_type
  @classifier_type
end

#cross_validation_foldObject (readonly)

Returns the value of attribute cross_validation_fold.



20
21
22
# File 'lib/wikipedia/vandalism_detection/configuration.rb', line 20

def cross_validation_fold
  @cross_validation_fold
end

#dataObject (readonly)

Returns the value of attribute data.



20
21
22
# File 'lib/wikipedia/vandalism_detection/configuration.rb', line 20

def data
  @data
end

#featuresObject (readonly)

Returns the value of attribute features.



20
21
22
# File 'lib/wikipedia/vandalism_detection/configuration.rb', line 20

def features
  @features
end

#output_base_directoryObject (readonly)

Returns the value of attribute output_base_directory.



20
21
22
# File 'lib/wikipedia/vandalism_detection/configuration.rb', line 20

def output_base_directory
  @output_base_directory
end

#training_data_optionsObject (readonly)

Returns the value of attribute training_data_options.



20
21
22
# File 'lib/wikipedia/vandalism_detection/configuration.rb', line 20

def training_data_options
  @training_data_options
end

Instance Method Details

#balanced_training_data?Boolean

Returns a boolean value whether a balanced data set is used for classifier training. (balanced means: same number of vandalism and regular samples)

Returns:

  • (Boolean)


57
58
59
# File 'lib/wikipedia/vandalism_detection/configuration.rb', line 57

def balanced_training_data?
  @training_data_options == TRAINING_DATA_BALANCED
end

#oversampled_training_data?Boolean

Returns a boolean value whether a oversampled data set is used for classifier training. (oversampled means: a balanced dataset is enriched through vandalism instances if vandalism number is less than regular number)

Returns:

  • (Boolean)


71
72
73
# File 'lib/wikipedia/vandalism_detection/configuration.rb', line 71

def oversampled_training_data?
  !@training_data_options.nil? && @training_data_options.include?(TRAINING_DATA_OVERSAMPLED)
end

#oversampling_optionsObject

Returns a hash of the oversampled training data options. Allowed options are -p (-percent) and -u (-undersampling)



77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
# File 'lib/wikipedia/vandalism_detection/configuration.rb', line 77

def oversampling_options
  if oversampled_training_data?
    params = @training_data_options.gsub(TRAINING_DATA_OVERSAMPLED, '').split('-')

    percent_default = 100.0
    undersampling_default = 100.0

    percent_option = params.select { |param| param.match /(p\s|percentage\s)\d+/i }[0]
    undersampling_option = params.select { |param| param.match /(u\s|undersampling\s)/i }[0]

    percent = percent_option.nil? ? percent_default : percent_option.split.last.to_f
    undersampling = undersampling_default

    if undersampling_option
      if !undersampling_option.match(/(true|t|yes|y)/i).nil?
        undersampling_percentage = undersampling_option.split.last
        undersampling = undersampling_percentage.nil? ? undersampling_default : undersampling_percentage.to_f
      else
        undersampling = 0.0
      end
    end

    { percentage: percent, undersampling: undersampling }
  else
    {}
  end
end

#replace_training_data_missing_values?Boolean

Returns:

  • (Boolean)


51
52
53
# File 'lib/wikipedia/vandalism_detection/configuration.rb', line 51

def replace_training_data_missing_values?
  !!(@replace_missing_values =~ /(true|t|yes|y)/i)
end

#test_output_arff_fileObject

Returns the test arff file name. The path is expanded by used classifier & options and is in the same directory as the classification file.



124
125
126
# File 'lib/wikipedia/vandalism_detection/configuration.rb', line 124

def test_output_arff_file
  File.join(File.dirname(test_output_classification_file), @test_arff_file_name)
end

#test_output_classification_fileObject

Returns the path to the classification file. Automatically sub directories for classifier and training data options are added. Thus it results in <output base dir>/<classifier name>/<training data options>/<file name>



108
109
110
111
112
113
114
# File 'lib/wikipedia/vandalism_detection/configuration.rb', line 108

def test_output_classification_file
  classifiction_file_name = @data['output']['test']['classification_file']
  classifier_name = @classifier_type.split('::').last.downcase

  File.join(@output_base_directory, classifier_name,
            @training_data_options.gsub(/\s+/, '_'), classifiction_file_name)
end

#training_output_arff_fileObject

Returns the training arff file name. The path is expanded by used classifier & options and is in the same directory as the classification file.



118
119
120
# File 'lib/wikipedia/vandalism_detection/configuration.rb', line 118

def training_output_arff_file
  File.join(File.dirname(test_output_classification_file), @training_arff_file_name)
end

#unbalanced_training_data?Boolean

Returns a boolean value whether an unbalanced data set is used for classifier training. (unbalanced means: vandalism and regular samples are used as given in arff file)

Returns:

  • (Boolean)


63
64
65
66
# File 'lib/wikipedia/vandalism_detection/configuration.rb', line 63

def unbalanced_training_data?
  @training_data_options == TRAINING_DATA_UNBALANCED || @training_data_options.nil? ||
      (!balanced_training_data? && !oversampled_training_data?)
end

#use_occ?Boolean

Returns whether the classifier uses one class classification

Returns:

  • (Boolean)


47
48
49
# File 'lib/wikipedia/vandalism_detection/configuration.rb', line 47

def use_occ?
  @classifier_type == Weka::Classifiers::Meta::OneClassClassifier.type
end