Class: Selector::InformationGain

Inherits:
BiNormalSeperation show all
Includes:
IG
Defined in:
lib/svm_helper/selectors/information_gain.rb

Overview

Feature Selection for Text Classification - HP Labs http://www.google.com/patents/US20040059697

Constant Summary

Constants included from BNS

BNS::A, BNS::B, BNS::C, BNS::D, BNS::P_HIGH, BNS::P_LOW, BNS::SQR2, BNS::SQR2PI

Constants inherited from Simple

Simple::DEFAULT_DICTIONARY_SIZE

Constants included from ParallelHelper

ParallelHelper::THREAD_COUNT

Instance Attribute Summary

Attributes inherited from Simple

#classification_encoding, #global_dictionary, #gram_size, #word_selection

Instance Method Summary collapse

Methods included from IG

#e, #information_gain, #xlx

Methods inherited from BiNormalSeperation

#build_dictionary, #extract_words, #generate_vectors, #initialize

Methods included from BNS

#bi_normal_seperation, #cdf, #cdf_inverse

Methods inherited from Simple

#build_dictionary, #extract_words, #extract_words_from_data, #generate_vector, #generate_vectors, #initialize, #reset

Methods included from ParallelHelper

#p_map, #p_map_with_index, #parallel?

Constructor Details

This class inherits a constructor from Selector::BiNormalSeperation

Instance Method Details

#generate_global_dictionary(all_words, size = DEFAULT_DICTIONARY_SIZE) ⇒ Array<String>

generates a list of words used as dictionary

Parameters:

  • size (defaults to: DEFAULT_DICTIONARY_SIZE)

    dictionary size

Returns:

  • (Array<String>)

    list of words



20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
# File 'lib/svm_helper/selectors/information_gain.rb', line 20

def generate_global_dictionary all_words, size=DEFAULT_DICTIONARY_SIZE
  return unless global_dictionary.empty?

  label_counts = [0,0]
  features = all_words.reduce(Hash.new { |h, k| h[k] = [0,0] }) do |accumulator, bag|
    label = bag.label ? 1 : 0
    label_counts[label] += 1
    # only count a feature once per bag
    bag.features.uniq.each do |word|
      unless accumulator.has_key?(word)
        accumulator[word] = [0,0]
      end
      accumulator[word][label] += 1
    end
    accumulator
  end
  neg, pos = label_counts
  words = p_map(features) do |word, counts|
            next if counts.any? { |e| e==0 } # skip words only appearing in one class
            tp, fp = counts
            ig = information_gain(pos, neg, tp, fp)
            [word, ig.abs]
          end
  @global_dictionary = words.compact
                            .sort_by{|e| e[1]}
                            .last(size)
                            .map{|e| e[0] }
end

#labelObject



10
11
12
# File 'lib/svm_helper/selectors/information_gain.rb', line 10

def label
  "InformationGain"
end