Class: Selector::BiNormalSeperation

# File 'lib/svm_helper/selectors/bi_normal_seperation.rb', line 14

def initialize classification, args={}
  super
  @word_selection = args.fetch(:word_selection){ :grams1_2 }
end

Instance Method Details

#build_dictionary(data_set, dictionary_size = DEFAULT_DICTIONARY_SIZE) ⇒ `Object`

# File 'lib/svm_helper/selectors/bi_normal_seperation.rb', line 70

def build_dictionary data_set, dictionary_size=DEFAULT_DICTIONARY_SIZE
  words_per_data = extract_words data_set, true
  generate_global_dictionary words_per_data, dictionary_size
end

#extract_words(data_set, keep_label = false) ⇒ `Array<OpenStruct<Array<String>,Boolean>>`

extracts the words of all provided data entries

Parameters:

data_set (Array<PreprocessedData>) —
list of preprocessed data
keep_label (defaults to: false)

Returns:

(Array<OpenStruct<Array<String>,Boolean>>) —
list of words per data entry

# File 'lib/svm_helper/selectors/bi_normal_seperation.rb', line 80

def extract_words data_set, keep_label=false
  data_set.map do |data|
    extract_words_from_data data, keep_label
  end
end

#generate_global_dictionary(all_words, size = DEFAULT_DICTIONARY_SIZE) ⇒ `Array<String>`

generates a list of words used as dictionary

Parameters:

size (defaults to: DEFAULT_DICTIONARY_SIZE) —
dictionary size

Returns:

(Array<String>) —
list of words

# File 'lib/svm_helper/selectors/bi_normal_seperation.rb', line 42

def generate_global_dictionary all_words, size=DEFAULT_DICTIONARY_SIZE
  return unless global_dictionary.empty?

  label_counts = [0,0]
  features = all_words.reduce(Hash.new { |h, k| h[k] = [0,0] }) do |accumulator, bag|
    label = bag.label ? 1 : 0
    label_counts[label] += 1
    # only count a feature once per bag
    bag.features.uniq.each do |word|
      unless accumulator.has_key?(word)
        accumulator[word] = [0,0]
      end
      accumulator[word][label] += 1
    end
    accumulator
  end
  neg, pos = label_counts
  words = p_map(features) do |word, counts|
            next if counts.any? { |e| e==0 } # skip words only appearing in one class
            bns = bi_normal_seperation(pos, neg, *counts)
            [word, bns.abs]
          end
  @global_dictionary = words.compact
                            .sort_by{|e| e[1]}
                            .last(size)
                            .map{|e| e[0] }
end

#generate_vectors(data_set, dictionary_size = DEFAULT_DICTIONARY_SIZE) ⇒ `Array<FeatureVector>`

generates a list of feature vetors and their labels from preprocessed data

Parameters:

data_set (Array<PreprocessedData>) —
list of preprocessed data
classification (Symbol) —
in :industry, :function, :career_level
dictionary_size (Integer) (defaults to: DEFAULT_DICTIONARY_SIZE) —
Size of a dictionary to create if non exists

Returns:

(Array<FeatureVector>) —
list of feature vectors and labels

# File 'lib/svm_helper/selectors/bi_normal_seperation.rb', line 25

def generate_vectors data_set, dictionary_size=DEFAULT_DICTIONARY_SIZE
  words_and_label_per_data = extract_words data_set, true
  generate_global_dictionary words_and_label_per_data, dictionary_size

  words_per_data = words_and_label_per_data.map(&:features)
  p_map_with_index(words_per_data) do |words,index|
    word_set = words.uniq
    make_vector word_set, data_set[index]
  end
end

#label ⇒ `Object`



10
11
12

# File 'lib/svm_helper/selectors/bi_normal_seperation.rb', line 10

def label
  "BiNormalSeperation"
end

Class: Selector::BiNormalSeperation

Overview

Direct Known Subclasses

Constant Summary

Constants included from BNS

Constants inherited from Simple

Constants included from ParallelHelper

Instance Attribute Summary

Attributes inherited from Simple

Instance Method Summary collapse

Methods included from BNS

Methods inherited from Simple

Methods included from ParallelHelper

Constructor Details

#initialize(classification, args = {}) ⇒ BiNormalSeperation

Instance Method Details

#build_dictionary(data_set, dictionary_size = DEFAULT_DICTIONARY_SIZE) ⇒ Object

#extract_words(data_set, keep_label = false) ⇒ Array<OpenStruct<Array<String>,Boolean>>

#generate_global_dictionary(all_words, size = DEFAULT_DICTIONARY_SIZE) ⇒ Array<String>

#generate_vectors(data_set, dictionary_size = DEFAULT_DICTIONARY_SIZE) ⇒ Array<FeatureVector>

#label ⇒ Object

#initialize(classification, args = {}) ⇒ `BiNormalSeperation`

#build_dictionary(data_set, dictionary_size = DEFAULT_DICTIONARY_SIZE) ⇒ `Object`

#extract_words(data_set, keep_label = false) ⇒ `Array<OpenStruct<Array<String>,Boolean>>`

#generate_global_dictionary(all_words, size = DEFAULT_DICTIONARY_SIZE) ⇒ `Array<String>`

#generate_vectors(data_set, dictionary_size = DEFAULT_DICTIONARY_SIZE) ⇒ `Array<FeatureVector>`

#label ⇒ `Object`