Class: Selector::Simple

Inherits:
Object
  • Object
show all
Includes:
ParallelHelper
Defined in:
lib/svm_helper/selectors/simple.rb

Overview

Selector which uses a simple dictionary to generate feature vectors

Author:

  • Andreas Eger

Direct Known Subclasses

BiNormalSeperation

Constant Summary collapse

DEFAULT_DICTIONARY_SIZE =

default dictionary size

800

Constants included from ParallelHelper

ParallelHelper::THREAD_COUNT

Instance Attribute Summary collapse

Instance Method Summary collapse

Methods included from ParallelHelper

#p_map, #p_map_with_index, #parallel?

Constructor Details

#initialize(classification, args = {}) ⇒ Simple

Returns a new instance of Simple.



16
17
18
19
20
21
22
23
# File 'lib/svm_helper/selectors/simple.rb', line 16

def initialize classification, args={}
  @classification = classification
  @global_dictionary = args.fetch(:global_dictionary) {[]}
  @classification_encoding = args.fetch(:classification_encoding){:bitmap}
  @word_selection = args.fetch(:word_selection){ :single }
  @gram_size = args.fetch(:gram_size) { 1 }
  @parallel = args.fetch(:parallel){false}
end

Instance Attribute Details

#classification_encodingObject (readonly)

Returns the value of attribute classification_encoding.



13
14
15
# File 'lib/svm_helper/selectors/simple.rb', line 13

def classification_encoding
  @classification_encoding
end

#global_dictionaryObject

Returns the value of attribute global_dictionary.



12
13
14
# File 'lib/svm_helper/selectors/simple.rb', line 12

def global_dictionary
  @global_dictionary
end

#gram_sizeObject (readonly)

Returns the value of attribute gram_size.



13
14
15
# File 'lib/svm_helper/selectors/simple.rb', line 13

def gram_size
  @gram_size
end

#word_selectionObject (readonly)

Returns the value of attribute word_selection.



13
14
15
# File 'lib/svm_helper/selectors/simple.rb', line 13

def word_selection
  @word_selection
end

Instance Method Details

#build_dictionary(data_set, dictionary_size = DEFAULT_DICTIONARY_SIZE) ⇒ Object



73
74
75
76
# File 'lib/svm_helper/selectors/simple.rb', line 73

def build_dictionary data_set, dictionary_size=DEFAULT_DICTIONARY_SIZE
  words_per_data = extract_words data_set
  generate_global_dictionary words_per_data, dictionary_size
end

#extract_words(data_set) ⇒ Array<Array<String>>

extracts the words of all provided data entries

Parameters:

Returns:

  • (Array<Array<String>>)

    list of words per data entry



82
83
84
85
86
# File 'lib/svm_helper/selectors/simple.rb', line 82

def extract_words data_set
  data_set.map do |data|
    extract_words_from_data data
  end
end

#extract_words_from_data(data, keep_label = false) ⇒ OpenStruct<Array<String>,Boolean>

fetches all words and two word phrases from one data entry, removes stopwords and very short words

Parameters:

  • data (PreprocessedData)

    preprocessed data entry

  • keep_label (defaults to: false)

Returns:

  • (OpenStruct<Array<String>,Boolean>)

    list of words



93
94
95
96
97
98
99
100
# File 'lib/svm_helper/selectors/simple.rb', line 93

def extract_words_from_data data
  words = (data.data.flat_map(&:split) - stopwords)
              .delete_if { |e| e.size <= 2 }
  if gram_size > 1
    words = words.each_cons(@gram_size).map{|e| e.join " " }
  end
  words
end

#generate_global_dictionary(all_words, size = DEFAULT_DICTIONARY_SIZE) ⇒ Array<String>

generates a list of words used as dictionary

Parameters:

  • size (defaults to: DEFAULT_DICTIONARY_SIZE)

    dictionary size

Returns:

  • (Array<String>)

    list of words



64
65
66
67
68
69
70
71
# File 'lib/svm_helper/selectors/simple.rb', line 64

def generate_global_dictionary all_words, size=DEFAULT_DICTIONARY_SIZE
  return unless global_dictionary.empty?

  words = all_words.flatten.group_by{|e| e}.values
           .sort_by{|e| e.size}
           .map{|e| [e[0],e.size]}
  @global_dictionary = words.last(size).map(&:first).reverse
end

#generate_vector(data, dictionary = global_dictionary) ⇒ FeatureVector

generates a feature vector with its label

Parameters:

  • data (PreprocessedData)
  • classification (Symbol)

    in :industry, :function, :career_level

  • dictionary (Array) (defaults to: global_dictionary)

    dictionary to use for this selection

Returns:



53
54
55
56
# File 'lib/svm_helper/selectors/simple.rb', line 53

def generate_vector data, dictionary=global_dictionary
  word_set = Set.new extract_words_from_data(data)
  make_vector word_set, data, dictionary
end

#generate_vectors(data_set, dictionary_size = DEFAULT_DICTIONARY_SIZE) ⇒ Array<FeatureVector>

generates a list of feature vetors and their labels from preprocessed data

Parameters:

  • data_set (Array<PreprocessedData>)

    list of preprocessed data

  • classification (Symbol)

    in :industry, :function, :career_level

  • dictionary_size (Integer) (defaults to: DEFAULT_DICTIONARY_SIZE)

    Size of a dictionary to create if non exists

Returns:



36
37
38
39
40
41
42
43
44
# File 'lib/svm_helper/selectors/simple.rb', line 36

def generate_vectors data_set, dictionary_size=DEFAULT_DICTIONARY_SIZE
  words_per_data = extract_words data_set
  generate_global_dictionary words_per_data, dictionary_size

  p_map_with_index(words_per_data) do |words,index|
    word_set = words.uniq
    make_vector word_set, data_set[index]
  end
end

#labelObject



25
26
27
# File 'lib/svm_helper/selectors/simple.rb', line 25

def label
  "simple"
end

#reset(classification) ⇒ Object



136
137
138
139
# File 'lib/svm_helper/selectors/simple.rb', line 136

def reset classification
  @global_dictionary = []
  @classification = classification
end