Class: Glove::Model

Inherits:
Object
  • Object
show all
Defined in:
lib/glove/model.rb

Constant Summary collapse

DEFAULTS =

Default options (see #initialize)

{
  max_count:      100,
  learning_rate:  0.05,
  alpha:          0.75,
  num_components: 30,
  epochs:         5,
  threads:        4
}

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(options = {}) ⇒ Glove::Model

Create a new Glove::Model instance. Accepts options for Corpus and Parser which only get forwarded and not used in this class.

Parameters:

  • options (Hash) (defaults to: {})

    the options to initialize the instance with.

Options Hash (options):

  • :max_count (Integer) — default: 100

    Parameter specifying cutoff in weighting function

  • :learning_rate (Float) — default: 0.05

    Initial learning rate

  • :alpha (Float) — default: 0.75

    Exponent of weighting function

  • :num_components (Integer) — default: 30

    Column size of the word vector matrix

  • :epochs (Integer) — default: 5

    Number of training iterations

  • :threads (Integer) — default: 4

    Number of threads to use in building the co-occurence matrix and training iterations. Must be greater then 0


43
44
45
46
47
48
49
50
51
52
# File 'lib/glove/model.rb', line 43

def initialize(options={})
  @opts = DEFAULTS.dup.merge(options)
  @opts.each do |key, value|
    instance_variable_set :"@#{key}", value
  end

  @cooc_matrix = nil
  @word_vec    = nil
  @word_biases = nil
end

Instance Attribute Details

#alphaObject (readonly)

Returns the value of attribute alpha.


24
25
26
# File 'lib/glove/model.rb', line 24

def alpha
  @alpha
end

#cooc_matrixObject (readonly)

Returns the value of attribute cooc_matrix.


25
26
27
# File 'lib/glove/model.rb', line 25

def cooc_matrix
  @cooc_matrix
end

#corpusGlove::Corpus (readonly)

Returns reference to the Corpus instance.

Returns:


23
# File 'lib/glove/model.rb', line 23

attr_reader :opts, :window, :epochs, :num_components, :min_count

#epochsObject (readonly)

Returns the value of attribute epochs.


23
24
25
# File 'lib/glove/model.rb', line 23

def epochs
  @epochs
end

#learning_rateObject (readonly)

Returns the value of attribute learning_rate.


24
25
26
# File 'lib/glove/model.rb', line 24

def learning_rate
  @learning_rate
end

#max_countObject (readonly)

Returns the value of attribute max_count.


24
25
26
# File 'lib/glove/model.rb', line 24

def max_count
  @max_count
end

#min_countObject (readonly)

Returns the value of attribute min_count.


23
24
25
# File 'lib/glove/model.rb', line 23

def min_count
  @min_count
end

#num_componentsObject (readonly)

Returns the value of attribute num_components.


23
24
25
# File 'lib/glove/model.rb', line 23

def num_components
  @num_components
end

#optsObject (readonly)

Returns the value of attribute opts.


23
24
25
# File 'lib/glove/model.rb', line 23

def opts
  @opts
end

#threadsObject (readonly)

Returns the value of attribute threads.


24
25
26
# File 'lib/glove/model.rb', line 24

def threads
  @threads
end

#token_indexHash (readonly)

Returns reference to corpus.index.

Returns:

  • (Hash)

    reference to corpus.index


23
# File 'lib/glove/model.rb', line 23

attr_reader :opts, :window, :epochs, :num_components, :min_count

#token_pairsArray<(Glove::TokenPair)> (readonly)

Returns reference to corpus.pairs.

Returns:


23
# File 'lib/glove/model.rb', line 23

attr_reader :opts, :window, :epochs, :num_components, :min_count

#windowObject (readonly)

Returns the value of attribute window.


23
24
25
# File 'lib/glove/model.rb', line 23

def window
  @window
end

#word_biasesGSL::Vector

Returns the vector holding the word biases.

Returns:

  • (GSL::Vector)

    the vector holding the word biases


23
# File 'lib/glove/model.rb', line 23

attr_reader :opts, :window, :epochs, :num_components, :min_count

#word_vecGSL::Matrix

Returns the word vector matrix.

Returns:

  • (GSL::Matrix)

    the word vector matrix


23
# File 'lib/glove/model.rb', line 23

attr_reader :opts, :window, :epochs, :num_components, :min_count

Instance Method Details

#analogy_words(word1, word2, target, num = 3, accuracy = 0.0001) ⇒ Array

Get a words that relates to :target like :word1 relates to :word2

Examples:

What words relate to atom like quantum relates to physics?

model.analogy_words('quantum', 'physics', 'atom')
# => [["electron", 0.98583], ["energi", 0.98151], ["photon",0.96650]]

Parameters:

  • word1 (String)
  • word2 (String)
  • num (Integer) (defaults to: 3)

    Number of related words to :target

  • accuracy (Float) (defaults to: 0.0001)

    Allowance in difference of target cosine and related word cosine distances

Returns:

  • (Array)

    List of related words to target


134
135
136
137
138
139
140
141
142
143
144
145
# File 'lib/glove/model.rb', line 134

def analogy_words(word1, word2, target, num=3, accuracy=0.0001)
  word1  = word1.stem
  word2  = word1.stem
  target = target.stem

  distance = cosine(vector(word1), vector(word2))

  vector_distance(target).reject do |item|
    diff = item[1].to_f.abs - distance
    diff.abs < accuracy
  end.take(num)
end

#fit(text) ⇒ Glove::Model

Fit a string or Corpus instance and build co-occurance matrix

Examples:

Provide corpus for the model

model = Glove::Model.new
model.fit(File.read('shakespeare.txt'))

Provide a Corpus instance as text argument

model = Glove::Model.new
corpus = Glove::Corpus.build(File.read('shakespeare.txt'))
model.fit(corpus)

Parameters:

Returns:


65
66
67
68
69
70
# File 'lib/glove/model.rb', line 65

def fit(text)
  fit_corpus(text)
  build_cooc_matrix
  build_word_vectors
  self
end

#inspectObject

Prevent token_pairs, matrices and vectors to fill up the terminal


159
160
161
# File 'lib/glove/model.rb', line 159

def inspect
  to_s
end

#load(corpus_file, cooc_file, vec_file, bias_file) ⇒ Object

Loads training data from already existing files

Parameters:

  • corpus_file (String)

    Filename for corpus

  • cooc_file (String)

    Filename for co-occurence matrix

  • vec_file (String)

    Filename for Word Vector Maxtrix

  • bias_file (String)

    Filename for Word Biases Vector


101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
# File 'lib/glove/model.rb', line 101

def load(corpus_file, cooc_file, vec_file, bias_file)
  @corpus = Marshal.load(File.binread(corpus_file))

  @token_index = corpus.index
  @token_pairs = corpus.pairs

  size = token_index.size

  @cooc_matrix = GSL::Matrix.alloc(size, size)
  @word_vec    = GSL::Matrix.alloc(size, num_components)
  @word_biases = GSL::Vector.alloc(size)

  @cooc_matrix.fread(cooc_file)
  @word_vec.fread(vec_file)
  @word_biases.fread(bias_file)
end

#most_similar(word, num = 3) ⇒ Array

Get most similar words to :word

Examples:

Get 1 most similar word to 'physics'

model.most_similar('physics', 1) # => ["quantum", 0.9967993356234444]

Parameters:

  • word (String)

    The word to find similar to

  • num (Integer) (defaults to: 3)

    (3) Number of similar words to :word

Returns:

  • (Array)

    List of most similar words with cosine distance as values


154
155
156
# File 'lib/glove/model.rb', line 154

def most_similar(word, num=3)
  vector_distance(word.stem).take(num)
end

#save(corpus_file, cooc_file, vec_file, bias_file) ⇒ Object

Save trained data to files

Parameters:

  • corpus_file (String)

    Filename for corpus

  • cooc_file (String)

    Filename for co-occurence matrix

  • vec_file (String)

    Filename for Word Vector Maxtrix

  • bias_file (String)

    Filename for Word Biases Vector


85
86
87
88
89
90
91
92
93
# File 'lib/glove/model.rb', line 85

def save(corpus_file, cooc_file, vec_file, bias_file)
  File.open(corpus_file, 'wb') do |file|
    file.write Marshal.dump(corpus)
  end

  cooc_matrix.fwrite(cooc_file)
  word_vec.fwrite(vec_file)
  word_biases.fwrite(bias_file)
end

#trainGlove::Model

Train the model. Must call #fit prior

Returns:


74
75
76
77
# File 'lib/glove/model.rb', line 74

def train
  train_in_epochs(matrix_nnz)
  self
end

#visualizeObject

TODO:

create graph of the word vector matrix


119
120
121
# File 'lib/glove/model.rb', line 119

def visualize
  raise "Not implemented"
end