Class: VectorModel

Inherits:
Object
  • Object
show all
Defined in:
lib/rbbt/vector/model.rb,
lib/rbbt/vector/model/util.rb

Direct Known Subclasses

RFModel, SVMModel, SpaCyModel, TensorFlowModel

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(directory, extract_features = nil, train_model = nil, eval_model = nil, names = nil, factor_levels = nil) ⇒ VectorModel

Returns a new instance of VectorModel.



108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
# File 'lib/rbbt/vector/model.rb', line 108

def initialize(directory, extract_features = nil, train_model = nil, eval_model = nil, names = nil, factor_levels = nil)
  @directory = directory
  FileUtils.mkdir_p @directory unless File.exists? @directory

  @model_file = File.join(@directory, "model")
  @extract_features_file = File.join(@directory, "features")
  @train_model_file = File.join(@directory, "train_model")
  @eval_model_file = File.join(@directory, "eval_model")
  @train_model_file_R = File.join(@directory, "train_model.R")
  @eval_model_file_R = File.join(@directory, "eval_model.R")
  @names_file = File.join(@directory, "feature_names")
  @levels_file = File.join(@directory, "levels")

  if extract_features.nil?
    if File.exists?(@extract_features_file)
      @extract_features = __load_method @extract_features_file
    end
  else
    @extract_features = extract_features 
  end

  if train_model.nil?
    if File.exists?(@train_model_file)
      @train_model = __load_method @train_model_file
    elsif File.exists?(@train_model_file_R)
      @train_model = Open.read(@train_model_file_R)
    end
  else
    @train_model = train_model 
  end

  if eval_model.nil?
    if File.exists?(@eval_model_file)
      @eval_model = __load_method @eval_model_file
    elsif File.exists?(@eval_model_file_R)
      @eval_model = Open.read(@eval_model_file_R)
    end
  else
    @eval_model = eval_model
  end

  if names.nil?
    if File.exists?(@names_file)
      @names = Open.read(@names_file).split("\n")
    end
  else
    @extract_features = names 
  end

  if factor_levels.nil?
    if File.exists?(@levels_file)
      @factor_levels = YAML.load(Open.read(@levels_file))
    end
    if File.exists?(@model_file + '.factor_levels')
      @factor_levels = TSV.open(@model_file + '.factor_levels')
    end
  else
    @factor_levels = factor_levels 
  end

  @features = []
  @labels = []
end

Instance Attribute Details

#bar(max = nil, desc = nil) ⇒ Object

Returns the value of attribute bar.



2
3
4
# File 'lib/rbbt/vector/model/util.rb', line 2

def bar
  @bar
end

#directoryObject

Returns the value of attribute directory.



5
6
7
# File 'lib/rbbt/vector/model.rb', line 5

def directory
  @directory
end

#eval_modelObject

Returns the value of attribute eval_model.



5
6
7
# File 'lib/rbbt/vector/model.rb', line 5

def eval_model
  @eval_model
end

#extract_featuresObject

Returns the value of attribute extract_features.



5
6
7
# File 'lib/rbbt/vector/model.rb', line 5

def extract_features
  @extract_features
end

#factor_levelsObject

Returns the value of attribute factor_levels.



6
7
8
# File 'lib/rbbt/vector/model.rb', line 6

def factor_levels
  @factor_levels
end

#featuresObject

Returns the value of attribute features.



6
7
8
# File 'lib/rbbt/vector/model.rb', line 6

def features
  @features
end

#labelsObject

Returns the value of attribute labels.



6
7
8
# File 'lib/rbbt/vector/model.rb', line 6

def labels
  @labels
end

#model_fileObject

Returns the value of attribute model_file.



5
6
7
# File 'lib/rbbt/vector/model.rb', line 5

def model_file
  @model_file
end

#namesObject

Returns the value of attribute names.



6
7
8
# File 'lib/rbbt/vector/model.rb', line 6

def names
  @names
end

#train_modelObject

Returns the value of attribute train_model.



5
6
7
# File 'lib/rbbt/vector/model.rb', line 5

def train_model
  @train_model
end

Class Method Details

.f1_metrics(test, predicted, good_label = nil) ⇒ Object

acc end



299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
# File 'lib/rbbt/vector/model.rb', line 299

def self.f1_metrics(test, predicted, good_label = nil)
  tp, tn, fp, fn, pr, re, f1 = [0, 0, 0, 0, nil, nil, nil]

  labels = (test + predicted).uniq

  if labels.length == 2 || good_label
    good_label = labels.uniq.select{|l| l.to_s == "true"}.first if good_label.nil?
    good_label = labels.uniq.select{|l| l.to_s == "1"}.first if good_label.nil?
    good_label = labels.uniq.sort.first if good_label.nil?
    good_label = good_label.to_s

    test.zip(predicted).each do |gs,pred|
      gs = gs.to_s
      pred = pred.to_s

      tp += 1 if pred == good_label && gs == good_label
      fp += 1 if pred == good_label && gs != good_label
      tn += 1 if pred != good_label && gs != good_label 
      fn += 1 if pred != good_label && gs == good_label
    end

    p = tp + fn
    pp = tp + fp

    pr = tp.to_f / pp
    re = tp.to_f / p

    f1 = (2.0 * tp) / (2.0 * tp + fp + fn) 

    [tp, tn, fp, fn, pr, re, f1]
  else 
    num = labels.length
    acc = []
    labels.each do |good_label|
      values = VectorModel.f1_metrics(test, predicted, good_label)
      tp, tn, fp, fn, pr, re, f1 = values
      Log.debug "Partial CV #{good_label} - P:#{"%.3f" % pr} R:#{"%.3f" % re} F1:#{"%.3f" % f1} - #{[tp.to_s, tn.to_s, fp.to_s, fn.to_s] * " "}"
      acc << values
    end
    Misc.zip_fields(acc).collect{|s| Misc.mean(s)}
  end
end

.R_eval(model_file, features, list, code, names = nil, factor_levels = nil) ⇒ Object



69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
# File 'lib/rbbt/vector/model.rb', line 69

def self.R_eval(model_file, features, list, code, names = nil, factor_levels = nil)
  TmpFile.with_file do |feature_file|
    if list
      Open.write(feature_file, features.collect{|feat| feat * "\t"} * "\n" + "\n")
    else
      Open.write(feature_file, features * "\t" + "\n")
    end
    Open.write(feature_file + '.names', names * "\n" + "\n") if names

    TmpFile.with_file do |results|

      io = R.run <<-EOF
features = read.table("#{ feature_file }", sep ="\\t", stringsAsFactors=TRUE);
#{"names(features) = make.names(readLines('#{feature_file + '.names'}'))" if names }
#{ factor_levels.collect do |name,levels|
  "features[['#{name}']] = factor(features[['#{name}']], levels=#{R.ruby2R levels})"
end * "\n" if factor_levels }
load(file="#{model_file}");
#{code}
cat(paste(label, sep="\\n", collapse="\\n"));
      EOF
      txt = io.read
      res = txt.sub(/WARNING: .*?\n/s,'').split(/\s+/)

      if list
        res
      else
        res.first
      end
    end
  end
end

.R_run(model_file, features, labels, code, names = nil, factor_levels = nil) ⇒ Object



8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
# File 'lib/rbbt/vector/model.rb', line 8

def self.R_run(model_file, features, labels, code, names = nil, factor_levels = nil)
  TmpFile.with_file do |feature_file|
    Open.write(feature_file, features.collect{|feats| feats * "\t"} * "\n")
    Open.write(feature_file + '.label', labels * "\n" + "\n")
    Open.write(feature_file + '.names', names * "\n" + "\n") if names


    what = case labels.first
           when Numeric, Integer, Float
             'numeric()'
           else
             'character()'
           end

    R.run <<-EOF
features = read.table("#{ feature_file }", sep ="\\t", stringsAsFactors=TRUE);
#{"names(features) = make.names(readLines('#{feature_file + '.names'}'))" if names }
#{ factor_levels.collect do |name,levels|
  "features[['#{name}']] = factor(features[['#{name}']], levels=#{R.ruby2R levels})"
end * "\n" if factor_levels }
labels = scan("#{ feature_file }.label", what=#{what});
features = cbind(features, label = labels);
#{code}
    EOF
  end
end

.R_train(model_file, features, labels, code, names = nil, factor_levels = nil) ⇒ Object



35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
# File 'lib/rbbt/vector/model.rb', line 35

def self.R_train(model_file, features, labels, code, names = nil, factor_levels = nil)
  TmpFile.with_file do |feature_file|
    Open.write(feature_file, features.collect{|feats| feats * "\t"} * "\n")
    Open.write(feature_file + '.label', labels * "\n" + "\n")
    Open.write(feature_file + '.names', names * "\n" + "\n") if names

    what = case labels.first
           when Numeric, Integer, Float
             'numeric()'
           else
             'character()'
           end

    R.run <<-EOF
features = read.table("#{ feature_file }", sep ="\\t", stringsAsFactors=TRUE);
labels = scan("#{ feature_file }.label", what=#{what});
#{"names(features) = make.names(readLines('#{feature_file + '.names'}'))" if names }
features = cbind(features, label = labels);
#{ factor_levels.collect do |name,levels|
  "features[['#{name}']] = factor(features[['#{name}']], levels=#{R.ruby2R levels})"
end * "\n" if factor_levels }
#{code}
# Save used factor levels
factor_levels = c()
for (c in names(features)){
if (is.factor(features[[c]]))
  factor_levels[c] = paste(levels(features[[c]]), collapse="\t")
}
rbbt.tsv.write("#{model_file}.factor_levels", factor_levels, names=c('Levels'), type='flat')
save(model, file='#{model_file}')
    EOF
  end
end

Instance Method Details

#__load_method(file) ⇒ Object



102
103
104
105
106
# File 'lib/rbbt/vector/model.rb', line 102

def __load_method(file)
  code = Open.read(file)
  code.sub!(/.*Proc\.new/, "Proc.new")
  instance_eval code, file
end

#add(element, label = nil) ⇒ Object



177
178
179
180
181
# File 'lib/rbbt/vector/model.rb', line 177

def add(element, label = nil)
  features = @extract_features ? extract_features.call(element) : element
  @features << features
  @labels << label 
end

#add_list(elements, labels = nil) ⇒ Object



183
184
185
186
187
188
189
190
191
192
193
# File 'lib/rbbt/vector/model.rb', line 183

def add_list(elements, labels = nil)
  if @extract_features.nil? || @extract_features.arity == 1
    elements.zip(labels || [nil]).each do |elem,label|
      add(elem, label)
    end
  else
    features = @extract_features.call(nil, elements)
    @features.concat  features
    @labels.concat labels if labels
  end
end

#clearObject



172
173
174
175
# File 'lib/rbbt/vector/model.rb', line 172

def clear
  @features = []
  @labels = []
end

#cross_validation(folds = 10, good_label = nil) ⇒ Object



342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
# File 'lib/rbbt/vector/model.rb', line 342

def cross_validation(folds = 10, good_label = nil)

  orig_features = @features
  orig_labels = @labels

  multiclass = @labels.uniq.length > 2

  if multiclass
    res = TSV.setup({}, "Fold~P,R,F1#:type=:list")
  else
    res = TSV.setup({}, "Fold~TP,TN,FP,FN,P,R,F1#:type=:list")
  end

  begin
    if folds == 1
      feature_folds = [@features]
      labels_folds = [@labels]
    else
      feature_folds = Misc.divide(@features, folds)
      labels_folds = Misc.divide(@labels, folds)
    end

    folds.times do |fix|

      if folds == 1
        rest = [fix]
      else
        rest = (0..(folds-1)).to_a - [fix]
      end

      test_set = feature_folds[fix]
      train_set = feature_folds.values_at(*rest).inject([]){|acc,e| acc += e; acc}

      test_labels = labels_folds[fix]
      train_labels = labels_folds.values_at(*rest).flatten

      @features = train_set
      @labels = train_labels

      self.train
      predictions = self.eval_list test_set, false

      raise "Number of predictions (#{predictions.length}) and test labels (#{test_labels.length}) do not match" if predictions.length != test_labels.length

      different_labels = test_labels.uniq

      Log.debug do "Accuracy Fold #{fix}: #{(100 * test_labels.zip(predictions).select{|t,p| t == p }.length.to_f / test_labels.length).round(2)}%"  end

      tp, tn, fp, fn, pr, re, f1 = VectorModel.f1_metrics(test_labels, predictions, good_label)

      if multiclass 
        Log.low "Multi-class CV Fold #{fix} - Average P:#{"%.3f" % pr} R:#{"%.3f" % re} F1:#{"%.3f" % f1}"
        res[fix] = [pr,re,f1]
      else
        Log.low "CV Fold #{fix} P:#{"%.3f" % pr} R:#{"%.3f" % re} F1:#{"%.3f" % f1} - #{[tp.to_s, tn.to_s, fp.to_s, fn.to_s] * " "}"
        res[fix] = [tp,tn,fp,fn,pr,re,f1]
      end

    end
  ensure
    @features = orig_features
    @labels = orig_labels
  end
  self.train unless folds == 1
  res
end

#eval(element) ⇒ Object



238
239
240
241
242
243
244
245
# File 'lib/rbbt/vector/model.rb', line 238

def eval(element)
  case 
  when Proc === @eval_model
    @eval_model.call(@model_file, @extract_features.call(element), false, nil, @names, @factor_levels)
  when String === @eval_model
    VectorModel.R_eval(@model_file,  @extract_features.call(element), false, eval_model, @names, @factor_levels)
  end
end

#eval_list(elements, extract = true) ⇒ Object



247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
# File 'lib/rbbt/vector/model.rb', line 247

def eval_list(elements, extract = true)

  if extract && ! @extract_features.nil? 
    features = if @extract_features.arity == 1
                 elements.collect{|element| @extract_features.call(element) }
               else
                 @extract_features.call(nil, elements)
               end
  else
    features = elements
  end

  case 
  when Proc === eval_model
    eval_model.call(@model_file, features, true, nil, @names, @factor_levels)
  when String === eval_model
    VectorModel.R_eval(@model_file, features, true, eval_model, @names, @factor_levels)
  end
end

#run(code) ⇒ Object



234
235
236
# File 'lib/rbbt/vector/model.rb', line 234

def run(code)
  VectorModel.R_run(@model_file,  @features, @labels, code, @names, @factor_levels)
end

#save_modelsObject



195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
# File 'lib/rbbt/vector/model.rb', line 195

def save_models
  require 'method_source'

  case 
  when Proc === train_model
    begin
      Open.write(@train_model_file, train_model.source)
    rescue
    end
  when String === train_model
    Open.write(@train_model_file_R, @train_model)
  end

  Open.write(@extract_features_file, @extract_features.source) if @extract_features

  case 
  when Proc === eval_model
    begin
      Open.write(@eval_model_file, eval_model.source)
    rescue
    end
  when String === eval_model
    Open.write(@eval_model_file_R, eval_model)
  end

  Open.write(@levels_file, @factor_levels.to_yaml) if @factor_levels
  Open.write(@names_file, @names * "\n" + "\n") if @names
end

#trainObject



224
225
226
227
228
229
230
231
232
# File 'lib/rbbt/vector/model.rb', line 224

def train
  case 
  when Proc === train_model
    train_model.call(@model_file, @features, @labels, @names, @factor_levels)
  when String === train_model
    VectorModel.R_train(@model_file,  @features, @labels, train_model, @names, @factor_levels)
  end
  save_models
end