Class: HuggingfaceModel
- Inherits:
-
TorchModel
- Object
- VectorModel
- PythonModel
- TorchModel
- HuggingfaceModel
- Defined in:
- lib/rbbt/vector/model/huggingface.rb
Direct Known Subclasses
Instance Attribute Summary collapse
-
#tokenizer ⇒ Object
Returns the value of attribute tokenizer.
Attributes inherited from TorchModel
Attributes inherited from PythonModel
Attributes inherited from VectorModel
#balance, #bar, #directory, #eval_model, #extract_features, #factor_levels, #features, #init_model, #labels, #model, #model_options, #model_path, #names, #post_process, #train_model
Instance Method Summary collapse
- #init ⇒ Object
-
#initialize(task, checkpoint, dir = nil, model_options = {}) ⇒ HuggingfaceModel
constructor
A new instance of HuggingfaceModel.
- #reset_model ⇒ Object
Methods inherited from TorchModel
device, dtype, feature_dataset, feature_tsv, freeze, #freeze_layer, freeze_layer, get_layer, #get_layer, #get_weights, get_weights, init_python, load_architecture, load_state, model_architecture, optimizer, save_architecture, save_state, tensor, text_dataset
Methods inherited from VectorModel
R_eval, R_run, R_train, #__load_method, #add, #add_list, #balance_labels, #clear, #cross_validation, #eval, #eval_list, f1_metrics, #run, #save_models, #train
Constructor Details
#initialize(task, checkpoint, dir = nil, model_options = {}) ⇒ HuggingfaceModel
Returns a new instance of HuggingfaceModel.
16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 |
# File 'lib/rbbt/vector/model/huggingface.rb', line 16 def initialize(task, checkpoint, dir = nil, = {}) super(dir, nil, ) checkpoint = checkpoint.find if Path === checkpoint @model_options[:tokenizer_options] = @model_options.delete(:tokenizer_args) if @model_options.include?(:tokenizer_args) tokenizer_args = IndiferentHash.pull_keys @model_options, :tokenizer @model_options[:tokenizer_args] = tokenizer_args @model_options[:task] = task if task @model_options[:checkpoint] = checkpoint if checkpoint init_model do checkpoint = @model_path && File.directory?(@model_path) ? @model_path : @model_options[:checkpoint] model = RbbtPython.call_method("rbbt_dm.huggingface", :load_model, @model_options[:task], checkpoint, **(IndiferentHash.setup(@model_options.except(:training_args, :tokenizer_args, :task, :checkpoint, :class_labels)))) tokenizer_checkpoint = @model_options[:tokenizer_args][:checkpoint] || checkpoint tokenizer = RbbtPython.call_method("rbbt_dm.huggingface", :load_tokenizer, tokenizer_checkpoint, **(IndiferentHash.setup(@model_options[:tokenizer_args]))) [model, tokenizer] end eval_model do |texts,is_list| model, tokenizer = self.init if is_list || @model_options[:task] == "MaskedLM" texts = [texts] if ! is_list if @model_options.include?(:locate_tokens) locate_tokens = @model_options[:locate_tokens] elsif @model_options[:task] == "MaskedLM" @model_options[:locate_tokens] = locate_tokens = tokenizer.special_tokens_map["mask_token"] end if @directory tsv_file = File.join(@directory, 'dataset.tsv') checkpoint_dir = File.join(@directory, 'checkpoints') else tmpdir = TmpFile.tmp_file Open.mkdir tmpdir tsv_file = File.join(tmpdir, 'dataset.tsv') checkpoint_dir = File.join(tmpdir, 'checkpoints') end dataset_file = TorchModel.text_dataset(tsv_file, texts) training_args_obj = RbbtPython.call_method("rbbt_dm.huggingface", :training_args, checkpoint_dir, {}) begin RbbtPython.call_method("rbbt_dm.huggingface", :predict_model, model, tokenizer, training_args_obj, dataset_file, locate_tokens) ensure Open.rm_rf tmpdir if tmpdir end else RbbtPython.call_method("rbbt_dm.huggingface", :eval_model, model, tokenizer, [texts], locate_tokens) end end train_model do |texts,labels| model, tokenizer = self.init if @directory tsv_file = File.join(@directory, 'dataset.tsv') checkpoint_dir = File.join(@directory, 'checkpoints') else tmpdir = TmpFile.tmp_file Open.mkdir tmpdir tsv_file = File.join(tmpdir, 'dataset.tsv') checkpoint_dir = File.join(tmpdir, 'checkpoints') end training_args_obj = RbbtPython.call_method("rbbt_dm.huggingface", :training_args, checkpoint_dir, training_args) dataset_file = HuggingfaceModel.text_dataset(tsv_file, texts, labels, @model_options[:class_labels]) RbbtPython.call_method("rbbt_dm.huggingface", :train_model, model, tokenizer, training_args_obj, dataset_file, @model_options[:class_weights]) Open.rm_rf tmpdir if tmpdir model.save_pretrained(@model_path) if @model_path tokenizer.save_pretrained(@model_path) if @model_path end post_process do |result,is_list| model, tokenizer = self.init if result.respond_to?(:predictions) single = false predictions = result.predictions elsif result["token_positions"] predictions = result["result"].predictions token_positions = result["token_positions"] else single = true predictions = result["logits"] end if @model_options[:return_logits] result = RbbtPython.numpy2ruby(predictions) else task, class_labels, locate_tokens = @model_options.values_at :task, :class_labels, :locate_tokens result = case task when "SequenceClassification" RbbtPython.collect(predictions) do |logits| logits = RbbtPython.numpy2ruby logits best_class = logits.index logits.max best_class = class_labels[best_class] if class_labels best_class end when "MaskedLM" all_token_positions = token_positions.to_a i = 0 RbbtPython.collect(predictions) do |item_logits| item_token_positions = all_token_positions[i] i += 1 item_logits = RbbtPython.numpy2ruby(item_logits) item_masks = item_token_positions.collect do |token_positions| best = item_logits.values_at(*token_positions).collect do |logits| best_token, best_score = nil logits.each_with_index do |v,i| if best_score.nil? || v > best_score best_token, best_score = i, v end end best_token end best.collect{|b| tokenizer.decode(b) } * "|" end Array === locate_tokens ? item_masks : item_masks.first end else predictions end end (! is_list || single) && Array === result ? result.first : result end save_models if @model_path end |
Instance Attribute Details
#tokenizer ⇒ Object
Returns the value of attribute tokenizer.
5 6 7 |
# File 'lib/rbbt/vector/model/huggingface.rb', line 5 def tokenizer @tokenizer end |
Instance Method Details
#init ⇒ Object
6 7 8 9 |
# File 'lib/rbbt/vector/model/huggingface.rb', line 6 def init @model, @tokenizer = self.instance_exec(&@init_model) if @model.nil? [@model, @tokenizer] end |
#reset_model ⇒ Object
164 165 166 167 168 169 |
# File 'lib/rbbt/vector/model/huggingface.rb', line 164 def reset_model @model, @tokenizer = nil Open.rm_rf @model_path Open.rm_rf TorchModel.model_architecture(model_path) init end |