Class: Informers::AutoTokenizer

Inherits:
Object
  • Object
show all
Defined in:
lib/informers/tokenizers.rb

Constant Summary collapse

TOKENIZER_CLASS_MAPPING =
{
  "T5Tokenizer" => T5Tokenizer,
  "BertTokenizer" => BertTokenizer,
  "DebertaV2Tokenizer" => DebertaV2Tokenizer,
  "DistilBertTokenizer" => DistilBertTokenizer,
  "BartTokenizer" => BartTokenizer,
  "RobertaTokenizer" => RobertaTokenizer,
  "XLMRobertaTokenizer" => XLMRobertaTokenizer,
  "MPNetTokenizer" => MPNetTokenizer,
  "CLIPTokenizer" => CLIPTokenizer,
  "GPT2Tokenizer" => GPT2Tokenizer,
  "NllbTokenizer" => NllbTokenizer,
  "M2M100Tokenizer" => M2M100Tokenizer,
  "SpeechT5Tokenizer" => SpeechT5Tokenizer
}

Class Method Summary collapse

Class Method Details

.from_pretrained(pretrained_model_name_or_path, quantized: true, progress_callback: nil, config: nil, cache_dir: nil, local_files_only: false, revision: "main", legacy: nil, **kwargs) ⇒ Object



267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
# File 'lib/informers/tokenizers.rb', line 267

def self.from_pretrained(
  pretrained_model_name_or_path,
  quantized: true,
  progress_callback: nil,
  config: nil,
  cache_dir: nil,
  local_files_only: false,
  revision: "main",
  legacy: nil,
  **kwargs
)
  tokenizer_json, tokenizer_config = load_tokenizer(
    pretrained_model_name_or_path,
    quantized:,
    progress_callback:,
    config:,
    cache_dir:,
    local_files_only:,
    revision:,
    legacy:
  )

  # Some tokenizers are saved with the "Fast" suffix, so we remove that if present.
  tokenizer_name = tokenizer_config["tokenizer_class"]&.delete_suffix("Fast") || "PreTrainedTokenizer"

  cls = TOKENIZER_CLASS_MAPPING[tokenizer_name]
  if !cls
    warn "Unknown tokenizer class #{tokenizer_name.inspect}, attempting to construct from base class."
    cls = PreTrainedTokenizer
  end
  cls.new(tokenizer_json, tokenizer_config)
end

.load_tokenizer(pretrained_model_name_or_path, **options) ⇒ Object



300
301
302
303
304
305
306
307
308
309
310
311
# File 'lib/informers/tokenizers.rb', line 300

def self.load_tokenizer(pretrained_model_name_or_path, **options)
  info = [
    Utils::Hub.get_model_file(pretrained_model_name_or_path, "tokenizer.json", true, **options),
    Utils::Hub.get_model_json(pretrained_model_name_or_path, "tokenizer_config.json", true, **options)
  ]

  # Override legacy option if `options.legacy` is not null
  if !options[:legacy].nil?
    info[1]["legacy"] = options[:legacy]
  end
  info
end