Class: Transformers::AutoTokenizer

Inherits:
Object
  • Object
show all
Defined in:
lib/transformers/models/auto/tokenization_auto.rb

Class Method Summary collapse

Class Method Details

.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) ⇒ Object

Raises:

  • (ArgumentError)


28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
# File 'lib/transformers/models/auto/tokenization_auto.rb', line 28

def from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
  config = kwargs.delete(:config)
  kwargs[:_from_auto] = true

  use_fast = kwargs.delete(:use_fast) { true }
  tokenizer_type = kwargs.delete(:tokenizer_type) { nil }
  trust_remote_code = kwargs.delete(:trust_remote_code)

  if !tokenizer_type.nil?
    raise Todo
  end

  tokenizer_config = get_tokenizer_config(pretrained_model_name_or_path, **kwargs)
  if tokenizer_config.include?("_commit_hash")
    kwargs[:_commit_hash] = tokenizer_config["_commit_hash"]
  end
  config_tokenizer_class = tokenizer_config["tokenizer_class"]
  _tokenizer_auto_map = nil
  if tokenizer_config["auto_map"]
    raise Todo
  end

  # If that did not work, let's try to use the config.
  if config_tokenizer_class.nil?
    if !config.is_a?(PretrainedConfig)
      config = AutoConfig.from_pretrained(
        pretrained_model_name_or_path, trust_remote_code: trust_remote_code, **kwargs
      )
      config_tokenizer_class = config.tokenizer_class
      # if hasattr(config, "auto_map") and "AutoTokenizer" in config.auto_map:
      #     tokenizer_auto_map = config.auto_map["AutoTokenizer"]
    end
  end

  if !config_tokenizer_class.nil?
    tokenizer_class = nil
    if use_fast && !config_tokenizer_class.end_with?("Fast")
      tokenizer_class_candidate = "#{config_tokenizer_class}Fast"
      tokenizer_class = tokenizer_class_from_name(tokenizer_class_candidate)
    end
    if tokenizer_class.nil?
      tokenizer_class_candidate = config_tokenizer_class
      tokenizer_class = tokenizer_class_from_name(tokenizer_class_candidate)
    end
    if tokenizer_class.nil?
      raise ArgumentError, "Tokenizer class #{tokenizer_class_candidate} does not exist or is not currently imported."
    end
    return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
  end

  model_type = config_class_to_model_type(config.class.name.split("::").last)
  if !model_type.nil?
    tokenizer_class_py, tokenizer_class_fast = TOKENIZER_MAPPING[config.class.name.split("::").last]
    if tokenizer_class_fast && (use_fast || tokenizer_class_py.nil?)
      return tokenizer_class_fast.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
    else
      if !tokenizer_class_py.nil?
        return tokenizer_class_py.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
      else
        raise ArgumentError, "This tokenizer cannot be instantiated. Please make sure you have `sentencepiece` installed in order to use this tokenizer."
      end
    end
  end

  raise ArgumentError, "Unrecognized configuration class #{config.class.name} to build an AutoTokenizer."
end