28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
|
# File 'lib/transformers/models/auto/tokenization_auto.rb', line 28
def from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
config = kwargs.delete(:config)
kwargs[:_from_auto] = true
use_fast = kwargs.delete(:use_fast) { true }
tokenizer_type = kwargs.delete(:tokenizer_type) { nil }
trust_remote_code = kwargs.delete(:trust_remote_code)
if !tokenizer_type.nil?
raise Todo
end
tokenizer_config = get_tokenizer_config(pretrained_model_name_or_path, **kwargs)
if tokenizer_config.include?("_commit_hash")
kwargs[:_commit_hash] = tokenizer_config["_commit_hash"]
end
config_tokenizer_class = tokenizer_config["tokenizer_class"]
_tokenizer_auto_map = nil
if tokenizer_config["auto_map"]
raise Todo
end
if config_tokenizer_class.nil?
if !config.is_a?(PretrainedConfig)
config = AutoConfig.from_pretrained(
pretrained_model_name_or_path, trust_remote_code: trust_remote_code, **kwargs
)
config_tokenizer_class = config.tokenizer_class
end
end
if !config_tokenizer_class.nil?
tokenizer_class = nil
if use_fast && !config_tokenizer_class.end_with?("Fast")
tokenizer_class_candidate = "#{config_tokenizer_class}Fast"
tokenizer_class = tokenizer_class_from_name(tokenizer_class_candidate)
end
if tokenizer_class.nil?
tokenizer_class_candidate = config_tokenizer_class
tokenizer_class = tokenizer_class_from_name(tokenizer_class_candidate)
end
if tokenizer_class.nil?
raise ArgumentError, "Tokenizer class #{tokenizer_class_candidate} does not exist or is not currently imported."
end
return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
end
model_type = config_class_to_model_type(config.class.name.split("::").last)
if !model_type.nil?
tokenizer_class_py, tokenizer_class_fast = TOKENIZER_MAPPING[config.class.name.split("::").last]
if tokenizer_class_fast && (use_fast || tokenizer_class_py.nil?)
return tokenizer_class_fast.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
else
if !tokenizer_class_py.nil?
return tokenizer_class_py.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
else
raise ArgumentError, "This tokenizer cannot be instantiated. Please make sure you have `sentencepiece` installed in order to use this tokenizer."
end
end
end
raise ArgumentError, "Unrecognized configuration class #{config.class.name} to build an AutoTokenizer."
end
|