Class: Transformers::PreTrainedTokenizerFast

Inherits:
PreTrainedTokenizerBase show all
Defined in:
lib/transformers/tokenization_utils_fast.rb

Constant Summary

Constants included from SpecialTokensMixin

SpecialTokensMixin::SPECIAL_TOKENS_ATTRIBUTES

Instance Attribute Summary

Attributes inherited from PreTrainedTokenizerBase

#init_kwargs, #model_max_length

Instance Method Summary collapse

Methods inherited from PreTrainedTokenizerBase

#_eventual_warn_about_too_long_sequence, _from_pretrained, #call, from_pretrained

Methods included from ClassAttribute

#class_attribute

Methods included from SpecialTokensMixin

#bos_token_id, #cls_token_id, #eos_token_id, #pad_token_id, #sep_token_id, #special_tokens_map, #unk_token_id

Constructor Details

#initialize(*args, **kwargs) ⇒ PreTrainedTokenizerFast

Returns a new instance of PreTrainedTokenizerFast.



17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
# File 'lib/transformers/tokenization_utils_fast.rb', line 17

def initialize(*args, **kwargs)
  tokenizer_object = kwargs.delete(:tokenizer_object)
  slow_tokenizer = kwargs.delete(:__slow_tokenizer)
  fast_tokenizer_file = kwargs.delete(:tokenizer_file)
  from_slow = kwargs.delete(:from_slow) { false }
  _added_tokens_decoder = kwargs.delete(:added_tokens_decoder)

  if !tokenizer_object.nil?
    fast_tokenizer = Copy.deepcopy(tokenizer_object)
  elsif !fast_tokenizer_file.nil? && !from_slow
    # We have a serialization from tokenizers which let us directly build the backend
    fast_tokenizer = Tokenizers::Tokenizer.from_file(fast_tokenizer_file)
  elsif !slow_tokenizer.nil?
    # We need to convert a slow tokenizer to build the backend
    fast_tokenizer = ConvertSlowTokenizer.convert_slow_tokenizer(slow_tokenizer)
  elsif !@slow_tokenizer_class.nil?
    # We need to create and convert a slow tokenizer to build the backend
    slow_tokenizer = @slow_tokenizer_class.new(*args, **kwargs)
    fast_tokenizer = ConvertSlowTokenizer.convert_slow_tokenizer(slow_tokenizer)
  else
    raise ArgumentError, <<~MSG
      Couldn't instantiate the backend tokenizer from one of:
      (1) a `tokenizers` library serialization file,
      (2) a slow tokenizer instance to convert or
      (3) an equivalent slow tokenizer class to instantiate and convert.
      You need to have sentencepiece installed to convert a slow tokenizer to a fast one.
    MSG
  end

  @tokenizer = fast_tokenizer

  if !slow_tokenizer.nil?
    kwargs.merge!(slow_tokenizer.init_kwargs)
  end

  @decode_use_source_tokenizer = false

  _truncation = @tokenizer.truncation

  if !_truncation.nil?
    _truncation = _truncation.transform_keys(&:to_sym)
    @tokenizer.enable_truncation(_truncation[:max_length], **_truncation.except(:max_length))
    kwargs[:max_length] ||= _truncation[:max_length]
    kwargs[:truncation_side] ||= _truncation[:direction]
    kwargs[:stride] ||= _truncation[:stride]
    kwargs[:truncation_strategy] ||= _truncation[:strategy]
  else
    @tokenizer.no_truncation
  end

  _padding = @tokenizer.padding
  if !_padding.nil?
    _padding = _padding.transform_keys(&:to_sym)
    @tokenizer.enable_padding(**_padding)
    kwargs[:pad_token] ||= _padding[:pad_token]
    kwargs[:pad_token_type_id] ||= _padding[:pad_token_type_id]
    kwargs[:padding_side] ||= _padding[:direction]
    kwargs[:max_length] ||= _padding[:length]
    kwargs[:pad_to_multiple_of] ||= _padding[:pad_to_multiple_of]
  end

  # We call this after having initialized the backend tokenizer because we update it.
  super(**kwargs)
end

Instance Method Details

#_convert_token_to_id_with_added_voc(token) ⇒ Object



114
115
116
117
118
119
120
# File 'lib/transformers/tokenization_utils_fast.rb', line 114

def _convert_token_to_id_with_added_voc(token)
  index = @tokenizer.token_to_id(token)
  if index.nil?
    return unk_token_id
  end
  index
end

#backend_tokenizerObject



94
95
96
# File 'lib/transformers/tokenization_utils_fast.rb', line 94

def backend_tokenizer
  @tokenizer
end

#convert_ids_to_tokens(ids, skip_special_tokens: false) ⇒ Object



122
123
124
125
126
127
128
129
130
131
132
133
134
135
# File 'lib/transformers/tokenization_utils_fast.rb', line 122

def convert_ids_to_tokens(ids, skip_special_tokens: false)
  if ids.is_a?(Integer)
    return @tokenizer.id_to_token(ids)
  end
  tokens = []
  ids.each do |index|
    index = index.to_i
    if skip_special_tokens && @all_special_ids.include?(index)
      next
    end
    tokens << @tokenizer.id_to_token(index)
  end
  tokens
end

#convert_tokens_to_ids(tokens) ⇒ Object



98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
# File 'lib/transformers/tokenization_utils_fast.rb', line 98

def convert_tokens_to_ids(tokens)
  if tokens.nil?
    return nil
  end

  if tokens.is_a?(String)
    return _convert_token_to_id_with_added_voc(tokens)
  end

  ids = []
  tokens.each do |token|
    ids << _convert_token_to_id_with_added_voc(token)
  end
  ids
end

#convert_tokens_to_string(tokens) ⇒ Object



137
138
139
# File 'lib/transformers/tokenization_utils_fast.rb', line 137

def convert_tokens_to_string(tokens)
  backend_tokenizer.decoder.decode(tokens)
end

#get_vocabObject



86
87
88
# File 'lib/transformers/tokenization_utils_fast.rb', line 86

def get_vocab
  @tokenizer.vocab(with_added_tokens: true)
end

#is_fastObject



82
83
84
# File 'lib/transformers/tokenization_utils_fast.rb', line 82

def is_fast
  true
end

#vocabObject



90
91
92
# File 'lib/transformers/tokenization_utils_fast.rb', line 90

def vocab
  get_vocab
end