Class: Transformers::PreTrainedTokenizer

Inherits:
PreTrainedTokenizerBase show all
Defined in:
lib/transformers/tokenization_utils.rb

Constant Summary

Constants included from SpecialTokensMixin

SpecialTokensMixin::SPECIAL_TOKENS_ATTRIBUTES

Instance Attribute Summary

Attributes inherited from PreTrainedTokenizerBase

#init_kwargs, #model_max_length

Instance Method Summary collapse

Methods inherited from PreTrainedTokenizerBase

#_eventual_warn_about_too_long_sequence, _from_pretrained, #call, from_pretrained

Methods included from ClassAttribute

#class_attribute

Methods included from SpecialTokensMixin

#bos_token_id, #cls_token_id, #eos_token_id, #pad_token_id, #sep_token_id, #special_tokens_map, #unk_token_id

Constructor Details

#initialize(**kwargs) ⇒ PreTrainedTokenizer

Returns a new instance of PreTrainedTokenizer.



17
18
19
20
21
22
23
24
25
26
27
28
29
# File 'lib/transformers/tokenization_utils.rb', line 17

def initialize(**kwargs)
  # 2. init `_added_tokens_decoder` if child class did not
  if !instance_variable_defined?(:@added_tokens_decoder)
    @added_tokens_decoder = {}
  end

  # 3. if a `added_tokens_decoder` is passed, we are loading from a saved tokenizer, we overwrite
  @added_tokens_decoder.merge!(kwargs.delete(:added_tokens_decoder) { {} })
  @added_tokens_encoder = @added_tokens_decoder.to_h { |k, v| [k.content, v] }

  # 4 init the parent class
  super(**kwargs)
end

Instance Method Details

#_convert_token_to_id(token) ⇒ Object

Raises:

  • (NotImplementedError)


147
148
149
# File 'lib/transformers/tokenization_utils.rb', line 147

def _convert_token_to_id(token)
  raise NotImplementedError
end

#_convert_token_to_id_with_added_voc(token) ⇒ Object



136
137
138
139
140
141
142
143
144
145
# File 'lib/transformers/tokenization_utils.rb', line 136

def _convert_token_to_id_with_added_voc(token)
  if token.nil?
    return nil
  end

  if @added_tokens_encoder.include?(token)
    return @added_tokens_encoder[token]
  end
  _convert_token_to_id(token)
end

#_encode_plus(text:, text_pair: nil, add_special_tokens: true, padding_strategy: PaddingStrategy::DO_NOT_PAD, truncation_strategy: TruncationStrategy::DO_NOT_TRUNCATE, max_length: nil, stride: 0, is_split_into_words: false, pad_to_multiple_of: nil, return_tensors: nil, return_token_type_ids: nil, return_attention_mask: nil, return_overflowing_tokens: false, return_special_tokens_mask: false, return_offsets_mapping: false, return_length: false, verbose: true, **kwargs) ⇒ Object



43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
# File 'lib/transformers/tokenization_utils.rb', line 43

def _encode_plus(
  text:,
  text_pair: nil,
  add_special_tokens: true,
  padding_strategy: PaddingStrategy::DO_NOT_PAD,
  truncation_strategy: TruncationStrategy::DO_NOT_TRUNCATE,
  max_length: nil,
  stride: 0,
  is_split_into_words: false,
  pad_to_multiple_of: nil,
  return_tensors: nil,
  return_token_type_ids: nil,
  return_attention_mask: nil,
  return_overflowing_tokens: false,
  return_special_tokens_mask: false,
  return_offsets_mapping: false,
  return_length: false,
  verbose: true,
  **kwargs
)
  get_input_ids = lambda do |text|
    if text.is_a?(String)
      tokens = tokenize(text, **kwargs)
      convert_tokens_to_ids(tokens)
    elsif text.is_a?(Array) && text.length > 0 && text[0].is_a?(String)
      if is_split_into_words
        raise Todo
      else
        convert_tokens_to_ids(text)
      end
    elsif text.is_a?(Array) && text.length > 0 && text[0].is_a?(Integer)
      text
    else
      if is_split_into_words
        raise ArgumentError,
          "Input #{text} is not valid. Should be a string or a list/tuple of strings when" +
          " `is_split_into_words=True`."
      else
        raise ArgumentError,
          "Input #{text} is not valid. Should be a string, a list/tuple of strings or a list/tuple of" +
          " integers."
      end
    end
  end

  if return_offsets_mapping
    raise RuntimeError,
      "return_offset_mapping is not available when using Ruby tokenizers. " +
      "To use this feature, change your tokenizer to one deriving from " +
      "Transformers::PreTrainedTokenizerFast. " +
      "More information on available tokenizers at " +
      "https://github.com/huggingface/transformers/pull/2674"
  end

  first_ids = get_input_ids.(text)
  second_ids = !text_pair.nil? ? get_input_ids.(text_pair) : nil

  prepare_for_model(
    first_ids,
    pair_ids: second_ids,
    add_special_tokens: add_special_tokens,
    padding: padding_strategy,
    truncation: truncation_strategy,
    max_length: max_length,
    stride: stride,
    pad_to_multiple_of: pad_to_multiple_of,
    return_tensors: return_tensors,
    prepend_batch_axis: true,
    return_attention_mask: return_attention_mask,
    return_token_type_ids: return_token_type_ids,
    return_overflowing_tokens: return_overflowing_tokens,
    return_special_tokens_mask: return_special_tokens_mask,
    return_length: return_length,
    verbose: verbose
  )
end

#convert_tokens_to_ids(tokens) ⇒ Object



120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
# File 'lib/transformers/tokenization_utils.rb', line 120

def convert_tokens_to_ids(tokens)
  if tokens.nil?
    return nil
  end

  if tokens.is_a?(String)
    return _convert_token_to_id_with_added_voc(tokens)
  end

  ids = []
  tokens.each do |token|
    ids << _convert_token_to_id_with_added_voc(token)
  end
  ids
end

#is_fastObject



31
32
33
# File 'lib/transformers/tokenization_utils.rb', line 31

def is_fast
  false
end

#tokenize(text, **kwargs) ⇒ Object

Raises:



39
40
41
# File 'lib/transformers/tokenization_utils.rb', line 39

def tokenize(text, **kwargs)
  raise Todo
end

#vocab_sizeObject

Raises:

  • (NotImplementedError)


35
36
37
# File 'lib/transformers/tokenization_utils.rb', line 35

def vocab_size
  raise NotImplementedError
end