Class: Transformers::PreTrainedTokenizer

Inherits:

PreTrainedTokenizerBase

Object
PreTrainedTokenizerBase
Transformers::PreTrainedTokenizer

show all

Defined in:: lib/transformers/tokenization_utils.rb

Direct Known Subclasses

Bert::BertTokenizer, Distilbert::DistilBertTokenizer

Constant Summary

Constants included from SpecialTokensMixin

SpecialTokensMixin::SPECIAL_TOKENS_ATTRIBUTES

Instance Attribute Summary

Attributes inherited from PreTrainedTokenizerBase

#init_kwargs, #model_max_length

Instance Method Summary collapse

Constructor Details

#initialize(**kwargs) ⇒ `PreTrainedTokenizer`

Returns a new instance of PreTrainedTokenizer.

# File 'lib/transformers/tokenization_utils.rb', line 17

def initialize(**kwargs)
  # 2. init `_added_tokens_decoder` if child class did not
  if !instance_variable_defined?(:@added_tokens_decoder)
    @added_tokens_decoder = {}
  end

  # 3. if a `added_tokens_decoder` is passed, we are loading from a saved tokenizer, we overwrite
  @added_tokens_decoder.merge!(kwargs.delete(:added_tokens_decoder) { {} })
  @added_tokens_encoder = @added_tokens_decoder.to_h { |k, v| [k.content, v] }

  # 4 init the parent class
  super(**kwargs)
end

Instance Method Details

#_convert_token_to_id(token) ⇒ `Object`

Raises:

(NotImplementedError)



147
148
149

# File 'lib/transformers/tokenization_utils.rb', line 147

def _convert_token_to_id(token)
  raise NotImplementedError
end

#_convert_token_to_id_with_added_voc(token) ⇒ `Object`

# File 'lib/transformers/tokenization_utils.rb', line 136

def _convert_token_to_id_with_added_voc(token)
  if token.nil?
    return nil
  end

  if @added_tokens_encoder.include?(token)
    return @added_tokens_encoder[token]
  end
  _convert_token_to_id(token)
end

#_encode_plus(text:, text_pair: nil, add_special_tokens: true, padding_strategy: PaddingStrategy::DO_NOT_PAD, truncation_strategy: TruncationStrategy::DO_NOT_TRUNCATE, max_length: nil, stride: 0, is_split_into_words: false, pad_to_multiple_of: nil, return_tensors: nil, return_token_type_ids: nil, return_attention_mask: nil, return_overflowing_tokens: false, return_special_tokens_mask: false, return_offsets_mapping: false, return_length: false, verbose: true, **kwargs) ⇒ `Object`

# File 'lib/transformers/tokenization_utils.rb', line 43

def _encode_plus(
  text:,
  text_pair: nil,
  add_special_tokens: true,
  padding_strategy: PaddingStrategy::DO_NOT_PAD,
  truncation_strategy: TruncationStrategy::DO_NOT_TRUNCATE,
  max_length: nil,
  stride: 0,
  is_split_into_words: false,
  pad_to_multiple_of: nil,
  return_tensors: nil,
  return_token_type_ids: nil,
  return_attention_mask: nil,
  return_overflowing_tokens: false,
  return_special_tokens_mask: false,
  return_offsets_mapping: false,
  return_length: false,
  verbose: true,
  **kwargs
)
  get_input_ids = lambda do |text|
    if text.is_a?(String)
      tokens = tokenize(text, **kwargs)
      convert_tokens_to_ids(tokens)
    elsif text.is_a?(Array) && text.length > 0 && text[0].is_a?(String)
      if is_split_into_words
        raise Todo
      else
        convert_tokens_to_ids(text)
      end
    elsif text.is_a?(Array) && text.length > 0 && text[0].is_a?(Integer)
      text
    else
      if is_split_into_words
        raise ArgumentError,
          "Input #{text} is not valid. Should be a string or a list/tuple of strings when" +
          " `is_split_into_words=True`."
      else
        raise ArgumentError,
          "Input #{text} is not valid. Should be a string, a list/tuple of strings or a list/tuple of" +
          " integers."
      end
    end
  end

  if return_offsets_mapping
    raise RuntimeError,
      "return_offset_mapping is not available when using Ruby tokenizers. " +
      "To use this feature, change your tokenizer to one deriving from " +
      "Transformers::PreTrainedTokenizerFast. " +
      "More information on available tokenizers at " +
      "https://github.com/huggingface/transformers/pull/2674"
  end

  first_ids = get_input_ids.(text)
  second_ids = !text_pair.nil? ? get_input_ids.(text_pair) : nil

  prepare_for_model(
    first_ids,
    pair_ids: second_ids,
    add_special_tokens: add_special_tokens,
    padding: padding_strategy,
    truncation: truncation_strategy,
    max_length: max_length,
    stride: stride,
    pad_to_multiple_of: pad_to_multiple_of,
    return_tensors: return_tensors,
    prepend_batch_axis: true,
    return_attention_mask: return_attention_mask,
    return_token_type_ids: return_token_type_ids,
    return_overflowing_tokens: return_overflowing_tokens,
    return_special_tokens_mask: return_special_tokens_mask,
    return_length: return_length,
    verbose: verbose
  )
end

#convert_tokens_to_ids(tokens) ⇒ `Object`

# File 'lib/transformers/tokenization_utils.rb', line 120

def convert_tokens_to_ids(tokens)
  if tokens.nil?
    return nil
  end

  if tokens.is_a?(String)
    return _convert_token_to_id_with_added_voc(tokens)
  end

  ids = []
  tokens.each do |token|
    ids << _convert_token_to_id_with_added_voc(token)
  end
  ids
end

#is_fast ⇒ `Object`



31
32
33

# File 'lib/transformers/tokenization_utils.rb', line 31

def is_fast
  false
end

#tokenize(text, **kwargs) ⇒ `Object`

Raises:

(Todo)



39
40
41

# File 'lib/transformers/tokenization_utils.rb', line 39

def tokenize(text, **kwargs)
  raise Todo
end

#vocab_size ⇒ `Object`

Raises:

(NotImplementedError)



35
36
37

# File 'lib/transformers/tokenization_utils.rb', line 35

def vocab_size
  raise NotImplementedError
end

Class: Transformers::PreTrainedTokenizer

Direct Known Subclasses

Constant Summary

Constants included from SpecialTokensMixin

Instance Attribute Summary

Attributes inherited from PreTrainedTokenizerBase

Instance Method Summary collapse

Methods inherited from PreTrainedTokenizerBase

Methods included from ClassAttribute

Methods included from SpecialTokensMixin

Constructor Details

#initialize(**kwargs) ⇒ PreTrainedTokenizer

Instance Method Details

#_convert_token_to_id(token) ⇒ Object

#_convert_token_to_id_with_added_voc(token) ⇒ Object

#convert_tokens_to_ids(tokens) ⇒ Object

#is_fast ⇒ Object

#tokenize(text, **kwargs) ⇒ Object

#vocab_size ⇒ Object

#initialize(**kwargs) ⇒ `PreTrainedTokenizer`

#_convert_token_to_id(token) ⇒ `Object`

#_convert_token_to_id_with_added_voc(token) ⇒ `Object`

#convert_tokens_to_ids(tokens) ⇒ `Object`

#is_fast ⇒ `Object`

#tokenize(text, **kwargs) ⇒ `Object`

#vocab_size ⇒ `Object`