Class: Informers::M2M100Tokenizer

Inherits:
PreTrainedTokenizer show all
Defined in:
lib/informers/tokenizers.rb

Instance Attribute Summary collapse

Attributes inherited from PreTrainedTokenizer

#mask_token, #mask_token_id, #sep_token_id

Instance Method Summary collapse

Methods inherited from PreTrainedTokenizer

#batch_decode, #call, #convert_tokens_to_ids, #convert_tokens_to_string, #decode, #get_token, #id_to_token, #padding_side=

Constructor Details

#initialize(tokenizer_json, tokenizer_config) ⇒ M2M100Tokenizer

Returns a new instance of M2M100Tokenizer.



200
201
202
203
204
205
206
207
208
# File 'lib/informers/tokenizers.rb', line 200

def initialize(tokenizer_json, tokenizer_config)
  super(tokenizer_json, tokenizer_config)

  @language_regex = /^__[a-z]{2,3}__$/
  @language_codes = @special_tokens
    .filter { |x| @language_regex.match?(x) }
    .map { |x| x.slice(2, -2) }
  @lang_to_token = ->(x) { "__#{x}__" }
end

Instance Attribute Details

#lang_to_tokenObject (readonly)

Returns the value of attribute lang_to_token.



198
199
200
# File 'lib/informers/tokenizers.rb', line 198

def lang_to_token
  @lang_to_token
end

#language_codesObject (readonly)

Returns the value of attribute language_codes.



198
199
200
# File 'lib/informers/tokenizers.rb', line 198

def language_codes
  @language_codes
end

#language_regexObject (readonly)

Returns the value of attribute language_regex.



198
199
200
# File 'lib/informers/tokenizers.rb', line 198

def language_regex
  @language_regex
end

Instance Method Details

#_build_translation_inputs(raw_inputs, tokenizer_options, generate_kwargs) ⇒ Object



210
211
212
# File 'lib/informers/tokenizers.rb', line 210

def _build_translation_inputs(raw_inputs, tokenizer_options, generate_kwargs)
  Utils._build_translation_inputs(self, raw_inputs, tokenizer_options, generate_kwargs)
end