Class: Informers::NllbTokenizer

Inherits:
PreTrainedTokenizer show all
Defined in:
lib/informers/tokenizers.rb

Instance Attribute Summary collapse

Attributes inherited from PreTrainedTokenizer

#mask_token, #mask_token_id, #sep_token_id

Instance Method Summary collapse

Methods inherited from PreTrainedTokenizer

#batch_decode, #call, #convert_tokens_to_ids, #convert_tokens_to_string, #decode, #get_token, #id_to_token, #padding_side=

Constructor Details

#initialize(tokenizer_json, tokenizer_config) ⇒ NllbTokenizer

Returns a new instance of NllbTokenizer.



184
185
186
187
188
189
190
# File 'lib/informers/tokenizers.rb', line 184

def initialize(tokenizer_json, tokenizer_config)
  super(tokenizer_json, tokenizer_config)

  @language_regex = /^[a-z]{3}_[A-Z][a-z]{3}$/
  @language_codes = @special_tokens.filter { |x| @language_regex.match?(x) }
  @lang_to_token = ->(x) { x } # Identity function
end

Instance Attribute Details

#lang_to_tokenObject (readonly)

Returns the value of attribute lang_to_token.



182
183
184
# File 'lib/informers/tokenizers.rb', line 182

def lang_to_token
  @lang_to_token
end

#language_codesObject (readonly)

Returns the value of attribute language_codes.



182
183
184
# File 'lib/informers/tokenizers.rb', line 182

def language_codes
  @language_codes
end

#language_regexObject (readonly)

Returns the value of attribute language_regex.



182
183
184
# File 'lib/informers/tokenizers.rb', line 182

def language_regex
  @language_regex
end

Instance Method Details

#_build_translation_inputs(raw_inputs, tokenizer_options, generate_kwargs) ⇒ Object



192
193
194
# File 'lib/informers/tokenizers.rb', line 192

def _build_translation_inputs(raw_inputs, tokenizer_options, generate_kwargs)
  Utils._build_translation_inputs(self, raw_inputs, tokenizer_options, generate_kwargs)
end