Class: Transformers::Mpnet::MPNetTokenizerFast
- Inherits:
-
PreTrainedTokenizerFast
- Object
- PreTrainedTokenizerBase
- PreTrainedTokenizerFast
- Transformers::Mpnet::MPNetTokenizerFast
- Defined in:
- lib/transformers/models/mpnet/tokenization_mpnet_fast.rb
Constant Summary collapse
- VOCAB_FILES_NAMES =
{vocab_file: "vocab.txt", tokenizer_file: "tokenizer.json"}
Constants included from SpecialTokensMixin
SpecialTokensMixin::SPECIAL_TOKENS_ATTRIBUTES
Instance Attribute Summary
Attributes inherited from PreTrainedTokenizerBase
#init_kwargs, #model_max_length
Instance Method Summary collapse
- #build_inputs_with_special_tokens(token_ids_0, token_ids_1: nil) ⇒ Object
- #create_token_type_ids_from_sequences(token_ids_0, token_ids_1: nil) ⇒ Object
-
#initialize(vocab_file: nil, tokenizer_file: nil, do_lower_case: true, bos_token: "<s>", eos_token: "</s>", sep_token: "</s>", cls_token: "<s>", unk_token: "[UNK]", pad_token: "<pad>", mask_token: "<mask>", tokenize_chinese_chars: true, strip_accents: nil, **kwargs) ⇒ MPNetTokenizerFast
constructor
A new instance of MPNetTokenizerFast.
- #mask_token ⇒ Object
- #mask_token=(value) ⇒ Object
- #save_vocabulary(save_directory, filename_prefix: nil) ⇒ Object
Methods inherited from PreTrainedTokenizerFast
#_convert_token_to_id_with_added_voc, #backend_tokenizer, #convert_ids_to_tokens, #convert_tokens_to_ids, #convert_tokens_to_string, #get_vocab, #is_fast, #vocab
Methods inherited from PreTrainedTokenizerBase
#_eventual_warn_about_too_long_sequence, _from_pretrained, #call, from_pretrained
Methods included from ClassAttribute
Methods included from SpecialTokensMixin
#bos_token_id, #cls_token_id, #eos_token_id, #pad_token_id, #sep_token_id, #special_tokens_map, #unk_token_id
Constructor Details
#initialize(vocab_file: nil, tokenizer_file: nil, do_lower_case: true, bos_token: "<s>", eos_token: "</s>", sep_token: "</s>", cls_token: "<s>", unk_token: "[UNK]", pad_token: "<pad>", mask_token: "<mask>", tokenize_chinese_chars: true, strip_accents: nil, **kwargs) ⇒ MPNetTokenizerFast
Returns a new instance of MPNetTokenizerFast.
25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 |
# File 'lib/transformers/models/mpnet/tokenization_mpnet_fast.rb', line 25 def initialize( vocab_file: nil, tokenizer_file: nil, do_lower_case: true, bos_token: "<s>", eos_token: "</s>", sep_token: "</s>", cls_token: "<s>", unk_token: "[UNK]", pad_token: "<pad>", mask_token: "<mask>", tokenize_chinese_chars: true, strip_accents: nil, **kwargs ) bos_token = bos_token.is_a?(String) ? Tokenizers::AddedToken.new(bos_token, lstrip: false, rstrip: false) : bos_token eos_token = eos_token.is_a?(String) ? Tokenizers::AddedToken.new(eos_token, lstrip: false, rstrip: false) : eos_token sep_token = sep_token.is_a?(String) ? Tokenizers::AddedToken.new(sep_token, lstrip: false, rstrip: false) : sep_token cls_token = cls_token.is_a?(String) ? Tokenizers::AddedToken.new(cls_token, lstrip: false, rstrip: false) : cls_token unk_token = unk_token.is_a?(String) ? Tokenizers::AddedToken.new(unk_token, lstrip: false, rstrip: false) : unk_token pad_token = pad_token.is_a?(String) ? Tokenizers::AddedToken.new(pad_token, lstrip: false, rstrip: false) : pad_token # Mask token behave like a normal word, i.e. include the space before it mask_token = mask_token.is_a?(String) ? Tokenizers::AddedToken.new(mask_token, lstrip: true, rstrip: false) : mask_token super(vocab_file, tokenizer_file: tokenizer_file, do_lower_case: do_lower_case, bos_token: bos_token, eos_token: eos_token, sep_token: sep_token, cls_token: cls_token, unk_token: unk_token, pad_token: pad_token, mask_token: mask_token, tokenize_chinese_chars: tokenize_chinese_chars, strip_accents: strip_accents, **kwargs) # TODO support # pre_tok_state = JSON.parse(backend_tokenizer.normalizer.__getstate__) # if (pre_tok_state["lowercase"] || do_lower_case) != do_lower_case || (pre_tok_state["strip_accents"] || strip_accents) != strip_accents # pre_tok_class = getattr(normalizers, pre_tok_state.delete("type")) # pre_tok_state["lowercase"] = do_lower_case # pre_tok_state["strip_accents"] = strip_accents # @normalizer = pre_tok_class(**pre_tok_state) # end @do_lower_case = do_lower_case end |
Instance Method Details
#build_inputs_with_special_tokens(token_ids_0, token_ids_1: nil) ⇒ Object
81 82 83 84 85 86 87 88 |
# File 'lib/transformers/models/mpnet/tokenization_mpnet_fast.rb', line 81 def build_inputs_with_special_tokens(token_ids_0, token_ids_1: nil) output = [@bos_token_id] + token_ids_0 + [@eos_token_id] if token_ids_1.nil? return output end output + [@eos_token_id] + token_ids_1 + [@eos_token_id] end |
#create_token_type_ids_from_sequences(token_ids_0, token_ids_1: nil) ⇒ Object
90 91 92 93 94 95 96 97 98 |
# File 'lib/transformers/models/mpnet/tokenization_mpnet_fast.rb', line 90 def create_token_type_ids_from_sequences(token_ids_0, token_ids_1: nil) sep = [@sep_token_id] cls = [@cls_token_id] if token_ids_1.nil? return (cls + token_ids_0 + sep).length * [0] end (cls + token_ids_0 + sep + sep + token_ids_1 + sep).length * [0] end |
#mask_token ⇒ Object
64 65 66 67 68 69 70 71 72 |
# File 'lib/transformers/models/mpnet/tokenization_mpnet_fast.rb', line 64 def mask_token if @mask_token.nil? if @verbose Transformers.logger.error("Using mask_token, but it is not set yet.") end return nil end @mask_token.to_s end |
#mask_token=(value) ⇒ Object
74 75 76 77 78 79 |
# File 'lib/transformers/models/mpnet/tokenization_mpnet_fast.rb', line 74 def mask_token=(value) # Mask token behave like a normal word, i.e. include the space before it # So we set lstrip to True value = value.is_a?(String) ? Tokenizers::AddedToken.new(value, lstrip: true, rstrip: false) : value @mask_token = value end |
#save_vocabulary(save_directory, filename_prefix: nil) ⇒ Object
100 101 102 103 |
# File 'lib/transformers/models/mpnet/tokenization_mpnet_fast.rb', line 100 def save_vocabulary(save_directory, filename_prefix: nil) files = @tokenizer.model.save(save_directory, name: filename_prefix) Array(files) end |