Class: Transformers::Mpnet::MPNetTokenizerFast

Inherits:
PreTrainedTokenizerFast show all
Defined in:
lib/transformers/models/mpnet/tokenization_mpnet_fast.rb

Constant Summary collapse

VOCAB_FILES_NAMES =
{vocab_file: "vocab.txt", tokenizer_file: "tokenizer.json"}

Constants included from SpecialTokensMixin

SpecialTokensMixin::SPECIAL_TOKENS_ATTRIBUTES

Instance Attribute Summary

Attributes inherited from PreTrainedTokenizerBase

#init_kwargs, #model_max_length

Instance Method Summary collapse

Methods inherited from PreTrainedTokenizerFast

#_convert_token_to_id_with_added_voc, #backend_tokenizer, #convert_ids_to_tokens, #convert_tokens_to_ids, #convert_tokens_to_string, #get_vocab, #is_fast, #vocab

Methods inherited from PreTrainedTokenizerBase

#_eventual_warn_about_too_long_sequence, _from_pretrained, #call, from_pretrained

Methods included from ClassAttribute

#class_attribute

Methods included from SpecialTokensMixin

#bos_token_id, #cls_token_id, #eos_token_id, #pad_token_id, #sep_token_id, #special_tokens_map, #unk_token_id

Constructor Details

#initialize(vocab_file: nil, tokenizer_file: nil, do_lower_case: true, bos_token: "<s>", eos_token: "</s>", sep_token: "</s>", cls_token: "<s>", unk_token: "[UNK]", pad_token: "<pad>", mask_token: "<mask>", tokenize_chinese_chars: true, strip_accents: nil, **kwargs) ⇒ MPNetTokenizerFast

Returns a new instance of MPNetTokenizerFast.



25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# File 'lib/transformers/models/mpnet/tokenization_mpnet_fast.rb', line 25

def initialize(
  vocab_file: nil,
  tokenizer_file: nil,
  do_lower_case: true,
  bos_token: "<s>",
  eos_token: "</s>",
  sep_token: "</s>",
  cls_token: "<s>",
  unk_token: "[UNK]",
  pad_token: "<pad>",
  mask_token: "<mask>",
  tokenize_chinese_chars: true,
  strip_accents: nil,
  **kwargs
)
  bos_token = bos_token.is_a?(String) ? Tokenizers::AddedToken.new(bos_token, lstrip: false, rstrip: false) : bos_token
  eos_token = eos_token.is_a?(String) ? Tokenizers::AddedToken.new(eos_token, lstrip: false, rstrip: false) : eos_token
  sep_token = sep_token.is_a?(String) ? Tokenizers::AddedToken.new(sep_token, lstrip: false, rstrip: false) : sep_token
  cls_token = cls_token.is_a?(String) ? Tokenizers::AddedToken.new(cls_token, lstrip: false, rstrip: false) : cls_token
  unk_token = unk_token.is_a?(String) ? Tokenizers::AddedToken.new(unk_token, lstrip: false, rstrip: false) : unk_token
  pad_token = pad_token.is_a?(String) ? Tokenizers::AddedToken.new(pad_token, lstrip: false, rstrip: false) : pad_token

  # Mask token behave like a normal word, i.e. include the space before it
  mask_token = mask_token.is_a?(String) ? Tokenizers::AddedToken.new(mask_token, lstrip: true, rstrip: false) : mask_token

  super(vocab_file, tokenizer_file: tokenizer_file, do_lower_case: do_lower_case, bos_token: bos_token, eos_token: eos_token, sep_token: sep_token, cls_token: cls_token, unk_token: unk_token, pad_token: pad_token, mask_token: mask_token, tokenize_chinese_chars: tokenize_chinese_chars, strip_accents: strip_accents, **kwargs)

  # TODO support
  # pre_tok_state = JSON.parse(backend_tokenizer.normalizer.__getstate__)
  # if (pre_tok_state["lowercase"] || do_lower_case) != do_lower_case || (pre_tok_state["strip_accents"] || strip_accents) != strip_accents
  #   pre_tok_class = getattr(normalizers, pre_tok_state.delete("type"))
  #   pre_tok_state["lowercase"] = do_lower_case
  #   pre_tok_state["strip_accents"] = strip_accents
  #   @normalizer = pre_tok_class(**pre_tok_state)
  # end

  @do_lower_case = do_lower_case
end

Instance Method Details

#build_inputs_with_special_tokens(token_ids_0, token_ids_1: nil) ⇒ Object



81
82
83
84
85
86
87
88
# File 'lib/transformers/models/mpnet/tokenization_mpnet_fast.rb', line 81

def build_inputs_with_special_tokens(token_ids_0, token_ids_1: nil)
  output = [@bos_token_id] + token_ids_0 + [@eos_token_id]
  if token_ids_1.nil?
    return output
  end

  output + [@eos_token_id] + token_ids_1 + [@eos_token_id]
end

#create_token_type_ids_from_sequences(token_ids_0, token_ids_1: nil) ⇒ Object



90
91
92
93
94
95
96
97
98
# File 'lib/transformers/models/mpnet/tokenization_mpnet_fast.rb', line 90

def create_token_type_ids_from_sequences(token_ids_0, token_ids_1: nil)
  sep = [@sep_token_id]
  cls = [@cls_token_id]

  if token_ids_1.nil?
    return (cls + token_ids_0 + sep).length * [0]
  end
  (cls + token_ids_0 + sep + sep + token_ids_1 + sep).length * [0]
end

#mask_tokenObject



64
65
66
67
68
69
70
71
72
# File 'lib/transformers/models/mpnet/tokenization_mpnet_fast.rb', line 64

def mask_token
  if @mask_token.nil?
    if @verbose
      Transformers.logger.error("Using mask_token, but it is not set yet.")
    end
    return nil
  end
  @mask_token.to_s
end

#mask_token=(value) ⇒ Object



74
75
76
77
78
79
# File 'lib/transformers/models/mpnet/tokenization_mpnet_fast.rb', line 74

def mask_token=(value)
  # Mask token behave like a normal word, i.e. include the space before it
  # So we set lstrip to True
  value = value.is_a?(String) ? Tokenizers::AddedToken.new(value, lstrip: true, rstrip: false) : value
  @mask_token = value
end

#save_vocabulary(save_directory, filename_prefix: nil) ⇒ Object



100
101
102
103
# File 'lib/transformers/models/mpnet/tokenization_mpnet_fast.rb', line 100

def save_vocabulary(save_directory, filename_prefix: nil)
  files = @tokenizer.model.save(save_directory, name: filename_prefix)
  Array(files)
end