Class: Transformers::DebertaV2::DebertaV2TokenizerFast

Inherits:
PreTrainedTokenizerFast show all
Defined in:
lib/transformers/models/deberta_v2/tokenization_deberta_v2_fast.rb

Constant Summary collapse

VOCAB_FILES_NAMES =
{vocab_file: "spm.model", tokenizer_file: "tokenizer.json"}

Constants included from SpecialTokensMixin

SpecialTokensMixin::SPECIAL_TOKENS_ATTRIBUTES

Instance Attribute Summary

Attributes inherited from PreTrainedTokenizerBase

#init_kwargs, #model_max_length

Instance Method Summary collapse

Methods inherited from PreTrainedTokenizerFast

#_convert_token_to_id_with_added_voc, #backend_tokenizer, #convert_ids_to_tokens, #convert_tokens_to_ids, #convert_tokens_to_string, #get_vocab, #is_fast, #vocab

Methods inherited from PreTrainedTokenizerBase

#_eventual_warn_about_too_long_sequence, _from_pretrained, #call, from_pretrained

Methods included from ClassAttribute

#class_attribute

Methods included from SpecialTokensMixin

#bos_token_id, #cls_token_id, #eos_token_id, #pad_token_id, #sep_token_id, #special_tokens_map, #unk_token_id

Constructor Details

#initialize(vocab_file: nil, tokenizer_file: nil, do_lower_case: false, split_by_punct: false, bos_token: "[CLS]", eos_token: "[SEP]", unk_token: "[UNK]", sep_token: "[SEP]", pad_token: "[PAD]", cls_token: "[CLS]", mask_token: "[MASK]", **kwargs) ⇒ DebertaV2TokenizerFast

self.slow_tokenizer_class = DebertaV2Tokenizer



23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
# File 'lib/transformers/models/deberta_v2/tokenization_deberta_v2_fast.rb', line 23

def initialize(
  vocab_file: nil,
  tokenizer_file: nil,
  do_lower_case: false,
  split_by_punct: false,
  bos_token: "[CLS]",
  eos_token: "[SEP]",
  unk_token: "[UNK]",
  sep_token: "[SEP]",
  pad_token: "[PAD]",
  cls_token: "[CLS]",
  mask_token: "[MASK]",
  **kwargs
)
  super(vocab_file, tokenizer_file: tokenizer_file, do_lower_case: do_lower_case, bos_token: bos_token, eos_token: eos_token, unk_token: unk_token, sep_token: sep_token, pad_token: pad_token, cls_token: cls_token, mask_token: mask_token, split_by_punct: split_by_punct, **kwargs)

  @do_lower_case = do_lower_case
  @split_by_punct = split_by_punct
  @vocab_file = vocab_file
end

Instance Method Details

#build_inputs_with_special_tokens(token_ids_0, token_ids_1: nil) ⇒ Object



48
49
50
51
52
53
54
55
# File 'lib/transformers/models/deberta_v2/tokenization_deberta_v2_fast.rb', line 48

def build_inputs_with_special_tokens(token_ids_0, token_ids_1: nil)
  if token_ids_1.nil?
    return [@cls_token_id] + token_ids_0 + [@sep_token_id]
  end
  cls = [@cls_token_id]
  sep = [@sep_token_id]
  cls + token_ids_0 + sep + token_ids_1 + sep
end

#can_save_slow_tokenizerObject



44
45
46
# File 'lib/transformers/models/deberta_v2/tokenization_deberta_v2_fast.rb', line 44

def can_save_slow_tokenizer
  @vocab_file ? File.exist?(@vocab_file) : false
end

#create_token_type_ids_from_sequences(token_ids_0, token_ids_1: nil) ⇒ Object



68
69
70
71
72
73
74
75
# File 'lib/transformers/models/deberta_v2/tokenization_deberta_v2_fast.rb', line 68

def create_token_type_ids_from_sequences(token_ids_0, token_ids_1: nil)
  sep = [@sep_token_id]
  cls = [@cls_token_id]
  if token_ids_1.nil?
    return (cls + token_ids_0 + sep).length * [0]
  end
  ((cls + token_ids_0 + sep).length * [0]) + ((token_ids_1 + sep).length * [1])
end

#get_special_tokens_mask(token_ids_0, token_ids_1: nil, already_has_special_tokens: false) ⇒ Object



57
58
59
60
61
62
63
64
65
66
# File 'lib/transformers/models/deberta_v2/tokenization_deberta_v2_fast.rb', line 57

def get_special_tokens_mask(token_ids_0, token_ids_1: nil, already_has_special_tokens: false)
  if already_has_special_tokens
    return super(token_ids_0: token_ids_0, token_ids_1: token_ids_1, already_has_special_tokens: true)
  end

  if !token_ids_1.nil?
    return [1] + ([0] * token_ids_0.length) + [1] + ([0] * token_ids_1.length) + [1]
  end
  [1] + ([0] * token_ids_0.length) + [1]
end