Class: Transformers::Distilbert::DistilBertTokenizer

Inherits:
PreTrainedTokenizer show all
Defined in:
lib/transformers/models/distilbert/tokenization_distilbert.rb

Defined Under Namespace

Classes: BasicTokenizer, WordpieceTokenizer

Constant Summary collapse

VOCAB_FILES_NAMES =
{vocab_file: "vocab.txt"}

Constants included from SpecialTokensMixin

SpecialTokensMixin::SPECIAL_TOKENS_ATTRIBUTES

Instance Attribute Summary collapse

Attributes inherited from PreTrainedTokenizerBase

#init_kwargs, #model_max_length

Instance Method Summary collapse

Methods inherited from PreTrainedTokenizer

#_convert_token_to_id_with_added_voc, #_encode_plus, #convert_tokens_to_ids, #is_fast, #tokenize, #vocab_size

Methods inherited from PreTrainedTokenizerBase

#_eventual_warn_about_too_long_sequence, _from_pretrained, #call, from_pretrained

Methods included from ClassAttribute

#class_attribute

Methods included from SpecialTokensMixin

#bos_token_id, #cls_token_id, #eos_token_id, #pad_token_id, #sep_token_id, #special_tokens_map, #unk_token_id

Constructor Details

#initialize(vocab_file:, do_lower_case: true, do_basic_tokenize: true, never_split: nil, unk_token: "[UNK]", sep_token: "[SEP]", pad_token: "[PAD]", cls_token: "[CLS]", mask_token: "[MASK]", tokenize_chinese_chars: true, strip_accents: nil, **kwargs) ⇒ DistilBertTokenizer

Returns a new instance of DistilBertTokenizer.



54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
# File 'lib/transformers/models/distilbert/tokenization_distilbert.rb', line 54

def initialize(
  vocab_file:,
  do_lower_case: true,
  do_basic_tokenize: true,
  never_split: nil,
  unk_token: "[UNK]",
  sep_token: "[SEP]",
  pad_token: "[PAD]",
  cls_token: "[CLS]",
  mask_token: "[MASK]",
  tokenize_chinese_chars: true,
  strip_accents: nil,
  **kwargs
)
  @vocab = load_vocab(vocab_file)
  @ids_to_tokens = @vocab.invert
  @do_basic_tokenize = do_basic_tokenize
  if do_basic_tokenize
    @basic_tokenizer =
      BasicTokenizer.new(
        do_lower_case: do_lower_case,
        never_split: never_split,
        tokenize_chinese_chars: tokenize_chinese_chars,
        strip_accents: strip_accents
      )
  end
  @wordpiece_tokenizer = WordpieceTokenizer.new(vocab: @vocab, unk_token: unk_token.to_s)

  super(
    do_lower_case: do_lower_case,
    do_basic_tokenize: do_basic_tokenize,
    never_split: never_split,
    unk_token: unk_token,
    sep_token: sep_token,
    pad_token: pad_token,
    cls_token: cls_token,
    mask_token: mask_token,
    tokenize_chinese_chars: tokenize_chinese_chars,
    strip_accents: strip_accents,
    **kwargs
  )
end

Instance Attribute Details

#basic_tokenizerObject (readonly)

Returns the value of attribute basic_tokenizer.



52
53
54
# File 'lib/transformers/models/distilbert/tokenization_distilbert.rb', line 52

def basic_tokenizer
  @basic_tokenizer
end

#vocabObject (readonly)

Returns the value of attribute vocab.



52
53
54
# File 'lib/transformers/models/distilbert/tokenization_distilbert.rb', line 52

def vocab
  @vocab
end

Instance Method Details

#_convert_token_to_id(token) ⇒ Object



97
98
99
# File 'lib/transformers/models/distilbert/tokenization_distilbert.rb', line 97

def _convert_token_to_id(token)
  @vocab.fetch(token, @vocab.fetch(@unk_token))
end