Class: Transformers::Bert::BertTokenizer

Inherits:
PreTrainedTokenizer show all
Defined in:
lib/transformers/models/bert/tokenization_bert.rb

Defined Under Namespace

Classes: BasicTokenizer, WordpieceTokenizer

Constant Summary

Constants included from SpecialTokensMixin

SpecialTokensMixin::SPECIAL_TOKENS_ATTRIBUTES

Instance Attribute Summary collapse

Attributes inherited from PreTrainedTokenizerBase

#init_kwargs, #model_max_length

Instance Method Summary collapse

Methods inherited from PreTrainedTokenizer

#_convert_token_to_id_with_added_voc, #_encode_plus, #convert_tokens_to_ids, #is_fast, #tokenize, #vocab_size

Methods inherited from PreTrainedTokenizerBase

#_eventual_warn_about_too_long_sequence, _from_pretrained, #call, from_pretrained

Methods included from ClassAttribute

#class_attribute

Methods included from SpecialTokensMixin

#bos_token_id, #cls_token_id, #eos_token_id, #pad_token_id, #sep_token_id, #special_tokens_map, #unk_token_id

Constructor Details

#initialize(vocab_file:, do_lower_case: true, do_basic_tokenize: true, never_split: nil, unk_token: "[UNK]", sep_token: "[SEP]", pad_token: "[PAD]", cls_token: "[CLS]", mask_token: "[MASK]", tokenize_chinese_chars: true, strip_accents: nil, **kwargs) ⇒ BertTokenizer



49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
# File 'lib/transformers/models/bert/tokenization_bert.rb', line 49

def initialize(
  vocab_file:,
  do_lower_case: true,
  do_basic_tokenize: true,
  never_split: nil,
  unk_token: "[UNK]",
  sep_token: "[SEP]",
  pad_token: "[PAD]",
  cls_token: "[CLS]",
  mask_token: "[MASK]",
  tokenize_chinese_chars: true,
  strip_accents: nil,
  **kwargs
)
  if !File.exist?(vocab_file)
    raise ArgumentError,
      "Can't find a vocabulary file at path '#{vocab_file}'. To load the vocabulary from a Google pretrained" +
      " model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
  end
  @vocab = load_vocab(vocab_file)
  @ids_to_tokens = @vocab.invert
  @do_basic_tokenize = do_basic_tokenize
  if do_basic_tokenize
    @basic_tokenizer =
      BasicTokenizer.new(
        do_lower_case: do_lower_case,
        never_split: never_split,
        tokenize_chinese_chars: tokenize_chinese_chars,
        strip_accents: strip_accents
      )
  end

  @wordpiece_tokenizer = WordpieceTokenizer.new(vocab: @vocab, unk_token: unk_token.to_s)

  super(
    do_lower_case: do_lower_case,
    do_basic_tokenize: do_basic_tokenize,
    never_split: never_split,
    unk_token: unk_token,
    sep_token: sep_token,
    pad_token: pad_token,
    cls_token: cls_token,
    mask_token: mask_token,
    tokenize_chinese_chars: tokenize_chinese_chars,
    strip_accents: strip_accents,
    **kwargs
  )
end

Instance Attribute Details

#basic_tokenizerObject (readonly)

Returns the value of attribute basic_tokenizer.



47
48
49
# File 'lib/transformers/models/bert/tokenization_bert.rb', line 47

def basic_tokenizer
  @basic_tokenizer
end

#vocabObject (readonly)

Returns the value of attribute vocab.



47
48
49
# File 'lib/transformers/models/bert/tokenization_bert.rb', line 47

def vocab
  @vocab
end

Instance Method Details

#_convert_token_to_id(token) ⇒ Object



98
99
100
# File 'lib/transformers/models/bert/tokenization_bert.rb', line 98

def _convert_token_to_id(token)
  @vocab.fetch(token, @vocab.fetch(@unk_token))
end