Class: BxBuilderChain::Utils::Tokenization::OpenAiEncodings

Inherits:
Object
  • Object
show all
Defined in:
lib/bx_builder_chain/utils/tokenization/open_ai_encodings.rb

Constant Summary collapse

ENDOFTEXT =
"<|endoftext|>"
FIM_PREFIX =
"<|fim_prefix|>"
FIM_MIDDLE =
"<|fim_middle|>"
FIM_SUFFIX =
"<|fim_suffix|>"
ENDOFPROMPT =
"<|endofprompt|>"

Class Method Summary collapse

Class Method Details

.cl100k_baseObject



23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
# File 'lib/bx_builder_chain/utils/tokenization/open_ai_encodings.rb', line 23

def self.cl100k_base
  mergeable_ranks = load_tiktoken_bpe(File.join(__dir__, '..', 'token_data', 'cl100k_base.tiktoken'))

  special_tokens = {
    ENDOFTEXT => 100257,
    FIM_PREFIX => 100258,
    FIM_MIDDLE => 100259,
    FIM_SUFFIX => 100260,
    ENDOFPROMPT => 100276,
  }

  {
    "name" => "cl100k_base",
    "pat_str" => /(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+/,
    "mergeable_ranks" => mergeable_ranks,
    "special_tokens" => special_tokens
  }
end

.load_tiktoken_bpe(tiktoken_bpe_file) ⇒ Object



15
16
17
18
19
20
21
# File 'lib/bx_builder_chain/utils/tokenization/open_ai_encodings.rb', line 15

def self.load_tiktoken_bpe(tiktoken_bpe_file)
  contents = File.read(tiktoken_bpe_file)
  contents.split("\n").each_with_object({}) do |line, hash|
    token, rank = line.split
    hash[Base64.decode64(token)] = rank.to_i
  end
end