Class: BxBuilderChain::Utils::Tokenization::OpenAiEncodings
- Inherits:
-
Object
- Object
- BxBuilderChain::Utils::Tokenization::OpenAiEncodings
- Defined in:
- lib/bx_builder_chain/utils/tokenization/open_ai_encodings.rb
Constant Summary collapse
- ENDOFTEXT =
"<|endoftext|>"
- FIM_PREFIX =
"<|fim_prefix|>"
- FIM_MIDDLE =
"<|fim_middle|>"
- FIM_SUFFIX =
"<|fim_suffix|>"
- ENDOFPROMPT =
"<|endofprompt|>"
Class Method Summary collapse
Class Method Details
.cl100k_base ⇒ Object
23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 |
# File 'lib/bx_builder_chain/utils/tokenization/open_ai_encodings.rb', line 23 def self.cl100k_base mergeable_ranks = load_tiktoken_bpe(File.join(__dir__, '..', 'token_data', 'cl100k_base.tiktoken')) special_tokens = { ENDOFTEXT => 100257, FIM_PREFIX => 100258, FIM_MIDDLE => 100259, FIM_SUFFIX => 100260, ENDOFPROMPT => 100276, } { "name" => "cl100k_base", "pat_str" => /(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+/, "mergeable_ranks" => mergeable_ranks, "special_tokens" => special_tokens } end |
.load_tiktoken_bpe(tiktoken_bpe_file) ⇒ Object
15 16 17 18 19 20 21 |
# File 'lib/bx_builder_chain/utils/tokenization/open_ai_encodings.rb', line 15 def self.load_tiktoken_bpe(tiktoken_bpe_file) contents = File.read(tiktoken_bpe_file) contents.split("\n").each_with_object({}) do |line, hash| token, rank = line.split hash[Base64.decode64(token)] = rank.to_i end end |