Class: BxBuilderChain::Utils::Tokenization::BytePairEncoding

Inherits:
Object
  • Object
show all
Defined in:
lib/bx_builder_chain/utils/tokenization/byte_pair_encoding.rb

Instance Method Summary collapse

Constructor Details

#initialize(pat_str:, mergeable_ranks:) ⇒ BytePairEncoding

Returns a new instance of BytePairEncoding.



5
6
7
8
9
10
11
# File 'lib/bx_builder_chain/utils/tokenization/byte_pair_encoding.rb', line 5

def initialize(pat_str:, mergeable_ranks:)
  @pat_str = pat_str
  @mergeable_ranks = mergeable_ranks
  @decoder = mergeable_ranks.invert
  @pat = Regexp.new(pat_str)
  @tokenized_string = []
end

Instance Method Details

#encode(text, visualise: nil) ⇒ Object



13
14
15
16
# File 'lib/bx_builder_chain/utils/tokenization/byte_pair_encoding.rb', line 13

def encode(text, visualise: nil)
  words = text.scan(@pat)
  words.flat_map { |word| bpe_encode(word.bytes, visualise: visualise) }
end

#visualise_tokenised_stringObject



18
19
20
# File 'lib/bx_builder_chain/utils/tokenization/byte_pair_encoding.rb', line 18

def visualise_tokenised_string
  visualise_tokens_coloured(@tokenized_string) unless @tokenized_string.empty?
end