Module: TorchText::Data::Utils
Instance Method Summary collapse
Instance Method Details
#ngrams_iterator(token_list, ngrams) ⇒ Object
17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 |
# File 'lib/torchtext/data/utils.rb', line 17 def ngrams_iterator(token_list, ngrams) return enum_for(:ngrams_iterator, token_list, ngrams) unless block_given? get_ngrams = lambda do |n| (token_list.size - n + 1).times.map { |i| token_list[i...(i + n)] } end token_list.each do |x| yield x end 2.upto(ngrams) do |n| get_ngrams.call(n).each do |x| yield x.join(" ") end end end |
#tokenizer(tokenizer, language: "en") ⇒ Object
4 5 6 7 8 9 10 11 12 13 14 15 |
# File 'lib/torchtext/data/utils.rb', line 4 def tokenizer(tokenizer, language: "en") return method(:split_tokenizer) if tokenizer.nil? if tokenizer == "basic_english" if language != "en" raise ArgumentError, "Basic normalization is only available for English(en)" end return method(:basic_english_normalize) end raise "Not implemented yet" end |