Module: TorchText::Data::Utils

Extended by:
Utils
Included in:
TorchText::Data, Utils
Defined in:
lib/torchtext/data/utils.rb

Instance Method Summary collapse

Instance Method Details

#ngrams_iterator(token_list, ngrams) ⇒ Object



17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
# File 'lib/torchtext/data/utils.rb', line 17

def ngrams_iterator(token_list, ngrams)
  return enum_for(:ngrams_iterator, token_list, ngrams) unless block_given?

  get_ngrams = lambda do |n|
    (token_list.size - n + 1).times.map { |i| token_list[i...(i + n)] }
  end

  token_list.each do |x|
    yield x
  end
  2.upto(ngrams) do |n|
    get_ngrams.call(n).each do |x|
      yield x.join(" ")
    end
  end
end

#tokenizer(tokenizer, language: "en") ⇒ Object



4
5
6
7
8
9
10
11
12
13
14
15
# File 'lib/torchtext/data/utils.rb', line 4

def tokenizer(tokenizer, language: "en")
  return method(:split_tokenizer) if tokenizer.nil?

  if tokenizer == "basic_english"
    if language != "en"
      raise ArgumentError, "Basic normalization is only available for English(en)"
    end
    return method(:basic_english_normalize)
  end

  raise "Not implemented yet"
end