Class: TorchText::Vocab

Inherits:
Object
  • Object
show all
Defined in:
lib/torchtext/vocab.rb

Constant Summary collapse

UNK =
"<unk>"

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(counter, max_size: nil, min_freq: 1, specials: ["<unk>", "<pad>"], vectors: nil, unk_init: nil, vectors_cache: nil, specials_first: true) ⇒ Vocab

Returns a new instance of Vocab.



5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
# File 'lib/torchtext/vocab.rb', line 5

def initialize(
  counter, max_size: nil, min_freq: 1, specials: ["<unk>", "<pad>"],
  vectors: nil, unk_init: nil, vectors_cache: nil, specials_first: true
)

  @freqs = counter
  counter = counter.dup
  min_freq = [min_freq, 1].max

  @itos = []
  @unk_index = nil

  if specials_first
    @itos = specials
    # only extend max size if specials are prepended
    max_size += specials.size if max_size
  end

  # frequencies of special tokens are not counted when building vocabulary
  # in frequency order
  specials.each do |tok|
    counter.delete(tok)
  end

  # sort by frequency, then alphabetically
  words_and_frequencies = counter.sort_by { |k, v| [-v, k] }

  words_and_frequencies.each do |word, freq|
    break if freq < min_freq || @itos.length == max_size
    @itos << word
  end

  if specials.include?(UNK)  # hard-coded for now
    unk_index = specials.index(UNK)  # position in list
    # account for ordering of specials, set variable
    @unk_index = specials_first ? unk_index : @itos.length + unk_index
    @stoi = Hash.new(@unk_index)
  else
    @stoi = {}
  end

  if !specials_first
    @itos.concat(specials)
  end

  # stoi is simply a reverse dict for itos
  @itos.each_with_index do |tok, i|
    @stoi[tok] = i
  end

  @vectors = nil
  if !vectors.nil?
    # self.load_vectors(vectors, unk_init=unk_init, cache=vectors_cache)
    raise "Not implemented yet"
  else
    raise "Failed assertion" unless unk_init.nil?
    raise "Failed assertion" unless vectors_cache.nil?
  end
end

Class Method Details

.build_vocab_from_iterator(iterator) ⇒ Object



74
75
76
77
78
79
80
81
82
83
84
85
# File 'lib/torchtext/vocab.rb', line 74

def self.build_vocab_from_iterator(iterator)
  counter = Hash.new(0)
  i = 0
  iterator.each do |tokens|
    tokens.each do |token|
      counter[token] += 1
    end
    i += 1
    puts "Processed #{i}" if i % 10000 == 0
  end
  Vocab.new(counter)
end

Instance Method Details

#[](token) ⇒ Object



65
66
67
# File 'lib/torchtext/vocab.rb', line 65

def [](token)
  @stoi.fetch(token, @stoi.fetch(UNK))
end

#lengthObject Also known as: size



69
70
71
# File 'lib/torchtext/vocab.rb', line 69

def length
  @itos.length
end