Class: Parts::Tagger
- Inherits:
-
Object
- Object
- Parts::Tagger
- Defined in:
- lib/parts.rb
Instance Attribute Summary collapse
-
#bigram_smoothing ⇒ Object
Returns the value of attribute bigram_smoothing.
-
#bigrams ⇒ Object
Returns the value of attribute bigrams.
-
#suffixes ⇒ Object
Returns the value of attribute suffixes.
-
#tags ⇒ Object
Returns the value of attribute tags.
-
#words ⇒ Object
Returns the value of attribute words.
Instance Method Summary collapse
- #classify(sentence) ⇒ Object
- #classify_unknown(word, tag) ⇒ Object
-
#initialize(sentences = nil) ⇒ Tagger
constructor
A new instance of Tagger.
- #load(sentences) ⇒ Object
Constructor Details
#initialize(sentences = nil) ⇒ Tagger
Returns a new instance of Tagger.
12 13 14 15 16 17 18 19 20 21 22 23 |
# File 'lib/parts.rb', line 12 def initialize sentences=nil sentences = Treebank.new.sentences if sentences.nil? # Tag-bigrams are stored such that P(T2|T1) = @bigrams[T1][T2]. # Word-tag pairs are stored such that P(W|T) = @words[W][T]. # Tags are stored such that @tags[T] = no. of occurences of T. @bigrams = Hash.new { |h, t| h[t] = Hash.new { |h, t| h[t] = 0 } } @words = Hash.new { |h, t| h[t] = Hash.new { |h, t| h[t] = 0 } } @tags = Hash.new { |h, t| h[t] = 0 } @bigram_smoothing = Hash.new { |h, t| h[t] = 0 } @suffixes = Hash.new { |h, t| h[t] = Hash.new { |h, t| h[t] = 0 } } self.load sentences end |
Instance Attribute Details
#bigram_smoothing ⇒ Object
Returns the value of attribute bigram_smoothing.
10 11 12 |
# File 'lib/parts.rb', line 10 def bigram_smoothing @bigram_smoothing end |
#bigrams ⇒ Object
Returns the value of attribute bigrams.
10 11 12 |
# File 'lib/parts.rb', line 10 def bigrams @bigrams end |
#suffixes ⇒ Object
Returns the value of attribute suffixes.
10 11 12 |
# File 'lib/parts.rb', line 10 def suffixes @suffixes end |
#tags ⇒ Object
Returns the value of attribute tags.
10 11 12 |
# File 'lib/parts.rb', line 10 def @tags end |
#words ⇒ Object
Returns the value of attribute words.
10 11 12 |
# File 'lib/parts.rb', line 10 def words @words end |
Instance Method Details
#classify(sentence) ⇒ Object
73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 |
# File 'lib/parts.rb', line 73 def classify sentence # Sentences for classification are passed in as an array of words, e.g. # ["Hello", ",", "world"]. I have adapted the Viterbi algorithm to play to # the strengths of Ruby. That or, it's just an implementation of Viterbi # as I understand it. # The variable, paths, will store an array of the most succesful paths up # to all of the possible word-tag pairs for our present word. For example, # if we are currently on the word 'world' from the above example, paths # will store the two highest scoring paths which result in the "NN" and # "NNP" variants of the word 'world. # We intialise the first stage of our paths with the start tag, and set # the score to 1. We also add the end tags to our sentence. paths = [{:words => [{:word => "$start", :tag => "$start"}], :score => 1}] sentence += [{:word => "$end", :tag => "$end"}] # We iterate over each word in the sentence, initialising a new hash map # for each word, in which we will store the most succesful path up to each # possible tag. sentence.each do |word| new_paths = Hash.new { |h, t| h[t] = {:score => -1} } # For each path leading up to the previous word's tags, we now calculate # a new score for how well they lead on to each of our current word's # tags. paths.each do |path| prev_tag = path[:words].last[:tag] = @words[word].keys # tags = @bigrams[prev_tag].keys if tags.empty? = @tags.keys if .empty? # For each of our current word's potential tags we generate a new # score. If the score for this is larger than any other scores we have # registered along other paths with this tag, we set it as the highest # achieving path for the tag we are currently looking at. In effect # this prunes our search space. When calculating word_score, in order # to account for unseen words, we distribute the tag likelihood # evenely across all tags. For our bigram score, we introduce the # smoothing for each tag we look at. Bere mind that due to our # initialisation of @bigrams, @bigrams[T1][T2] for a tag T1 or T2 # which has not appeared, will always return 0, thus ensuring our # smoothing will always work, even for tags we have no registered a # bigram probability for. .each do |tag| word_score = @words[word][tag] != 0 ? @words[word][tag] : classify_unknown(word, tag) bigram_score = @bigram_smoothing[prev_tag] + @bigrams[prev_tag][tag] score = path[:score] * word_score * bigram_score new_paths[tag] = { :words => (path[:words] + [{:word => word, :tag => tag}]), :score => score } if score > new_paths[tag][:score] end end # Here we update our best paths up until this word, for each of the # word's potential tags. paths = new_paths.values end # Having looped over every word, we have now covered the entire sentence, # and need simply pick the highest scoring path. We use [1..-1] to remove # the start word-tag pair from our returned path. return paths.max_by {|a| a[:score]}[:words][1..-2] end |
#classify_unknown(word, tag) ⇒ Object
140 141 142 143 144 145 146 |
# File 'lib/parts.rb', line 140 def classify_unknown word, tag suffixes_weight = [0.05,0.15,0.5,0.3] suffixes_probability = (1..4).map do |i| @suffixes[word[-i..-1]][tag] end probability = suffixes_probability.zip(suffixes_weight).map{|i| i[0] * i[1]}.sum end |
#load(sentences) ⇒ Object
25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 |
# File 'lib/parts.rb', line 25 def load sentences # Sentences are passed in as an ordered array of hash maps, where each # hash map represents a word/tag pair, {:word => word, :tag => tag}. # We append and prepend a start tag and end tag to the sentence, and # iterate over each bigram in the sentence and increment the relevant # counters accordingly. sentences.each do |sentence| sentence = [{:word => "$start", :tag => "$start"}] + sentence sentence += [{:word => "$end", :tag => "$end"}] sentence.each_cons(2) do |previous, current| @words[current[:word]][current[:tag]] += 1 @bigrams[previous[:tag]][current[:tag]] += 1 @tags[current[:tag]] += 1 (1..4).each do |i| @suffixes[current[:word][-i..-1]][current[:tag]] += 1 end end end # For each tag-bigram, we convert its counter value into a probability. We # also take into account the effect add 1 smoothing will have on each tag. @bigrams.each do |tag, grams| total = grams.values.inject(:+) grams.each {|g,n| grams[g] = n.to_f/total} @bigram_smoothing[tag] = 1 / (@tags.length + total) end # For each word-tag pair, we convert its counter value into a probability. @words.each do |word, | # If a word occurs less than once in the corpora we remove it. if .values.sum > 1 .each {|t,n| [t] = n.to_f/@tags[t]} else @words.delete word end end # For each suffix-tag pair, we convert its counter value into a probability. @suffixes.each do |suffix, | .each {|t,n| [t] = n.to_f/@tags[t]} end # We have now initialised our two probability measures for tag-bigrams and # word-tag pairs, storing them in hash map data structures for easy # access. end |