Module: BLEU
- Defined in:
- lib/zipf/bleu.rb
Defined Under Namespace
Classes: NgramCounts, Ngrams
Class Method Summary collapse
- .best_match_length(hypothesis, references) ⇒ Object
- .bleu(hyp_file, ref_file, n, debug = false) ⇒ Object
- .bleu_(counts, n, debug = false) ⇒ Object
- .brevity_penalty(c, r, smooth = 0.0) ⇒ Object
- .get_counts(hypothesis, references, n, times = 1) ⇒ Object
- .hbleu(hypotheses, references, n, debug = false) ⇒ Object
- .hbleu_(counts, n, debug = false) ⇒ Object
- .per_sentence_bleu(hypothesis, references, n = 4, smooth = 0.0) ⇒ Object
Class Method Details
.best_match_length(hypothesis, references) ⇒ Object
65 66 67 68 69 70 71 72 73 74 |
# File 'lib/zipf/bleu.rb', line 65 def BLEU::best_match_length hypothesis, references hyp_len = hypothesis.strip.split.size ref_lens = references.map { |r| r.strip.split.size } min = Integer::MAX min_idx = -1 ref_lens.each_with_index { |l,i| min_idx = i if (hyp_len-l).abs < min } return hyp_len, ref_lens[min_idx] end |
.bleu(hyp_file, ref_file, n, debug = false) ⇒ Object
115 116 117 118 119 120 121 122 123 124 125 |
# File 'lib/zipf/bleu.rb', line 115 def BLEU::bleu hyp_file, ref_file, n, debug=false hypotheses = ReadFile.readlines_strip(hyp_file) references = ReadFile.readlines_strip(ref_file).map { |l| splitpipe(l,3) } counts = [] hypotheses.each_with_index { |h,i| counts << BLEU::get_counts(h, references[i], 4) } bleu_ counts, n, debug end |
.bleu_(counts, n, debug = false) ⇒ Object
100 101 102 103 104 105 106 107 108 109 110 111 112 113 |
# File 'lib/zipf/bleu.rb', line 100 def BLEU::bleu_ counts, n, debug=false corpus_stats = NgramCounts.new n counts.each { |i| corpus_stats.plus_eq i } logbleu = 0.0 0.upto(n-1) { |m| STDERR.write "#{m+1} #{corpus_stats.clipped[m]} / #{corpus_stats.sum[m]} = #{(corpus_stats.clipped[m]/corpus_stats.sum[m]).round 2}\n" if debug return 0.0 if corpus_stats.clipped[m] == 0 or corpus_stats.sum == 0 logbleu += Math.log(corpus_stats.clipped[m]) - Math.log(corpus_stats.sum[m]) } logbleu /= n STDERR.write "BP #{brevity_penalty(corpus_stats.hyp_len, corpus_stats.ref_len).round 2}\n" if debug logbleu += brevity_penalty corpus_stats.hyp_len, corpus_stats.ref_len return Math.exp logbleu end |
.brevity_penalty(c, r, smooth = 0.0) ⇒ Object
96 97 98 |
# File 'lib/zipf/bleu.rb', line 96 def BLEU::brevity_penalty c, r, smooth=0.0 return [0.0, 1.0-((r+smooth)/c)].min end |
.get_counts(hypothesis, references, n, times = 1) ⇒ Object
76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 |
# File 'lib/zipf/bleu.rb', line 76 def BLEU::get_counts hypothesis, references, n, times=1 p = NgramCounts.new n r = [] references.each { |reference| r << Ngrams.new ngrams(reference, n) { |ng| r.last.add ng } } h = Ngrams.new ngrams(hypothesis, n) { |ng| h.add ng } h.each { |ng,count| sz = ng.size-1 p.sum[sz] += count * times p.clipped[sz] += [r.map { |i| i.get_count(ng)}.max, count].min * times } p.hyp_len, p.ref_len = best_match_length hypothesis, references p.hyp_len *= times p.ref_len *= times return p end |
.hbleu(hypotheses, references, n, debug = false) ⇒ Object
131 132 |
# File 'lib/zipf/bleu.rb', line 131 def BLEU::hbleu hypotheses, references, n, debug=false end |
.hbleu_(counts, n, debug = false) ⇒ Object
127 128 129 |
# File 'lib/zipf/bleu.rb', line 127 def BLEU::hbleu_ counts, n, debug=false (100*bleu(counts, n, debug)).round(3) end |
.per_sentence_bleu(hypothesis, references, n = 4, smooth = 0.0) ⇒ Object
134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
# File 'lib/zipf/bleu.rb', line 134 def BLEU::per_sentence_bleu hypothesis, references, n=4, smooth=0.0 h_ng = {}; r_ng = [] num_ref = references.size num_ref.times { r_ng << {} } (1).upto(n) { |i| h_ng[i] = []; num_ref.times { |j| r_ng[j][i] = [] } } ngrams(hypothesis, n) { |i| h_ng[i.size] << i } references.each_with_index { |reference,j| ngrams(reference, n) { |i| r_ng[j][i.size] << i } } m = [n, references.map { |i| i.split.size }.max].min add = 0.0 logbleu = 0.0 (1).upto(m) { |i| counts_clipped = 0 counts_sum = h_ng[i].size h_ng[i].uniq.each { |j| max_count = [h_ng[i].count(j), r_ng.map { |r| r[i].count(j) }.max].min counts_clipped += max_count } add = 1.0 if i >= 2 logbleu += Math.log(counts_clipped+add) - Math.log(counts_sum+add); } logbleu /= m hyp_len, best_ref_len = BLEU::best_match_length hypothesis, references logbleu += brevity_penalty hyp_len, best_ref_len, smooth return Math.exp logbleu end |