Module: PROIEL::Alignment::Builder
- Defined in:
- lib/proiel/alignment/builder.rb
Class Method Summary collapse
-
.compute_matrix(alignment, source, blacklist = [], log_directory = nil) ⇒ Object
This computes a matrix of original and translation sentences that are aligned.
Class Method Details
.compute_matrix(alignment, source, blacklist = [], log_directory = nil) ⇒ Object
This computes a matrix of original and translation sentences that are aligned. For now, this function does not handle translation sentences that are unaligned (this is tricky to handle robustly!). As the current treebank collection stands this is an issue that should not arise so this is for now a reasonable approximation.
9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 |
# File 'lib/proiel/alignment/builder.rb', line 9 def self.compute_matrix(alignment, source, blacklist = [], log_directory = nil) matrix1 = group_backwards(alignment, source, blacklist) raise unless matrix1.map { |r| r[:original] }.flatten.compact == alignment.sentences.map(&:id) matrix2 = group_forwards(alignment, source, blacklist) raise unless matrix2.map { |r| r[:translation] }.flatten.compact == source.sentences.map(&:id) if log_directory # Verify that both texts are still in the correct sequence File.open(File.join(log_directory, "#{source.id}1"), 'w') do |f| matrix1.map do |x| f.puts x.inspect end end File.open(File.join(log_directory, "#{source.id}2"), 'w') do |f| matrix2.map do |x| f.puts x.inspect end end end matrix = [] iter1 = { i: 0, m: matrix1 } iter2 = { i: 0, m: matrix2 } loop do # Take from matrix1 unless we have a translation while iter1[:i] < iter1[:m].length and iter1[:m][iter1[:i]][:translation].empty? matrix << iter1[:m][iter1[:i]] iter1[:i] += 1 end # Take from matrix2 unless we have an original while iter2[:i] < iter2[:m].length and iter2[:m][iter2[:i]][:original].empty? matrix << iter2[:m][iter2[:i]] iter2[:i] += 1 end if iter1[:i] < iter1[:m].length and iter2[:i] < iter2[:m].length # Now the two should match provided alignments are sorted the same way, # so take one from each. If they don't match outright, we may have a case # of swapped sentence orders or a gap (one sentence unaligned in one of # the texts surrounded by two sentences that are aligned to the same # sentence in the other text). We'll try to repair this by merging bits # from the next row in various combinations. # # When adding to the new mateix, pick original from matrix1 and # translation from matrix2 so that the original textual order is # preserved if repair(matrix, iter1, 0, iter2, 0) or repair(matrix, iter1, 1, iter2, 0) or repair(matrix, iter1, 0, iter2, 1) or repair(matrix, iter1, 1, iter2, 1) or repair(matrix, iter1, 2, iter2, 0) or repair(matrix, iter1, 0, iter2, 2) or repair(matrix, iter1, 2, iter2, 1) or repair(matrix, iter1, 1, iter2, 2) or repair(matrix, iter1, 2, iter2, 2) or repair(matrix, iter1, 3, iter2, 0) or repair(matrix, iter1, 0, iter2, 3) or repair(matrix, iter1, 3, iter2, 1) or repair(matrix, iter1, 1, iter2, 3) or repair(matrix, iter1, 3, iter2, 2) or repair(matrix, iter1, 2, iter2, 3) or repair(matrix, iter1, 3, iter2, 3) or repair(matrix, iter1, 4, iter2, 0) or repair(matrix, iter1, 0, iter2, 4) or repair(matrix, iter1, 4, iter2, 1) or repair(matrix, iter1, 1, iter2, 4) or repair(matrix, iter1, 4, iter2, 2) or repair(matrix, iter1, 2, iter2, 4) or repair(matrix, iter1, 4, iter2, 3) or repair(matrix, iter1, 3, iter2, 4) or repair(matrix, iter1, 4, iter2, 4) else STDERR.puts iter1[:i], iter1[:m][iter1[:i]].inspect STDERR.puts iter2[:i], iter2[:m][iter2[:i]].inspect raise end else raise unless iter1[:i] == iter1[:m].length and iter2[:i] == iter2[:m].length break end end if log_directory File.open(File.join(log_directory, "#{source.id}3"), 'w') do |f| matrix.map do |x| f.puts x.inspect end end end raise unless matrix.map { |r| r[:original] }.flatten.compact == alignment.sentences.map(&:id) raise unless matrix.map { |r| r[:translation] }.flatten.compact == source.sentences.map(&:id) matrix end |