Class: Glaemscribe::API::UpDownTehtaSplitPreProcessorOperator

Inherits:
PreProcessorOperator show all
Defined in:
lib/api/pre_processor/up_down_tehta_split.rb

Instance Attribute Summary collapse

Attributes inherited from PrePostProcessorOperator

#finalized_glaeml_element, #glaeml_element

Instance Method Summary collapse

Methods inherited from PrePostProcessorOperator

#eval_arg, #finalize_glaeml_element, #initialize

Constructor Details

This class inherits a constructor from Glaemscribe::API::PrePostProcessorOperator

Instance Attribute Details

#consonant_listObject (readonly)

Returns the value of attribute consonant_list.



28
29
30
# File 'lib/api/pre_processor/up_down_tehta_split.rb', line 28

def consonant_list
  @consonant_list
end

#vowel_listObject (readonly)

Returns the value of attribute vowel_list.



28
29
30
# File 'lib/api/pre_processor/up_down_tehta_split.rb', line 28

def vowel_list
  @vowel_list
end

Instance Method Details

#apply(content) ⇒ Object



115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
# File 'lib/api/pre_processor/up_down_tehta_split.rb', line 115

def apply(content)
  accumulated_word = ""
  
  ret = ""
  
  content.split(//).each{ |letter|
    if @word_split_map[letter]
      accumulated_word += letter
    else
      ret += apply_to_word(accumulated_word)
      ret += letter
      accumulated_word = ""
    end        
  }
  ret += apply_to_word(accumulated_word) 
  ret   
end

#apply_to_word(w) ⇒ Object



62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
# File 'lib/api/pre_processor/up_down_tehta_split.rb', line 62

def apply_to_word(w)
  res = []
  
  if w.strip.empty?
    res << w
  else
    while w.length != 0
      r, len = @splitter_tree.transcribe(w)
         
      if r != [UNKNOWN_CHAR_OUTPUT]
        res << r 
      else
        res << w[0..0] # r
      end
    
      w = w[len..-1]
    end
  end
  
  res_modified = []

  # We replace the pattern CVC by CvVC where v is a phantom vowel.
  # This makes the pattern CVC not possible.
  i = 0
  while i < res.count - 2 do
    
    r0 = res[i]
    r1 = res[i+1]
    r2 = res[i+2]
    t0 = type_of(r0)
    t1 = type_of(r1)
    t2 = type_of(r2)
  
    if t0 == "C" && t1 == "V" && t2 == "C"
      res_modified << res[i]
      res_modified << "@"
      res_modified << res[i+1] 
      i += 2
    else
      res_modified << res[i]
      i += 1
    end
  end

  # Add the remaining stuff
  while i < res.count
    res_modified << res[i]
    i += 1
  end

  return res_modified.join("")       
end

#finalize(trans_options) ⇒ Object



29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
# File 'lib/api/pre_processor/up_down_tehta_split.rb', line 29

def finalize(trans_options)
  super(trans_options)
  
  vowel_list          = finalized_glaeml_element.args[0]
  consonant_list      = finalized_glaeml_element.args[1]
      
  vowel_list          = vowel_list.split(/,/).map{|s| s.strip}
  consonant_list      = consonant_list.split(/,/).map{|s| s.strip}  
     
  @vowel_map          = {} # Recognize vowel tokens
  @consonant_map      = {} # Recognize consonant tokens
  @splitter_tree      = TranscriptionTreeNode.new(nil,nil) # Recognize tokens
  @word_split_map     = {}
  # The word split map will help to recognize words
  # The splitter tree will help to split words into tokens
  
  vowel_list.each      { |v| @splitter_tree.add_subpath(v, v); @vowel_map[v] = v }
  consonant_list.each  { |c| @splitter_tree.add_subpath(c, c); @consonant_map[c] = c}

  all_letters = (vowel_list + consonant_list).join("").split(//).sort.uniq    
  all_letters.each{ |l| @word_split_map[l] = l }
end

#type_of(token) ⇒ Object



52
53
54
55
56
57
58
59
60
# File 'lib/api/pre_processor/up_down_tehta_split.rb', line 52

def type_of(token)
  if @vowel_map[token]        
    return "V"
  elsif @consonant_map[token] 
    return "C"
  else                        
    return "X"
  end            
end