Class: UniversalDetector::HebrewProber
- Inherits:
-
CharSetProber
- Object
- CharSetProber
- UniversalDetector::HebrewProber
- Defined in:
- lib/HebrewProber.rb
Instance Method Summary collapse
- #feed(aBuf) ⇒ Object
- #get_charset_name ⇒ Object
- #get_state ⇒ Object
-
#initialize ⇒ HebrewProber
constructor
A new instance of HebrewProber.
- #is_final(c) ⇒ Object
- #is_non_final(c) ⇒ Object
- #reset ⇒ Object
- #set_model_probers(logicalProber, visualProber) ⇒ Object
Methods inherited from CharSetProber
#filter_high_bit_only, #filter_with_english_letters, #filter_without_english_letters, #get_confidence
Constructor Details
#initialize ⇒ HebrewProber
Returns a new instance of HebrewProber.
154 155 156 157 158 159 |
# File 'lib/HebrewProber.rb', line 154 def initialize super @_mLogicalProber = nil @_mVisualProber = nil reset() end |
Instance Method Details
#feed(aBuf) ⇒ Object
195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 |
# File 'lib/HebrewProber.rb', line 195 def feed(aBuf) # Final letter analysis for logical-visual decision. # Look for evidence that the received buffer is either logical Hebrew or # visual Hebrew. # The following cases are checked: # 1) A word longer than 1 letter, ending with a final letter. This is an # indication that the text is laid out "naturally" since the final letter # really appears at the end. +1 for logical score. # 2) A word longer than 1 letter, ending with a Non-Final letter. In normal # Hebrew, words ending with Kaf, Mem, Nun, Pe or Tsadi, should not end with # the Non-Final form of that letter. Exceptions to this rule are mentioned # above in isNonFinal(). This is an indication that the text is laid out # backwards. +1 for visual score # 3) A word longer than 1 letter, starting with a final letter. Final letters # should not appear at the beginning of a word. This is an indication that # the text is laid out backwards. +1 for visual score. # # The visual score and logical score are accumulated throughout the text and # are finally checked against each other in GetCharSetName(). # No checking for final letters in the middle of words is done since that case # is not an indication for either Logical or Visual text. # # We automatically filter out all 7-bit characters (replace them with spaces) # so the word boundary detection works properly. [MAP] if get_state() == :NotMe # Both model probers say it's not them. No reason to continue. return :NotMe end aBuf = filter_high_bit_only(aBuf) for cur in aBuf if cur == ' ' # We stand on a space - a word just ended if @_mBeforePrev != ' ' # next-to-last char was not a space so @_mPrev is not a 1 letter word if is_final(@_mPrev) # case (1) [-2:not space][-1:final letter][cur:space] @_mFinalCharLogicalScore += 1 elsif is_non_final(@_mPrev) # case (2) [-2:not space][-1:Non-Final letter][cur:space] @_mFinalCharVisualScore += 1 end end else # Not standing on a space if (@_mBeforePrev == ' ') and (is_final(@_mPrev)) and (cur != ' ') # case (3) [-2:space][-1:final letter][cur:not space] @_mFinalCharVisualScore += 1 end end @_mBeforePrev = @_mPrev @_mPrev = cur end # Forever detecting, till the end or until both model probers return eNotMe (handled above) return :Detecting end |
#get_charset_name ⇒ Object
255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 |
# File 'lib/HebrewProber.rb', line 255 def get_charset_name # Make the decision: is it Logical or Visual? # If the final letter score distance is dominant enough, rely on it. finalsub = @_mFinalCharLogicalScore - @_mFinalCharVisualScore if finalsub >= MIN_FINAL_CHAR_DISTANCE return LOGICAL_HEBREW_NAME end if finalsub <= -MIN_FINAL_CHAR_DISTANCE return VISUAL_HEBREW_NAME end # It's not dominant enough, try to rely on the model scores instead. modelsub = @_mLogicalProber.get_confidence() - @_mVisualProber.get_confidence() if modelsub > MIN_MODEL_DISTANCE return LOGICAL_HEBREW_NAME end if modelsub < -MIN_MODEL_DISTANCE return VISUAL_HEBREW_NAME end # Still no good, back to final letter distance, maybe it'll save the day. if finalsub < 0.0 return VISUAL_HEBREW_NAME end # (finalsub > 0 - Logical) or (don't know what to do) default to Logical. return LOGICAL_HEBREW_NAME end |
#get_state ⇒ Object
284 285 286 287 288 289 290 |
# File 'lib/HebrewProber.rb', line 284 def get_state # Remain active as long as any of the model probers are active. if (@_mLogicalProber.get_state() == :NotMe) and (@_mVisualProber.get_state() == :NotMe) return :NotMe end return :Detecting end |
#is_final(c) ⇒ Object
177 178 179 |
# File 'lib/HebrewProber.rb', line 177 def is_final(c) return [FINAL_KAF, FINAL_MEM, FINAL_NUN, FINAL_PE, FINAL_TSADI].include?(c) end |
#is_non_final(c) ⇒ Object
181 182 183 184 185 186 187 188 189 190 191 192 193 |
# File 'lib/HebrewProber.rb', line 181 def is_non_final(c) # The normal Tsadi is not a good Non-Final letter due to words like # 'lechotet' (to chat) containing an apostrophe after the tsadi. This # apostrophe is converted to a space in FilterWithoutEnglishLetters causing # the Non-Final tsadi to appear at an end of a word even though this is not # the case in the original text. # The letters Pe and Kaf rarely display a related behavior of not being a # good Non-Final letter. Words like 'Pop', 'Winamp' and 'Mubarak' for # example legally end with a Non-Final Pe or Kaf. However, the benefit of # these letters as Non-Final letters outweighs the damage since these words # are quite rare. return [NORMAL_KAF, NORMAL_MEM, NORMAL_NUN, NORMAL_PE].include?(c) end |
#reset ⇒ Object
161 162 163 164 165 166 167 168 169 170 |
# File 'lib/HebrewProber.rb', line 161 def reset @_mFinalCharLogicalScore = 0 @_mFinalCharVisualScore = 0 # The two last characters seen in the previous buffer, # mPrev and mBeforePrev are initialized to space in order to simulate a word # delimiter at the beginning of the data @_mPrev = ' ' @_mBeforePrev = ' ' # These probers are owned by the group prober. end |
#set_model_probers(logicalProber, visualProber) ⇒ Object
172 173 174 175 |
# File 'lib/HebrewProber.rb', line 172 def set_model_probers(logicalProber, visualProber) @_mLogicalProber = logicalProber @_mVisualProber = visualProber end |