Class: CharDet::HebrewProber
- Inherits:
-
CharSetProber
- Object
- CharSetProber
- CharDet::HebrewProber
- Defined in:
- lib/rchardet/hebrewprober.rb
Instance Attribute Summary
Attributes inherited from CharSetProber
Instance Method Summary collapse
- #feed(aBuf) ⇒ Object
- #get_charset_name ⇒ Object
- #get_state ⇒ Object
-
#initialize ⇒ HebrewProber
constructor
A new instance of HebrewProber.
- #is_final(c) ⇒ Object
- #is_non_final(c) ⇒ Object
- #reset ⇒ Object
- #set_model_probers(logicalProber, visualProber) ⇒ Object
Methods inherited from CharSetProber
#filter_high_bit_only, #filter_with_english_letters, #filter_without_english_letters, #get_confidence
Constructor Details
#initialize ⇒ HebrewProber
Returns a new instance of HebrewProber.
151 152 153 154 155 156 |
# File 'lib/rchardet/hebrewprober.rb', line 151 def initialize super() @_mLogicalProber = nil @_mVisualProber = nil reset() end |
Instance Method Details
#feed(aBuf) ⇒ Object
192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 |
# File 'lib/rchardet/hebrewprober.rb', line 192 def feed(aBuf) # Final letter analysis for logical-visual decision. # Look for evidence that the received buffer is either logical Hebrew or # visual Hebrew. # The following cases are checked: # 1) A word longer than 1 letter, ending with a final letter. This is an # indication that the text is laid out "naturally" since the final letter # really appears at the end. +1 for logical score. # 2) A word longer than 1 letter, ending with a Non-Final letter. In normal # Hebrew, words ending with Kaf, Mem, Nun, Pe or Tsadi, should not end with # the Non-Final form of that letter. Exceptions to this rule are mentioned # above in isNonFinal(). This is an indication that the text is laid out # backwards. +1 for visual score # 3) A word longer than 1 letter, starting with a final letter. Final letters # should not appear at the beginning of a word. This is an indication that # the text is laid out backwards. +1 for visual score. # # The visual score and logical score are accumulated throughout the text and # are finally checked against each other in GetCharSetName(). # No checking for final letters in the middle of words is done since that case # is not an indication for either Logical or Visual text. # # We automatically filter out all 7-bit characters (replace them with spaces) # so the word boundary detection works properly. [MAP] if get_state() == ENotMe # Both model probers say it's not them. No reason to continue. return ENotMe end aBuf = filter_high_bit_only(aBuf) for cur in aBuf.split(' ') if cur == ' ' # We stand on a space - a word just ended if @_mBeforePrev != ' ' # next-to-last char was not a space so self._mPrev is not a 1 letter word if is_final(@_mPrev) # case (1) [-2:not space][-1:final letter][cur:space] @_mFinalCharLogicalScore += 1 elsif is_non_final(@_mPrev) # case (2) [-2:not space][-1:Non-Final letter][cur:space] @_mFinalCharVisualScore += 1 end end else # Not standing on a space if (@_mBeforePrev == ' ') and (is_final(@_mPrev)) and (cur != ' ') # case (3) [-2:space][-1:final letter][cur:not space] @_mFinalCharVisualScore += 1 end end @_mBeforePrev = @_mPrev @_mPrev = cur end # Forever detecting, till the end or until both model probers return eNotMe (handled above) return EDetecting end |
#get_charset_name ⇒ Object
252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 |
# File 'lib/rchardet/hebrewprober.rb', line 252 def get_charset_name # Make the decision: is it Logical or Visual? # If the final letter score distance is dominant enough, rely on it. finalsub = @_mFinalCharLogicalScore - @_mFinalCharVisualScore if finalsub >= MIN_FINAL_CHAR_DISTANCE return LOGICAL_HEBREW_NAME end if finalsub <= -MIN_FINAL_CHAR_DISTANCE return VISUAL_HEBREW_NAME end # It's not dominant enough, try to rely on the model scores instead. modelsub = @_mLogicalProber.get_confidence() - @_mVisualProber.get_confidence() if modelsub > MIN_MODEL_DISTANCE return LOGICAL_HEBREW_NAME end if modelsub < -MIN_MODEL_DISTANCE return VISUAL_HEBREW_NAME end # Still no good, back to final letter distance, maybe it'll save the day. if finalsub < 0.0 return VISUAL_HEBREW_NAME end # (finalsub > 0 - Logical) or (don't know what to do) default to Logical. return LOGICAL_HEBREW_NAME end |
#get_state ⇒ Object
281 282 283 284 285 286 287 |
# File 'lib/rchardet/hebrewprober.rb', line 281 def get_state # Remain active as long as any of the model probers are active. if (@_mLogicalProber.get_state() == ENotMe) and (@_mVisualProber.get_state() == ENotMe) return ENotMe end return EDetecting end |
#is_final(c) ⇒ Object
174 175 176 |
# File 'lib/rchardet/hebrewprober.rb', line 174 def is_final(c) return [FINAL_KAF, FINAL_MEM, FINAL_NUN, FINAL_PE, FINAL_TSADI].include?(c) end |
#is_non_final(c) ⇒ Object
178 179 180 181 182 183 184 185 186 187 188 189 190 |
# File 'lib/rchardet/hebrewprober.rb', line 178 def is_non_final(c) # The normal Tsadi is not a good Non-Final letter due to words like # 'lechotet' (to chat) containing an apostrophe after the tsadi. This # apostrophe is converted to a space in FilterWithoutEnglishLetters causing # the Non-Final tsadi to appear at an end of a word even though this is not # the case in the original text. # The letters Pe and Kaf rarely display a related behavior of not being a # good Non-Final letter. Words like 'Pop', 'Winamp' and 'Mubarak' for # example legally end with a Non-Final Pe or Kaf. However, the benefit of # these letters as Non-Final letters outweighs the damage since these words # are quite rare. return [NORMAL_KAF, NORMAL_MEM, NORMAL_NUN, NORMAL_PE].include?(c) end |
#reset ⇒ Object
158 159 160 161 162 163 164 165 166 167 |
# File 'lib/rchardet/hebrewprober.rb', line 158 def reset @_mFinalCharLogicalScore = 0 @_mFinalCharVisualScore = 0 # The two last characters seen in the previous buffer, # mPrev and mBeforePrev are initialized to space in order to simulate a word # delimiter at the beginning of the data @_mPrev = ' ' @_mBeforePrev = ' ' # These probers are owned by the group prober. end |
#set_model_probers(logicalProber, visualProber) ⇒ Object
169 170 171 172 |
# File 'lib/rchardet/hebrewprober.rb', line 169 def set_model_probers(logicalProber, visualProber) @_mLogicalProber = logicalProber @_mVisualProber = visualProber end |