Class: CharDet::HebrewProber

Inherits:

CharSetProber

Object
CharSetProber
CharDet::HebrewProber

show all

Defined in:: lib/rchardet/hebrewprober.rb

Instance Attribute Summary

Attributes inherited from CharSetProber

#active

Instance Method Summary collapse

#feed(aBuf) ⇒ Object
#get_charset_name ⇒ Object
#get_state ⇒ Object
#initialize ⇒ HebrewProber constructor

A new instance of HebrewProber.
#is_final(c) ⇒ Object
#is_non_final(c) ⇒ Object
#reset ⇒ Object
#set_model_probers(logicalProber, visualProber) ⇒ Object

Methods inherited from CharSetProber

#filter_high_bit_only, #filter_with_english_letters, #filter_without_english_letters, #get_confidence

Constructor Details

#initialize ⇒ `HebrewProber`

Returns a new instance of HebrewProber.

# File 'lib/rchardet/hebrewprober.rb', line 151

def initialize
  super()
  @_mLogicalProber = nil
  @_mVisualProber = nil
  reset()
end

Instance Method Details

#feed(aBuf) ⇒ `Object`

# File 'lib/rchardet/hebrewprober.rb', line 192

def feed(aBuf)
  # Final letter analysis for logical-visual decision.
  # Look for evidence that the received buffer is either logical Hebrew or 
  # visual Hebrew.
  # The following cases are checked:
  # 1) A word longer than 1 letter, ending with a final letter. This is an 
  #    indication that the text is laid out "naturally" since the final letter 
  #    really appears at the end. +1 for logical score.
  # 2) A word longer than 1 letter, ending with a Non-Final letter. In normal
  #    Hebrew, words ending with Kaf, Mem, Nun, Pe or Tsadi, should not end with
  #    the Non-Final form of that letter. Exceptions to this rule are mentioned
  #    above in isNonFinal(). This is an indication that the text is laid out
  #    backwards. +1 for visual score
  # 3) A word longer than 1 letter, starting with a final letter. Final letters 
  #    should not appear at the beginning of a word. This is an indication that 
  #    the text is laid out backwards. +1 for visual score.
  # 
  # The visual score and logical score are accumulated throughout the text and 
  # are finally checked against each other in GetCharSetName().
  # No checking for final letters in the middle of words is done since that case
  # is not an indication for either Logical or Visual text.
  # 
  # We automatically filter out all 7-bit characters (replace them with spaces)
  # so the word boundary detection works properly. [MAP]

  if get_state() == ENotMe
	# Both model probers say it's not them. No reason to continue.
	return ENotMe
  end

  aBuf = filter_high_bit_only(aBuf)

  for cur in aBuf.split(' ')
	if cur == ' '
	  # We stand on a space - a word just ended
	  if @_mBeforePrev != ' '
 # next-to-last char was not a space so self._mPrev is not a 1 letter word
 if is_final(@_mPrev)
   # case (1) [-2:not space][-1:final letter][cur:space]
   @_mFinalCharLogicalScore += 1
 elsif is_non_final(@_mPrev)
   # case (2) [-2:not space][-1:Non-Final letter][cur:space]
   @_mFinalCharVisualScore += 1
 end
	  end
	else
	  # Not standing on a space
	  if (@_mBeforePrev == ' ') and (is_final(@_mPrev)) and (cur != ' ')
 # case (3) [-2:space][-1:final letter][cur:not space]
 @_mFinalCharVisualScore += 1
	  end
	end
	@_mBeforePrev = @_mPrev
	@_mPrev = cur
  end

  # Forever detecting, till the end or until both model probers return eNotMe (handled above)
  return EDetecting
end

#get_charset_name ⇒ `Object`

# File 'lib/rchardet/hebrewprober.rb', line 252

def get_charset_name
  # Make the decision: is it Logical or Visual?
  # If the final letter score distance is dominant enough, rely on it.
  finalsub = @_mFinalCharLogicalScore - @_mFinalCharVisualScore
  if finalsub >= MIN_FINAL_CHAR_DISTANCE
	return LOGICAL_HEBREW_NAME
  end
  if finalsub <= -MIN_FINAL_CHAR_DISTANCE
	return VISUAL_HEBREW_NAME
  end

  # It's not dominant enough, try to rely on the model scores instead.
  modelsub = @_mLogicalProber.get_confidence() - @_mVisualProber.get_confidence()
  if modelsub > MIN_MODEL_DISTANCE
	return LOGICAL_HEBREW_NAME
  end
  if modelsub < -MIN_MODEL_DISTANCE
	return VISUAL_HEBREW_NAME
  end

  # Still no good, back to final letter distance, maybe it'll save the day.
  if finalsub < 0.0
	return VISUAL_HEBREW_NAME
  end

  # (finalsub > 0 - Logical) or (don't know what to do) default to Logical.
  return LOGICAL_HEBREW_NAME
end

#get_state ⇒ `Object`

# File 'lib/rchardet/hebrewprober.rb', line 281

def get_state
  # Remain active as long as any of the model probers are active.
  if (@_mLogicalProber.get_state() == ENotMe) and (@_mVisualProber.get_state() == ENotMe)
	return ENotMe
  end
  return EDetecting
end

#is_final(c) ⇒ `Object`



174
175
176

# File 'lib/rchardet/hebrewprober.rb', line 174

def is_final(c)
  return [FINAL_KAF, FINAL_MEM, FINAL_NUN, FINAL_PE, FINAL_TSADI].include?(c)
end

#is_non_final(c) ⇒ `Object`

# File 'lib/rchardet/hebrewprober.rb', line 178

def is_non_final(c)
  # The normal Tsadi is not a good Non-Final letter due to words like 
  # 'lechotet' (to chat) containing an apostrophe after the tsadi. This 
  # apostrophe is converted to a space in FilterWithoutEnglishLetters causing 
  # the Non-Final tsadi to appear at an end of a word even though this is not 
  # the case in the original text.
  # The letters Pe and Kaf rarely display a related behavior of not being a 
  # good Non-Final letter. Words like 'Pop', 'Winamp' and 'Mubarak' for 
  # example legally end with a Non-Final Pe or Kaf. However, the benefit of 
  # these letters as Non-Final letters outweighs the damage since these words 
  # are quite rare.
  return [NORMAL_KAF, NORMAL_MEM, NORMAL_NUN, NORMAL_PE].include?(c)
end

#reset ⇒ `Object`

# File 'lib/rchardet/hebrewprober.rb', line 158

def reset
  @_mFinalCharLogicalScore = 0
  @_mFinalCharVisualScore = 0
  # The two last characters seen in the previous buffer,
  # mPrev and mBeforePrev are initialized to space in order to simulate a word 
  # delimiter at the beginning of the data
  @_mPrev = ' '
  @_mBeforePrev = ' '
  # These probers are owned by the group prober.
end

#set_model_probers(logicalProber, visualProber) ⇒ `Object`

# File 'lib/rchardet/hebrewprober.rb', line 169

def set_model_probers(logicalProber, visualProber)
  @_mLogicalProber = logicalProber
  @_mVisualProber = visualProber
end

Class: CharDet::HebrewProber

Instance Attribute Summary

Attributes inherited from CharSetProber

Instance Method Summary collapse

Methods inherited from CharSetProber

Constructor Details

#initialize ⇒ HebrewProber

Instance Method Details

#feed(aBuf) ⇒ Object

#get_charset_name ⇒ Object

#get_state ⇒ Object

#is_final(c) ⇒ Object

#is_non_final(c) ⇒ Object

#reset ⇒ Object

#set_model_probers(logicalProber, visualProber) ⇒ Object

#initialize ⇒ `HebrewProber`

#feed(aBuf) ⇒ `Object`

#get_charset_name ⇒ `Object`

#get_state ⇒ `Object`

#is_final(c) ⇒ `Object`

#is_non_final(c) ⇒ `Object`

#reset ⇒ `Object`

#set_model_probers(logicalProber, visualProber) ⇒ `Object`