Class: CharDet::CharDistributionAnalysis
- Defined in:
- lib/tmail/vendor/rchardet-1.3/lib/rchardet/chardistribution.rb
Direct Known Subclasses
Big5DistributionAnalysis, EUCJPDistributionAnalysis, EUCKRDistributionAnalysis, EUCTWDistributionAnalysis, GB2312DistributionAnalysis, SJISDistributionAnalysis
Instance Method Summary collapse
- #feed(aStr, aCharLen) ⇒ Object
- #get_confidence ⇒ Object
- #get_order(aStr) ⇒ Object
- #got_enough_data ⇒ Object
-
#initialize ⇒ CharDistributionAnalysis
constructor
A new instance of CharDistributionAnalysis.
- #reset ⇒ Object
Constructor Details
#initialize ⇒ CharDistributionAnalysis
Returns a new instance of CharDistributionAnalysis.
36 37 38 39 40 41 |
# File 'lib/tmail/vendor/rchardet-1.3/lib/rchardet/chardistribution.rb', line 36 def initialize @_mCharToFreqOrder = nil # Mapping table to get frequency order from char order (get from GetOrder()) @_mTableSize = nil # Size of above table @_mTypicalDistributionRatio = nil # This is a constant value which varies from language to language, used in calculating confidence. See http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html for further detail. reset() end |
Instance Method Details
#feed(aStr, aCharLen) ⇒ Object
50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 |
# File 'lib/tmail/vendor/rchardet-1.3/lib/rchardet/chardistribution.rb', line 50 def feed(aStr, aCharLen) # # """feed a character with known length""" if aCharLen == 2 # we only care about 2-bytes character in our distribution analysis order = get_order(aStr) else order = -1 end if order >= 0 @_mTotalChars += 1 # order is valid if order < @_mTableSize if 512 > @_mCharToFreqOrder[order] @_mFreqChars += 1 end end end end |
#get_confidence ⇒ Object
69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 |
# File 'lib/tmail/vendor/rchardet-1.3/lib/rchardet/chardistribution.rb', line 69 def get_confidence # """return confidence based on existing data""" # if we didn't receive any character in our consideration range, return negative answer if @_mTotalChars <= 0 return SURE_NO end if @_mTotalChars != @_mFreqChars r = @_mFreqChars / ((@_mTotalChars - @_mFreqChars) * @_mTypicalDistributionRatio) if r < SURE_YES return r end end # normalize confidence (we don't want to be 100% sure) return SURE_YES end |
#get_order(aStr) ⇒ Object
93 94 95 96 97 98 |
# File 'lib/tmail/vendor/rchardet-1.3/lib/rchardet/chardistribution.rb', line 93 def get_order(aStr) # We do not handle characters based on the original encoding string, but # convert this encoding string to a number, here called order. # This allows multiple encodings of a language to share one frequency table. return -1 end |
#got_enough_data ⇒ Object
87 88 89 90 91 |
# File 'lib/tmail/vendor/rchardet-1.3/lib/rchardet/chardistribution.rb', line 87 def got_enough_data # It is not necessary to receive all data to draw conclusion. For charset detection, # certain amount of data is enough return @_mTotalChars > ENOUGH_DATA_THRESHOLD end |
#reset ⇒ Object
43 44 45 46 47 48 |
# File 'lib/tmail/vendor/rchardet-1.3/lib/rchardet/chardistribution.rb', line 43 def reset # # """reset analyser, clear any state""" @_mDone = false # If this flag is set to constants.True, detection is done and conclusion has been made @_mTotalChars = 0 # Total characters encountered @_mFreqChars = 0 # The number of characters whose frequency order is less than 512 end |