Class: CharDet::CharDistributionAnalysis

Inherits:
Object
  • Object
show all
Defined in:
lib/rchardet/chardistribution.rb

Instance Method Summary collapse

Constructor Details

#initializeCharDistributionAnalysis

Returns a new instance of CharDistributionAnalysis.



35
36
37
38
39
40
# File 'lib/rchardet/chardistribution.rb', line 35

def initialize
  @_mCharToFreqOrder = nil # Mapping table to get frequency order from char order (get from GetOrder())
  @_mTableSize = nil # Size of above table
  @_mTypicalDistributionRatio = nil # This is a constant value which varies from language to language, used in calculating confidence.  See http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html for further detail.
  reset()
end

Instance Method Details

#feed(aStr, aCharLen) ⇒ Object



49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
# File 'lib/rchardet/chardistribution.rb', line 49

def feed(aStr, aCharLen)
  # # """feed a character with known length"""
  if aCharLen == 2
	# we only care about 2-bytes character in our distribution analysis
	order = get_order(aStr)
  else
	order = -1
  end
  if order >= 0
	@_mTotalChars += 1
	# order is valid
	if order < @_mTableSize
	  if 512 > @_mCharToFreqOrder[order]
 @_mFreqChars += 1
	  end
	end
  end
end

#get_confidenceObject



68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# File 'lib/rchardet/chardistribution.rb', line 68

def get_confidence
  # """return confidence based on existing data"""
  # if we didn't receive any character in our consideration range, return negative answer
  if @_mTotalChars <= 0
	return SURE_NO
  end

  if @_mTotalChars != @_mFreqChars
	r = @_mFreqChars / ((@_mTotalChars - @_mFreqChars) * @_mTypicalDistributionRatio)
	if r < SURE_YES
	  return r
	end
  end

  # normalize confidence (we don't want to be 100% sure)
  return SURE_YES
end

#get_order(aStr) ⇒ Object



92
93
94
95
96
97
# File 'lib/rchardet/chardistribution.rb', line 92

def get_order(aStr)
  # We do not handle characters based on the original encoding string, but 
  # convert this encoding string to a number, here called order.
  # This allows multiple encodings of a language to share one frequency table.
  return -1
end

#got_enough_dataObject



86
87
88
89
90
# File 'lib/rchardet/chardistribution.rb', line 86

def got_enough_data
  # It is not necessary to receive all data to draw conclusion. For charset detection,
  # certain amount of data is enough
  return @_mTotalChars > ENOUGH_DATA_THRESHOLD
end

#resetObject



42
43
44
45
46
47
# File 'lib/rchardet/chardistribution.rb', line 42

def reset
  # # """reset analyser, clear any state"""
  @_mDone = false # If this flag is set to constants.True, detection is done and conclusion has been made
  @_mTotalChars = 0 # Total characters encountered
  @_mFreqChars = 0 # The number of characters whose frequency order is less than 512
end