Class: UniversalDetector::CharDistributionAnalysis

Inherits:
Object
  • Object
show all
Defined in:
lib/CharDistributionAnalysis.rb

Constant Summary collapse

ENOUGH_DATA_THRESHOLD =
1024
SURE_YES =
0.99
SURE_NO =
0.01

Instance Method Summary collapse

Constructor Details

#initializeCharDistributionAnalysis

Returns a new instance of CharDistributionAnalysis.



43
44
45
46
47
48
# File 'lib/CharDistributionAnalysis.rb', line 43

def initialize
    @_mCharToFreqOrder = nil # Mapping table to get frequency order from char order (get from GetOrder())
    @_mTableSize = nil # Size of above table
    @_mTypicalDistributionRatio = nil # This is a constant value which varies from language to language, used in calculating confidence.  See http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html for further detail.
    reset()
end

Instance Method Details

#feed(aStr, aCharLen) ⇒ Object



57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
# File 'lib/CharDistributionAnalysis.rb', line 57

def feed(aStr, aCharLen)
    #"""feed a character with known length"""
    if aCharLen == 2
        # we only care about 2-bytes character in our distribution analysis
        order = get_order(aStr)
    else
        order = -1
    end
    if order >= 0
        @_mTotalChars += 1
        # order is valid
        if order < @_mTableSize
            if 512 > @_mCharToFreqOrder[order]
                @_mFreqChars += 1
            end
        end
    end
end

#get_confidenceObject



76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# File 'lib/CharDistributionAnalysis.rb', line 76

def get_confidence
    #"""return confidence based on existing data"""
    # if we didn"t receive any character in our consideration range, return negative answer
    if @_mTotalChars <= 0
        return SURE_NO
    end

    if @_mTotalChars != @_mFreqChars
        r = @_mFreqChars / ((@_mTotalChars - @_mFreqChars) * @_mTypicalDistributionRatio)
        if r < SURE_YES
            return r
        end
    end

    # normalize confidence (we don"t want to be 100% sure)
    return SURE_YES
end

#get_order(aStr) ⇒ Object



100
101
102
103
104
105
# File 'lib/CharDistributionAnalysis.rb', line 100

def get_order(aStr)
    # We do not handle characters based on the original encoding string, but 
    # convert this encoding string to a number, here called order.
    # This allows multiple encodings of a language to share one frequency table.
    return -1
end

#got_enough_dataObject



94
95
96
97
98
# File 'lib/CharDistributionAnalysis.rb', line 94

def got_enough_data
    # It is not necessary to receive all data to draw conclusion. For charset detection,
    # certain amount of data is enough
    return @_mTotalChars > ENOUGH_DATA_THRESHOLD
end

#resetObject



50
51
52
53
54
55
# File 'lib/CharDistributionAnalysis.rb', line 50

def reset
    #"""reset analyser, clear any state"""
    @_mDone = false # If this flag is set to constants.True, detection is done and conclusion has been made
    @_mTotalChars = 0 # Total characters encountered
    @_mFreqChars = 0 # The number of characters whose frequency order is less than 512
end