Class: UniversalDetector::SingleByteCharSetProber

Inherits:
CharSetProber
  • Object
show all
Defined in:
lib/SingleByteCharSetProber.rb

Instance Method Summary collapse

Methods inherited from CharSetProber

#filter_high_bit_only, #filter_with_english_letters, #filter_without_english_letters, #get_state

Constructor Details

#initialize(model, reversed = false, nameProber = nil) ⇒ SingleByteCharSetProber

Returns a new instance of SingleByteCharSetProber.



43
44
45
46
47
48
49
# File 'lib/SingleByteCharSetProber.rb', line 43

def initialize(model, reversed=false, nameProber=nil)
    super()
    @_mModel = model
    @_mReversed = reversed # TRUE if we need to reverse every pair in the model lookup
    @_mNameProber = nameProber # Optional auxiliary prober for name decision
    reset()
end

Instance Method Details

#feed(aBuf) ⇒ Object



68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# File 'lib/SingleByteCharSetProber.rb', line 68

def feed(aBuf)
    unless @_mModel['keepEnglishLetter']
        aBuf = filter_without_english_letters(aBuf)
    end
    aLen = aBuf.length
    unless aLen
        return get_state()
    end                        
    
    for i in 0...aLen
        c = aBuf[i]
        order = @_mModel['charToOrderMap'][c]
        if order < SYMBOL_CAT_ORDER
            @_mTotalChar += 1
        end
        if order < SAMPLE_SIZE                    
            @_mFreqChar += 1
            if @_mLastOrder < SAMPLE_SIZE
                @_mTotalSeqs += 1
                unless @_mReversed
                    @_mSeqCounters[@_mModel['precedenceMatrix'][(@_mLastOrder * SAMPLE_SIZE) + order]] += 1                        
                else # reverse the order of the letters in the lookup
                    @_mSeqCounters[@_mModel['precedenceMatrix'][(order * SAMPLE_SIZE) + @_mLastOrder]] += 1
                end
            end
        end
        @_mLastOrder = order
    end

    if get_state() == :Detecting
        if @_mTotalSeqs > SB_ENOUGH_REL_THRESHOLD
            cf = get_confidence()
            if cf > POSITIVE_SHORTCUT_THRESHOLD
                if DEBUG
                    p('%s confidence = %s, we have a winner\n' % [@_mModel['charsetName'], cf])
                end
                @_mState = :FoundIt
            elsif cf < NEGATIVE_SHORTCUT_THRESHOLD
                if DEBUG
                    p('%s confidence = %s, below negative shortcut threshhold %s\n' % [@_mModel['charsetName'], cf, NEGATIVE_SHORTCUT_THRESHOLD])
                end
                @_mState = :NotMe
            end
        end
    end

    return get_state()
end

#get_charset_nameObject



60
61
62
63
64
65
66
# File 'lib/SingleByteCharSetProber.rb', line 60

def get_charset_name
    if @_mNameProber
        return @_mNameProber.get_charset_name()
    else
        return @_mModel['charsetName']
    end
end

#get_confidenceObject



117
118
119
120
121
122
123
124
125
126
127
128
129
# File 'lib/SingleByteCharSetProber.rb', line 117

def get_confidence
    r = 0.01
    if @_mTotalSeqs > 0
    #            print @_mSeqCounters[POSITIVE_CAT], @_mTotalSeqs, @_mModel['mTypicalPositiveRatio']
        r = (1.0 * @_mSeqCounters[POSITIVE_CAT]) / @_mTotalSeqs / @_mModel['mTypicalPositiveRatio']
    #            print r, @_mFreqChar, @_mTotalChar
        r = r * @_mFreqChar / @_mTotalChar
        if r >= 1.0
            r = 0.99
        end
    end
    return r
end

#resetObject



51
52
53
54
55
56
57
58
# File 'lib/SingleByteCharSetProber.rb', line 51

def reset
    super
    @_mLastOrder = 255 # char order of last character
    @_mSeqCounters = [0] * NUMBER_OF_SEQ_CAT
    @_mTotalSeqs = 0
    @_mTotalChar = 0
    @_mFreqChar = 0 # characters that fall in our sampling range
end