Class: UniversalDetector::MultiByteCharSetProber

Inherits:
CharSetProber show all
Defined in:
lib/MultiByteCharSetProber.rb

Instance Method Summary collapse

Methods inherited from CharSetProber

#filter_high_bit_only, #filter_with_english_letters, #filter_without_english_letters, #get_state

Constructor Details

#initializeMultiByteCharSetProber

Returns a new instance of MultiByteCharSetProber.



35
36
37
38
39
40
# File 'lib/MultiByteCharSetProber.rb', line 35

def initialize
    super
    @_mDistributionAnalyzer = nil
    @_mCodingSM = nil
    @_mLastChar = ['\x00', '\x00']
end

Instance Method Details

#feed(aBuf) ⇒ Object



56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
# File 'lib/MultiByteCharSetProber.rb', line 56

def feed(aBuf)            
    aLen = aBuf.length
    for i in 0...aLen
        codingState = @_mCodingSM.next_state(aBuf[i])
        if codingState == :Error
            if UniversalDetector::DEBUG
                p(get_charset_name() + ' prober hit error at byte ' + i.to_s + '\n')
            end
            @_mState = :NotMe
            break                
        elsif codingState == :ItsMe
            @_mState = :FoundIt
            break
        elsif codingState == :Start
            charLen = @_mCodingSM.get_current_charlen()
            if i == 0
                @_mLastChar[1] = aBuf[0]
                @_mDistributionAnalyzer.feed(@_mLastChar, charLen)
            else
                @_mDistributionAnalyzer.feed(aBuf[(i-1)..(i+1)], charLen)
            end
        end
    end

    @_mLastChar[0] = aBuf[aLen - 1]
    if get_state() == :Detecting
        if @_mDistributionAnalyzer.got_enough_data() && (get_confidence() > SHORTCUT_THRESHOLD)
            @_mState = :FoundIt
        end
    end

    return get_state()
end

#get_charset_nameObject



53
54
# File 'lib/MultiByteCharSetProber.rb', line 53

def get_charset_name
end

#get_confidenceObject



90
91
92
# File 'lib/MultiByteCharSetProber.rb', line 90

def get_confidence
    return @_mDistributionAnalyzer.get_confidence()
end

#resetObject



42
43
44
45
46
47
48
49
50
51
# File 'lib/MultiByteCharSetProber.rb', line 42

def reset
    super
    if @_mCodingSM
        @_mCodingSM.reset()
    end
    if @_mDistributionAnalyzer
        @_mDistributionAnalyzer.reset()
    end
    @_mLastChar = ['\x00', '\x00']
end