Class: UniversalDetector::SingleByteCharSetProber
Instance Method Summary
collapse
#filter_high_bit_only, #filter_with_english_letters, #filter_without_english_letters, #get_state
Constructor Details
#initialize(model, reversed = false, nameProber = nil) ⇒ SingleByteCharSetProber
Returns a new instance of SingleByteCharSetProber.
43
44
45
46
47
48
49
|
# File 'lib/SingleByteCharSetProber.rb', line 43
def initialize(model, reversed=false, nameProber=nil)
super()
@_mModel = model
@_mReversed = reversed @_mNameProber = nameProber reset()
end
|
Instance Method Details
#feed(aBuf) ⇒ Object
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
|
# File 'lib/SingleByteCharSetProber.rb', line 68
def feed(aBuf)
unless @_mModel['keepEnglishLetter']
aBuf = filter_without_english_letters(aBuf)
end
aLen = aBuf.length
unless aLen
return get_state()
end
for i in 0...aLen
c = aBuf[i]
order = @_mModel['charToOrderMap'][c]
if order < SYMBOL_CAT_ORDER
@_mTotalChar += 1
end
if order < SAMPLE_SIZE
@_mFreqChar += 1
if @_mLastOrder < SAMPLE_SIZE
@_mTotalSeqs += 1
unless @_mReversed
@_mSeqCounters[@_mModel['precedenceMatrix'][(@_mLastOrder * SAMPLE_SIZE) + order]] += 1
else @_mSeqCounters[@_mModel['precedenceMatrix'][(order * SAMPLE_SIZE) + @_mLastOrder]] += 1
end
end
end
@_mLastOrder = order
end
if get_state() == :Detecting
if @_mTotalSeqs > SB_ENOUGH_REL_THRESHOLD
cf = get_confidence()
if cf > POSITIVE_SHORTCUT_THRESHOLD
if DEBUG
p('%s confidence = %s, we have a winner\n' % [@_mModel['charsetName'], cf])
end
@_mState = :FoundIt
elsif cf < NEGATIVE_SHORTCUT_THRESHOLD
if DEBUG
p('%s confidence = %s, below negative shortcut threshhold %s\n' % [@_mModel['charsetName'], cf, NEGATIVE_SHORTCUT_THRESHOLD])
end
@_mState = :NotMe
end
end
end
return get_state()
end
|
#get_charset_name ⇒ Object
60
61
62
63
64
65
66
|
# File 'lib/SingleByteCharSetProber.rb', line 60
def get_charset_name
if @_mNameProber
return @_mNameProber.get_charset_name()
else
return @_mModel['charsetName']
end
end
|
#get_confidence ⇒ Object
117
118
119
120
121
122
123
124
125
126
127
128
129
|
# File 'lib/SingleByteCharSetProber.rb', line 117
def get_confidence
r = 0.01
if @_mTotalSeqs > 0
r = (1.0 * @_mSeqCounters[POSITIVE_CAT]) / @_mTotalSeqs / @_mModel['mTypicalPositiveRatio']
r = r * @_mFreqChar / @_mTotalChar
if r >= 1.0
r = 0.99
end
end
return r
end
|
#reset ⇒ Object
51
52
53
54
55
56
57
58
|
# File 'lib/SingleByteCharSetProber.rb', line 51
def reset
super
@_mLastOrder = 255 @_mSeqCounters = [0] * NUMBER_OF_SEQ_CAT
@_mTotalSeqs = 0
@_mTotalChar = 0
@_mFreqChar = 0 end
|