Class: CharDet::SingleByteCharSetProber
Overview
Instance Attribute Summary
#active
Instance Method Summary
collapse
#filter_high_bit_only, #filter_with_english_letters, #filter_without_english_letters, #result, #state
Constructor Details
#initialize(model, reversed = false, nameProber = nil) ⇒ SingleByteCharSetProber
Returns a new instance of SingleByteCharSetProber.
41
42
43
44
45
46
47
|
# File 'lib/rchardet/sbcharsetprober.rb', line 41
def initialize(model, reversed=false, nameProber=nil)
super()
@_mModel = model
@_mReversed = reversed @_mNameProber = nameProber reset()
end
|
Instance Method Details
#charset_name ⇒ Object
58
59
60
61
62
63
64
|
# File 'lib/rchardet/sbcharsetprober.rb', line 58
def charset_name
if @_mNameProber
return @_mNameProber.charset_name()
else
return @_mModel['charsetName']
end
end
|
#confidence ⇒ Object
110
111
112
113
114
115
116
117
118
119
120
121
122
|
# File 'lib/rchardet/sbcharsetprober.rb', line 110
def confidence
r = 0.01
if @_mTotalSeqs > 0
r = (1.0 * @_mSeqCounters[POSITIVE_CAT]) / @_mTotalSeqs / @_mModel['mTypicalPositiveRatio']
r = r * @_mFreqChar / @_mTotalChar
if r >= 1.0
r = 0.99
end
end
return r
end
|
#feed(aBuf) ⇒ Object
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
|
# File 'lib/rchardet/sbcharsetprober.rb', line 66
def feed(aBuf)
if not @_mModel['keepEnglishLetter']
aBuf = filter_without_english_letters(aBuf)
end
aLen = aBuf.length
if not aLen
return state()
end
aBuf.each_byte do |b|
c = b.chr
order = @_mModel['charToOrderMap'][c[0].ord]
if order < SYMBOL_CAT_ORDER
@_mTotalChar += 1
end
if order < SAMPLE_SIZE
@_mFreqChar += 1
if @_mLastOrder < SAMPLE_SIZE
@_mTotalSeqs += 1
if not @_mReversed
@_mSeqCounters[@_mModel['precedenceMatrix'][(@_mLastOrder * SAMPLE_SIZE) + order]] += 1
else @_mSeqCounters[@_mModel['precedenceMatrix'][(order * SAMPLE_SIZE) + @_mLastOrder]] += 1
end
end
end
@_mLastOrder = order
end
if state() == EDetecting
if @_mTotalSeqs > SB_ENOUGH_REL_THRESHOLD
cf = confidence()
if cf > POSITIVE_SHORTCUT_THRESHOLD
$stderr << "#{@_mModel['charsetName']} confidence = #{cf}, we have a winner\n" if $debug
@_mState = EFoundIt
elsif cf < NEGATIVE_SHORTCUT_THRESHOLD
$stderr << "#{@_mModel['charsetName']} confidence = #{cf}, below negative shortcut threshold #{NEGATIVE_SHORTCUT_THRESHOLD}\n" if $debug
@_mState = ENotMe
end
end
end
return state()
end
|
#reset ⇒ Object
49
50
51
52
53
54
55
56
|
# File 'lib/rchardet/sbcharsetprober.rb', line 49
def reset
super()
@_mLastOrder = 255 @_mSeqCounters = [0] * NUMBER_OF_SEQ_CAT
@_mTotalSeqs = 0
@_mTotalChar = 0
@_mFreqChar = 0 end
|