Class: CharDet::SingleByteCharSetProber
Overview
Instance Attribute Summary
#active
Instance Method Summary
collapse
#filter_high_bit_only, #filter_with_english_letters, #filter_without_english_letters, #get_state
Constructor Details
#initialize(model, reversed = false, nameProber = nil) ⇒ SingleByteCharSetProber
Returns a new instance of SingleByteCharSetProber.
41
42
43
44
45
46
47
|
# File 'lib/rchardet/sbcharsetprober.rb', line 41
def initialize(model, reversed=false, nameProber=nil)
super()
@model = model
@reversed = reversed @nameProber = nameProber reset()
end
|
Instance Method Details
#feed(aBuf) ⇒ Object
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
|
# File 'lib/rchardet/sbcharsetprober.rb', line 66
def feed(aBuf)
if !@model['keepEnglishLetter']
aBuf = filter_without_english_letters(aBuf)
end
aLen = aBuf.length
if aLen == 0
return get_state()
end
aBuf.each_byte do |b|
c = b.chr
order = @model['charToOrderMap'][c.bytes.first]
if order < SYMBOL_CAT_ORDER
@totalChar += 1
end
if order < SAMPLE_SIZE
@freqChar += 1
if @lastOrder < SAMPLE_SIZE
@totalSeqs += 1
if !@reversed
@seqCounters[@model['precedenceMatrix'][(@lastOrder * SAMPLE_SIZE) + order]] += 1
else @seqCounters[@model['precedenceMatrix'][(order * SAMPLE_SIZE) + @lastOrder]] += 1
end
end
end
@lastOrder = order
end
if get_state() == EDetecting
if @totalSeqs > SB_ENOUGH_REL_THRESHOLD
cf = get_confidence()
if cf > POSITIVE_SHORTCUT_THRESHOLD
$stderr << "#{@model['charsetName']} confidence = #{cf}, we have a winner\n" if $debug
@state = EFoundIt
elsif cf < NEGATIVE_SHORTCUT_THRESHOLD
$stderr << "#{@model['charsetName']} confidence = #{cf}, below negative shortcut threshold #{NEGATIVE_SHORTCUT_THRESHOLD}\n" if $debug
@state = ENotMe
end
end
end
return get_state()
end
|
#get_charset_name ⇒ Object
58
59
60
61
62
63
64
|
# File 'lib/rchardet/sbcharsetprober.rb', line 58
def get_charset_name
if @nameProber
return @nameProber.get_charset_name()
else
return @model['charsetName']
end
end
|
#get_confidence ⇒ Object
110
111
112
113
114
115
116
117
118
119
120
|
# File 'lib/rchardet/sbcharsetprober.rb', line 110
def get_confidence
r = 0.01
if @totalSeqs > 0
r = (1.0 * @seqCounters[POSITIVE_CAT]) / @totalSeqs / @model['mTypicalPositiveRatio']
r = r * @freqChar / @totalChar
if r >= 1.0
r = 0.99
end
end
return r
end
|
#reset ⇒ Object
49
50
51
52
53
54
55
56
|
# File 'lib/rchardet/sbcharsetprober.rb', line 49
def reset
super()
@lastOrder = 255 @seqCounters = [0] * NUMBER_OF_SEQ_CAT
@totalSeqs = 0
@totalChar = 0
@freqChar = 0 end
|