Class: CharDet::UTF8Prober

Inherits:
CharSetProber show all
Defined in:
lib/rchardet/utf8prober.rb

Instance Attribute Summary

Attributes inherited from CharSetProber

#active

Instance Method Summary collapse

Methods inherited from CharSetProber

#filter_high_bit_only, #filter_with_english_letters, #filter_without_english_letters, #get_state

Constructor Details

#initializeUTF8Prober

Returns a new instance of UTF8Prober.



33
34
35
36
37
# File 'lib/rchardet/utf8prober.rb', line 33

def initialize
  super()
  @_mCodingSM = CodingStateMachine.new(UTF8SMModel)
  reset()
end

Instance Method Details

#feed(aBuf) ⇒ Object



49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
# File 'lib/rchardet/utf8prober.rb', line 49

def feed(aBuf)
  aBuf.each_byte do |b|
	c = b.chr
	codingState = @_mCodingSM.next_state(c)
	if codingState == EError
	  @_mState = ENotMe
	  break
	elsif codingState == EItsMe
	  @_mState = EFoundIt
	  break
	elsif codingState == EStart
	  if @_mCodingSM.get_current_charlen() >= 2
 @_mNumOfMBChar += 1
	  end
	end
  end

  if get_state() == EDetecting
	if get_confidence() > SHORTCUT_THRESHOLD
	  @_mState = EFoundIt
	end
  end

  return get_state()
end

#get_charset_nameObject



45
46
47
# File 'lib/rchardet/utf8prober.rb', line 45

def get_charset_name
  return "utf-8"
end

#get_confidenceObject



75
76
77
78
79
80
81
82
83
84
85
# File 'lib/rchardet/utf8prober.rb', line 75

def get_confidence
  unlike = 0.99
  if @_mNumOfMBChar < 6
	for i in (0...@_mNumOfMBChar)
	  unlike = unlike * ONE_CHAR_PROB
	end
	return 1.0 - unlike
  else
	return unlike
  end
end

#resetObject



39
40
41
42
43
# File 'lib/rchardet/utf8prober.rb', line 39

def reset
  super()
  @_mCodingSM.reset()
  @_mNumOfMBChar = 0
end