Class: CharDet::UTF8Prober

Inherits:
CharSetProber show all
Defined in:
lib/rchardet/utf8prober.rb

Instance Attribute Summary

Attributes inherited from CharSetProber

#active

Instance Method Summary collapse

Methods inherited from CharSetProber

#filter_high_bit_only, #filter_with_english_letters, #filter_without_english_letters, #get_state

Constructor Details

#initializeUTF8Prober

Returns a new instance of UTF8Prober.



33
34
35
36
37
# File 'lib/rchardet/utf8prober.rb', line 33

def initialize
  super()
  @codingSM = CodingStateMachine.new(UTF8SMModel)
  reset()
end

Instance Method Details

#feed(aBuf) ⇒ Object



49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
# File 'lib/rchardet/utf8prober.rb', line 49

def feed(aBuf)
  aBuf.each_byte do |b|
    c = b.chr
    codingState = @codingSM.next_state(c)
    if codingState == EError
      @state = ENotMe
      break
    elsif codingState == EItsMe
      @state = EFoundIt
      break
    elsif codingState == EStart
      if @codingSM.get_current_charlen() >= 2
        @numOfMBChar += 1
      end
    end
  end

  if get_state() == EDetecting
    if get_confidence() > SHORTCUT_THRESHOLD
      @state = EFoundIt
    end
  end

  return get_state()
end

#get_charset_nameObject



45
46
47
# File 'lib/rchardet/utf8prober.rb', line 45

def get_charset_name
  return "utf-8"
end

#get_confidenceObject



75
76
77
78
79
80
81
82
83
# File 'lib/rchardet/utf8prober.rb', line 75

def get_confidence
  unlike = 0.99
  if @numOfMBChar < 6
    unlike *= ONE_CHAR_PROB ** @numOfMBChar
    return 1.0 - unlike
  else
    return unlike
  end
end

#resetObject



39
40
41
42
43
# File 'lib/rchardet/utf8prober.rb', line 39

def reset
  super()
  @codingSM.reset()
  @numOfMBChar = 0
end