Class: CharDet::UniversalDetector

Inherits:

Object

Object
CharDet::UniversalDetector

show all

Defined in:: lib/rchardet/universaldetector.rb

Instance Attribute Summary collapse

#done ⇒ Object readonly

Returns the value of attribute done.
#result ⇒ Object readonly

Returns the value of attribute result.

Instance Method Summary collapse

#close ⇒ Object
#feed(aBuf) ⇒ Object
#initialize ⇒ UniversalDetector constructor

A new instance of UniversalDetector.
#reset ⇒ Object

Constructor Details

#initialize ⇒ `UniversalDetector`

Returns a new instance of UniversalDetector.

# File 'lib/rchardet/universaldetector.rb', line 41

def initialize
  @highBitDetector = /[\x80-\xFF]/n
  @escDetector = /(\033|\~\{)/n
  @escCharSetProber = nil
  @charSetProbers = []
  reset()
end

Instance Attribute Details

#done ⇒ `Object` (readonly)

Returns the value of attribute done.



39
40
41

# File 'lib/rchardet/universaldetector.rb', line 39

def done
  @done
end

#result ⇒ `Object` (readonly)

Returns the value of attribute result.



39
40
41

# File 'lib/rchardet/universaldetector.rb', line 39

def result
  @result
end

Instance Method Details

#close ⇒ `Object`

# File 'lib/rchardet/universaldetector.rb', line 146

def close
  return if @done
  if !@gotData
    $stderr << "no data received!\n" if $debug
    return
  end
  @done = true

  if @inputState == EPureAscii
    @result = {'encoding' => 'ascii', 'confidence' => 1.0}
    return @result
  end

  if @inputState == EHighbyte
    confidences = {}
    @charSetProbers.each{ |prober| confidences[prober] = prober.get_confidence }
    maxProber = @charSetProbers.max{ |a,b| confidences[a] <=> confidences[b] }
    if maxProber and maxProber.get_confidence > MINIMUM_THRESHOLD
      @result = {'encoding' =>  maxProber.get_charset_name(),
        'confidence' =>  maxProber.get_confidence()}
      return @result
    end
  end

  if $debug
    $stderr << "no probers hit minimum threshhold\n" if $debug
    for prober in @charSetProbers[0].probers
      next if !prober
      $stderr << "#{prober.get_charset_name} confidence = #{prober.get_confidence}\n" if $debug
    end
  end
end

#feed(aBuf) ⇒ `Object`

# File 'lib/rchardet/universaldetector.rb', line 64

def feed(aBuf)
  return if @done

  aLen = aBuf.length
  return if aLen == 0

  if !@gotData
    # If the data starts with BOM, we know it is UTF
    if aBuf[0, 3] == "\xEF\xBB\xBF"
      # EF BB BF  UTF-8 with BOM
      @result = {'encoding' => "UTF-8", 'confidence' => 1.0}
    elsif aBuf[0, 4] == "\xFF\xFE\x00\x00"
      # FF FE 00 00  UTF-32, little-endian BOM
      @result = {'encoding' => "UTF-32LE", 'confidence' => 1.0}
    elsif aBuf[0, 4] == "\x00\x00\xFE\xFF"
      # 00 00 FE FF  UTF-32, big-endian BOM
      @result = {'encoding' => "UTF-32BE", 'confidence' => 1.0}
    elsif aBuf[0, 4] == "\xFE\xFF\x00\x00"
      # FE FF 00 00  UCS-4, unusual octet order BOM (3412)
      @result = {'encoding' => "X-ISO-10646-UCS-4-3412", 'confidence' => 1.0}
    elsif aBuf[0, 4] == "\x00\x00\xFF\xFE"
      # 00 00 FF FE  UCS-4, unusual octet order BOM (2143)
      @result = {'encoding' =>  "X-ISO-10646-UCS-4-2143", 'confidence' =>  1.0}
    elsif aBuf[0, 2] == "\xFF\xFE"
      # FF FE  UTF-16, little endian BOM
      @result = {'encoding' =>  "UTF-16LE", 'confidence' =>  1.0}
    elsif aBuf[0, 2] == "\xFE\xFF"
      # FE FF  UTF-16, big endian BOM
      @result = {'encoding' =>  "UTF-16BE", 'confidence' =>  1.0}
    elsif aBuf[0, 3] == "\x2B\x2F\x76" && ["\x38", "\x39", "\x2B", "\x2F"].include?(aBuf[3, 1])
      # NOTE: Ruby only includes "dummy" support for UTF-7.
      # A Ruby UTF-7 string can't have methods called on it, nor can it be converted to anything else, but "BINARY"/"ASCII-8BIT".
      # Still, this doesn't make detection useless, as UTF-7 encodings exist in the wild, and the scenario may need to be handled.
      # 2B 2F 76 38  UTF-7
      # 2B 2F 76 39  UTF-7
      # 2B 2F 76 2B  UTF-7
      # 2B 2F 76 2F  UTF-7
      # 2B 2F 76 38 2D  UTF-7 with no following character (empty string)
      @result = {'encoding' =>  "UTF-7", 'confidence' =>  0.99}
    end
  end

  @gotData = true
  if @result['encoding'] and (@result['confidence'] > 0.0)
    @done = true
    return
  end
  if @inputState == EPureAscii
    if @highBitDetector =~ (aBuf)
      @inputState = EHighbyte
    elsif (@inputState == EPureAscii) and @escDetector =~ (@lastChar + aBuf)
      @inputState = EEscAscii
    end
  end

  @lastChar = aBuf[-1, 1]
  if @inputState == EEscAscii
    if !@escCharSetProber
      @escCharSetProber = EscCharSetProber.new()
    end
    if @escCharSetProber.feed(aBuf) == EFoundIt
      @result = {'encoding' =>  @escCharSetProber.get_charset_name(),
        'confidence' =>  @escCharSetProber.get_confidence()
      }
      @done = true
    end
  elsif @inputState == EHighbyte
    if @charSetProbers.nil? || @charSetProbers.empty?
      @charSetProbers = [MBCSGroupProber.new(), SBCSGroupProber.new(), Latin1Prober.new()]
    end
    for prober in @charSetProbers
      if prober.feed(aBuf) == EFoundIt
        @result = {'encoding' =>  prober.get_charset_name(),
          'confidence' =>  prober.get_confidence()}
        @done = true
        break
      end
    end
  end

end

#reset ⇒ `Object`

# File 'lib/rchardet/universaldetector.rb', line 49

def reset
  @result = {'encoding' => nil, 'confidence' => 0.0}
  @done = false
  @start = true
  @gotData = false
  @inputState = EPureAscii
  @lastChar = ''
  if @escCharSetProber
    @escCharSetProber.reset()
  end
  for prober in @charSetProbers
    prober.reset()
  end
end

Class: CharDet::UniversalDetector

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize ⇒ UniversalDetector

Instance Attribute Details

#done ⇒ Object (readonly)

#result ⇒ Object (readonly)

Instance Method Details

#close ⇒ Object

#feed(aBuf) ⇒ Object

#reset ⇒ Object

#initialize ⇒ `UniversalDetector`

#done ⇒ `Object` (readonly)

#result ⇒ `Object` (readonly)

#close ⇒ `Object`

#feed(aBuf) ⇒ `Object`

#reset ⇒ `Object`