Class: UTF8Parser

Inherits:
StringScanner
  • Object
show all
Defined in:
lib/rdf_objects/parsers.rb

Constant Summary collapse

STRING =
/(([\x0-\x1f]|[\\\/bfnrt]|\\u[0-9a-fA-F]{4}|[\x20-\xff])*)/nx
UNPARSED =
Object.new
UNESCAPE_MAP =
Hash.new { |h, k| h[k] = k.chr }
UTF16toUTF8 =
Iconv.new('utf-8', 'utf-16be')

Instance Method Summary collapse

Constructor Details

#initialize(str) ⇒ UTF8Parser

Returns a new instance of UTF8Parser.



30
31
32
33
# File 'lib/rdf_objects/parsers.rb', line 30

def initialize(str)
  super(str)
  @string = str
end

Instance Method Details

#parse_stringObject



34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# File 'lib/rdf_objects/parsers.rb', line 34

def parse_string
  if scan(STRING)
    return '' if self[1].empty?
    string = self[1].gsub(%r((?:\\[\\bfnrt"/]|(?:\\u(?:[A-Fa-f\d]{4}))+|\\[\x20-\xff]))n) do |c|
      if u = UNESCAPE_MAP[$&[1]]
        u
      else # \uXXXX
        bytes = ''
        i = 0
        while c[6 * i] == ?\\ && c[6 * i + 1] == ?u
          bytes << c[6 * i + 2, 2].to_i(16) << c[6 * i + 4, 2].to_i(16)
          i += 1
        end
        UTF16toUTF8.iconv(bytes)
      end
    end
    if string.respond_to?(:force_encoding)
      string.force_encoding(Encoding::UTF_8)
    end
    string
  else
    UNPARSED
  end
rescue Iconv::Failure => e
  raise StandardError, "Caught #{e.class}: #{e}"
end