Module: JavaProperties::Encoding::Utf8

Defined in:
lib/java_properties/utf8.rb

Overview

Modules for encoding and decoding UTF-8 characters into ASCII for Java properties files.

# Decode a string with UTF-8 encoded as \uXXXX (e.g. \u0050  = 'P')
decoded_string = JavaProperties::Encoding::Utf8.decode( encoded_string )

# Encode a string so that unicode characters are escaped as \uXXXX
encoded_string = JavaProperties::Encoding::Utf8.encode( decoded_string )

# Get the bytes that represent a given UTF-9 code point
utf8_char_as_string = JavaProperties::Encoding::Utf8.utf8( code_point )

Class Method Summary collapse

Class Method Details

.decode(string) ⇒ Object

Replaces uXXXX escaped chars with proper UTF-8 bytes



21
22
23
24
25
26
# File 'lib/java_properties/utf8.rb', line 21

def self.decode(string)
	string.gsub!( /\\[uU]([0-9a-fA-f]{1,6})/) do |c|
	  Encoding::Utf8.utf8($1.hex)
	end
	string
end

.encode(string) ⇒ Object

Encodes all UTF-8 characters in the provided string using uXXXX format.



72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# File 'lib/java_properties/utf8.rb', line 72

def self.encode(string)
	s = ""
	chars = string.split( // )
	while(chars.size > 0) do
	  z = chars.shift[0].to_i
	  if z >= 0 && z <= 127 then
	    # 1 byte -- essentially ascii
	    s << z
	  elsif z >= 192 && z <= 223 then
	    # 2 bytes
	    y = chars.shift[0].to_i
	    s << "\\u#{sprintf('%02x',( (z-192)*64 + (y-128) ))}"
	  elsif z >= 224 && z <= 239 then
	    # 3 bytes
	    y = chars.shift[0].to_i
	    x = chars.shift[0].to_i
	    s << "\\u#{sprintf('%04x',( (z-224)*4096 + (y-128)*64 + (x-128) ))}"
	  elsif z >= 240 && z <= 247 then
	    # 4 bytes
	    y = chars.shift[0].to_i
	    x = chars.shift[0].to_i
	    w = chars.shift[0].to_i
	    s << "\\u#{sprintf('%06x',( (z-240)*262144 + (y-128)*4096 + (x-128)*64 + (w-128) ))}"
	  elsif z >= 248 && z <= 251 then
	    # 5 bytes
	    y = chars.shift[0].to_i
	    x = chars.shift[0].to_i
	    w = chars.shift[0].to_i
	    v = chars.shift[0].to_i
	    s << "\\u#{sprintf('%08x',( (z-248)*16777216 + (y-128)*262144 + (x-128)*4096 + (w-128)*64 + (v-128) ))}"
	  elsif z >= 252 && z <= 253 then
	    # 6 bytes
	    y = chars.shift[0].to_i
	    x = chars.shift[0].to_i
	    w = chars.shift[0].to_i
	    v = chars.shift[0].to_i
	    u = chars.shift[0].to_i
	    s << "\\u#{sprintf('%010x',( (z-252)*1073741824 + (y-128)*16777216 + (x-128)*262144 + (w-128)*4096 + (v-128)*64 + (u-128) ))}"
	  else
	    s << z
	  end
	end
	s
end

.utf8(ud) ⇒ Object

Gets the UTF-8 encoding of a given unicode code point (provided as an Fixnum)



30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
# File 'lib/java_properties/utf8.rb', line 30

def self.utf8(ud)
	s = ""
	if ud < 128 then
	  # UTF-8 is 1 byte long, the value of ud.
	  s << ud
	elsif ud >= 128 && ud <= 2047 then
	  # UTF-8 is 2 bytes long.
	  s << (192 + (ud.div 64))
	  s << (128 + (ud % 64))
	elsif ud >= 2048 && ud <= 65535 then
	  # UTF-8 is 3 bytes long.
	  s << (224 + (ud.div 4096))
	  s << (128 + ((ud.div 64) % 64))
	  s << (128 + (ud % 64))
	elsif ud >= 65536 && ud <=2097151 then
	  # UTF-8 is 4 bytes long.
	  s << (240 + (ud.div 262144))
	  s << (128 + ((ud.div 4096) % 64))
	  s << (128 + ((ud.div 64) % 64))
	  s << (128 + (ud % 64))
	elsif ud >= 2097152 && ud <= 7108863 then
	  # UTF-8 is 5 bytes long.
	  s << (248 + (ud.div 16777216))
	  s << (128 + ((ud.div 262144) % 64))
	  s << (128 + ((ud.div 4096) % 64))
	  s << (128 + ((ud.div 64) % 64))
	  s << (128 + (ud % 64))
	elsif ud >= 67108864 && ud <= 2147483647
	  # then UTF-8 is 6 bytes long.
	  s << (252 + (ud.div 1073741824))
	  s << (128 + ((ud.div 16777216) % 64))
	  s << (128 + ((ud.div 262144) % 64))
	  s << (128 + ((ud.div 4096) % 64))
	  s << (128 + ((ud.div 64) % 64))
	  s << (128 + (ud % 64))
	end
	s
end