Class: String::UTF8

Inherits:

String

Object
String
String::UTF8

show all

Defined in:: lib/string/utf8.rb

Constant Summary collapse

UTF8 =

'utf-8'

Constants inherited from String

Binary, COLORS, Encodings, Escapes, UNICODE_LEADERS_AND_TRAILERS, UNICODE_LT_PAT, UNICODE_L_PAT, UNICODE_T_PAT, UNICODE_WHITESPACE

Instance Attribute Summary collapse

#collation ⇒ Object readonly

Returns the value of attribute collation.

Class Method Summary collapse

.new(string, encoding, collation = nil) ⇒ Object

Instance Method Summary collapse

#<=>(other) ⇒ Object
#==(other) ⇒ Object
#[](arg1, arg2 = nil) ⇒ Object (also: #slice)

See String#[].
#[]=(*args) ⇒ Object
#byte_insert ⇒ Object
#capitalize ⇒ Object
#chop ⇒ Object
#chop! ⇒ Object
#decompose ⇒ Object

Decomposes the string and returns the decomposed string.
#downcase ⇒ Object
#dup ⇒ Object
#each_char(&block) ⇒ Object
#encoding ⇒ Object
#index(item, offset = 0) ⇒ Object
#initialize(string, collation = nil) ⇒ UTF8 constructor

A new instance of UTF8.
#insert(offset, fragment) ⇒ Object

Inserts the string at codepoint offset specified in offset.
#inspect ⇒ Object
#length ⇒ Object
#lstrip ⇒ Object
#normalize_C ⇒ Object

Normalizes the string to form C and returns the result.
#normalize_D ⇒ Object

Normalizes the string to form D and returns the result.
#normalize_KC ⇒ Object

Normalizes the string to form KC and returns the result.
#reverse ⇒ Object
#rindex(item, offset = -1)) ⇒ Object
#rstrip ⇒ Object
#strip ⇒ Object
#swapcase ⇒ Object
#upcase ⇒ Object
#utf8(collation = nil) ⇒ Object

Methods inherited from String

#arguments, #ascii, #binary, #camelcase, #chunks, encodings, #mirc_formatted, #mirc_stripped, #mirc_translated_color, #post_arguments, #strip_user_prefixes, #to_flags, #to_s, #unescaped, #user_prefixes, #valid_channelname?, #valid_nickname?, #valid_user?

Constructor Details

#initialize(string, collation = nil) ⇒ `UTF8`

Returns a new instance of UTF8.

# File 'lib/string/utf8.rb', line 22

def initialize(string, collation=nil)
	super(Unicode::normalize_KC(string))
	@collation = collation
end

Instance Attribute Details

#collation ⇒ `Object` (readonly)

Returns the value of attribute collation.



20
21
22

# File 'lib/string/utf8.rb', line 20

def collation
  @collation
end

Class Method Details

.new(string, encoding, collation = nil) ⇒ `Object`

# File 'lib/string/utf8.rb', line 13

def self.new(string, encoding, collation=nil)
	raise "Encoding must be 'utf-8' but is '#{encoding}'" unless encoding == UTF8
	obj = allocate
	obj.send(:initialize, string, collation)
	obj
end

Instance Method Details

#<=>(other) ⇒ `Object`

# File 'lib/string/utf8.rb', line 56

def <=>(other)
	raise "Can't compare strings with different collation" unless @collation == other.collation
	Unicode.strcmp(self, other)
end

#==(other) ⇒ `Object`



52
53
54

# File 'lib/string/utf8.rb', line 52

def ==(other)
	super(other.utf8)
end

#[](arg1, arg2 = nil) ⇒ `Object` Also known as: slice

See String#[]. May return an integer > 255 when used like “342210205” # => 8709

# File 'lib/string/utf8.rb', line 33

def [](arg1, arg2=nil) #:nodoc:
	if arg2 then
		unpack("U*").slice(arg1, arg2).pack("U*").utf8
	elsif Range === arg1 then
		unpack("U*").slice(arg1).pack("U*").utf8
	else
		unpack("U*").slice(*args)
	end
end

#[]=(*args) ⇒ `Object`

# File 'lib/string/utf8.rb', line 44

def []=(*args)
	value = args.pop
	codepoints = unpack("U*")
	codepoints[*args] = value.utf8.unpack("U*")
	replace(codepoints.pack("U*"))
	self
end

#byte_insert ⇒ `Object`

6	# File 'lib/string/utf8.rb', line 6 alias byte_insert insert

#capitalize ⇒ `Object`

64	# File 'lib/string/utf8.rb', line 64 def capitalize; Unicode::capitalize(self).utf8; end

#chop ⇒ `Object`



117
118
119

# File 'lib/string/utf8.rb', line 117

def chop
	gsub(/(?:.|\r?\n)\z/u, '')
end

#chop! ⇒ `Object`



113
114
115

# File 'lib/string/utf8.rb', line 113

def chop!
	gsub!(/(?:.|\r?\n)\z/u, '')
end

#decompose ⇒ `Object`

Decomposes the string and returns the decomposed string



135
136
137

# File 'lib/string/utf8.rb', line 135

def decompose #:nodoc:
	Unicode::decompose(self)
end

#downcase ⇒ `Object`

63	# File 'lib/string/utf8.rb', line 63 def downcase; Unicode::downcase(self).utf8; end

#dup ⇒ `Object`



130
131
132

# File 'lib/string/utf8.rb', line 130

def dup
	String::UTF8.new(self, String::UTF8::UTF8, @collation)
end

#each_char(&block) ⇒ `Object`

# File 'lib/string/utf8.rb', line 121

def each_char(&block)
	scan(/./um, &block)
	self
end

#encoding ⇒ `Object`



27
28
29

# File 'lib/string/utf8.rb', line 27

def encoding
	String::UTF8::UTF8
end

#index(item, offset = 0) ⇒ `Object`

# File 'lib/string/utf8.rb', line 77

def index(item, offset=0)
	case item
		when Regexp
			mb = unpack("U*")[offset..-1].pack("U*")
			bi = mb.byte_index(item)
			bi && mb.byte_slice(0,bi).unpack("U*").size+offset
		when Integer
			# sucks, but Array#index does not accept an offset-arg
			unpack("U*")[offset..-1].index(item)+offset
		else
			raise "Must be of same encoding" if String === item and encoding != item.encoding
			if offset.zero? then
				bi = byte_index(item)
				bi && byte_slice(0,bi).unpack("U*").size
			else
				index(Regexp.new(Regexp.escape(item)), offset)
			end
	end
end

#insert(offset, fragment) ⇒ `Object`

Inserts the string at codepoint offset specified in offset.



109
110
111

# File 'lib/string/utf8.rb', line 109

def insert(offset, fragment)  #:nodoc:
	replace(unpack("U*").insert(offset, fragment.unpack("U*")).flatten.pack("U*"))
end

#inspect ⇒ `Object`



154
155
156

# File 'lib/string/utf8.rb', line 154

def inspect
	"#{encoding}(#{collation||'none'}):#{super}"
end

#length ⇒ `Object`

61	# File 'lib/string/utf8.rb', line 61 def length; @length\|\|=unpack("U*").size; end

#lstrip ⇒ `Object`

74	# File 'lib/string/utf8.rb', line 74 def lstrip; gsub(UNICODE_L_PAT, '').utf8; end

#normalize_C ⇒ `Object`

Normalizes the string to form C and returns the result



140
141
142

# File 'lib/string/utf8.rb', line 140

def normalize_C #:nodoc:
	Unicode::normalize_C(self)
end

#normalize_D ⇒ `Object`

Normalizes the string to form D and returns the result



145
146
147

# File 'lib/string/utf8.rb', line 145

def normalize_D #:nodoc:
	Unicode::normalize_D(self)
end

#normalize_KC ⇒ `Object`

Normalizes the string to form KC and returns the result



150
151
152

# File 'lib/string/utf8.rb', line 150

def normalize_KC #:nodoc:
	Unicode::normalize_KC(self)
end

#reverse ⇒ `Object`

72	# File 'lib/string/utf8.rb', line 72 def reverse; unpack("U").reverse.pack("U").utf8; end

#rindex(item, offset = -1)) ⇒ `Object`

# File 'lib/string/utf8.rb', line 97

def rindex(item, offset=-1)
	case item
		when Integer
			unpack("U*")[0..offset].rindex(item)
		else
			raise "Must be of same encoding" if String === item and encoding != item.encoding
			bi = byte_rindex(item, offset)
			bi && byte_slice(0,bi).unpack("U*").size
	end
end

#rstrip ⇒ `Object`

75	# File 'lib/string/utf8.rb', line 75 def rstrip; gsub(UNICODE_T_PAT, '').utf8; end

#strip ⇒ `Object`

73	# File 'lib/string/utf8.rb', line 73 def strip; gsub(UNICODE_L_PAT, '').gsub(UNICODE_L_PAT, '').utf8; end

#swapcase ⇒ `Object`

# File 'lib/string/utf8.rb', line 65

def swapcase
	up   = Unicode::upcase(self)
	down = Unicode::downcase(self)
	unpack("U*").zip(up.unpack("U*"), down.unpack("U*")).map { |n,u,d|
		n == u ? d : u
	}.pack("U*")
end

#upcase ⇒ `Object`

62	# File 'lib/string/utf8.rb', line 62 def upcase; Unicode::upcase(self).utf8; end

#utf8(collation = nil) ⇒ `Object`



126
127
128

# File 'lib/string/utf8.rb', line 126

def utf8(collation=nil)
	String::UTF8.new(self, String::UTF8::UTF8, collation)
end

Class: String::UTF8

Constant Summary collapse

Constants inherited from String

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Methods inherited from String

Constructor Details

#initialize(string, collation = nil) ⇒ UTF8

Instance Attribute Details

#collation ⇒ Object (readonly)

Class Method Details

.new(string, encoding, collation = nil) ⇒ Object

Instance Method Details

#<=>(other) ⇒ Object

#==(other) ⇒ Object

#[](arg1, arg2 = nil) ⇒ Object Also known as: slice

#[]=(*args) ⇒ Object

#byte_insert ⇒ Object

#capitalize ⇒ Object

#chop ⇒ Object

#chop! ⇒ Object

#decompose ⇒ Object

#downcase ⇒ Object

#dup ⇒ Object

#each_char(&block) ⇒ Object

#encoding ⇒ Object

#index(item, offset = 0) ⇒ Object

#insert(offset, fragment) ⇒ Object

#inspect ⇒ Object

#length ⇒ Object

#lstrip ⇒ Object

#normalize_C ⇒ Object

#normalize_D ⇒ Object

#normalize_KC ⇒ Object

#reverse ⇒ Object

#rindex(item, offset = -1)) ⇒ Object

#rstrip ⇒ Object

#strip ⇒ Object

#swapcase ⇒ Object

#upcase ⇒ Object

#utf8(collation = nil) ⇒ Object