Class: String

Inherits:
Object
  • Object
show all
Defined in:
lib/toPinyin.rb

Overview

The string should be UTF-8 encoded

Constant Summary collapse

UTF8REGEX =
/\A(?:                               # ?: non-capturing group (grouping with no back references)
  [\x09\x0A\x0D\x20-\x7E]            # ASCII
| [\xC2-\xDF][\x80-\xBF]             # non-overlong 2-byte
|  \xE0[\xA0-\xBF][\x80-\xBF]        # excluding overlongs
| [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}  # straight 3-byte
|  \xED[\x80-\x9F][\x80-\xBF]        # excluding surrogates
|  \xF0[\x90-\xBF][\x80-\xBF]{2}     # planes 1-3
| [\xF1-\xF3][\x80-\xBF]{3}          # planes 4-15
|  \xF4[\x80-\x8F][\x80-\xBF]{2}     # plane 16
)*\z/mnx
@@mm =
uniToPyMap

Instance Method Summary collapse

Instance Method Details

#pinyinObject



45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
# File 'lib/toPinyin.rb', line 45

def pinyin

	scan(/./mu).map do |c| 
	    #conver to unicode
		#u = Iconv.iconv("UNICODEBIG","utf-8",c)[0].each_byte.map {|b| b.to_s(16)}.join
		u=sprintf("%04X", c.unpack("U*").first) 
		#handle a-z, A-Z
		if  u =~ /^00/ 
			#return c as it is
			c
		else
			m = @@mm[u]
			m.chop unless m.nil?
		end
 	end
end

#unicodeObject



30
31
32
33
34
35
36
37
38
# File 'lib/toPinyin.rb', line 30

def unicode

	scan(/./mu).map do |c| 
	    #option 1 : use Iconv
		#Iconv.iconv("UNICODEBIG","utf-8",c)[0].each_byte.map {|b| b.to_s(16)}.join
		#option 2 : 
		sprintf("%04X", c.unpack("U*").first) 
	end
end

#utf8Object



26
27
28
# File 'lib/toPinyin.rb', line 26

def utf8
	scan(/./mu).map {|c| c.each_byte.inject(""){|o, b| o+b.to_s(16)}}	
end

#utf8?Boolean

Returns:

  • (Boolean)


21
22
23
# File 'lib/toPinyin.rb', line 21

def utf8?	
	self =~ UTF8REGEX
end

#validate_utf8Object



41
42
43
# File 'lib/toPinyin.rb', line 41

def validate_utf8
      Iconv.iconv('UTF-8//IGNORE', 'UTF-8', (self + ' ') ).first[0..-2]
end