Class: String
- Inherits:
-
Object
- Object
- String
- Defined in:
- lib/toPinyin.rb
Overview
The string should be UTF-8 encoded
Constant Summary collapse
- UTF8REGEX =
/\A(?: # ?: non-capturing group (grouping with no back references) [\x09\x0A\x0D\x20-\x7E] # ASCII | [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte | \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte | \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates | \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3 | [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15 | \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16 )*\z/mnx
- @@mm =
uniToPyMap
Instance Method Summary collapse
Instance Method Details
#pinyin ⇒ Object
45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 |
# File 'lib/toPinyin.rb', line 45 def scan(/./mu).map do |c| #conver to unicode #u = Iconv.iconv("UNICODEBIG","utf-8",c)[0].each_byte.map {|b| b.to_s(16)}.join u=sprintf("%04X", c.unpack("U*").first) #handle a-z, A-Z if u =~ /^00/ #return c as it is c else m = @@mm[u] m.chop unless m.nil? end end end |
#unicode ⇒ Object
30 31 32 33 34 35 36 37 38 |
# File 'lib/toPinyin.rb', line 30 def unicode scan(/./mu).map do |c| #option 1 : use Iconv #Iconv.iconv("UNICODEBIG","utf-8",c)[0].each_byte.map {|b| b.to_s(16)}.join #option 2 : sprintf("%04X", c.unpack("U*").first) end end |
#utf8 ⇒ Object
26 27 28 |
# File 'lib/toPinyin.rb', line 26 def utf8 scan(/./mu).map {|c| c.each_byte.inject(""){|o, b| o+b.to_s(16)}} end |
#validate_utf8 ⇒ Object
cf. Paul Battley, po-ru.com/diary/fixing-invalid-utf-8-in-ruby-revisited/
41 42 43 |
# File 'lib/toPinyin.rb', line 41 def validate_utf8 Iconv.iconv('UTF-8//IGNORE', 'UTF-8', (self + ' ') ).first[0..-2] end |