Method: UnicodeUtils.each_grapheme
- Defined in:
- lib/unicode_utils/each_grapheme.rb
permalink .each_grapheme(str) {|grapheme| ... } ⇒ Object
Iterate over the grapheme clusters that make up str
. A grapheme cluster is a user perceived character (the basic unit of a writing system for a language) and consists of one or more code points.
This method uses the default Unicode algorithm for extended grapheme clusters.
Returns an enumerator if no block is given.
Examples:
require "unicode_utils/each_grapheme"
UnicodeUtils.each_grapheme("a\r\nb") { |g| p g }
prints:
"a"
"\r\n"
"b"
and
UnicodeUtils.each_grapheme("a\r\nb").count => 3
35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 |
# File 'lib/unicode_utils/each_grapheme.rb', line 35 def each_grapheme(str) return enum_for(__method__, str) unless block_given? c0 = nil c0_prop = nil grapheme = String.new.force_encoding(str.encoding) str.each_codepoint { |c| gbreak = false c_prop = GRAPHEME_CLUSTER_BREAK_MAP[c] ### rules ### if c0_prop == 0x0 && c_prop == 0x1 # don't break CR LF elsif c0_prop == 0x0 || c0_prop == 0x1 || c0_prop == 0x2 # break after controls gbreak = true elsif c_prop == 0x0 || c_prop == 0x1 || c_prop == 0x2 # break before controls gbreak = true elsif c0_prop == 0x6 && (c_prop == 0x6 || c_prop == 0x7 || c_prop == 0x9 || c_prop == 0xA) # don't break hangul syllable elsif (c0_prop == 0x9 || c0_prop == 0x7) && (c_prop == 0x7 || c_prop == 0x8) # don't break hangul syllable elsif (c0_prop == 0xA || c0_prop == 0x8) && c_prop == 0x8 # don't break hangul syllable elsif c0_prop == 0xB && c_prop == 0xB # don't break between regional indicator symbols elsif c_prop == 0x3 # don't break before extending characters elsif c_prop == 0x5 # don't break before SpacingMarks elsif c0_prop == 0x4 # don't break after Prepend characters else # break everywhere gbreak = true end ############# if gbreak && !grapheme.empty? yield grapheme grapheme = String.new.force_encoding(str.encoding) end grapheme << c c0 = c c0_prop = c_prop } yield grapheme unless grapheme.empty? end |