Module: UnicodeUtils::Impl

Defined in:
lib/unicode_utils/nfc.rb,
lib/unicode_utils/debug.rb,
lib/unicode_utils/each_word.rb,
lib/unicode_utils/read_cdata.rb,
lib/unicode_utils/conditional_casing.rb,
lib/unicode_utils/canonical_decomposition.rb,
lib/unicode_utils/compatibility_decomposition.rb,
lib/unicode_utils/hangul_syllable_decomposition.rb

Overview

:nodoc:

Defined Under Namespace

Modules: NFC Classes: AfterIConditionalCasing, AfterSoftDottedConditionalCasing, BeforeDotConditionalCasing, ConditionalCasing, FinalSigmaConditionalCasing, MoreAboveConditionalCasing, NotBeforeDotConditionalCasing

Constant Summary collapse

COMPOSITION_EXCLUSION_SET =
Impl.read_code_point_set("composition_exclusion_set")
CANONICAL_COMPOSITION_MAP =
Hash.new.tap do |m|
  CANONICAL_DECOMPOSITION_MAP.each_pair { |comp, decomp|
    if decomp.length == 2
      (m[decomp[0]] ||= {})[decomp[1]] = comp
    end
  }
end
DEBUG_COLUMNS =
{
  "Char" => -> cp {
    case cp
    when 0x07 then '"\a"'
    when 0x08 then '"\b"'
    when 0x09 then '"\t"'
    when 0x0A then '"\n"'
    when 0x0D then '"\r"'
    else
      if UnicodeUtils.graphic_char?(cp) &&
            UnicodeUtils.char_display_width(cp) > 0
        '"' + cp.chr(Encoding::UTF_8) + '"'
      else
        "N/A"
      end
    end
  },
  "Ordinal" => -> cp {
    cp.to_s(16).upcase.rjust(7)
  },
  "Sid" => -> cp {
    UnicodeUtils.sid(cp)
  },
  "General Category" => -> cp {
    UnicodeUtils.general_category(cp).to_s
  },
  "UTF-8" => -> cp {
    begin
      cp.chr(Encoding::UTF_8).bytes.map { |b| sprintf("%02X", b) }.join(" ")
    rescue RangeError # surrogate code points are not valid in utf-8
      "N/A"
    end
  }
}
EAST_ASIAN_WIDTH_SYMBOL_MAP =
{
  1 => :Ambiguous,
  2 => :Halfwidth,
  3 => :Wide,
  4 => :Fullwidth,
  5 => :Narrow
}.freeze
NAME_ALIAS_TYPE_TO_SYMBOL_MAP =
{
  1 => :correction,
  2 => :control,
  3 => :alternate,
  4 => :figment,
  5 => :abbreviation
}.freeze
LANGS_WITH_RULES =
{:tr => true, :lt => true, :az => true}
CONDITIONAL_UPCASE_MAP =
read_conditional_casings("cond_uc_map")
CONDITIONAL_DOWNCASE_MAP =
read_conditional_casings("cond_lc_map")
CONDITIONAL_TITLECASE_MAP =
read_conditional_casings("cond_tc_map")

Class Method Summary collapse

Class Method Details

.append_hangul_syllable_decomposition(str, s) ⇒ Object



20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
# File 'lib/unicode_utils/hangul_syllable_decomposition.rb', line 20

def self.append_hangul_syllable_decomposition(str, s)
  # constants
  sbase = 0xAC00
  lbase = 0x1100
  vbase = 0x1161
  tbase = 0x11A7
  scount = 11172
  lcount = 19
  vcount = 21
  tcount = 28
  ncount = vcount * tcount

  sindex = s - sbase
  if 0 <= sindex && sindex < scount
    l = lbase + sindex / ncount
    v = vbase + (sindex % ncount) / tcount
    t = tbase + sindex % tcount
    str << l << v
    str << t if t != tbase
  else
    str << s
  end
end

.append_recursive_canonical_decomposition_mapping(str, mapping) ⇒ Object



48
49
50
51
52
53
54
55
56
57
# File 'lib/unicode_utils/canonical_decomposition.rb', line 48

def self.append_recursive_canonical_decomposition_mapping(str, mapping)
  mapping.each { |cp|
    mapping_ = CANONICAL_DECOMPOSITION_MAP[cp]
    if mapping_
      append_recursive_canonical_decomposition_mapping(str, mapping_)
    else
      str << cp
    end
  }
end

.append_recursive_compatibility_decomposition_mapping(str, cp) ⇒ Object



41
42
43
44
45
46
47
48
49
50
51
# File 'lib/unicode_utils/compatibility_decomposition.rb', line 41

def self.append_recursive_compatibility_decomposition_mapping(str, cp)
  mapping = COMPATIBILITY_DECOMPOSITION_MAP[cp]
  mapping ||= CANONICAL_DECOMPOSITION_MAP[cp]
  if mapping
    mapping.each { |c|
      append_recursive_compatibility_decomposition_mapping(str, c)
    }
  else
    str << cp
  end
end

.column_widths(table) ⇒ Object



89
90
91
92
93
94
95
96
97
98
99
# File 'lib/unicode_utils/debug.rb', line 89

def self.column_widths(table)
  Array.new.tap { |column_widths|
    table.each_with_index { |row|
      row.each_with_index { |txt, col_i|
        dw = UnicodeUtils.display_width(txt)
        cw = column_widths[col_i]
        column_widths[col_i] = dw if cw.nil? || cw < dw
      }
    }
  }
end

.composition(str) ⇒ Object



46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# File 'lib/unicode_utils/nfc.rb', line 46

def self.composition(str)
  ### constants for hangul composition ###
  sbase = 0xAC00
  lbase = 0x1100
  vbase = 0x1161
  tbase = 0x11A7
  lcount = 19
  vcount = 21
  tcount = 28
  ncount = vcount * tcount
  scount = lcount * ncount
  ########################################

  String.new.force_encoding(str.encoding).tap do |res|
    last_starter = nil
    uncomposable_non_starters = []
    str.each_codepoint { |cp|
      if COMBINING_CLASS_MAP[cp] == 0 # starter?
        combined = false
        if last_starter && uncomposable_non_starters.empty?
          ### hangul ###
          lindex = last_starter - lbase
          if 0 <= lindex && lindex < lcount
            vindex = cp - vbase
            if 0 <= vindex && vindex <= vcount
              last_starter =
                sbase + (lindex * vcount + vindex) * tcount
              combined = true
            end
          end
          unless combined
            sindex = last_starter - sbase
            if 0 <= sindex && sindex < scount && (sindex % tcount) == 0
              tindex = cp - tbase
              if 0 <= tindex && tindex < tcount
                last_starter += tindex
                combined = true
              end
            end
          end
          ##############
          unless combined
            map = Impl::CANONICAL_COMPOSITION_MAP[last_starter]
            composition = map && map[cp]
            if composition && Impl::NFC.primary_composite?(composition)
              last_starter = composition
              combined = true
            end
          end
        end
        unless combined
          res << last_starter if last_starter
          uncomposable_non_starters.each { |nc| res << nc }
          uncomposable_non_starters.clear
          last_starter = cp
        end
      else
        last_non_starter = uncomposable_non_starters.last
        if last_non_starter && Impl::NFC.blocked?(last_non_starter, cp)
          uncomposable_non_starters << cp
        else
          map = Impl::CANONICAL_COMPOSITION_MAP[last_starter]
          composition = map && map[cp]
          if composition && Impl::NFC.primary_composite?(composition)
            last_starter = composition
          else
            uncomposable_non_starters << cp
          end
        end
      end
    }
    res << last_starter if last_starter
    uncomposable_non_starters.each { |nc| res << nc }
  end
end

.conditional_downcase_mapping(cp, str, pos, language_id) ⇒ Object



140
141
142
143
144
145
146
147
148
# File 'lib/unicode_utils/conditional_casing.rb', line 140

def self.conditional_downcase_mapping(cp, str, pos, language_id)
  lang_map = CONDITIONAL_DOWNCASE_MAP[cp]
  if lang_map
    casing = lang_map[language_id] || lang_map[nil]
    if casing && casing.context_match?(str, pos)
      casing.mapping
    end
  end
end

.conditional_titlecase_mapping(cp, str, pos, language_id) ⇒ Object



150
151
152
153
154
155
156
157
158
# File 'lib/unicode_utils/conditional_casing.rb', line 150

def self.conditional_titlecase_mapping(cp, str, pos, language_id)
  lang_map = CONDITIONAL_TITLECASE_MAP[cp]
  if lang_map
    casing = lang_map[language_id] || lang_map[nil]
    if casing && casing.context_match?(str, pos)
      casing.mapping
    end
  end
end

.conditional_upcase_mapping(cp, str, pos, language_id) ⇒ Object



130
131
132
133
134
135
136
137
138
# File 'lib/unicode_utils/conditional_casing.rb', line 130

def self.conditional_upcase_mapping(cp, str, pos, language_id)
  lang_map = CONDITIONAL_UPCASE_MAP[cp]
  if lang_map
    casing = lang_map[language_id] || lang_map[nil]
    if casing && casing.context_match?(str, pos)
      casing.mapping
    end
  end
end

.open_cdata_file(filename, &block) ⇒ Object



27
28
29
# File 'lib/unicode_utils/read_cdata.rb', line 27

def self.open_cdata_file(filename, &block)
  File.open(File.join(CDATA_DIR, filename), "r:US-ASCII:-", &block)
end


101
102
103
104
105
106
107
108
109
110
111
112
113
# File 'lib/unicode_utils/debug.rb', line 101

def self.print_row(row, column_widths, io)
  row.each_with_index { |txt, col_i|
    io.print(" ")
    io.print(txt)
    if col_i != row.length - 1
      dw = UnicodeUtils.display_width(txt)
      d = column_widths[col_i] - dw
      io.print(" " * (d + 1))
      io.print("|")
    end
  }
  io.puts
end


115
116
117
118
119
120
121
122
123
# File 'lib/unicode_utils/debug.rb', line 115

def self.print_separator_row(column_widths, io)
  column_widths.each_with_index { |cw, col_i|
    io.print("-" * (cw + 2))
    if col_i != column_widths.length - 1
      io.print("+")
    end
  }
  io.puts
end


125
126
127
128
129
130
131
132
133
# File 'lib/unicode_utils/debug.rb', line 125

def self.print_table(table, io)
  cws = column_widths(table)
  print_row(table[0], cws, io)
  print_separator_row(cws, io)
  table[1..-1].each { |row|
    print_row(row, cws, io)
  }
  io.flush
end

.put_into_canonical_order(str) ⇒ Object



59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
# File 'lib/unicode_utils/canonical_decomposition.rb', line 59

def self.put_into_canonical_order(str)
  reorder_needed = false
  last_cp = nil
  last_cc = nil
  str.each_codepoint { |cp|
    cc = COMBINING_CLASS_MAP[cp]
    if last_cp && cc != 0 && last_cc > cc
      reorder_needed = true
      break
    end
    last_cp = cp
    last_cc = cc
  }
  return str unless reorder_needed
  res = String.new.force_encoding(str.encoding)
  last_cp = nil
  last_cc = nil
  str.each_codepoint { |cp|
    cc = COMBINING_CLASS_MAP[cp]
    if last_cp
      if cc != 0 && last_cc > cc
        res << cp
        cp = nil
        cc = nil
      end
      res << last_cp
    end
    last_cp = cp
    last_cc = cc
  }
  res << last_cp if last_cp
  put_into_canonical_order(res)
end

.read_code_point_map(filename) ⇒ Object



43
44
45
46
47
48
49
50
51
52
53
# File 'lib/unicode_utils/read_cdata.rb', line 43

def self.read_code_point_map(filename)
  Hash.new.tap { |map|
    open_cdata_file(filename) do |input|
      buffer = "x" * 6
      buffer.force_encoding(Encoding::US_ASCII)
      while input.read(6, buffer)
        map[buffer.to_i(16)] = input.read(6, buffer).to_i(16)
      end
    end
  }
end

.read_code_point_set(filename) ⇒ Object



31
32
33
34
35
36
37
38
39
40
41
# File 'lib/unicode_utils/read_cdata.rb', line 31

def self.read_code_point_set(filename)
  Hash.new.tap { |set|
    open_cdata_file(filename) do |input|
      buffer = "x" * 6
      buffer.force_encoding(Encoding::US_ASCII)
      while input.read(6, buffer)
        set[buffer.to_i(16)] = true
      end
    end
  }
end

.read_combining_class_mapObject



101
102
103
104
105
106
107
108
109
110
111
112
113
# File 'lib/unicode_utils/read_cdata.rb', line 101

def self.read_combining_class_map
  Hash.new.tap { |map|
    open_cdata_file("combining_class_map") do |input|
      buffer = "x" * 6
      buffer.force_encoding(Encoding::US_ASCII)
      cc_buffer = "x" * 2
      cc_buffer.force_encoding(Encoding::US_ASCII)
      while input.read(6, buffer)
        map[buffer.to_i(16)] = input.read(2, cc_buffer).to_i(16)
      end
    end
  }
end

.read_conditional_casings(filename) ⇒ Object



84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
# File 'lib/unicode_utils/read_cdata.rb', line 84

def self.read_conditional_casings(filename)
  Hash.new.tap { |cp_map|
    open_cdata_file(filename) do |input|
      input.each_line { |line|
        line.chomp!
        record = line.split(";")
        cp = record[0].to_i(16)
        mapping = record[1].split(",").map { |c| c.to_i(16) }
        language_id = record[2].empty? ? nil : record[2].to_sym
        context = record[3] && record[3].gsub('_', '')
        casing = Impl.const_get("#{context}ConditionalCasing").new(mapping)
        (cp_map[cp] ||= {})[language_id] = casing
      }
    end
  }
end

.read_east_asian_width_per_cp(filename) ⇒ Object



151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
# File 'lib/unicode_utils/read_cdata.rb', line 151

def self.read_east_asian_width_per_cp(filename)
  # like read_hexdigit_map, but with translation to symbol values
  Hash.new(:Neutral).tap { |map|
    open_cdata_file(filename) do |input|
      buffer = "x" * 6
      buffer.force_encoding(Encoding::US_ASCII)
      val_buffer = "x"
      val_buffer.force_encoding(Encoding::US_ASCII)
      while input.read(6, buffer)
        map[buffer.to_i(16)] =
          EAST_ASIAN_WIDTH_SYMBOL_MAP[input.read(1, val_buffer).to_i(16)]
      end
    end
  }
end

.read_east_asian_width_ranges(filename) ⇒ Object



167
168
169
170
171
172
173
# File 'lib/unicode_utils/read_cdata.rb', line 167

def self.read_east_asian_width_ranges(filename)
  read_range_to_hexdigit_list(filename).tap { |list|
    list.each { |pair|
      pair[1] = EAST_ASIAN_WIDTH_SYMBOL_MAP[pair[1]]
    }
  }
end

.read_general_category_per_cp(filename) ⇒ Object



175
176
177
178
179
180
181
182
183
184
185
186
187
# File 'lib/unicode_utils/read_cdata.rb', line 175

def self.read_general_category_per_cp(filename)
  Hash.new.tap { |map|
    open_cdata_file(filename) do |input|
      cp_buffer = "x" * 6
      cp_buffer.force_encoding(Encoding::US_ASCII)
      cat_buffer = "x" * 2
      cat_buffer.force_encoding(Encoding::US_ASCII)
      while input.read(6, cp_buffer)
        map[cp_buffer.to_i(16)] = input.read(2, cat_buffer).to_sym
      end
    end
  }
end

.read_general_category_ranges(filename) ⇒ Object



189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
# File 'lib/unicode_utils/read_cdata.rb', line 189

def self.read_general_category_ranges(filename)
  Array.new.tap { |list|
    open_cdata_file(filename) do |input|
      cp_buffer = "x" * 6
      cp_buffer.force_encoding(Encoding::US_ASCII)
      cat_buffer = "x" * 2
      cat_buffer.force_encoding(Encoding::US_ASCII)
      while input.read(6, cp_buffer)
        list << [
          Range.new(cp_buffer.to_i(16), input.read(6, cp_buffer).to_i(16)),
          input.read(2, cat_buffer).to_sym
        ]
      end
    end
  }
end

.read_hexdigit_map(filename) ⇒ Object

Read a map whose keys are code points (6 hexgdigits, converted to integer) and whose values are single hexdigits (converted to integer).



118
119
120
121
122
123
124
125
126
127
128
129
130
# File 'lib/unicode_utils/read_cdata.rb', line 118

def self.read_hexdigit_map(filename)
  Hash.new.tap { |map|
    open_cdata_file(filename) do |input|
      buffer = "x" * 6
      buffer.force_encoding(Encoding::US_ASCII)
      val_buffer = "x"
      val_buffer.force_encoding(Encoding::US_ASCII)
      while input.read(6, buffer)
        map[buffer.to_i(16)] = input.read(1, val_buffer).to_i(16)
      end
    end
  }
end

.read_multivalued_map(filename) ⇒ Object



55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
# File 'lib/unicode_utils/read_cdata.rb', line 55

def self.read_multivalued_map(filename)
  Hash.new.tap { |map|
    open_cdata_file(filename) do |input|
      buffer = "x" * 6
      buffer.force_encoding(Encoding::US_ASCII)
      while input.read(6, buffer)
        cp = buffer.to_i(16)
        mapping = []
        while input.read(6, buffer).getbyte(0) != 120
          mapping << buffer.to_i(16)
        end
        map[cp] = mapping
      end
    end
  }
end

.read_name_aliases(filename) ⇒ Object



219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
# File 'lib/unicode_utils/read_cdata.rb', line 219

def self.read_name_aliases(filename)
  Hash.new.tap { |map|
    open_cdata_file(filename) do |input|
      cp_buffer = "x" * 6
      cp_buffer.force_encoding(Encoding::US_ASCII)
      ac_buffer = "x" * 1
      ac_buffer.force_encoding(Encoding::US_ASCII)
      at_buffer = "x" * 1
      at_buffer.force_encoding(Encoding::US_ASCII)
      al_buffer = "x" * 2
      al_buffer.force_encoding(Encoding::US_ASCII)
      while input.read(6, cp_buffer)
        aliases = Array.new(input.read(1, ac_buffer).to_i(16))
        0.upto(aliases.length - 1) { |i|
          type = NAME_ALIAS_TYPE_TO_SYMBOL_MAP[input.read(1, at_buffer).to_i(16)]
          name = input.read(input.read(2, al_buffer).to_i(16))
          aliases[i] = NameAlias.new(name.freeze, type)
        }
        map[cp_buffer.to_i(16)] = aliases.freeze
      end
    end
  }
end

.read_names(filename) ⇒ Object



72
73
74
75
76
77
78
79
80
81
82
# File 'lib/unicode_utils/read_cdata.rb', line 72

def self.read_names(filename)
  Hash.new.tap { |map|
    open_cdata_file(filename) do |input|
      buffer = "x" * 6
      buffer.force_encoding(Encoding::US_ASCII)
      while input.read(6, buffer)
        map[buffer.to_i(16)] = input.gets.tap { |x| x.chomp! }
      end
    end
  }
end

.read_range_to_hexdigit_list(filename) ⇒ Object

Returns a list (array) of pairs (two element Arrays) of Range (code points) and associated integer value.



134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
# File 'lib/unicode_utils/read_cdata.rb', line 134

def self.read_range_to_hexdigit_list(filename)
  Array.new.tap { |list|
    open_cdata_file(filename) do |input|
      cp_buffer = "x" * 6
      cp_buffer.force_encoding(Encoding::US_ASCII)
      val_buffer = "x"
      val_buffer.force_encoding(Encoding::US_ASCII)
      while input.read(6, cp_buffer)
        list << [
          Range.new(cp_buffer.to_i(16), input.read(6, cp_buffer).to_i(16)),
          input.read(1, val_buffer).to_i(16)
        ]
      end
    end
  }
end

.read_symbol_map(filename) ⇒ Object



206
207
208
209
210
211
212
213
214
215
216
217
# File 'lib/unicode_utils/read_cdata.rb', line 206

def self.read_symbol_map(filename)
  Hash.new.tap { |map|
    open_cdata_file(filename) do |input|
      input.each_line { |line|
        parts = line.split(";")
        parts[0].strip!
        parts[1].strip!
        map[parts[0].to_sym] = parts[1].to_sym
      }
    end
  }
end

.word_break?(cs, i) ⇒ Boolean

Returns:

  • (Boolean)


41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
# File 'lib/unicode_utils/each_word.rb', line 41

def self.word_break?(cs, i)
  # wb3
  cs_i = cs[i]
  i1 = i + 1
  cs_i1 = cs[i1]
  if cs_i == 0x0 && cs_i1 == 0x1
    return false
  end
  # wb3a
  if cs_i == 0x2 || cs_i == 0x0 || cs_i == 0x1
    return true
  end
  # wb3b
  if cs_i1 == 0x2 || cs_i1 == 0x0 || cs_i1 == 0x1
    return true
  end
  # wb5
  i0 = i
  # inline skip_l
  c = nil
  loop { c = cs[i0]; break unless c == 0x3 || c == 0x4; i0 -= 1 }
  ci0 = c
  if ci0 == 0x6 && cs_i1 == 0x6
    return false
  end
  # wb6
  i2 = i1 + 1
  # inline skip_r
  loop { c = cs[i2]; break unless c == 0x3 || c == 0x4; i2 += 1 }
  if ci0 == 0x6 && (cs_i1 == 0x7 || cs_i1 == 0x9) && cs[i2] == 0x6
    return false
  end
  # wb7
  i_1 = i0 - 1
  # inline skip_l
  loop { c = cs[i_1]; break unless c == 0x3 || c == 0x4; i_1 -= 1 }
  if cs[i_1] == 0x6 && (ci0 == 0x7 || ci0 == 0x9) && cs_i1 == 0x6
    return false
  end
  # wb8
  if ci0 == 0xA && cs_i1 == 0xA
    return false
  end
  # wb9
  if ci0 == 0x6 && cs_i1 == 0xA
    return false
  end
  # wb10
  if ci0 == 0xA && cs_i1 == 0x6
    return false
  end
  # wb11
  if cs[i_1] == 0xA && (ci0 == 0x8 || ci0 == 0x9) && cs_i1 == 0xA
    return false
  end
  # wb12
  if ci0 == 0xA && (cs_i1 == 0x8 || cs_i1 == 0x9) && cs[i2] == 0xA
    return false
  end
  # wb13
  if ci0 == 0x5 && cs_i1 == 0x5
    return false
  end
  # wb13a
  if (ci0 == 0x6 || ci0 == 0xA || ci0 == 0x5 || ci0 == 0xB) && cs_i1 == 0xB
    return false
  end
  # wb13b
  if ci0 == 0xB && (cs_i1 == 0x6 || cs_i1 == 0xA || cs_i1 == 0x5)
    return false
  end
  # wb13c
  if ci0 == 0xC && cs_i1 == 0xC
    return false
  end
  # break unless next char is Extend/Format
  cs_i1 != 0x3 && cs_i1 != 0x4
end