25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
|
# File 'lib/asciidammit.rb', line 25
def update
database = {}
unmatched = []
CSV.parse(open('http://www.unicode.org/Public/3.2-Update/UnicodeData-3.2.0.txt').read, :col_sep => ';').each{|row|
next if row[1] == '<control>'
codepoint = {
:codepoint => row[0],
:char => row[0].length == 4 && row[0] >= '0020' && row[0] <= '007E' ? row[0].to_i(16).chr : nil,
:name => row[1],
:category => row[2],
:decomp => row[5] ? row[5].split.select{|v| v =~ /^[0-9A-F]{4}$/i}.collect{|cp| cp.intern} : nil,
:decimal => row[6] ? Integer(row[6]) : (row[7] ? Integer(row[7]) : nil),
:numeric => row[8] =~ /^[0-9]+$/ ? Integer(row[8]) : nil,
:upcase => row[12] ? row[12].intern : nil,
:downcase => row[13] ? row[13].intern : nil,
}
next if %w{KANNADA MALAYALAM TELUGU TAMIL ORIYA GUJARATI GURMUKHI BENGALI DEVANAGARI THAANA
SYRIAC HEBREW COPTIC HANGUL CYRILLIC ARMENIAN ARABIC SINHALA THAI CHEROKEE
OGHAM RUNIC TAGALOG HANUNOO BUHID TAGBANWA KHMER MONGOLIAN MEASURED KANGXI
LAO MYANMAR GEORGIAN ETHIOPIC TIBETAN}.include?(codepoint[:name].split[0])
next if codepoint[:name] =~ /CANADIAN SYLLABICS /
next if codepoint[:name] =~ /PHILIPPINE (SINGLE|DOUBLE) PUNCTUATION/
next if codepoint[:name] =~ /BOX DRAWINGS /
next if codepoint[:name] =~ /MUSICAL SYMBOL /
next if codepoint[:name] =~ /CJK COMPATIBILITY /
if codepoint[:char].nil?
name = codepoint[:name].dup.split.reject{|w| %w{TURNED INVERTED CLOSED REVERSED OPEN}.include?(w)}.join(' ')
codepoint[:char] ||= $2 if name =~ /LATIN CAPITAL (LIGATURE|LETTER) ([A-Z]+)/ && codepoint[:category] == 'Lu'
codepoint[:char] ||= $1 if name =~ /LATIN LETTER ([A-Z]+)/ && codepoint[:category] == 'Lu'
codepoint[:char] ||= $3.downcase if name =~ /LATIN( LETTER)? SMALL (LIGATURE|LETTER|CAPITAL|SCRIPT) ([A-Z]+)/ && codepoint[:category] == 'Ll'
codepoint[:char] ||= $1.downcase if name =~ /LATIN LETTER STRETCHED ([A-Z]+)/ && codepoint[:category] == 'Ll'
codepoint[:char] ||= ' ' if name =~ /NO-BREAK SPACE/ && %w{Cf Zs}.include?(codepoint[:category])
codepoint[:char] ||= $2.downcase if name =~ /MODIFIER LETTER SMALL( CAPITAL)? ([A-Z])$/ && codepoint[:category] == 'Lm'
codepoint[:char] ||= $2.downcase if name =~ /MODIFIER LETTER SMALL( CAPITAL)? ([A-Z]) / && codepoint[:category] == 'Lm'
end
codepoint[:char] ||= MANUAL[codepoint[:name]]
database[codepoint[:codepoint].intern] = database[codepoint[:name]] = codepoint
}
database.each_pair{|k, v|
next unless k.is_a?(Symbol) && !v[:char]
case v[:category]
when 'Mn'
v[:char] = database[v[:name].gsub(/^COMBINING /, '')][:char] if database[v[:name].gsub(/^COMBINING /, '')]
when 'Lu', 'Ll'
if v[:decomp]
v[:char] = v[:decomp].collect{|c| database[c]}.compact.collect{|c| c[:char]}.join('')
v[:char] = nil if v[:char] == ''
else
names = v[:name].split.reject{|w| %w{LATIN MACRON SHARP WITH STROKE HOOK TOPBAR DOT ABOVE ACUTE}.include?(w)}
names.size.downto(1){|n|
names.permutation(n).to_a.each{|name|
name = "LATIN " + name.join(' ')
v[:char] = database[name][:char] if database[name] && database[name][:char]
break if v[:char]
}
break if v[:char]
}
end
when 'Lm', 'Lo', 'So', 'Po', 'Sc', 'Mc', 'Sm', 'Ps', 'Pe', 'Pd', 'Cs'
names = v[:name].split.reject{|w| %w{PARENTHESIZED CIRCLED ELEMENT VERTICAL HORIZONTAL STROKE LEFTWARDS ARROW BAR OVER
RIGHTWARDS WITH DOT ABOVE BELOW TAG INVERTED LEFT-POINTING RIGHT-POINTING
QUADRANT UPPER LEFT LOWER RIGHT
ANGLE SUPERSCRIPT SUBSCRIPT DOWNWARDS UPWARDS}.include?(w)}
STDOUT.flush
names.size.downto(1){|n|
names.permutation(n).to_a.each{|name|
name = name.join(' ')
v[:char] = database[name][:char] if database[name] && database[name][:char]
break if v[:char]
}
break if v[:char]
}
when 'Lt'
m = v[:name].match(/(CAPITAL|SMALL) LETTER (.) WITH (CAPITAL|SMALL) LETTER (.)/)
if m
v[:char] = database["LATIN #{m[1]} LETTER #{m[2]}"][:char].to_s + database["LATIN #{m[3]} LETTER #{m[4]}"][:char].to_s
v[:char] = nil if v[:char] == ''
end
when 'Nl', 'No', 'Nd'
if v[:decomp]
v[:char] = v[:decomp].collect{|c| database[c]}.compact.collect{|c| c[:char]}.join('')
v[:char] = nil if v[:char] == ''
end
if v[:char].nil?
offset = [v[:decimal], v[:numeric]].compact
v[:char] = offset[0].to_s if offset.size > 0
end
when 'Pi', 'Pf'
v[:char] = "'" if v[:name] =~ /SINGLE/
v[:char] = '"' if v[:name] =~ /DOUBLE/
when 'Zs'
v[:char] = ' '
when 'Sk', 'Cf', 'Me', 'Pc', 'Cc', 'Co', 'Zl', 'Zp'
v[:ignore] = true
else
raise "Unhandled character category #{v[:category]}"
end
if v[:char].nil?
v[:char] = database[v[:upcase]][:char].downcase if v[:upcase] && database[v[:upcase]][:char]
v[:char] = database[v[:downcase]][:char].upcase if v[:downcase] && database[v[:downcase]][:char]
end
unmatched << v unless v[:char]
}
database.keys.each{|k|
next unless k.is_a?(Symbol)
database[eval("\"\\u{#{k}}\"")] = database[k][:char]
}
database[:unmatched] = unmatched
puts "#{database[:unmatched].size} unmatched"
File.open(DATABASEFILE, 'wb'){|f| f.write(database.reject{|k, v| v.is_a?(Hash)}.to_msgpack) }
end
|