Class: Ting::HanyuPinyinParser
- Inherits:
-
Object
- Object
- Ting::HanyuPinyinParser
- Includes:
- Procable
- Defined in:
- lib/ting/hanyu_pinyin_parser.rb
Instance Method Summary collapse
- #all_syllables ⇒ Object
- #consonant_syllables ⇒ Object
- #hanyu_reader ⇒ Object
- #hanyu_writer ⇒ Object
- #parse(pinyin) ⇒ Object (also: #call)
- #parse_cluster(pinyin) ⇒ Object
- #pinyin_regexp ⇒ Object
- #pinyin_separator_regexp ⇒ Object
Methods included from Procable
Instance Method Details
#all_syllables ⇒ Object
13 14 15 |
# File 'lib/ting/hanyu_pinyin_parser.rb', line 13 def all_syllables @all_syllables ||= Ting.all_syllables.map(&hanyu_writer).sort_by(&:length).reverse end |
#consonant_syllables ⇒ Object
17 18 19 |
# File 'lib/ting/hanyu_pinyin_parser.rb', line 17 def consonant_syllables @consonant_syllables ||= all_syllables.grep(/^[bcdfghjklmnpqrstwxyz]/i) end |
#hanyu_reader ⇒ Object
9 10 11 |
# File 'lib/ting/hanyu_pinyin_parser.rb', line 9 def hanyu_reader @hanyu_reader ||= Ting.reader(:hanyu, :accents) end |
#hanyu_writer ⇒ Object
5 6 7 |
# File 'lib/ting/hanyu_pinyin_parser.rb', line 5 def hanyu_writer @hanyu_writer ||= Ting.writer(:hanyu, :accents) end |
#parse(pinyin) ⇒ Object Also known as: call
64 65 66 67 68 69 70 |
# File 'lib/ting/hanyu_pinyin_parser.rb', line 64 def parse() # hanyu_reader cannot parse uppercase pinyin. = .downcase clusters = .split() clusters.flat_map {|cluster| parse_cluster(cluster)}.flat_map(&hanyu_reader) end |
#parse_cluster(pinyin) ⇒ Object
41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 |
# File 'lib/ting/hanyu_pinyin_parser.rb', line 41 def parse_cluster() syllables = [] # Chop off one syllable at a time from the end by continuously matching the same regular expression. # This ensures the pinyin will be split into only valid pinyin syllables. Because a match capture will # only contain the *last* content it has matched, we have to use a loop. while match = .match() # If an 'r' at the end was matched, this implies that all other parts of the string were matched as # syllables, and this cluster uses erhua. if 'r' == match[3] syllables << 'er' = .chop end last_syllable = match[2] || match[1] syllables << last_syllable = [0, .length - last_syllable.length] end raise ArgumentError, "Unparseable pinyin fragment encountered: #{}" if !.empty? syllables.reverse end |
#pinyin_regexp ⇒ Object
21 22 23 24 25 26 27 28 29 30 31 32 33 34 |
# File 'lib/ting/hanyu_pinyin_parser.rb', line 21 def # This will parse a cluster of pinyin, i.e. an uninterrupted string of pinyin characters without punctuation. @pinyin_cluster_regexp ||= /\A # Every syllable can appear at the start of a cluster. (#{Regexp.union(all_syllables)}) # However, only syllables starting with a consonant can follow, as syllables starting with a vowel have to # be prefixed with an apostrophe. # Since it is common to omit the apostrophe when there is no ambiguity, also allow syllables starting with # a vowel after all letters except n and g, and after -ong, since -on does not appear at the end of a valid # syllable. (#{Regexp.union(consonant_syllables)}|(?<=[^ng]|[ōóǒòo]ng)#{Regexp.union(all_syllables)})* (r)? \Z/x end |
#pinyin_separator_regexp ⇒ Object
36 37 38 39 |
# File 'lib/ting/hanyu_pinyin_parser.rb', line 36 def # A regular expression that matches every character that can *not* appear in pinyin. @pinyin_separator_regexp ||= Regexp.new("[^#{all_syllables.join.downcase.split("").sort.uniq.join}]+") end |