Class: Ting::HanyuPinyinParser

Inherits:
Object
  • Object
show all
Includes:
Procable
Defined in:
lib/ting/hanyu_pinyin_parser.rb

Instance Method Summary collapse

Methods included from Procable

#memoize, #to_proc

Instance Method Details

#all_syllablesObject



13
14
15
# File 'lib/ting/hanyu_pinyin_parser.rb', line 13

def all_syllables
  @all_syllables ||= Ting.all_syllables.map(&hanyu_writer).sort_by(&:length).reverse
end

#consonant_syllablesObject



17
18
19
# File 'lib/ting/hanyu_pinyin_parser.rb', line 17

def consonant_syllables
  @consonant_syllables ||= all_syllables.grep(/^[bcdfghjklmnpqrstwxyz]/i)
end

#hanyu_readerObject



9
10
11
# File 'lib/ting/hanyu_pinyin_parser.rb', line 9

def hanyu_reader
  @hanyu_reader ||= Ting.reader(:hanyu, :accents)
end

#hanyu_writerObject



5
6
7
# File 'lib/ting/hanyu_pinyin_parser.rb', line 5

def hanyu_writer
  @hanyu_writer ||= Ting.writer(:hanyu, :accents)
end

#parse(pinyin) ⇒ Object Also known as: call



64
65
66
67
68
69
70
# File 'lib/ting/hanyu_pinyin_parser.rb', line 64

def parse(pinyin)
  # hanyu_reader cannot parse uppercase pinyin.
  pinyin = pinyin.downcase

  clusters = pinyin.split(pinyin_separator_regexp)
  clusters.flat_map {|cluster| parse_cluster(cluster)}.flat_map(&hanyu_reader)
end

#parse_cluster(pinyin) ⇒ Object

Raises:

  • (ArgumentError)


41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# File 'lib/ting/hanyu_pinyin_parser.rb', line 41

def parse_cluster(pinyin)
  syllables = []

  # Chop off one syllable at a time from the end by continuously matching the same regular expression.
  # This ensures the pinyin will be split into only valid pinyin syllables. Because a match capture will
  # only contain the *last* content it has matched, we have to use a loop.
  while match = pinyin_regexp.match(pinyin)
    # If an 'r' at the end was matched, this implies that all other parts of the string were matched as
    # syllables, and this cluster uses erhua.
    if 'r' == match[3]
      syllables << 'er'
      pinyin = pinyin.chop
    end
    last_syllable = match[2] || match[1]
    syllables << last_syllable
    pinyin = pinyin[0, pinyin.length - last_syllable.length]
  end

  raise ArgumentError, "Unparseable pinyin fragment encountered: #{pinyin}" if !pinyin.empty?

  syllables.reverse
end

#pinyin_regexpObject



21
22
23
24
25
26
27
28
29
30
31
32
33
34
# File 'lib/ting/hanyu_pinyin_parser.rb', line 21

def pinyin_regexp
  # This will parse a cluster of pinyin, i.e. an uninterrupted string of pinyin characters without punctuation.
  @pinyin_cluster_regexp ||= /\A
    # Every syllable can appear at the start of a cluster.
    (#{Regexp.union(all_syllables)})
    # However, only syllables starting with a consonant can follow, as syllables starting with a vowel have to
    # be prefixed with an apostrophe.
    # Since it is common to omit the apostrophe when there is no ambiguity, also allow syllables starting with
    # a vowel after all letters except n and g, and after -ong, since -on does not appear at the end of a valid
    # syllable.
    (#{Regexp.union(consonant_syllables)}|(?<=[^ng]|[ōóǒòo]ng)#{Regexp.union(all_syllables)})*
    (r)?
    \Z/x
end

#pinyin_separator_regexpObject



36
37
38
39
# File 'lib/ting/hanyu_pinyin_parser.rb', line 36

def pinyin_separator_regexp
  # A regular expression that matches every character that can *not* appear in pinyin.
  @pinyin_separator_regexp ||= Regexp.new("[^#{all_syllables.join.downcase.split("").sort.uniq.join}]+")
end