Module: WenlinDbScanner::Dicts

Defined in:: lib/wenlin_db_scanner/dict.rb

Overview

Parses the data in the dictionary databases.

Class Method Summary collapse

.en_zh(db_root) ⇒ Enumerator<DictEntry>

The entries in the English->Chinese dictionary.
.entries(db_file) ⇒ Enumerator<DictEntry>

Generic decodeder for a database of dictionary entries.
.key_frequency(key) ⇒ Integer^?

The frequency information expressed in a dictionary key.
.key_latin_frequency(key) ⇒ Boolean

The latin frequency information expressed in a dictionary key.
.key_latin_term(key) ⇒ String

The term defined by a dictionary key, spelled using Latin characters.
.key_term(key) ⇒ String

The term defined by a dictionary key.
.zh_en(db_root) ⇒ Enumerator<DictEntry>

The entries in the Chinese->English dictionary.

Class Method Details

.en_zh(db_root) ⇒ `Enumerator<DictEntry>`

The entries in the English->Chinese dictionary.

Parameters:

db_root (String) —

the directory containing the .db files

Returns:

(Enumerator<DictEntry>)



11
12
13

# File 'lib/wenlin_db_scanner/dict.rb', line 11

def self.en_zh(db_root)
  entries File.join(db_root, 'yinghan.db')
end

.entries(db_file) ⇒ `Enumerator<DictEntry>`

Generic decodeder for a database of dictionary entries.

Parameters:

db_file (String) —

path to the .db file containing dictionary data

Returns:

(Enumerator<DictEntry>)

# File 'lib/wenlin_db_scanner/dict.rb', line 27

def self.entries(db_file)
  Enumerator.new do |yielder|
    db = Db.new db_file
    db.records.each do |record|
      next if record.binary?
      lines = record.text.split("\n").map(&:strip).reject(&:empty?)

      key = lines[0]

      entry = DictEntry.new
      entry.key = key
      entry.term = key_term key
      entry.latin_term = key_latin_term key
      entry.term_frequency = key_frequency key
      entry.latin_frequency_boost = key_latin_frequency key

      collect_values = false
      lines[1..-1].each do |line|
        tag, data = *line.split(' ', 2)
        tag_parts = /^(\d*)(\w+)(\@.*)?$/.match tag
        unless tag_parts
          raise "Unknown tag format #{tag} in dictionary entry!\n#{record.text}"
        end
        case tag_parts[2]
        when 'ipa'
          prop = :ipa
        when 'a'
          prop = :abbreviates
        when 'c'
          prop = nil
          prop1 = :zh
          data1 = data.gsub(/\[[^\]]*\]/, '').strip
          prop2 = :zh_tw
          data2 = data.scan(/\[([^\]]*)\]/).map(&:first).join('; ').strip
          if data2.empty?
            data2 = data1
          else
            if data2.index '-'
              # Handle entries like
              data2 = data2.chars.map.with_index { |char, index|
                (char == '-') ? data1[index] : char
              }.join ''
            end
          end
        when 'd'
          prop = :defn
        when 'b'  # NOTE: base of?
          prop = nil
          prop1 = :used_in_terms
          prop2 = :used_in_serials
          data1 = data.gsub(/\[[^\]]*\]/, '').strip
          data2 = data.scan(/\[([^\]]*)\]/).map(&:first).join('; ').strip
          collect_values = true
        when 'e'  # NOTE: equivalent?
          prop = nil
          prop1 = :linked_terms
          prop2 = :linked_serials
          data1 = data.gsub(/\[[^\]]*\]/, '').strip
          data2 = data.scan(/\[([^\]]*)\]/).map(&:first).join('; ').strip
          collect_values = true
        when 'f' # e.g. 2.2 [XHPC:4]
          prop = :freq
          data = data.split('[', 2).first.strip
        when 'gr'
          prop = :grade
        when 'h'
          # NOTE: guessing this means it shows up in the application's help.
          #       it seems to only be set for technical terms
          prop = false
        when 'hz'
          prop = :example_zh
        when 'infl'
          prop = :inflection
        when 'j'  # NOTE: jump?
          prop = :see_serial
        when 'k'
          prop = :see_term
        when 'm'
          prop = :measure_word
          # NOTE: stripping the complex hanzi, as it can be found by
          #       cross-referencing the measure word's key
          data = data.gsub(/\[[^\]]*\]/, '').strip
          data = data.split('/').map(&:strip)
        when 'n'
          # NOTE: the field of reference sometimes looks like "mus.[音]"
          data2 = data.scan(/\[([^\]]*)\]/).map(&:first).join('; ').strip
          if data2.empty?
            prop = :field
          else
            prop = nil
            prop1 = :field
            prop2 = :field_zh
            data1 = data.gsub(/\[[^\]]*\]/, '').strip
            data2 = data.scan(/\[([^\]]*)\]/).map(&:first).join('; ').strip
          end
        when 'note'
          prop = :note
        when 'o'
          prop = :construction
        when 'p'
          prop = :speech_part
        when 'q'
          prop = :usage
        when 'r', 'rem'
          # NOTE: skipping remarks / revisions for now; they might be
          #       interesting for research
          prop = false
        when 's'
          prop = :serial
        when 'sub'
          prop = nil
          prop1 = :extend
          prop2 = :extend_serial
          data1 = data.gsub(/\[[^\]]*\]/, '').strip
          data2 = data.scan(/\[([^\]]*)\]/).map(&:first).join('; ').strip
          collect_values = true
        when 'subof'
          prop = nil
          prop1 = :extended_from
          prop2 = :extended_from_serial
          data1 = data.gsub(/\[[^\]]*\]/, '').strip
          data2 = data.scan(/\[([^\]]*)\]/).map(&:first).join('; ').strip
          collect_values = true
        when 't'
          prop = :example_translation
        when 'u'
          prop = :unverified
          data = true
        when 'v'
          # NOTE: no idea what this is, only shows up once
          prop = false
        when 'w'
          prop = :reference
        when 'x'
          prop = :example
        when 'y'
          prop = :years
        else
          raise "Unknown tag #{tag} in dictionary entry!\n#{record.text}"
        end
        next if prop == false

        ops = if prop
          [[prop, data]]
        else
          [[prop1, data1], [prop2, data2]]
        end
        ops.each do |k, v|
          if tag_parts[1].empty?
            if collect_values
              entry[k] ||= []
              entry[k] << v
            else
              entry[k] = v
            end
          else
            # Exampe: 31x means example: [blah, blah, [value]]
            indexes = tag_parts[1].chars.map do |char|
              ((char == ?0) ? 10 : char.to_i) - 1
            end
            if indexes.any? { |i| i < 0 }
              puts "Broken tag #{tag} #{tag_parts[1]} #{indexes.inspect}\n#{record.text}"
            end
            entry[k] ||= []
            unless entry[k].kind_of?(Array)
              # Fix entries listing props x and 2x instead of 1x, 2x.
              entry[k] = [entry[k]]
            end
            array = entry[k]
            indexes[0...-1].each do |i|
              array[i] ||= []
              unless array[i].kind_of?(Array)
                # Fix entries listing props 1x and 12x instead of 11x, 12x.
                array[i] = [array[i]]
              end
              array = array[i]
            end
            if collect_values
              array[indexes.last] ||= []
              array[indexes.last] << v
            else
              array[indexes.last] = v
            end
          end
        end
      end

      yielder << entry
    end
  end
end

.key_frequency(key) ⇒ `Integer`^?

The frequency information expressed in a dictionary key.

This shows the relative frequency of the term, among all the terms with the same exact spelling. For Chinese terms, the spelling is pinyin.

Parameters:

key (String) —

a dictionary key

Returns:

(Integer, nil) —

nil if the key does not have frequency information

# File 'lib/wenlin_db_scanner/dict.rb', line 242

def self.key_frequency(key)
  match = /^[^\p{L}]+/.match(key)
  return nil unless match
  match[0].tr('⁰¹²³⁴⁵⁶⁷⁸⁹' , '0123456789').to_i
end

.key_latin_frequency(key) ⇒ `Boolean`

The latin frequency information expressed in a dictionary key.

This is true if the term is the most frequent, among all terms with the same latin spelling. For Chinese terms, the latin spelling is pinyin with tone information removed.

Parameters:

key (String) —

a dictionary key

Returns:

(Boolean)



256
257
258

# File 'lib/wenlin_db_scanner/dict.rb', line 256

def self.key_latin_frequency(key)
  key[-1] == ?*
end

.key_latin_term(key) ⇒ `String`

The term defined by a dictionary key, spelled using Latin characters.

Parameters:

key (String) —

a dictionary key

Returns:

(String) —

the term inside the key, spelled using Latin characters



231
232
233

# File 'lib/wenlin_db_scanner/dict.rb', line 231

def self.key_latin_term(key)
  Chars.pinyin_to_latin key_term(key)
end

.key_term(key) ⇒ `String`

The term defined by a dictionary key.

Parameters:

key (String) —

a dictionary key

Returns:

(String) —

the term inside the key



223
224
225

# File 'lib/wenlin_db_scanner/dict.rb', line 223

def self.key_term(key)
  key.gsub(/[^\p{L}]/, '')
end

.zh_en(db_root) ⇒ `Enumerator<DictEntry>`

The entries in the Chinese->English dictionary.

Parameters:

db_root (String) —

the directory containing the .db files

Returns:

(Enumerator<DictEntry>)



19
20
21

# File 'lib/wenlin_db_scanner/dict.rb', line 19

def self.zh_en(db_root)
  entries File.join(db_root, 'cidian.db')
end

Module: WenlinDbScanner::Dicts

Overview

Class Method Summary collapse

Class Method Details

.en_zh(db_root) ⇒ Enumerator<DictEntry>

.entries(db_file) ⇒ Enumerator<DictEntry>

.key_frequency(key) ⇒ Integer?

.key_latin_frequency(key) ⇒ Boolean

.key_latin_term(key) ⇒ String

.key_term(key) ⇒ String

.zh_en(db_root) ⇒ Enumerator<DictEntry>

.en_zh(db_root) ⇒ `Enumerator<DictEntry>`

.entries(db_file) ⇒ `Enumerator<DictEntry>`

.key_frequency(key) ⇒ `Integer`^?

.key_latin_frequency(key) ⇒ `Boolean`

.key_latin_term(key) ⇒ `String`

.key_term(key) ⇒ `String`

.zh_en(db_root) ⇒ `Enumerator<DictEntry>`