Class: JDict::LibXMLDictionaryIndexer

Inherits:

DictionaryIndexer

Object
DictionaryIndexer
JDict::LibXMLDictionaryIndexer

show all

Defined in:: lib/ruby-jdict/indexer/libxml_dictionary_indexer.rb

Instance Attribute Summary

Attributes inherited from DictionaryIndexer

#parts_of_speech

Instance Method Summary collapse

#index(db_transaction, &block) ⇒ Object
#initialize(path) ⇒ LibXMLDictionaryIndexer constructor

A new instance of LibXMLDictionaryIndexer.
#parse_parts_of_speech ⇒ Object

Constructor Details

#initialize(path) ⇒ `LibXMLDictionaryIndexer`

Returns a new instance of LibXMLDictionaryIndexer.



7
8
9

# File 'lib/ruby-jdict/indexer/libxml_dictionary_indexer.rb', line 7

def initialize(path)
  super
end

Instance Method Details

#index(db_transaction, &block) ⇒ `Object`

# File 'lib/ruby-jdict/indexer/libxml_dictionary_indexer.rb', line 11

def index(db_transaction, &block)
  reader = open_reader(@path)

  # whenever there is a reader error, print its block parameters
  XML::Error.set_handler { |*args| p args }

  entry_sequence_num, kanji, kana, senses = 0, [], [], []
  language = nil
  glosses = {}
  parts_of_speech = []

  entries_added = 0

  while reader.read
    yield entries_added, 0 if block_given?

    case reader.node_type

    # start-of-element node
    when XML::Reader::TYPE_ELEMENT
      case reader.name
      when JDict::JMDictConstants::Elements::SEQUENCE
        entry_sequence_num = reader.next_text.to_i

      # TODO: Raise an exception if reader.next_text.empty? inside the when's
      #       JMdict shouldn't have any empty elements, I believe.
      when JDict::JMDictConstants::Elements::KANJI
        text = reader.next_text
        kanji << text unless text.empty?

      when JDict::JMDictConstants::Elements::KANA
        text = reader.next_text
        kana << text unless text.empty?

      when JDict::JMDictConstants::Elements::GLOSS
        # Assume the language of the whole sense is the language
        # of the first gloss (in practice, there is never a gloss
        # with more than one language)
        unless language
          language = reader.node.lang || JMDictConstants::LANGUAGE_DEFAULT
          language = language.intern
        end
        text = reader.next_text
        glosses << text unless text.empty?

      when JDict::JMDictConstants::Elements::CROSSREFERENCE
        text = reader.next_text
      end

    # XML entity references are treated as a different node type
    # the parent node of the entity reference itself has the actual tag name
    when XML::Reader::TYPE_ENTITY_REFERENCE
      if reader.node.parent.name == JDict::JMDictConstants::Elements::PART_OF_SPEECH
        text = reader.name
        parts_of_speech << text unless text.empty?
      end

    when XML::Reader::TYPE_END_ELEMENT
      case reader.name

      when JDict::JMDictConstants::Elements::SENSE
        senses << Sense.new(parts_of_speech, glosses, language)

        # clear data for the next sense
        glosses = {}
        parts_of_speech = []
        language = nil

      # we're at the end of the entry element, so index it
      when JDict::JMDictConstants::Elements::ENTRY
        raise "No kana found for this entry!" if kana.empty?

        entry = Entry.new(entry_sequence_num, kanji, kana, senses)
        add_entry(entry)

        # clear data for the next entry
        kanji, kana, senses = [], [], []

        entries_added += 1
      end
    end
  end

  reader.close

  entries_added
end

#parse_parts_of_speech ⇒ `Object`

# File 'lib/ruby-jdict/indexer/libxml_dictionary_indexer.rb', line 99

def parse_parts_of_speech
  pos_hash = {}
  done = false
  until done
    reader.read
    case reader.node_type
    when XML::Reader::TYPE_DOCUMENT_TYPE
      # segfaults when attempting this:
      # cs.each do |child|
      #   p child.to_s
      # end
      doctype_string = reader.node.to_s
      entities = doctype_string.scan(ENTITY_REGEX)
      entities.map do |entity|
        abbrev = entity[0]
        full = entity[1]
        sym = pos_to_sym(abbrev)
        pos_hash[sym] = full
      end
      done = true
    when XML::Reader::TYPE_ELEMENT
      done = true
    end
  end

  reader.close

  printf "\n"

  pos_hash
end

Class: JDict::LibXMLDictionaryIndexer

Instance Attribute Summary

Attributes inherited from DictionaryIndexer

Instance Method Summary collapse

Constructor Details

#initialize(path) ⇒ LibXMLDictionaryIndexer

Instance Method Details

#index(db_transaction, &block) ⇒ Object

#parse_parts_of_speech ⇒ Object

#initialize(path) ⇒ `LibXMLDictionaryIndexer`

#index(db_transaction, &block) ⇒ `Object`

#parse_parts_of_speech ⇒ `Object`