Class: JDict::LibXMLDictionaryIndexer
- Inherits:
-
DictionaryIndexer
- Object
- DictionaryIndexer
- JDict::LibXMLDictionaryIndexer
- Defined in:
- lib/ruby-jdict/indexer/libxml_dictionary_indexer.rb
Instance Attribute Summary
Attributes inherited from DictionaryIndexer
Instance Method Summary collapse
- #index(db_transaction, &block) ⇒ Object
-
#initialize(path) ⇒ LibXMLDictionaryIndexer
constructor
A new instance of LibXMLDictionaryIndexer.
- #parse_parts_of_speech ⇒ Object
Constructor Details
#initialize(path) ⇒ LibXMLDictionaryIndexer
Returns a new instance of LibXMLDictionaryIndexer.
7 8 9 |
# File 'lib/ruby-jdict/indexer/libxml_dictionary_indexer.rb', line 7 def initialize(path) super end |
Instance Method Details
#index(db_transaction, &block) ⇒ Object
11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 |
# File 'lib/ruby-jdict/indexer/libxml_dictionary_indexer.rb', line 11 def index(db_transaction, &block) reader = open_reader(@path) # whenever there is a reader error, print its block parameters XML::Error.set_handler { |*args| p args } entry_sequence_num, kanji, kana, senses = 0, [], [], [] language = nil glosses = {} parts_of_speech = [] entries_added = 0 while reader.read yield entries_added, 0 if block_given? case reader.node_type # start-of-element node when XML::Reader::TYPE_ELEMENT case reader.name when JDict::JMDictConstants::Elements::SEQUENCE entry_sequence_num = reader.next_text.to_i # TODO: Raise an exception if reader.next_text.empty? inside the when's # JMdict shouldn't have any empty elements, I believe. when JDict::JMDictConstants::Elements::KANJI text = reader.next_text kanji << text unless text.empty? when JDict::JMDictConstants::Elements::KANA text = reader.next_text kana << text unless text.empty? when JDict::JMDictConstants::Elements::GLOSS # Assume the language of the whole sense is the language # of the first gloss (in practice, there is never a gloss # with more than one language) unless language language = reader.node.lang || JMDictConstants::LANGUAGE_DEFAULT language = language.intern end text = reader.next_text glosses << text unless text.empty? when JDict::JMDictConstants::Elements::CROSSREFERENCE text = reader.next_text end # XML entity references are treated as a different node type # the parent node of the entity reference itself has the actual tag name when XML::Reader::TYPE_ENTITY_REFERENCE if reader.node.parent.name == JDict::JMDictConstants::Elements::PART_OF_SPEECH text = reader.name parts_of_speech << text unless text.empty? end when XML::Reader::TYPE_END_ELEMENT case reader.name when JDict::JMDictConstants::Elements::SENSE senses << Sense.new(parts_of_speech, glosses, language) # clear data for the next sense glosses = {} parts_of_speech = [] language = nil # we're at the end of the entry element, so index it when JDict::JMDictConstants::Elements::ENTRY raise "No kana found for this entry!" if kana.empty? entry = Entry.new(entry_sequence_num, kanji, kana, senses) add_entry(entry) # clear data for the next entry kanji, kana, senses = [], [], [] entries_added += 1 end end end reader.close entries_added end |
#parse_parts_of_speech ⇒ Object
99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 |
# File 'lib/ruby-jdict/indexer/libxml_dictionary_indexer.rb', line 99 def parse_parts_of_speech pos_hash = {} done = false until done reader.read case reader.node_type when XML::Reader::TYPE_DOCUMENT_TYPE # segfaults when attempting this: # cs.each do |child| # p child.to_s # end doctype_string = reader.node.to_s entities = doctype_string.scan(ENTITY_REGEX) entities.map do |entity| abbrev = entity[0] full = entity[1] sym = pos_to_sym(abbrev) pos_hash[sym] = full end done = true when XML::Reader::TYPE_ELEMENT done = true end end reader.close printf "\n" pos_hash end |