Class: Cab2xml::Converter
- Inherits:
-
Object
- Object
- Cab2xml::Converter
- Defined in:
- lib/cab2xml/converter.rb
Instance Attribute Summary collapse
-
#attr_mode ⇒ Object
Returns the value of attribute attr_mode.
-
#token_format ⇒ Object
Returns the value of attribute token_format.
Instance Method Summary collapse
- #add_node(parent, format, data) ⇒ Object
- #check_namespace(key) ⇒ Object
- #create_xml(mode) ⇒ Object
-
#initialize ⇒ Converter
constructor
A new instance of Converter.
- #parse(file) ⇒ Object
- #parse_cabocha_tag(line) ⇒ Object
- #parse_extended_tag(line) ⇒ Object
Constructor Details
#initialize ⇒ Converter
Returns a new instance of Converter.
7 8 9 10 |
# File 'lib/cab2xml/converter.rb', line 7 def initialize @attr_mode = :attr @token_format = :mecab_unidic end |
Instance Attribute Details
#attr_mode ⇒ Object
Returns the value of attribute attr_mode.
6 7 8 |
# File 'lib/cab2xml/converter.rb', line 6 def attr_mode @attr_mode end |
#token_format ⇒ Object
Returns the value of attribute token_format.
6 7 8 |
# File 'lib/cab2xml/converter.rb', line 6 def token_format @token_format end |
Instance Method Details
#add_node(parent, format, data) ⇒ Object
36 37 38 39 40 |
# File 'lib/cab2xml/converter.rb', line 36 def add_node(parent, format, data) parent << (format % data) @last = parent.children.last return @last end |
#check_namespace(key) ⇒ Object
41 42 43 44 45 46 47 48 |
# File 'lib/cab2xml/converter.rb', line 41 def check_namespace(key) return unless key =~ /:/ namespace, key = key.split(':', 2) @namespaces ||= {} return if @namespaces[namespace] @namespaces[namespace] = true @doc.add_namespace namespace, 'http://www.ninjal.ac.jp/corpus_center/bccwj/' + namespace end |
#create_xml(mode) ⇒ Object
29 30 31 32 33 34 35 |
# File 'lib/cab2xml/converter.rb', line 29 def create_xml(mode) # mode = {:corpora|:document} @xml = Nokogiri::XML("<#{mode}/>") @xml.encoding = 'UTF-8' @corpora = @xml.root if mode == :corpora @doc = @xml.root if mode == :document end |
#parse(file) ⇒ Object
11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 |
# File 'lib/cab2xml/converter.rb', line 11 def parse(file) @xml, @sen, @senid = nil file.set_encoding 'UTF-8' file.each_line do |line| line.chomp! case line when '' # ignore when /^##/ # comment line when /^#!/ parse_extended_tag line else parse_cabocha_tag line end end return @xml end |
#parse_cabocha_tag(line) ⇒ Object
96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
# File 'lib/cab2xml/converter.rb', line 96 def parse_cabocha_tag(line) case line when /^\*/ create_xml(:document) unless @xml unless @sen @sen = add_node(@doc, '<sentence id="%d"/>', @senid ||= 0) @senid += 1 @tokid = 0 end null, id, dep, headfunc, score = line.split(' ') link, rel = dep[0..-2], dep[-1] head, func = headfunc.split('/') data = [id, link, rel, head, func, score] format = '<chunk id="%d" link="%d" rel="%s" head="%d" func="%d" score="%s"/>' @chunk = add_node(@sen, format, data) when 'EOS' @sen = nil else case token_format when :chasen data = line.split(/\s/) data = [@tokid, *data[1..5], data[0]] format = '<tok id="%d" read="%s" base="%s" pos="%s" cype="%s" cform="%s">%s</tok>' @tok = add_node(@chunk, format, data) @tokid += 1 when :mecab_unidic text, data = line.split(/\s/, 2) data = data.split(',').map{|item| item == '*' ? nil : item } pos = data[0, 4].compact.join('-') ctype, cform, lemmaForm, lemma = data[4, 4] data = [@tokid, pos, ctype, cform, lemmaForm, lemma, text] format = '<tok id="%d" pos="%s" cype="%s" cform="%s" lemmaForm="%s" lemma="%s">%s</tok>' @tok = add_node(@chunk, format, data) @tokid += 1 end end end |
#parse_extended_tag(line) ⇒ Object
49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 |
# File 'lib/cab2xml/converter.rb', line 49 def parse_extended_tag(line) null, label, *data = CSV.parse_line(line, :col_sep => "\s") data.map!{|item| item.encode(:xml => :text)} case label when 'DOCID' create_xml(:corpora) unless @xml format = '<DOCID id=%d>%s</DOCID>' @docid = add_node(@corpora, format, data) when 'SENTENCETAGID' format = '<SENTENCETAGID id=%d>%s</SENTENCETAGID>' @sentencetagid = add_node(@corpora, format, data) when 'DOC' format = '<document id="%d"/>' @doc = add_node(@corpora, format, data) @senid = 0 when 'ATTR' case @attr_mode when :node format = '<ATTR Key="%s" Value="%s"/>' @attr = add_node(@last, format, data) when :attr key, value = data check_namespace key @last[key] = value end when 'SEGMENT' format = '<SEGMENT TagName="%s" StartGPos="%s" EndGPos="%s" Comments="%s"/>' @seg = add_node(@doc, format, data) when 'SEGMENT_S' format = '<SEGMENT_S TagName="%s" StartLPos="%s" EndLPos="%s" Comments="%s"/>' @seg = add_node(@sen, format, data) when 'LINK' format = '<LINK TagName="%s" FromSegNo="%s" EndSegNo="%s" Comments="%s"/>' @link = add_node(@doc, format, data) when 'LINK_S' format = '<LINK_S TagName="%s" FromSegSNo="%s" EndSegSNo="%s" Comments="%s"/>' @link = add_node(@sen, format, data) when 'GROUP' format = '<GROUP TagName="%s" SegNo="%s" Comments="%s"/>' data = [data[0], data[1..-2].join(','), data[-1]] @group = add_node(@doc, format, data) when 'GROUP_S' format = '<GROUP_S TagName="%s" SegSNo="%s" Comments="%s"/>' data = [data[0], data[1..-2].join(','), data[-1]] @group = add_node(@sen, format, data) end end |