Class: Cab2xml::Converter

Inherits:
Object
  • Object
show all
Defined in:
lib/cab2xml/converter.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initializeConverter

Returns a new instance of Converter.



7
8
9
10
# File 'lib/cab2xml/converter.rb', line 7

def initialize
  @attr_mode = :attr
  @token_format = :mecab_unidic
end

Instance Attribute Details

#attr_modeObject

Returns the value of attribute attr_mode.



6
7
8
# File 'lib/cab2xml/converter.rb', line 6

def attr_mode
  @attr_mode
end

#token_formatObject

Returns the value of attribute token_format.



6
7
8
# File 'lib/cab2xml/converter.rb', line 6

def token_format
  @token_format
end

Instance Method Details

#add_node(parent, format, data) ⇒ Object



36
37
38
39
40
# File 'lib/cab2xml/converter.rb', line 36

def add_node(parent, format, data)
  parent << (format % data)
  @last = parent.children.last
  return @last
end

#check_namespace(key) ⇒ Object



41
42
43
44
45
46
47
48
# File 'lib/cab2xml/converter.rb', line 41

def check_namespace(key)
  return unless key =~ /:/
  namespace, key = key.split(':', 2)
  @namespaces ||= {}
  return if @namespaces[namespace]
  @namespaces[namespace] = true
  @doc.add_namespace namespace, 'http://www.ninjal.ac.jp/corpus_center/bccwj/' + namespace
end

#create_xml(mode) ⇒ Object



29
30
31
32
33
34
35
# File 'lib/cab2xml/converter.rb', line 29

def create_xml(mode)
  # mode = {:corpora|:document}
  @xml = Nokogiri::XML("<#{mode}/>")
  @xml.encoding = 'UTF-8'
  @corpora = @xml.root if mode == :corpora
  @doc = @xml.root if mode == :document
end

#parse(file) ⇒ Object



11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
# File 'lib/cab2xml/converter.rb', line 11

def parse(file)
  @xml, @sen, @senid = nil
  file.set_encoding 'UTF-8'
  file.each_line do |line|
    line.chomp!
    case line
    when ''
      # ignore
    when /^##/
      # comment line
    when /^#!/
      parse_extended_tag line
    else
      parse_cabocha_tag line
    end
  end
  return @xml
end

#parse_cabocha_tag(line) ⇒ Object



96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
# File 'lib/cab2xml/converter.rb', line 96

def parse_cabocha_tag(line)
  case line
  when /^\*/
    create_xml(:document) unless @xml
    unless @sen
      @sen = add_node(@doc, '<sentence id="%d"/>', @senid ||= 0)
      @senid += 1
      @tokid = 0
    end
    null, id, dep, headfunc, score = line.split(' ')
    link, rel = dep[0..-2], dep[-1]
    head, func = headfunc.split('/')
    data = [id, link, rel, head, func, score]
    format = '<chunk id="%d" link="%d" rel="%s" head="%d" func="%d" score="%s"/>'
    @chunk = add_node(@sen, format, data)
  when 'EOS'
    @sen = nil
  else
    case token_format
    when :chasen
      data = line.split(/\s/)
      data = [@tokid, *data[1..5], data[0]]
      format = '<tok id="%d" read="%s" base="%s" pos="%s" cype="%s" cform="%s">%s</tok>'
      @tok = add_node(@chunk, format, data)
      @tokid += 1
    when :mecab_unidic
      text, data = line.split(/\s/, 2)
      data = data.split(',').map{|item| item == '*' ? nil : item }
      pos = data[0, 4].compact.join('-')
      ctype, cform, lemmaForm, lemma = data[4, 4]
      data = [@tokid, pos, ctype, cform, lemmaForm, lemma, text]
      format = '<tok id="%d" pos="%s" cype="%s" cform="%s" lemmaForm="%s" lemma="%s">%s</tok>'
      @tok = add_node(@chunk, format, data)
      @tokid += 1
    end
  end
end

#parse_extended_tag(line) ⇒ Object



49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
# File 'lib/cab2xml/converter.rb', line 49

def parse_extended_tag(line)
  null, label, *data = CSV.parse_line(line, :col_sep => "\s")
  data.map!{|item| item.encode(:xml => :text)}
  case label
  when 'DOCID'
    create_xml(:corpora) unless @xml
    format = '<DOCID id=%d>%s</DOCID>'
    @docid = add_node(@corpora, format, data)
  when 'SENTENCETAGID'
    format = '<SENTENCETAGID id=%d>%s</SENTENCETAGID>'
    @sentencetagid = add_node(@corpora, format, data)
  when 'DOC'
    format = '<document id="%d"/>'
    @doc = add_node(@corpora, format, data)
    @senid = 0
  when 'ATTR'
    case @attr_mode
    when :node
      format = '<ATTR Key="%s" Value="%s"/>'
      @attr = add_node(@last, format, data)
    when :attr
      key, value = data
      check_namespace key
      @last[key] = value
    end
  when 'SEGMENT'
    format = '<SEGMENT TagName="%s" StartGPos="%s" EndGPos="%s" Comments="%s"/>'
    @seg = add_node(@doc, format, data)
  when 'SEGMENT_S'
    format = '<SEGMENT_S TagName="%s" StartLPos="%s" EndLPos="%s" Comments="%s"/>'
    @seg = add_node(@sen, format, data)
  when 'LINK'
    format = '<LINK TagName="%s" FromSegNo="%s" EndSegNo="%s" Comments="%s"/>'
    @link = add_node(@doc, format, data)
  when 'LINK_S'
    format = '<LINK_S TagName="%s" FromSegSNo="%s" EndSegSNo="%s" Comments="%s"/>'
    @link = add_node(@sen, format, data)
  when 'GROUP'
    format = '<GROUP TagName="%s" SegNo="%s" Comments="%s"/>'
    data = [data[0], data[1..-2].join(','), data[-1]]
    @group = add_node(@doc, format, data)
  when 'GROUP_S'
    format = '<GROUP_S TagName="%s" SegSNo="%s" Comments="%s"/>'
    data = [data[0], data[1..-2].join(','), data[-1]]
    @group = add_node(@sen, format, data)
  end
end