11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
|
# File 'lib/furigana/mecab.rb', line 11
def tokenize(text)
surface_form, reading = 0, 1
stdout, stderr, status = Open3.capture3("mecab -Ochasen", :stdin_data => sanitize_text(text))
lines = if stdout.valid_encoding?
stdout.split("\n")
else
stdout.encode!("UTF-8", "UTF-8", :invalid => :replace, :undef => :replace, :replace => "�")
stdout.split("\n")
end
lines.inject([]) do |output, line|
columns = line.split("\t")
output << {
:surface_form => columns[surface_form],
:reading => columns[reading]
} if columns[surface_form] != 'EOS'
output
end
end
|