Class: Furigana::Mecab

Inherits:
Object
  • Object
show all
Defined in:
lib/furigana/mecab.rb

Class Method Summary collapse

Class Method Details

.sanitize_text(text) ⇒ Object



7
8
9
# File 'lib/furigana/mecab.rb', line 7

def sanitize_text(text)
  text.tr("\n", "")
end

.tokenize(text) ⇒ Object



11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
# File 'lib/furigana/mecab.rb', line 11

def tokenize(text)
  surface_form, reading = 0, 1
  stdout, stderr, status = Open3.capture3("mecab -Ochasen", :stdin_data => sanitize_text(text))

  # Avoid `ArgumentError - invalid byte sequence in UTF-8`
  lines = if stdout.valid_encoding?
            stdout.split("\n")
          else
            stdout.encode!("UTF-8", "UTF-8", :invalid => :replace, :undef => :replace, :replace => "")
            stdout.split("\n")
          end

  lines.inject([]) do |output, line|
    columns = line.split("\t")
    output << {
      :surface_form => columns[surface_form],
      :reading      => columns[reading]
    } if columns[surface_form] != 'EOS'
    output
  end
end