Class: SRX::Polish::SentenceSplitter

Inherits:
Object
  • Object
show all
Includes:
Enumerable
Defined in:
lib/srx/polish/sentence_splitter.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(text = nil) ⇒ SentenceSplitter

Returns a new instance of SentenceSplitter.



64
65
66
67
68
69
70
# File 'lib/srx/polish/sentence_splitter.rb', line 64

def initialize(text=nil)
  if text.is_a?(String)
    @input = StringIO.new(text,"r:utf-8")
  else
    @input = text
  end
end

Instance Attribute Details

#debug=(value) ⇒ Object (writeonly)

Sets the attribute debug

Parameters:

  • value

    the value to set the attribute debug to.



62
63
64
# File 'lib/srx/polish/sentence_splitter.rb', line 62

def debug=(value)
  @debug = value
end

#inputObject

Returns the value of attribute input.



61
62
63
# File 'lib/srx/polish/sentence_splitter.rb', line 61

def input
  @input
end

Instance Method Details

#each {|sentence + after_buffer| ... } ⇒ Object

Yields:

  • (sentence + after_buffer)


72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# File 'lib/srx/polish/sentence_splitter.rb', line 72

def each
  raise "Invalid argument - text is nil" if @input.nil?
  buffer_length = 10
  sentence = ""
  before_buffer = ""
  @input.pos = 0
  after_buffer = buffer_length.times.map{|i| @input.readchar}.join("")
  matched_rule = nil
  while(!@input.eof?) do
    matched_before = BEFORE_RE.match(before_buffer)
    break_detected = false
    if matched_before
      start_index = (matched_before.size - 1).times.find do |index|
        matched_before[index+1]
      end
      if @debug
        puts "#{before_buffer}|#{after_buffer.gsub(/\n/,"\\n")}"
      end
      REGEXPS.each do |before_re,after_re,value|
        # skip the whole match
        if before_re.match(before_buffer) && after_re.match(after_buffer)
          break_detected = true
          color = value ? :red : :green
          if @debug
            sentence << Term::ANSIColor.send(color,"<#{before_re}:#{after_re}>")
          end
          if value
            yield sentence
            sentence = ""
          end
          break
        end
      end
    end
    next_after = @input.readchar
    before_buffer.sub!(FIRST_CHAR,"") if before_buffer.size >= buffer_length
    after_buffer.sub!(FIRST_CHAR,"")
    before_buffer << $&
    sentence << $&
    after_buffer << next_after
  end
  yield sentence + after_buffer unless sentence.empty? || after_buffer.empty?
end