Class: SRX::English::SentenceSplitter

Inherits:
Object
  • Object
show all
Includes:
Enumerable
Defined in:
lib/srx/english/sentence_splitter.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(text = nil) ⇒ SentenceSplitter

The sentence splitter is initialized with the text to split. This might be a String or a IO object.



41
42
43
44
45
46
47
# File 'lib/srx/english/sentence_splitter.rb', line 41

def initialize(text=nil)
  if text.is_a?(String)
    @input = StringIO.new(text,"r:utf-8")
  else
    @input = text
  end
end

Instance Attribute Details

#debug=(value) ⇒ Object (writeonly)

Sets the attribute debug

Parameters:

  • value

    the value to set the attribute debug to.



37
38
39
# File 'lib/srx/english/sentence_splitter.rb', line 37

def debug=(value)
  @debug = value
end

#inputObject

Returns the value of attribute input.



36
37
38
# File 'lib/srx/english/sentence_splitter.rb', line 36

def input
  @input
end

Instance Method Details

#each {|sentence + after_buffer| ... } ⇒ Object

Iterate over the sentences in the text. If the text is nil, exception is raised.

Yields:

  • (sentence + after_buffer)


51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
# File 'lib/srx/english/sentence_splitter.rb', line 51

def each
  raise "Invalid argument - text is nil" if @input.nil?
  buffer_length = 10
  sentence = ""
  before_buffer = ""
  @input.pos = 0
  after_buffer = buffer_length.times.map{|i| @input.getc }.join("")
  matched_rule = nil
  while(!@input.eof?) do
    matched_before = BEFORE_RE.match(before_buffer)
    break_detected = false
    if matched_before
      start_index = (matched_before.size - 1).times.find do |index|
        matched_before[index+1]
      end
      if @debug
        puts "#{before_buffer}|#{after_buffer.gsub(/\n/,"\\n")}"
      end
      REGEXPS.each do |before_re,after_re,value|
        # skip the whole match
        if before_re.match(before_buffer) && after_re.match(after_buffer)
          break_detected = true
          color = value ? :red : :green
          if @debug
            sentence << Term::ANSIColor.send(color,"<#{before_re}:#{after_re}>")
          end
          if value
            yield sentence
            sentence = ""
          end
          break
        end
      end
    end
    next_after = @input.readchar
    before_buffer.sub!(FIRST_CHAR,"") if before_buffer.size >= buffer_length
    after_buffer.sub!(FIRST_CHAR,"")
    before_buffer << $&
    sentence << $&
    after_buffer << next_after
  end
  yield sentence + after_buffer unless sentence.empty? || after_buffer.empty?
end