Class: SRX::English::SentenceSplitter
- Inherits:
-
Object
- Object
- SRX::English::SentenceSplitter
- Includes:
- Enumerable
- Defined in:
- lib/srx/english/sentence_splitter.rb
Instance Attribute Summary collapse
-
#debug ⇒ Object
writeonly
Sets the attribute debug.
-
#input ⇒ Object
Returns the value of attribute input.
Instance Method Summary collapse
-
#each {|sentence + after_buffer| ... } ⇒ Object
Iterate over the sentences in the text.
-
#initialize(text = nil) ⇒ SentenceSplitter
constructor
The sentence splitter is initialized with the
text
to split.
Constructor Details
#initialize(text = nil) ⇒ SentenceSplitter
The sentence splitter is initialized with the text
to split. This might be a String or a IO object.
41 42 43 44 45 46 47 |
# File 'lib/srx/english/sentence_splitter.rb', line 41 def initialize(text=nil) if text.is_a?(String) @input = StringIO.new(text,"r:utf-8") else @input = text end end |
Instance Attribute Details
#debug=(value) ⇒ Object (writeonly)
Sets the attribute debug
37 38 39 |
# File 'lib/srx/english/sentence_splitter.rb', line 37 def debug=(value) @debug = value end |
#input ⇒ Object
Returns the value of attribute input.
36 37 38 |
# File 'lib/srx/english/sentence_splitter.rb', line 36 def input @input end |
Instance Method Details
#each {|sentence + after_buffer| ... } ⇒ Object
Iterate over the sentences in the text. If the text is nil, exception is raised.
51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 |
# File 'lib/srx/english/sentence_splitter.rb', line 51 def each raise "Invalid argument - text is nil" if @input.nil? buffer_length = 10 sentence = "" before_buffer = "" @input.pos = 0 after_buffer = buffer_length.times.map{|i| @input.getc }.join("") matched_rule = nil while(!@input.eof?) do matched_before = BEFORE_RE.match(before_buffer) break_detected = false if matched_before start_index = (matched_before.size - 1).times.find do |index| matched_before[index+1] end if @debug puts "#{before_buffer}|#{after_buffer.gsub(/\n/,"\\n")}" end REGEXPS.each do |before_re,after_re,value| # skip the whole match if before_re.match(before_buffer) && after_re.match(after_buffer) break_detected = true color = value ? :red : :green if @debug sentence << Term::ANSIColor.send(color,"<#{before_re}:#{after_re}>") end if value yield sentence sentence = "" end break end end end next_after = @input.readchar before_buffer.sub!(FIRST_CHAR,"") if before_buffer.size >= buffer_length after_buffer.sub!(FIRST_CHAR,"") before_buffer << $& sentence << $& after_buffer << next_after end yield sentence + after_buffer unless sentence.empty? || after_buffer.empty? end |