Module: NlpPure::Segmenting::DefaultSentence
- Defined in:
- lib/nlp_pure/segmenting/default_sentence.rb
Overview
SEE ALSO: Unsupervised Multilingual Sentence Boundary Detection. Kiss, Strunk; 2006. NOTE: this fails on some proper nouns with abbreviations (e.g. business names)
and fails on single-linebreak headings
Constant Summary collapse
- DEFAULT_OPTIONS =
{ # punctuation or linebreaks split: /([.?!]|\n{2,}|\r\n)+/, # array of arrays; [0] should be regexp, [1] should be replacement # NOTE: minor performance risk in letting this array grow long gsub: [ # period ellipses need reconstruction [/\.{3,}/, '…'] ], naive_sentence_word_count: 3, segment_boundary: '. ' }.freeze
Class Method Summary collapse
- .clean_input(text = nil) ⇒ Object
- .cleanup_segmenting(segments) ⇒ Object
-
.handle_special_fragments(segments, segment) ⇒ Object
rejoin leading punctuation, abbreviation, and numbers.
- .next_segment_appears_included?(segment) ⇒ Boolean
-
.options ⇒ Object
NOTE: exposed as a method for easy mock/stub.
- .parse(*args) ⇒ Object
- .rejoin_segment_fragments(segments) ⇒ Object
Class Method Details
.clean_input(text = nil) ⇒ Object
40 41 42 43 44 45 46 47 48 |
# File 'lib/nlp_pure/segmenting/default_sentence.rb', line 40 def clean_input(text = nil) input = text.to_s # perform replacements to work around the limitations of the splitting regexp .fetch(:gsub, []).each do |gsub_pair| input.gsub!(gsub_pair[0], gsub_pair[1]) end # NOTE: leading whitespace is problematic; ref #12 input.strip end |
.cleanup_segmenting(segments) ⇒ Object
89 90 91 |
# File 'lib/nlp_pure/segmenting/default_sentence.rb', line 89 def cleanup_segmenting(segments) segments.compact end |
.handle_special_fragments(segments, segment) ⇒ Object
rejoin leading punctuation, abbreviation, and numbers
62 63 64 65 66 67 68 69 |
# File 'lib/nlp_pure/segmenting/default_sentence.rb', line 62 def handle_special_fragments(segments, segment) # NOTE: always index zero because we're shifting while next_segment_appears_included?(segments[0]) STDERR << "\t\t<< #{segments[0].inspect}\n" if ENV['DEBUG'] segment = "#{segment}#{segments.shift}" end segment.strip end |
.next_segment_appears_included?(segment) ⇒ Boolean
71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 |
# File 'lib/nlp_pure/segmenting/default_sentence.rb', line 71 def next_segment_appears_included?(segment) return false unless segment # NOTE: the logic is expanded for logging reasons (despite style violation) if segment[0] =~ .fetch(:split, nil) STDERR << "\t! leading punctuation detected\n" if ENV['DEBUG'] elsif segment[0] =~ /^\w/ STDERR << "\t! assuming abbreviation\n" if ENV['DEBUG'] elsif segment =~ /^\s[a-z0-9]/ STDERR << "\t! greedily grabbing lowercase\n" if ENV['DEBUG'] elsif segment =~ /^\d/ STDERR << "\t! leading numeral detected\n" if ENV['DEBUG'] else STDERR << "\t\tx\n" if ENV['DEBUG'] return false end true end |
.options ⇒ Object
NOTE: exposed as a method for easy mock/stub
25 26 27 |
# File 'lib/nlp_pure/segmenting/default_sentence.rb', line 25 def DEFAULT_OPTIONS end |
.parse(*args) ⇒ Object
29 30 31 32 33 34 35 36 37 38 |
# File 'lib/nlp_pure/segmenting/default_sentence.rb', line 29 def parse(*args) return nil if args.nil? || args.empty? # naive split segments = clean_input(args[0]).split(.fetch(:split, nil)) # skip rejoin if one segment return segments if segments.length == 1 returning = rejoin_segment_fragments(segments).compact STDERR << "#{returning.inspect}\n" if ENV['DEBUG'] returning end |
.rejoin_segment_fragments(segments) ⇒ Object
50 51 52 53 54 55 56 57 58 59 |
# File 'lib/nlp_pure/segmenting/default_sentence.rb', line 50 def rejoin_segment_fragments(segments) reassociated_segments = [] # take all segments while (segment = segments.shift) STDERR << "#{segment.inspect}\n" if ENV['DEBUG'] # join segments if needed reassociated_segments << handle_special_fragments(segments, segment) end reassociated_segments end |