Module: NlpPure::Segmenting::DefaultSentence

Defined in:
lib/nlp_pure/segmenting/default_sentence.rb

Overview

SEE ALSO: Unsupervised Multilingual Sentence Boundary Detection. Kiss, Strunk; 2006. NOTE: this fails on some proper nouns with abbreviations (e.g. business names)

and fails on single-linebreak headings

Constant Summary collapse

DEFAULT_OPTIONS =
{
  # punctuation or linebreaks
  split: /([.?!]|\n{2,}|\r\n)+/,
  # array of arrays; [0] should be regexp, [1] should be replacement
  # NOTE: minor performance risk in letting this array grow long
  gsub:  [
    # period ellipses need reconstruction
    [/\.{3,}/, '']
  ],
  naive_sentence_word_count: 3,
  segment_boundary: '. '
}.freeze

Class Method Summary collapse

Class Method Details

.clean_input(text = nil) ⇒ Object



40
41
42
43
44
45
46
47
48
# File 'lib/nlp_pure/segmenting/default_sentence.rb', line 40

def clean_input(text = nil)
  input = text.to_s
  # perform replacements to work around the limitations of the splitting regexp
  options.fetch(:gsub, []).each do |gsub_pair|
    input.gsub!(gsub_pair[0], gsub_pair[1])
  end
  # NOTE: leading whitespace is problematic; ref #12
  input.strip
end

.cleanup_segmenting(segments) ⇒ Object



89
90
91
# File 'lib/nlp_pure/segmenting/default_sentence.rb', line 89

def cleanup_segmenting(segments)
  segments.compact
end

.handle_special_fragments(segments, segment) ⇒ Object

rejoin leading punctuation, abbreviation, and numbers



62
63
64
65
66
67
68
69
# File 'lib/nlp_pure/segmenting/default_sentence.rb', line 62

def handle_special_fragments(segments, segment)
  # NOTE: always index zero because we're shifting
  while next_segment_appears_included?(segments[0])
    STDERR << "\t\t<< #{segments[0].inspect}\n" if ENV['DEBUG']
    segment = "#{segment}#{segments.shift}"
  end
  segment.strip
end

.next_segment_appears_included?(segment) ⇒ Boolean

Returns:

  • (Boolean)


71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
# File 'lib/nlp_pure/segmenting/default_sentence.rb', line 71

def next_segment_appears_included?(segment)
  return false unless segment
  # NOTE: the logic is expanded for logging reasons (despite style violation)
  if segment[0] =~ options.fetch(:split, nil)
    STDERR << "\t! leading punctuation detected\n" if ENV['DEBUG']
  elsif segment[0] =~ /^\w/
    STDERR << "\t! assuming abbreviation\n" if ENV['DEBUG']
  elsif segment =~ /^\s[a-z0-9]/
    STDERR << "\t! greedily grabbing lowercase\n" if ENV['DEBUG']
  elsif segment =~ /^\d/
    STDERR << "\t! leading numeral detected\n" if ENV['DEBUG']
  else
    STDERR << "\t\tx\n" if ENV['DEBUG']
    return false
  end
  true
end

.optionsObject

NOTE: exposed as a method for easy mock/stub



25
26
27
# File 'lib/nlp_pure/segmenting/default_sentence.rb', line 25

def options
  DEFAULT_OPTIONS
end

.parse(*args) ⇒ Object



29
30
31
32
33
34
35
36
37
38
# File 'lib/nlp_pure/segmenting/default_sentence.rb', line 29

def parse(*args)
  return nil if args.nil? || args.empty?
  # naive split
  segments = clean_input(args[0]).split(options.fetch(:split, nil))
  # skip rejoin if one segment
  return segments if segments.length == 1
  returning = rejoin_segment_fragments(segments).compact
  STDERR << "#{returning.inspect}\n" if ENV['DEBUG']
  returning
end

.rejoin_segment_fragments(segments) ⇒ Object



50
51
52
53
54
55
56
57
58
59
# File 'lib/nlp_pure/segmenting/default_sentence.rb', line 50

def rejoin_segment_fragments(segments)
  reassociated_segments = []
  # take all segments
  while (segment = segments.shift)
    STDERR << "#{segment.inspect}\n" if ENV['DEBUG']
    # join segments if needed
    reassociated_segments << handle_special_fragments(segments, segment)
  end
  reassociated_segments
end