Class: Docuvator::Splitter

Inherits:
Object
  • Object
show all
Defined in:
lib/docuvator/splitter.rb

Constant Summary collapse

IGNORE_SET =

Sentence ignore set

['Mr', 'Mrs', 'Ms', 'Jr', 'Dr', 'Prof', 'Sr']
REGEX_NEWLINE =
Regexp.new(/\n+/)
REGEX_SPACES =
Regexp.new(/\s+/)
REGEX_PUNCTUATION =
Regexp.new(/[.!?,:;\[\]\(\)]+/)
REGEX_SENTENCES =
Regexp.new(/(?<=[.!?]|[.!?][\'"])(?<! #{IGNORE_SET.join('\\.| ') + '\\.|'} \s[A-Z]\.)\s+/ix)

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(file) ⇒ Splitter

Regex to split up sentences - stackoverflow.com/a/5844564/583592

(?<= # Begin positive lookbehind.

[.!?]             # Either an end of sentence punct,

| [.!?]['“] # or end of sentence punct and quote. ) # End positive lookbehind. (?<! # Begin negative lookbehind.

Mr\.              # Skip either "Mr."

| Mrs. # or “Mrs.”, | Ms. # or “Ms.”, | Jr. # or “Jr.”, | Dr. # or “Dr.”, | Prof. # or “Prof.”, | Sr. # or “Sr.”, | s. # or initials ex: “George W. Bush”, ) # End negative lookbehind. s+ # Split on whitespace between sentences. /ix



35
36
37
38
39
40
41
42
# File 'lib/docuvator/splitter.rb', line 35

def initialize(file)
  if File.exist? file
    @file = file
  else
    Log.error "Cannot split #{file} as it does not exist"
    exit 1
  end
end

Instance Attribute Details

#fileObject (readonly)

Returns the value of attribute file.



4
5
6
# File 'lib/docuvator/splitter.rb', line 4

def file
  @file
end

#paragraphsObject

Returns the value of attribute paragraphs.



3
4
5
# File 'lib/docuvator/splitter.rb', line 3

def paragraphs
  @paragraphs
end

#sentencesObject

Returns the value of attribute sentences.



3
4
5
# File 'lib/docuvator/splitter.rb', line 3

def sentences
  @sentences
end

#word_frequencyObject

Returns the value of attribute word_frequency.



3
4
5
# File 'lib/docuvator/splitter.rb', line 3

def word_frequency
  @word_frequency
end

Instance Method Details

#splitObject



44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# File 'lib/docuvator/splitter.rb', line 44

def split
  # Split up text into paragraphs
  @paragraphs = File.open(@file).read.split(REGEX_NEWLINE).map { |s|
    s = s.gsub(REGEX_SPACES, ' ').strip
  }
  Log.info "Number of paragraphs: #{@paragraphs.size}"

  # Split up paragraphs into sentences
  @sentences = []
  @paragraphs.each do |paragraph|
    paragraph.split(REGEX_SENTENCES).each do |sentence|
      @sentences << sentence
    end
  end
  Log.info "Number of sentences: #{@sentences.size}"

  @word_frequency = Hash.new(0)
  @sentences.each do |sentence|
    sentence.split.map { |s|
      s = s.gsub(REGEX_PUNCTUATION, '').downcase
      @word_frequency[s] += 1
      s
    }
  end
  Log.info "Unique words: #{@word_frequency.size}"
end