Class: Docuvator::Splitter

Inherits:

Object

Object
Docuvator::Splitter

Defined in:: lib/docuvator/splitter.rb

Constant Summary collapse

IGNORE_SET = Sentence ignore set

['Mr', 'Mrs', 'Ms', 'Jr', 'Dr', 'Prof', 'Sr']

REGEX_NEWLINE =

Regexp.new(/\n+/)

REGEX_SPACES =

Regexp.new(/\s+/)

REGEX_PUNCTUATION =

Regexp.new(/[.!?,:;\[\]\(\)]+/)

REGEX_SENTENCES =

Regexp.new(/(?<=[.!?]|[.!?][\'"])(?<! #{IGNORE_SET.join('\\.| ') + '\\.|'} \s[A-Z]\.)\s+/ix)

Instance Attribute Summary collapse

#file ⇒ Object readonly

Returns the value of attribute file.
#paragraphs ⇒ Object

Returns the value of attribute paragraphs.
#sentences ⇒ Object

Returns the value of attribute sentences.
#word_frequency ⇒ Object

Returns the value of attribute word_frequency.

Instance Method Summary collapse

#initialize(file) ⇒ Splitter constructor

Regex to split up sentences - stackoverflow.com/a/5844564/583592.
#split ⇒ Object

Constructor Details

#initialize(file) ⇒ `Splitter`

Regex to split up sentences - stackoverflow.com/a/5844564/583592

(?<= # Begin positive lookbehind.

[.!?]             # Either an end of sentence punct,

| [.!?]['“] # or end of sentence punct and quote. ) # End positive lookbehind. (?<! # Begin negative lookbehind.

Mr\.              # Skip either "Mr."

# File 'lib/docuvator/splitter.rb', line 35

def initialize(file)
  if File.exist? file
    @file = file
  else
    Log.error "Cannot split #{file} as it does not exist"
    exit 1
  end
end

Instance Attribute Details

#file ⇒ `Object` (readonly)

Returns the value of attribute file.



4
5
6

# File 'lib/docuvator/splitter.rb', line 4

def file
  @file
end

#paragraphs ⇒ `Object`

Returns the value of attribute paragraphs.



3
4
5

# File 'lib/docuvator/splitter.rb', line 3

def paragraphs
  @paragraphs
end

#sentences ⇒ `Object`

Returns the value of attribute sentences.



3
4
5

# File 'lib/docuvator/splitter.rb', line 3

def sentences
  @sentences
end

#word_frequency ⇒ `Object`

Returns the value of attribute word_frequency.



3
4
5

# File 'lib/docuvator/splitter.rb', line 3

def word_frequency
  @word_frequency
end

Instance Method Details

#split ⇒ `Object`

# File 'lib/docuvator/splitter.rb', line 44

def split
  # Split up text into paragraphs
  @paragraphs = File.open(@file).read.split(REGEX_NEWLINE).map { |s|
    s = s.gsub(REGEX_SPACES, ' ').strip
  }
  Log.info "Number of paragraphs: #{@paragraphs.size}"

  # Split up paragraphs into sentences
  @sentences = []
  @paragraphs.each do |paragraph|
    paragraph.split(REGEX_SENTENCES).each do |sentence|
      @sentences << sentence
    end
  end
  Log.info "Number of sentences: #{@sentences.size}"

  @word_frequency = Hash.new(0)
  @sentences.each do |sentence|
    sentence.split.map { |s|
      s = s.gsub(REGEX_PUNCTUATION, '').downcase
      @word_frequency[s] += 1
      s
    }
  end
  Log.info "Unique words: #{@word_frequency.size}"
end

Class: Docuvator::Splitter

Constant Summary collapse

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(file) ⇒ Splitter

Instance Attribute Details

#file ⇒ Object (readonly)

#paragraphs ⇒ Object

#sentences ⇒ Object

#word_frequency ⇒ Object