Class: Docuvator::Splitter
- Inherits:
-
Object
- Object
- Docuvator::Splitter
- Defined in:
- lib/docuvator/splitter.rb
Constant Summary collapse
- IGNORE_SET =
Sentence ignore set
['Mr', 'Mrs', 'Ms', 'Jr', 'Dr', 'Prof', 'Sr']
- REGEX_NEWLINE =
Regexp.new(/\n+/)
- REGEX_SPACES =
Regexp.new(/\s+/)
- REGEX_PUNCTUATION =
Regexp.new(/[.!?,:;\[\]\(\)]+/)
- REGEX_SENTENCES =
Regexp.new(/(?<=[.!?]|[.!?][\'"])(?<! #{IGNORE_SET.join('\\.| ') + '\\.|'} \s[A-Z]\.)\s+/ix)
Instance Attribute Summary collapse
-
#file ⇒ Object
readonly
Returns the value of attribute file.
-
#paragraphs ⇒ Object
Returns the value of attribute paragraphs.
-
#sentences ⇒ Object
Returns the value of attribute sentences.
-
#word_frequency ⇒ Object
Returns the value of attribute word_frequency.
Instance Method Summary collapse
-
#initialize(file) ⇒ Splitter
constructor
Regex to split up sentences - stackoverflow.com/a/5844564/583592.
- #split ⇒ Object
Constructor Details
#initialize(file) ⇒ Splitter
Regex to split up sentences - stackoverflow.com/a/5844564/583592
(?<= # Begin positive lookbehind.
[.!?] # Either an end of sentence punct,
| [.!?]['“] # or end of sentence punct and quote. ) # End positive lookbehind. (?<! # Begin negative lookbehind.
Mr\. # Skip either "Mr."
| Mrs. # or “Mrs.”, | Ms. # or “Ms.”, | Jr. # or “Jr.”, | Dr. # or “Dr.”, | Prof. # or “Prof.”, | Sr. # or “Sr.”, | s. # or initials ex: “George W. Bush”, ) # End negative lookbehind. s+ # Split on whitespace between sentences. /ix
35 36 37 38 39 40 41 42 |
# File 'lib/docuvator/splitter.rb', line 35 def initialize(file) if File.exist? file @file = file else Log.error "Cannot split #{file} as it does not exist" exit 1 end end |
Instance Attribute Details
#file ⇒ Object (readonly)
Returns the value of attribute file.
4 5 6 |
# File 'lib/docuvator/splitter.rb', line 4 def file @file end |
#paragraphs ⇒ Object
Returns the value of attribute paragraphs.
3 4 5 |
# File 'lib/docuvator/splitter.rb', line 3 def paragraphs @paragraphs end |
#sentences ⇒ Object
Returns the value of attribute sentences.
3 4 5 |
# File 'lib/docuvator/splitter.rb', line 3 def sentences @sentences end |
#word_frequency ⇒ Object
Returns the value of attribute word_frequency.
3 4 5 |
# File 'lib/docuvator/splitter.rb', line 3 def word_frequency @word_frequency end |
Instance Method Details
#split ⇒ Object
44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 |
# File 'lib/docuvator/splitter.rb', line 44 def split # Split up text into paragraphs @paragraphs = File.open(@file).read.split(REGEX_NEWLINE).map { |s| s = s.gsub(REGEX_SPACES, ' ').strip } Log.info "Number of paragraphs: #{@paragraphs.size}" # Split up paragraphs into sentences @sentences = [] @paragraphs.each do |paragraph| paragraph.split(REGEX_SENTENCES).each do |sentence| @sentences << sentence end end Log.info "Number of sentences: #{@sentences.size}" @word_frequency = Hash.new(0) @sentences.each do |sentence| sentence.split.map { |s| s = s.gsub(REGEX_PUNCTUATION, '').downcase @word_frequency[s] += 1 s } end Log.info "Unique words: #{@word_frequency.size}" end |