Class: Boilerpipe::Filters::MinClauseWordsFilter

Inherits:
Object
  • Object
show all
Defined in:
lib/boilerpipe/filters/min_clause_words_filter.rb

Class Method Summary collapse

Class Method Details

.is_clause?(text, min_words = 5) ⇒ Boolean

Returns:

  • (Boolean)


30
31
32
33
34
# File 'lib/boilerpipe/filters/min_clause_words_filter.rb', line 30

def self.is_clause?(text, min_words=5)
 return false if text.nil?
  whitespace = /[ \n\r]+/
  text.scan(whitespace).size >= min_words
end

.process(doc, min_words = 5) ⇒ Object



12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
# File 'lib/boilerpipe/filters/min_clause_words_filter.rb', line 12

def self.process(doc, min_words=5)

  doc.text_blocks.each do |tb|
    next if tb.is_not_content?

    clause_delimiter = /[\p{L}\d \u00a0]+[\,.:;!?]+(?:[ \n\r]+|$)/
    tb.text.scan(clause_delimiter).each do |possible_clause|
      if is_clause? possible_clause
        break
      else
        tb.content = false
      end
    end
  end

  doc
end