Class: Boilerpipe::Filters::LargeBlockSameTagLevelToContentFilter

Inherits:
Object
  • Object
show all
Defined in:
lib/boilerpipe/filters/large_block_same_tag_level_to_content_filter.rb

Class Method Summary collapse

Class Method Details

.process(doc) ⇒ Object



11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
# File 'lib/boilerpipe/filters/large_block_same_tag_level_to_content_filter.rb', line 11

def self.process(doc)

  largest = doc.text_blocks.find do |tb|
    tb.is_content? && tb.has_label?(:VERY_LIKELY_CONTENT)
  end

  return doc if largest.nil?
  tag_level = largest.tag_level

  doc.text_blocks.each do |tb|
    next if tb.is_content?
    tb.content = true if tb.num_words >= 100 && tb.tag_level == tag_level
  end

  doc
end