Class: Boilerpipe::Filters::ExpandTitleToContentFilter
- Inherits:
-
Object
- Object
- Boilerpipe::Filters::ExpandTitleToContentFilter
- Defined in:
- lib/boilerpipe/filters/expand_title_to_content_filter.rb
Class Method Summary collapse
Class Method Details
.no_title_with_subsequent_content?(content_start, title) ⇒ Boolean
38 39 40 |
# File 'lib/boilerpipe/filters/expand_title_to_content_filter.rb', line 38 def self.no_title_with_subsequent_content?(content_start, title) title.nil? || content_start.nil? || content_start <= title end |
.process(doc) ⇒ Object
10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 |
# File 'lib/boilerpipe/filters/expand_title_to_content_filter.rb', line 10 def self.process(doc) tbs = doc.text_blocks # slower and more ruby-like # comeback and let's do some benchmarking # titles = tbs.select{ |tb| tb.has_label?(:TITLE) } # title = tbs.index(titles.last) # content_start = tbs.find_index(&:is_content?) i = 0 title = nil content_start = nil tbs.each do |tb| title = i if content_start.nil? && tb.has_label?(:TITLE) content_start = i if content_start.nil? && tb.is_content? i += 1 end return doc if no_title_with_subsequent_content?(content_start, title) tbs.slice(title...content_start).each do |tb| tb.content = true if tb.has_label?(:MIGHT_BE_CONTENT) end doc end |