Class: ContentFocus::HTML
- Inherits:
-
Object
- Object
- ContentFocus::HTML
- Defined in:
- lib/content_focus/html.rb
Overview
Static content fragments are things like: title, about, author, content of an article, etc.
Instance Method Summary collapse
-
#initialize(html) ⇒ HTML
constructor
A new instance of HTML.
-
#static_fragment(options = {}) ⇒ Object
Based on the title, find a common chunk of HTML that is the most relevant This is to extract atomic/permanent content.
-
#static_fragments(options = {}) ⇒ Object
Get all relevant div/span/td/body/p blocks from the HTML page - based on the <title> This is to extract atomic/permanent content.
- #static_text ⇒ Object
Constructor Details
#initialize(html) ⇒ HTML
Returns a new instance of HTML.
9 10 11 |
# File 'lib/content_focus/html.rb', line 9 def initialize(html) @doc = Hpricot(html) end |
Instance Method Details
#static_fragment(options = {}) ⇒ Object
Based on the title, find a common chunk of HTML that is the most relevant This is to extract atomic/permanent content
20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 |
# File 'lib/content_focus/html.rb', line 20 def static_fragment( = {}) fragments = self.static_fragments() return nil if fragments == nil || fragments.empty? if fragments.size == 1 return fragments.first end # Find common ancestors fragments_by_parents = {} fragments.each do |fragment| next unless fragment[:parent] fragments_by_parents[fragment[:parent]] ||= [] fragments_by_parents[fragment[:parent]] << fragment end # Find the top parent top_fragments = [] top_parent_fragments_count = 0 fragments_by_parents.each do |parent,fr| if fr.size > top_parent_fragments_count top_parent_fragments_count = fr.size top_fragments = fr end end # Failed? if !top_fragments || top_fragments.empty? return fragments.first end # Create a combined fragment with combined score element = top_fragments.first[:element] combined_fragment = {:score => 0, :element => element.parent, :inner_text => element.parent.inner_text, :parent => element.parent ? element.parent.object_id : nil} top_fragments.each { |f| combined_fragment[:score] = combined_fragment[:score] + f[:score] } # De-value the body tag if combined_fragment[:element].name == 'body' combined_fragment[:score] = top_fragments.size end # Add combined fragment to pool and re-order by score. fragments << combined_fragment fragments.sort! { |b,a| a[:score] <=> b[:score] } #puts fragments.collect { |f| ["#{f[:element].parent ? f[:element].parent.name : nil}:#{f[:parent]}", f[:element].name + '(' + f[:score].to_s + '): ', f[:element].attributes] }.inspect fragments.first end |
#static_fragments(options = {}) ⇒ Object
Get all relevant div/span/td/body/p blocks from the HTML page - based on the <title> This is to extract atomic/permanent content
73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 |
# File 'lib/content_focus/html.rb', line 73 def static_fragments( = {}) title_elements = (@doc/"title") return html if !title_elements || title_elements.empty? title_inner_text = title_elements.first.inner_text keywords = Linguistics::Tagger.(title_inner_text) blocks = [] # First, find the smallest blocks, but bigger than the title (@doc/"div|span|td|body|p|dd|ul").each do |element| next if element_with_negative_identifier(element) inner_text = '' element.children.each do |child| inner_text << child.to_s if child.is_a?(Hpricot::Text) end inner_text.downcase! next if inner_text.size <= title_inner_text.size # Check the occurance of keyword in block, skip if none num_matches = 0 keywords.each { |k| num_matches+=1 if inner_text.split(/\s+/).include?(k) } next if num_matches == 0 # Calculate a score based on keyword matches times positive naming of id/class score = num_matches identifier = nil if (identifier = element_with_positive_identifier(element)) score = score * 2; end blocks << {:score => score, :element => element, :inner_text => inner_text, :parent => element.parent ? element.parent.object_id : nil, :identifier => identifier} end big_block_identifiers = {} # Finding big blocks with both matches and positive identifiers (@doc/"div|span|table|td|body|p|dd|ul").each do |element| next if element_with_negative_identifier(element) # Need to log identifier statistics identifier = nil if (identifier = element_with_positive_identifier(element)) big_block_identifiers[identifier] ||= 0 big_block_identifiers[identifier] += 1 else next end inner_text = element.inner_text inner_text.downcase! next if inner_text.size <= title_inner_text.size # Check the occurance of keyword in block, skip if none num_matches = 0 keywords.each { |k| num_matches+=1 if inner_text.split(/\s+/).include?(k) } #puts "#{element.name}(#{element.inner_text.size}/#{title_inner_text.size}, score:#{num_matches} * #{element_with_positive_identifier(element)}): " + element.attributes['class'].to_s next if num_matches == 0 # Calculate a score based on keyword matches times positive naming of id/class score = num_matches if identifier score = score * 3; end blocks << {:score => score, :element => element, :inner_text => inner_text, :parent => element.parent ? element.parent : nil, :identifier => identifier} end # De-value the identifiers that are repeated blocks.each do |block| if block[:identifier] && big_block_identifiers[block[:identifier]].to_i > 1 block[:score] = block[:score] / 3; end end # Order those blocks by top matches blocks.sort! { |b,a| a[:score] <=> b[:score] } blocks.reject! { |b| b[:score] == 0 } blocks end |
#static_text ⇒ Object
13 14 15 16 |
# File 'lib/content_focus/html.rb', line 13 def static_text fragment = self.static_fragment fragment ? fragment[:element].inner_text.strip! : nil end |