Class: Pismo::Reader::Tree
Constant Summary
Constants inherited from Base
Base::BAD_WORDS, Base::BLOCK_OUTPUT_ELEMENTS, Base::COULD_CONTAIN_FULL_CONTENT, Base::FATAL_WORDS, Base::GOOD_WORDS, Base::INLINE_OUTPUT_ELEMENTS, Base::META_WORDS, Base::NON_HEADER_ELEMENTS, Base::OK_ATTRIBUTES, Base::OK_CLEAN_ATTRIBUTES, Base::OK_ELEMENTS, Base::OUTPUT_ELEMENTS, Base::WONT_CONTAIN_FULL_CONTENT
Instance Attribute Summary
Attributes inherited from Base
#content_candidates, #doc, #options, #raw_content
Instance Method Summary collapse
-
#analyze ⇒ Object
Analyze the structure of the HTML document and score branches for likelihood of containing useful content.
- #content_at(index) ⇒ Object
Methods inherited from Base
#build_doc, #content, #images, #initialize, #sentences, #strip
Constructor Details
This class inherits a constructor from Pismo::Reader::Base
Instance Method Details
#analyze ⇒ Object
Analyze the structure of the HTML document and score branches for likelihood of containing useful content
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 |
# File 'lib/pismo/reader/tree.rb', line 6 def analyze @tree = {} subels = {} t1 = Time.now.to_i + (Time.now.usec.to_f / 1000000) @doc.css(COULD_CONTAIN_FULL_CONTENT.join(", ")).each do |el| # Assume that no content we'll want comes in a total package of fewer than 80 characters! next unless el.text.to_s.strip.length >= 80 path_segments = el.path.scan(/[a-z]+/)[2..-1] || [] depth = path_segments.length local_ids = (el['id'].to_s + ' ' + el['class'].to_s).downcase.strip.scan(/[a-z]+/) ids = local_ids cp = el.parent (depth - 1).times do ids += (cp['id'].to_s + ' ' + cp['class'].to_s).downcase.strip.scan(/[a-z]+/) cp = cp.parent end if depth > 1 #puts "IDS" #ap ids #puts "LOCAL IDS" #ap local_ids branch = {} branch[:ids] = ids branch[:local_ids] = local_ids branch[:score] = -(BAD_WORDS & ids).size branch[:score] += ((GOOD_WORDS & ids).size * 2) next if branch[:score] < -5 #puts "#{ids.join(",")} - #{branch[:score].to_s} - #{el.text.to_s.strip.length}" # Elements that have an ID or class are more likely to be our winners branch[:score] += 2 unless local_ids.empty? branch[:name] = el.name branch[:depth] = depth branch[:path] = el.path branch[:raw_word_count] = 0 branch[:word_count] = 0 branch[:child_count] = 0 branch[:bad_child_count] = 0 branch[:score_steps] = [] el.traverse do |subel| div_at_end_of_branch = false if subel.name == "div" path = subel.path subels[path] ||= {} subels[path][:path_segments] ||= (path.scan(/[a-z]+/)[2..-1] || []) subels[path][:is_text] ||= subel.text? if subels[path][:is_text] subels[path][:text] ||= subel.text.downcase.scan(/[a-z]+/) next if subels[path][:text].empty? subels[path][:raw_word_count] ||= subels[path][:text].size subels[path][:word_count] ||= (%{a h1 h2 h3 h4 h5 h6 h6}.include?(subel.parent.name) ? 0 : subels[path][:text].select { |word| word.length > 3 }.size) subels[path][:meta_matches] ||= (subels[path][:text] & META_WORDS).size branch[:raw_word_count] += subels[path][:raw_word_count] branch[:word_count] += subels[path][:word_count] - subels[path][:meta_matches] end subels[path][:ids] ||= (subel['id'].to_s + ' ' + subel['class'].to_s).gsub(/[^a-z]/, ' ').downcase.strip.split(/\s+/) subels[path][:bad_child_count_inc] = (BAD_WORDS & subels[path][:ids]).size - (GOOD_WORDS & subels[path][:ids]).size subels[path][:child_count_inc] = subels[path][:ids].empty? ? 0 : 1 branch[:bad_child_count] += subels[path][:bad_child_count_inc] branch[:child_count] += subels[path][:child_count_inc] end branch[:score] += 2 if branch[:name] == "div" branch[:score] += 4 if el.text.scan(/\,\s/).size > 10 branch[:score_steps] << "lots of commas!" if el.text.scan(/\,\s/).size > 5 branch[:score] *= 3 branch[:score] *= 0.7 if el.children && el.children.size < 3 branch[:score] *= 1.25 if branch[:raw_word_count] > 10 next if branch[:raw_word_count] < 10 branch[:score] += [branch[:word_count], 1].max ** 0.5 word_child_count_ratio = branch[:word_count].to_f / [branch[:child_count], 1].max branch[:word_child_count_ratio] = word_child_count_ratio if branch[:raw_word_count] > 100 good_word_ratio = branch[:word_count].to_f / branch[:raw_word_count] branch[:score] += good_word_ratio * 12 if word_child_count_ratio > 50 branch[:score] *= 1.5 elsif word_child_count_ratio > 30 branch[:score] *= 1.2 elsif word_child_count_ratio > 15 branch[:score] *= 1.1 elsif word_child_count_ratio < 4 branch[:score] *= 0.9 end end branch[:score_steps] << "s1: #{branch[:score]}" bad_child_ratio = branch[:bad_child_count].to_f / [branch[:child_count], 1].max branch[:bad_child_ratio] = bad_child_ratio branch[:score] += 3 if bad_child_ratio < 0.0 branch[:score] -= 3 if bad_child_ratio > 0.15 branch[:score] -= 2 if bad_child_ratio > 0.25 branch[:score] -= 2 if bad_child_ratio > 0.4 branch[:score] -= 4 if bad_child_ratio > 0.5 branch[:score] -= 5 if bad_child_ratio > 0.7 branch[:score] -= 5 if branch[:bad_child_count] > 20 branch[:score] += depth branch[:score] *= 0.8 if ids.length > 10 @tree[el.path] = branch end sorted_tree = @tree.sort_by { |k, v| v[:score] } #ap @doc.at(sorted_tree.first[0]).text # Sort the branches by their score in reverse order @content_candidates = sorted_tree.reverse.first([5, sorted_tree.length].min) #ap @content_candidates #.map { |i| [i[0], i[1][:name], i[1][:ids].join(','), i[1][:score] ]} #t2 = Time.now.to_i + (Time.now.usec.to_f / 1000000) #puts t2 - t1 #exit end |
#content_at(index) ⇒ Object
148 149 150 |
# File 'lib/pismo/reader/tree.rb', line 148 def content_at(index) @doc.at(@content_candidates[index].first) end |