Class: NewsCrawler::Processing::StructureAnalysis
- Inherits:
-
Object
- Object
- NewsCrawler::Processing::StructureAnalysis
- Includes:
- CrawlerModule, URLHelper
- Defined in:
- lib/news_crawler/processing/structure_analysis.rb
Overview
Analyse website structure to extract content Database should only contains raw data from one website.
Instance Method Summary collapse
-
#analyse(url) ⇒ Object
Get and analyse url for information.
-
#classify_h2(root, limit) ⇒ Symbol
Predict type of tree point by root is fragment of article or index page.
-
#count_a_and_non_a_tag(node) ⇒ [Fixnum, Fixnum]
Count a tag and non-a tag in tree pointed by node.
- #extract_content(url) ⇒ Object
-
#find_longest_node(doc) ⇒ Object
Find longest text node that doesn’t have a in ancestors list.
-
#find_lowest_ancestor_has_id(node) ⇒ Nokogiri::XML::Node
Find the lowest node’s ancestor has id attribute.
- #get_result ⇒ Object
-
#hash_node(node, limit = -1)) ⇒ String
Calculate hash of a node by its and children info.
-
#initialize ⇒ StructureAnalysis
constructor
A new instance of StructureAnalysis.
-
#is_url?(url) ⇒ Boolean
Check if it is really ‘url’.
-
#node_info(node) ⇒ String
Return String represents node’s name, node’s id and node’s class.
-
#remove_tag(html_doc, tag) ⇒ Object
Remove unwanted HTML tag.
Methods included from URLHelper
Methods included from CrawlerModule
#find_all, #find_one, #find_unprocessed, #load_yaml, #mark_all_as_unprocessed, #mark_processed, #mark_unprocessed, #next_unprocessed, #save_yaml
Constructor Details
#initialize ⇒ StructureAnalysis
Returns a new instance of StructureAnalysis.
40 41 42 43 44 45 46 47 48 |
# File 'lib/news_crawler/processing/structure_analysis.rb', line 40 def initialize @url_stats = {} while (url = next_unprocessed) NCLogger.get_logger.info "[NC::P::SA] Processing #{url}" re = extract_content(url) @url_stats[url] = re save_yaml(url, re) end end |
Instance Method Details
#analyse(url) ⇒ Object
Get and analyse url for information
264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 |
# File 'lib/news_crawler/processing/structure_analysis.rb', line 264 def analyse(url) # puts "processing #{url}" html_doc = RawData.find_by_url(url) doc = Nokogiri.HTML(html_doc) inner_url = doc.xpath('//a').collect { | a_el | temp_url = (a_el.attribute 'href').to_s if (!temp_url.nil?) && (temp_url[0] == '/') temp_url = URI.join(url, temp_url).to_s end temp_url } inner_url.delete_if { | url_0 | (url_0.nil?) || (url_0.size == 0) || (url_0 == '#') || (url_0 == 'javascript:;') } inner_url.each do | url | @url_stats[url] = (@url_stats[url] || 0) + 1 end mark_processed(url) end |
#classify_h2(root, limit) ⇒ Symbol
Predict type of tree point by root is fragment of article or index page
103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 |
# File 'lib/news_crawler/processing/structure_analysis.rb', line 103 def classify_h2(root, limit) current = root current = current.parent if current.text? depth = 0 while true expect_hash = hash_node(current, 0) previous = current current = current.parent depth += 1 lons = {} node_count = 0 node_list = [previous] current.children.each do | child | hc = hash_node(child, depth - 1) if hc == expect_hash node_count += 1 node_list << child end end if node_count > 1 a_tag_len, non_a_tag_len = count_a_and_non_a_tag(current) if non_a_tag_len > a_tag_len return :article else return :list end break end if current == limit a_tag_len, non_a_tag_len = count_a_and_non_a_tag(current) if non_a_tag_len > a_tag_len return :article else return :list end break end end return :list end |
#count_a_and_non_a_tag(node) ⇒ [Fixnum, Fixnum]
Count a tag and non-a tag in tree pointed by node
153 154 155 156 157 158 159 160 161 162 163 164 165 166 |
# File 'lib/news_crawler/processing/structure_analysis.rb', line 153 def count_a_and_non_a_tag(node) a_tag_list = node.xpath './/a' a_tag_len = a_tag_list.count # number of a tag non_a_tag_list = node.xpath './/text()[not (ancestor::a)]' non_a_tag_len = non_a_tag_list.to_a.inject(0) do | memo, node | if node.content.gsub(/\s+/, '').length > 15 memo + 1 else memo end end [ a_tag_len, non_a_tag_len ] end |
#extract_content(url) ⇒ Object
50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 |
# File 'lib/news_crawler/processing/structure_analysis.rb', line 50 def extract_content(url) html_doc = RawData.find_by_url(url) result = {} result[:type] == :article # Remove tag causing trouble to nokogiri html_doc = remove_tag(html_doc, 'script') html_doc = remove_tag(html_doc, 'iframe') html_doc = remove_tag(html_doc, 'style') doc = Nokogiri::HTML.parse(html_doc) longest = find_longest_node(doc) lowest_ancestor, path_to_longest = find_lowest_ancestor_has_id(longest) # Heuristic 1 # Longest content is a element as id attribute if path_to_longest.length == 2 return { :type => :list } end parent = path_to_longest[1..-1] parent = parent.reverse xpath_path = parent.join('/') xpath_path = '//' + xpath_path + '//text()' guest_type = classify_h2(longest, lowest_ancestor) result = { :type => guest_type } if (result[:type] == :article) title_ = lowest_ancestor.css('h1') if title_.count == 1 result[:title] = title_.to_a[0].content else # if cann't guest title then assume it isn't an article result[:type] = :list end main_content = '' lowest_ancestor.xpath(xpath_path).each do | node | main_content += node.content end result[:content] = main_content end mark_processed(url) result end |
#find_longest_node(doc) ⇒ Object
Find longest text node that doesn’t have a in ancestors list
198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 |
# File 'lib/news_crawler/processing/structure_analysis.rb', line 198 def find_longest_node(doc) xpath_query = '//*[@id]//text()[not (ancestor::a)]' a_l = doc.xpath xpath_query longest = nil longest_len = 0 a_l.each do | en | node_content_wo_space = en.content.gsub(/\s/, '') # trick here if node_content_wo_space.length > longest_len longest_len = node_content_wo_space.length longest = en end end return longest end |
#find_lowest_ancestor_has_id(node) ⇒ Nokogiri::XML::Node
Find the lowest node’s ancestor has id attribute
171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 |
# File 'lib/news_crawler/processing/structure_analysis.rb', line 171 def find_lowest_ancestor_has_id(node) found_id = false closest_ancestor = node path_to_closest = [] while (!found_id) if closest_ancestor.has_attribute?('id') path_to_closest << "#{closest_ancestor.node_name}[@id='#{closest_ancestor.attribute('id')}']" found_id = true else if closest_ancestor.has_attribute?('class') node_class = "@class = '#{closest_ancestor.attribute('class')}'" else node_class = 'not(@class)' end path_to_closest << "#{closest_ancestor.node_name}[#{node_class}]" closest_ancestor = closest_ancestor.parent end end return [ closest_ancestor, path_to_closest ] end |
#get_result ⇒ Object
294 295 296 |
# File 'lib/news_crawler/processing/structure_analysis.rb', line 294 def get_result @url_stats end |
#hash_node(node, limit = -1)) ⇒ String
Calculate hash of a node by its and children info
239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 |
# File 'lib/news_crawler/processing/structure_analysis.rb', line 239 def hash_node(node, limit = -1) node_sign = node.node_name node_sign += "##{node['id']}" unless node['id'].nil? node_sign += ".#{node['class']}" unless node['class'].nil? hash_sum = node_sign if limit != 0 child_hash = Set.new node.children.each do | child_node | child_hash.add(hash_node(child_node, limit - 1)) end child_hash.each do | ch | hash_sum += ch end else end Digest::SHA2.new.base64digest(hash_sum) end |
#is_url?(url) ⇒ Boolean
Check if it is really ‘url’
290 291 292 |
# File 'lib/news_crawler/processing/structure_analysis.rb', line 290 def is_url?(url) (url.size != 0) && (url != '#') && (url != 'javascript:;') end |
#node_info(node) ⇒ String
Return String represents node’s name, node’s id and node’s class
228 229 230 231 232 233 |
# File 'lib/news_crawler/processing/structure_analysis.rb', line 228 def node_info(node) node_pp = node.node_name node_pp += '#' + node.attribute('id') if node.has_attribute?('id') node_pp += '.' + node.attribute('class') if node.has_attribute?('class') node_pp end |
#remove_tag(html_doc, tag) ⇒ Object
Remove unwanted HTML tag
220 221 222 223 |
# File 'lib/news_crawler/processing/structure_analysis.rb', line 220 def remove_tag(html_doc, tag) pattern = Regexp.new("<#{tag}.*?>.*?</#{tag}>", Regexp::MULTILINE) html_doc.gsub(pattern, '') end |