Class: NewsCrawler::Processing::StructureAnalysis

Inherits:

Object

Object
NewsCrawler::Processing::StructureAnalysis

show all

Includes:: CrawlerModule, URLHelper

Defined in:: lib/news_crawler/processing/structure_analysis.rb

Overview

Analyse website structure to extract content Database should only contains raw data from one website.

Instance Method Summary collapse

#analyse(url) ⇒ Object

Get and analyse url for information.
#classify_h2(root, limit) ⇒ Symbol

Predict type of tree point by root is fragment of article or index page.
#count_a_and_non_a_tag(node) ⇒ [Fixnum, Fixnum]

Count a tag and non-a tag in tree pointed by node.
#extract_content(url) ⇒ Object
#find_longest_node(doc) ⇒ Object

Find longest text node that doesn’t have a in ancestors list.
#find_lowest_ancestor_has_id(node) ⇒ Nokogiri::XML::Node

Find the lowest node’s ancestor has id attribute.
#get_result ⇒ Object
#hash_node(node, limit = -1)) ⇒ String

Calculate hash of a node by its and children info.
#initialize ⇒ StructureAnalysis constructor

A new instance of StructureAnalysis.
#is_url?(url) ⇒ Boolean

Check if it is really ‘url’.
#node_info(node) ⇒ String

Return String represents node’s name, node’s id and node’s class.
#remove_tag(html_doc, tag) ⇒ Object

Remove unwanted HTML tag.

Methods included from URLHelper

#get_url_path, #same_domain?

Methods included from CrawlerModule

#find_all, #find_one, #find_unprocessed, #load_yaml, #mark_all_as_unprocessed, #mark_processed, #mark_unprocessed, #next_unprocessed, #save_yaml

Constructor Details

#initialize ⇒ `StructureAnalysis`

Returns a new instance of StructureAnalysis.

# File 'lib/news_crawler/processing/structure_analysis.rb', line 40

def initialize
  @url_stats = {}
  while (url = next_unprocessed)
    NCLogger.get_logger.info "[NC::P::SA] Processing #{url}"
    re = extract_content(url)
    @url_stats[url] = re
    save_yaml(url, re)
  end
end

Instance Method Details

#analyse(url) ⇒ `Object`

Get and analyse url for information

# File 'lib/news_crawler/processing/structure_analysis.rb', line 264

def analyse(url)
  #        puts "processing #{url}"
  html_doc = RawData.find_by_url(url)
  doc = Nokogiri.HTML(html_doc)
  inner_url = doc.xpath('//a').collect { | a_el |
    temp_url = (a_el.attribute 'href').to_s
    if (!temp_url.nil?) && (temp_url[0] == '/')
      temp_url = URI.join(url, temp_url).to_s
    end
    temp_url
  }

  inner_url.delete_if { | url_0 |
    (url_0.nil?) || (url_0.size == 0) || (url_0 == '#') ||
    (url_0 == 'javascript:;')
  }

  inner_url.each do  | url |
    @url_stats[url] = (@url_stats[url] || 0) + 1
  end
  mark_processed(url)
end

#classify_h2(root, limit) ⇒ `Symbol`

Predict type of tree point by root is fragment of article or index page

Parameters:

root (Nokogiri::XML::Node)

Returns:

(Symbol) —

one of :article, :list

# File 'lib/news_crawler/processing/structure_analysis.rb', line 103

def classify_h2(root, limit)
  current = root
  current = current.parent if current.text?

  depth = 0

  while true
    expect_hash = hash_node(current, 0)
    previous = current
    current = current.parent

    depth += 1
    lons = {}
    node_count = 0
    node_list = [previous]
    current.children.each do | child |
      hc = hash_node(child, depth - 1)
      if hc == expect_hash
        node_count += 1
        node_list << child
      end
    end

    if node_count > 1
      a_tag_len, non_a_tag_len = count_a_and_non_a_tag(current)
      if non_a_tag_len > a_tag_len
        return :article
      else
        return :list
      end
      break
    end

    if current == limit
      a_tag_len, non_a_tag_len = count_a_and_non_a_tag(current)
      if non_a_tag_len > a_tag_len
        return :article
      else
        return :list
      end
      break
    end
  end

  return :list
end

#count_a_and_non_a_tag(node) ⇒ `[Fixnum, Fixnum]`

Count a tag and non-a tag in tree pointed by node

Parameters:

node (Nokogiri::XML::Node)

Returns:

([Fixnum, Fixnum]) —

a tag and non-a tag

# File 'lib/news_crawler/processing/structure_analysis.rb', line 153

def count_a_and_non_a_tag(node)
  a_tag_list = node.xpath './/a'
  a_tag_len = a_tag_list.count # number of a tag

  non_a_tag_list = node.xpath './/text()[not (ancestor::a)]'
  non_a_tag_len = non_a_tag_list.to_a.inject(0) do | memo, node |
    if node.content.gsub(/\s+/, '').length > 15
      memo + 1
    else
      memo
    end
  end
  [ a_tag_len, non_a_tag_len ]
end

#extract_content(url) ⇒ `Object`

# File 'lib/news_crawler/processing/structure_analysis.rb', line 50

def extract_content(url)
  html_doc = RawData.find_by_url(url)
  result = {}
  result[:type] == :article

  # Remove tag causing trouble to nokogiri
  html_doc = remove_tag(html_doc, 'script')
  html_doc = remove_tag(html_doc, 'iframe')
  html_doc = remove_tag(html_doc, 'style')

  doc = Nokogiri::HTML.parse(html_doc)
  longest = find_longest_node(doc)
  lowest_ancestor, path_to_longest = find_lowest_ancestor_has_id(longest)

  # Heuristic 1
  # Longest content is a element as id attribute
  if path_to_longest.length == 2
    return { :type => :list }
  end

  parent = path_to_longest[1..-1]
  parent = parent.reverse
  xpath_path = parent.join('/')
  xpath_path = '//' + xpath_path + '//text()'

  guest_type = classify_h2(longest, lowest_ancestor)
  result = { :type => guest_type }

  if (result[:type] == :article)
    title_ = lowest_ancestor.css('h1')
    if title_.count == 1
      result[:title] = title_.to_a[0].content
    else
      # if cann't guest title then assume it isn't an article
      result[:type] = :list
    end

    main_content = ''
    lowest_ancestor.xpath(xpath_path).each do | node |
      main_content += node.content
    end

    result[:content] = main_content
  end

  mark_processed(url)
  result
end

#find_longest_node(doc) ⇒ `Object`

Find longest text node that doesn’t have a in ancestors list

Parameters:

doc (Nokogiri::XML::Node)

# File 'lib/news_crawler/processing/structure_analysis.rb', line 198

def find_longest_node(doc)
  xpath_query = '//*[@id]//text()[not (ancestor::a)]'

  a_l = doc.xpath xpath_query

  longest = nil
  longest_len = 0

  a_l.each do | en |
    node_content_wo_space = en.content.gsub(/\s/, '') # trick here
    if node_content_wo_space.length > longest_len
      longest_len = node_content_wo_space.length
      longest = en
    end
  end

  return longest
end

#find_lowest_ancestor_has_id(node) ⇒ `Nokogiri::XML::Node`

Find the lowest node’s ancestor has id attribute

Parameters:

node (Nokogiri::XML::Node)

Returns:

(Nokogiri::XML::Node)

# File 'lib/news_crawler/processing/structure_analysis.rb', line 171

def find_lowest_ancestor_has_id(node)
  found_id = false

  closest_ancestor = node

  path_to_closest = []

  while (!found_id)
    if closest_ancestor.has_attribute?('id')
      path_to_closest << "#{closest_ancestor.node_name}[@id='#{closest_ancestor.attribute('id')}']"
      found_id = true
    else
      if closest_ancestor.has_attribute?('class')
        node_class = "@class = '#{closest_ancestor.attribute('class')}'"
      else
        node_class = 'not(@class)'
      end
      path_to_closest << "#{closest_ancestor.node_name}[#{node_class}]"
      closest_ancestor = closest_ancestor.parent
    end
  end

  return [ closest_ancestor, path_to_closest ]
end

#get_result ⇒ `Object`



294
295
296

# File 'lib/news_crawler/processing/structure_analysis.rb', line 294

def get_result
  @url_stats
end

#hash_node(node, limit = -1)) ⇒ `String`

Calculate hash of a node by its and children info

Parameters:

node (Nokogiri::XML::Node)
limit (Fixnum) (defaults to: -1)) —

limit depth of children (-1 for unlimited)

Returns:

(String) —

Hash of node in base 64 encode

# File 'lib/news_crawler/processing/structure_analysis.rb', line 239

def hash_node(node, limit = -1)
  node_sign = node.node_name
  node_sign += "##{node['id']}" unless node['id'].nil?
  node_sign += ".#{node['class']}" unless node['class'].nil?

  hash_sum = node_sign

  if limit != 0
    child_hash = Set.new
    node.children.each do | child_node |
      child_hash.add(hash_node(child_node, limit - 1))
    end

    child_hash.each do | ch |
      hash_sum += ch
    end
  else

  end

  Digest::SHA2.new.base64digest(hash_sum)
end

#is_url?(url) ⇒ `Boolean`

Check if it is really ‘url’

Parameters:

url (String)

Returns:

(Boolean)



290
291
292

# File 'lib/news_crawler/processing/structure_analysis.rb', line 290

def is_url?(url)
  (url.size != 0) && (url != '#') && (url != 'javascript:;')
end

#node_info(node) ⇒ `String`

Return String represents node’s name, node’s id and node’s class

Parameters:

node (Nokogiri::XML::Node)

Returns:

(String)

# File 'lib/news_crawler/processing/structure_analysis.rb', line 228

def node_info(node)
  node_pp = node.node_name
  node_pp += '#' + node.attribute('id') if node.has_attribute?('id')
  node_pp += '.' + node.attribute('class') if node.has_attribute?('class')
  node_pp
end

#remove_tag(html_doc, tag) ⇒ `Object`

Remove unwanted HTML tag

Parameters:

html_doc (String) —

HTML document
tag (String) —

tag to be removed

# File 'lib/news_crawler/processing/structure_analysis.rb', line 220

def remove_tag(html_doc, tag)
  pattern = Regexp.new("<#{tag}.*?>.*?</#{tag}>", Regexp::MULTILINE)
  html_doc.gsub(pattern, '')
end

Class: NewsCrawler::Processing::StructureAnalysis

Overview

Instance Method Summary collapse

Methods included from URLHelper

Methods included from CrawlerModule

Constructor Details

#initialize ⇒ StructureAnalysis

Instance Method Details

#analyse(url) ⇒ Object

#classify_h2(root, limit) ⇒ Symbol

#count_a_and_non_a_tag(node) ⇒ [Fixnum, Fixnum]

#extract_content(url) ⇒ Object

#find_longest_node(doc) ⇒ Object

#find_lowest_ancestor_has_id(node) ⇒ Nokogiri::XML::Node

#get_result ⇒ Object

#hash_node(node, limit = -1)) ⇒ String

#is_url?(url) ⇒ Boolean

#node_info(node) ⇒ String

#remove_tag(html_doc, tag) ⇒ Object

#initialize ⇒ `StructureAnalysis`

#analyse(url) ⇒ `Object`

#classify_h2(root, limit) ⇒ `Symbol`

#count_a_and_non_a_tag(node) ⇒ `[Fixnum, Fixnum]`

#extract_content(url) ⇒ `Object`

#find_longest_node(doc) ⇒ `Object`

#find_lowest_ancestor_has_id(node) ⇒ `Nokogiri::XML::Node`

#get_result ⇒ `Object`

#hash_node(node, limit = -1)) ⇒ `String`

#is_url?(url) ⇒ `Boolean`

#node_info(node) ⇒ `String`

#remove_tag(html_doc, tag) ⇒ `Object`