Class: Charles::Document

Inherits:
Object
  • Object
show all
Includes:
Images, InternalAttributes
Defined in:
lib/charles/document.rb

Instance Method Summary collapse

Methods included from Images

#calculate_image_from_node, #calculate_images, #filtered_images, #filtered_images_extra, #get_image, #image, #images

Methods included from InternalAttributes

#clean_title, #title

Constructor Details

#initialize(input, options = {}) ⇒ Document

Returns a new instance of Document.



9
10
11
12
13
14
15
16
# File 'lib/charles/document.rb', line 9

def initialize(input, options={})
  @document = Nokogiri::HTML.parse(input)
  @document.search("script, style").remove
  @nodes = @document.search('body *').select{|_n|
    _n.clean_inner_tokens_text.size > 30 #arbitrary, minimum inner text limit of 30 chars
  }
  @options = options
end

Instance Method Details

#calculate_content_nodes(seeds = {}) ⇒ Object



36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# File 'lib/charles/document.rb', line 36

def calculate_content_nodes(seeds={})
  default_seeds = {:title_match=>0.0586074856962615, #0.238237272128463,0.173173520342878
  :title_match_buffer=>0.508671373602233,
  :length=>1246.27917099503,
  :distance_from_top=>0.436005480844439,
  :internal_nodes=>18.0265463704097,
  :internal_nodes_buffer=>32.7588984705223}
  seeds = default_seeds.merge(seeds)
  
  o = []
  _rank = 0
  
  @nodes.each_index{|_i|
    _n = @nodes[_i]
    _rank += 1
    
    scores={
      :length => 1 - seeds[:length].to_f / (_n.clean_inner_tokens_text.size + seeds[:length]), #length of inner text in this node, too little = less
      :internal_nodes => seeds[:internal_nodes].to_f / (_n.internal_nodes_size + seeds[:internal_nodes] + seeds[:internal_nodes_buffer]), #number of nodes in this node, too many = less
      :distance_from_top => (1-(_rank.to_f / @nodes.size))**seeds[:distance_from_top].to_f, #how far this element is from the top of the page
      :title_match => ((content_node_ferret_index[_i]||0.0 + seeds[:title_match_buffer]) / 1 + seeds[:title_match_buffer])**seeds[:title_match].to_f, #ferret index score, search score with page title
      #:interesting => (0.5 + _n.interesting_score) ** seeds[:interesting].to_f
      #:special_characters => (1 - (_n.inner_text.scan(/[^\s\302\240a-zA-Z]/).size.to_f / (_n.clean_inner_text.size+1)))**2 #number of special characters and numbers.. this is pretty cpu intensive!
    }
    o << {:node =>_n, :score => scores.values.inject(:*), :scores => scores}
  }
  
  o.sort!{|a,b| b[:score] <=> a[:score]}
  
  #o[0,1].each{|o2| pp [o2[:score], o2[:scores]]}
  #o[0,1].each{|o2| pp [refine_content_node(o2[:node]).clean_inner_text, o2[:score], o2[:scores]]}
        
  return o
end

#caluclate_content_node_ferret_indexObject



96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
# File 'lib/charles/document.rb', line 96

def caluclate_content_node_ferret_index
  index = Ferret::Index::Index.new()
  index.field_infos.add_field(:id, :store => :yes)
  index.field_infos.add_field(:content, :store => :no, :boost => 1)


  @nodes.each_index{|_i|
    i=@nodes[_i]
    index << {
      :id => _i,
      :content => i.clean_inner_text,
    }
  }


  q=self.title.gsub(/[:()\[\]{}!+"~^\-|<>=*?\\]/,'') #remove special charcaters used by ferret query parser: http://www.davebalmain.com/api/classes/Ferret/QueryParser.html, http://www.regular-expressions.info/charclass.html
  s=index.search(q, :limit => @nodes.size)

  o=[]
  s.hits.each {|hit|
    _i = index[hit.doc][:id].to_i
    _n = @nodes[_i]
    _search_score = hit.score
    _search_normalised_score = hit.score/s.max_score
    #logger.info [_n.clean_inner_text, _search_score, _search_normalised_score].pretty_inspect
    o[_i] = _search_normalised_score
  }
  o
end

#content(seeds = {}) ⇒ Object



24
25
26
27
28
# File 'lib/charles/document.rb', line 24

def content(seeds={})
  content_node = content_node(seeds)
  return unless content_node
  refine_content_node(content_node).clean_inner_text
end

#content_node(seeds = {}) ⇒ Object



30
31
32
33
34
# File 'lib/charles/document.rb', line 30

def content_node(seeds={})
  content_nodes = calculate_content_nodes(seeds)
  return unless content_nodes.first
  content_nodes.first[:node]
end

#content_node_ferret_indexObject



93
94
95
# File 'lib/charles/document.rb', line 93

def content_node_ferret_index
  @content_node_ferret_index ||= caluclate_content_node_ferret_index
end

#interesting_content(options = {:max_length => 388}) ⇒ Object



20
21
22
# File 'lib/charles/document.rb', line 20

def interesting_content(options = {:max_length => 388})
  Shiner.shine(content, options)
end

#loggerObject



18
# File 'lib/charles/document.rb', line 18

def logger; Charles.logger; end

#mechanize_agentObject



126
127
128
# File 'lib/charles/document.rb', line 126

def mechanize_agent
  @options[:mechanize_agent] ||= Mechanize.new{|a|a.user_agent_alias = 'Mac Mozilla'}
end

#refine_content_node(node) ⇒ Object



71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
# File 'lib/charles/document.rb', line 71

def refine_content_node(node)
  node = node.dup
  
  #strip 'clutter'
  #i.children.each{|_n| pp _n.inner_text; pp _n.clean_inner_text.size}
  _min_size = 30
  node.children.each{|_n|
    if(_n.clean_inner_tokens_text.size < _min_size)
      _n.remove
    else; break; end
  }
  node.children.reverse.each{|_n|
    if(_n.clean_inner_tokens_text.size < _min_size)
      _n.remove
    else; break; end
  }
  node.search('*').each{|_n| _n.after(' ')}
  
  return node
end