Class: Charles::Document
Instance Method Summary
collapse
Methods included from Images
#calculate_image_from_node, #calculate_images, #filtered_images, #filtered_images_extra, #get_image, #image, #images
#clean_title, #title
Constructor Details
#initialize(input, options = {}) ⇒ Document
Returns a new instance of Document.
9
10
11
12
13
14
15
16
|
# File 'lib/charles/document.rb', line 9
def initialize(input, options={})
@document = Nokogiri::HTML.parse(input)
@document.search("script, style").remove
@nodes = @document.search('body *').select{|_n|
_n.clean_inner_tokens_text.size > 30 }
@options = options
end
|
Instance Method Details
#calculate_content_nodes(seeds = {}) ⇒ Object
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
|
# File 'lib/charles/document.rb', line 36
def calculate_content_nodes(seeds={})
default_seeds = {:title_match=>0.0586074856962615, :title_match_buffer=>0.508671373602233,
:length=>1246.27917099503,
:distance_from_top=>0.436005480844439,
:internal_nodes=>18.0265463704097,
:internal_nodes_buffer=>32.7588984705223}
seeds = default_seeds.merge(seeds)
o = []
_rank = 0
@nodes.each_index{|_i|
_n = @nodes[_i]
_rank += 1
scores={
:length => 1 - seeds[:length].to_f / (_n.clean_inner_tokens_text.size + seeds[:length]), :internal_nodes => seeds[:internal_nodes].to_f / (_n.internal_nodes_size + seeds[:internal_nodes] + seeds[:internal_nodes_buffer]), :distance_from_top => (1-(_rank.to_f / @nodes.size))**seeds[:distance_from_top].to_f, :title_match => ((content_node_ferret_index[_i]||0.0 + seeds[:title_match_buffer]) / 1 + seeds[:title_match_buffer])**seeds[:title_match].to_f, }
o << {:node =>_n, :score => scores.values.inject(:*), :scores => scores}
}
o.sort!{|a,b| b[:score] <=> a[:score]}
return o
end
|
#caluclate_content_node_ferret_index ⇒ Object
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
|
# File 'lib/charles/document.rb', line 96
def caluclate_content_node_ferret_index
index = Ferret::Index::Index.new()
index.field_infos.add_field(:id, :store => :yes)
index.field_infos.add_field(:content, :store => :no, :boost => 1)
@nodes.each_index{|_i|
i=@nodes[_i]
index << {
:id => _i,
:content => i.clean_inner_text,
}
}
q=self.title.gsub(/[:()\[\]{}!+"~^\-|<>=*?\\]/,'') s=index.search(q, :limit => @nodes.size)
o=[]
s.hits.each {|hit|
_i = index[hit.doc][:id].to_i
_n = @nodes[_i]
_search_score = hit.score
_search_normalised_score = hit.score/s.max_score
o[_i] = _search_normalised_score
}
o
end
|
#content(seeds = {}) ⇒ Object
24
25
26
27
28
|
# File 'lib/charles/document.rb', line 24
def content(seeds={})
content_node = content_node(seeds)
return unless content_node
refine_content_node(content_node).clean_inner_text
end
|
#content_node(seeds = {}) ⇒ Object
30
31
32
33
34
|
# File 'lib/charles/document.rb', line 30
def content_node(seeds={})
content_nodes = calculate_content_nodes(seeds)
return unless content_nodes.first
content_nodes.first[:node]
end
|
#content_node_ferret_index ⇒ Object
93
94
95
|
# File 'lib/charles/document.rb', line 93
def content_node_ferret_index
@content_node_ferret_index ||= caluclate_content_node_ferret_index
end
|
#interesting_content(options = {:max_length => 388}) ⇒ Object
20
21
22
|
# File 'lib/charles/document.rb', line 20
def interesting_content(options = {:max_length => 388})
Shiner.shine(content, options)
end
|
#logger ⇒ Object
18
|
# File 'lib/charles/document.rb', line 18
def logger; Charles.logger; end
|
#mechanize_agent ⇒ Object
126
127
128
|
# File 'lib/charles/document.rb', line 126
def mechanize_agent
@options[:mechanize_agent] ||= Mechanize.new{|a|a.user_agent_alias = 'Mac Mozilla'}
end
|
#refine_content_node(node) ⇒ Object
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
|
# File 'lib/charles/document.rb', line 71
def refine_content_node(node)
node = node.dup
_min_size = 30
node.children.each{|_n|
if(_n.clean_inner_tokens_text.size < _min_size)
_n.remove
else; break; end
}
node.children.reverse.each{|_n|
if(_n.clean_inner_tokens_text.size < _min_size)
_n.remove
else; break; end
}
node.search('*').each{|_n| _n.after(' ')}
return node
end
|