6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
|
# File 'lib/scrappy/server/admin.rb', line 6
def self.registered app
app.set :method_override, true
app.use Rack::Flash
app.get '/' do
if params[:format] and params[:uri]
redirect "#{settings.base_uri}/#{params[:format]}/#{simplify_uri(params[:uri])}"
else
haml :home
end
end
app.get '/javascript' do
fragments = agent.fragments_for(Scrappy::Kb., params[:uri])
content_type 'application/javascript'
"window.scrappy_extractor=#{fragments.any?};" + open("#{settings.public_folder}/javascripts/annotator.js").read
end
app.get '/help' do
haml :help
end
app.get '/extractors' do
@uris = ( Agent::Options.kb.find(nil, Node('rdf:type'), Node('sc:UriSelector')) +
Agent::Options.kb.find(nil, Node('rdf:type'), Node('sc:UriPatternSelector')) ).
map { |node| node.rdf::value }.flatten.sort.map(&:to_s)
haml :extractors
end
app.post '/extractors' do
if params[:html]
html = Iconv.iconv('UTF-8', params[:encoding], params[:html]).first
samples = [{ :html=>html, :uri=>params[:uri] }]
= agent.train_xpath(*samples)
Scrappy::App.
Scrappy::App. agent.(Scrappy::Kb., samples), samples
else
Scrappy::App. RDF::Parser.parse(:yarf,params[:rdf])
end
flash[:notice] = "Extractor stored"
redirect "#{settings.base_uri}/extractors"
end
app.delete '/extractors/*' do |uri|
Scrappy::App. uri
flash[:notice] = "Extractor deleted"
redirect "#{settings.base_uri}/extractors"
end
app.get '/patterns' do
@patterns = ( Scrappy::Kb.patterns.find(nil, Node('rdf:type'), Node('sc:Fragment')) -
Scrappy::Kb.patterns.find([], Node('sc:subfragment'), nil) )
haml :patterns
end
app.get '/patterns/visual' do
@patterns = ( Scrappy::Kb.patterns.find(nil, Node('rdf:type'), Node('sc:Fragment')) -
Scrappy::Kb.patterns.find([], Node('sc:subfragment'), nil) )
html = @patterns.map { |pattern| render_fragment(pattern) } * ""
"<html><body>#{html}</body></html>"
end
app.get '/patterns/*' do |id|
"<html><body>#{render_fragment(Scrappy::Kb.patterns[id])}</body></html>"
end
app.delete '/patterns' do
Scrappy::App.delete_patterns
flash[:notice] = "Patterns deleted"
redirect "#{settings.base_uri}/patterns"
end
app.delete '/patterns/*' do |id|
Scrappy::App.delete_pattern id
flash[:notice] = "Pattern deleted"
redirect "#{settings.base_uri}/patterns"
end
app.get '/samples' do
@samples = Scrappy::App.samples
haml :samples
end
app.get '/samples/:id' do |id|
Nokogiri::HTML(Scrappy::App.samples[id.to_i][:html], nil, 'utf-8').search("*").map do |node|
next if node.text?
text = node.children.map { |n| n.content if n.text? } * " "
x = node[:vx].to_i
y = node[:vy].to_i
w = node[:vw].to_i
h = node[:vh].to_i
font = node[:vfont]
size = node[:vsize].to_i
weight = node[:vweight].to_i
color = "#555"
color = "#55f" if node.name == "a"
style = "position: absolute; left: #{x}px; top: #{y}px; width: #{w}px; height: #{h}px; font-family: #{font}; font-size: #{size}px; font-weight: #{weight}; border: 1px solid gray; color: #{color};"
style += "background-color: #f00; opacity: 0.2;" if node.name == "img"
style += "text-decoration: underline;" if node.name == "a"
"<div style='#{style}'>#{text}</div>"
end * ""
end
app.get '/samples/:id/raw' do |id|
Scrappy::App.samples[id.to_i][:html]
end
app.get '/samples/:id/annotations' do |id|
'Content-Type' => 'text/plain'
RDF::Graph.new(Scrappy::App.samples[id.to_i][:output] || []).serialize(:yarf)
end
app.get '/samples/:id/:kb_type' do |id,kb_type|
kb = (kb_type == "patterns" ? Scrappy::Kb.patterns : Scrappy::Kb.)
sample = Scrappy::App.samples[id.to_i]
'Content-Type' => 'text/plain'
RDF::Graph.new(agent.(sample[:uri], sample[:html], kb, Agent::Options.referenceable)).serialize(:yarf)
end
app.post '/samples/annotate' do
samples = (params['samples'] || []).map { |i| Scrappy::App.samples[i.to_i] }.each do |sample|
sample[:output] = agent.(sample[:uri], sample[:html], Scrappy::Kb.)
end
Scrappy::App.save_samples
flash[:notice] = "Samples annotated"
redirect "#{settings.base_uri}/samples"
end
app.post '/samples/train/:kb_type' do |kb_type|
kb = (kb_type == "patterns" ? Scrappy::Kb.patterns : Scrappy::Kb.)
samples = (params['samples'] || []).map { |i| Scrappy::App.samples[i.to_i] }
if kb_type == "patterns"
Scrappy::App.add_patterns agent.train(*samples)
else
Scrappy::App. agent.train_xpath(*samples)
end
flash[:notice] = "Training completed"
redirect "#{settings.base_uri}/samples"
end
app.post '/samples/optimize/:kb_type' do |kb_type|
kb = (kb_type == "patterns" ? Scrappy::Kb.patterns : Scrappy::Kb.)
samples = (params['samples'] || []).map { |i| Scrappy::App.samples[i.to_i] }
if kb_type == "patterns"
Scrappy::App.save_patterns agent.optimize_patterns(kb, samples)
else
Scrappy::App. agent.(kb, samples), samples
end
flash[:notice] = "Optimization completed"
redirect "#{settings.base_uri}/samples"
end
app.post '/samples/test/:kb_type' do |kb_type|
kb = (kb_type == "patterns" ? Scrappy::Kb.patterns : Scrappy::Kb.)
@results = {}
@missing = []
@wrong = []
output = RDF::Parser.parse(:ntriples, params["output"].to_s).triples
= []
(params['samples'] || []).each do |i|
sample = Scrappy::App.samples[i.to_i]
output += sample[:output] || []
+= agent.(sample[:uri], sample[:html], kb)
end
output = output.uniq
= .uniq
predicates = output.map { |s,p,o| p }.uniq
types = output.map { |s,p,o| o if p == ID('rdf:type') }.compact.uniq
predicates.each do |predicate|
new_output = output.select { |s,p,o| p==predicate }
= .select { |s,p,o| p==predicate }
precision, recall, fscore = agent.send :metrics, new_output,
@results[predicate] ||= Hash.new(0.0)
@results[predicate][:count] += 1
@results[predicate][:triples] = new_output.size
@results[predicate][:fscore] += fscore
@results[predicate][:precision] += precision
@results[predicate][:recall] += recall
end
types.each do |type|
new_output = output.select { |s,p,o| p==ID("rdf:type") and o==type }
= .select { |s,p,o| p==ID("rdf:type") and o==type }
precision, recall, fscore = agent.send :metrics, new_output,
@results[type] ||= Hash.new(0.0)
@results[type][:count] += 1
@results[type][:triples] = new_output.size
@results[type][:fscore] += fscore
@results[type][:precision] += precision
@results[type][:recall] += recall
end
precision, recall, fscore = agent.send :metrics, output,
@results[:total] ||= Hash.new(0.0)
@results[:total][:count] += 1
@results[:total][:triples] = output.size
@results[:total][:fscore] += fscore
@results[:total][:precision] += precision
@results[:total][:recall] += recall
@missing += output -
@wrong += - output
@results.each do |key, result|
count = result[:count]
result.each do |k,v|
result[k] /= count
end
end
@total = output.size
@extracted = .size
@correct = @extracted - @wrong.size
@missing = RDF::Graph.new(@missing)
@wrong = RDF::Graph.new(@wrong)
flash.now[:notice] = "Testing completed"
haml :test
end
app.post '/samples' do
html = Iconv.iconv('UTF-8', params[:encoding], params[:html]).first
sample = Scrappy::App.add_sample(:html=>html, :uri=>params[:uri], :date=>Time.now)
flash[:notice] = "Sample stored"
redirect "#{settings.base_uri}/samples"
end
app.delete '/samples/:id' do |id|
Scrappy::App.delete_sample id.to_i
flash[:notice] = "Sample deleted"
redirect "#{settings.base_uri}/samples"
end
def render_fragment fragment, selected_branch=nil
label = if fragment.sc::relation.first
fragment.sc::relation.map {|id| RDF::ID.compress(id)} * ', '
else
fragment.sc::type.map {|id| RDF::ID.compress(id)} * ', '
end
subfragments = [selected_branch || [:min, :max]].flatten.map do |branch|
fragment.sc::subfragment.map { |f| render_fragment(f, branch) } * ""
end * ""
[selected_branch || [:min, :max]].flatten.map do |branch|
fragment.sc::selector.map do |selector|
x,y,w,h,font,size,weight,color = case branch
when :min then
[selector.sc::min_relative_x.first, selector.sc::min_relative_y.first, selector.sc::min_width.first, selector.sc::min_height.first, selector.sc::font_family.first, selector.sc::min_font_size.first, selector.sc::min_font_weight.first, :blue]
when :max then
[selector.sc::max_relative_x.first, selector.sc::max_relative_y.first, selector.sc::max_width.first, selector.sc::max_height.first, selector.sc::font_family.first, selector.sc::max_font_size.first, selector.sc::max_font_weight.first, :red]
end
style = "position: absolute; left: #{x}px; top: #{y}px; width: #{w}px; height: #{h}px; font-family: #{font}; font-size: #{size}px; font-weight: #{weight}; border: 1px solid #{color}; color: #555;"
"<div style='#{style}'>#{label}#{subfragments}</div>"
end * ""
end * ""
end
def percentage value
"%.2f%" % (value * 100.0)
end
app.helpers Admin
end
|