Module: Scrappy::Optimizer

Included in:
Agent
Defined in:
lib/scrappy/learning/optimizer.rb

Instance Method Summary collapse

Instance Method Details

#optimize_extractors(kb, samples) ⇒ Object

Iterates through a knowledge base and tries to merge and generalize selectors whenever the output of the resulting kb is the same



7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
# File 'lib/scrappy/learning/optimizer.rb', line 7

def optimize_extractors kb, samples
  # Build an array of fragments
  all_fragments = kb.find(nil, Node('rdf:type'), Node('sc:Fragment')) - kb.find([], Node('sc:subfragment'), nil)

  root_superfragments = all_fragments.select do |fragment|
    fragment.sc::selector.any? do |selector|
      ( selector.rdf::type.include?(Node('sc:UriSelector')) or
        selector.rdf::type.include?(Node('sc:UriPatternSelector')) ) and
      samples.any? { |sample| !kb.node(selector).filter(:uri=>sample[:uri]).empty? }
    end
  end
  root_fragments = root_superfragments.map { |f| f.sc::subfragment }.flatten

  # Optimize the fragments
  fragments = optimize_all root_fragments, samples, :extractors
  
  # Build a graph by adding all fragments to a common URI-selected superfragment
  superfragment = Node(nil)
  identifier    = Node(nil)
  selector      = uri_selector_for(samples.map { |sample| sample[:uri] })
  identifier.rdf::type  = Node('sc:BaseUriSelector')
  superfragment.rdf::type       = Node('sc:Fragment')
  superfragment.sc::selector    = selector
  superfragment.sc::identifier  = identifier
  superfragment.graph << selector
  superfragment.graph << identifier

  triples = fragments.inject([]) do |triples, fragment|
    triples << [superfragment.id, ID('sc:subfragment'), fragment.id]
    triples += fragment.all_triples
  end
  triples += superfragment.all_triples
        
  RDF::Graph.new(triples)
end

#optimize_patterns(kb, samples) ⇒ Object

Iterates through a knowledge base and tries to merge and generalize selectors whenever the output of the resulting kb is the same



45
46
47
48
49
50
51
52
53
54
# File 'lib/scrappy/learning/optimizer.rb', line 45

def optimize_patterns kb, samples
  # Build an array of fragments
  root_fragments = kb.find(nil, Node('rdf:type'), Node('sc:Fragment')) - kb.find([], Node('sc:subfragment'), nil)

  # Optimize the fragments
  fragments = optimize_all root_fragments, samples, :patterns
  
  # Build a graph
  RDF::Graph.new(fragments.inject([]) { |triples, fragment| triples += fragment.all_triples })
end