Module: Scrappy::Extractor
- Included in:
- Agent
- Defined in:
- lib/scrappy/extractor/extractor.rb
Instance Method Summary collapse
- #extract(uri, html, kb, referenceable = nil) ⇒ Object
-
#extract_graph(fragments, options) ⇒ Object
Extracts all mappings from a fragment and returns a graph.
-
#fragments_for(kb, uri) ⇒ Object
Returns a list of fragments that have mappings in a given URI.
Instance Method Details
#extract(uri, html, kb, referenceable = nil) ⇒ Object
10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 |
# File 'lib/scrappy/extractor/extractor.rb', line 10 def extract uri, html, kb, referenceable=nil synchronize do if .debug print "Extracting #{uri}..."; $stdout.flush end # Restart stateful selectors kb = RDF::Graph.new(kb.triples) # Parse document content = Nokogiri::HTML(html, nil, 'utf-8') # Extract each fragment = { :doc => { :uri=>uri, :content=>content }, :referenceable=>referenceable } output = extract_graph(fragments_for(kb, uri), ) puts "done!" if self..debug output.triples end end |
#extract_graph(fragments, options) ⇒ Object
Extracts all mappings from a fragment and returns a graph
55 56 57 58 59 |
# File 'lib/scrappy/extractor/extractor.rb', line 55 def extract_graph fragments, output = RDF::Graph.new fragments.each { |fragment| fragment.extract().each { |result| output << result } } output end |
#fragments_for(kb, uri) ⇒ Object
Returns a list of fragments that have mappings in a given URI
33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 |
# File 'lib/scrappy/extractor/extractor.rb', line 33 def fragments_for kb, uri root_fragments = kb.find(nil, Node('rdf:type'), Node('sc:Fragment')) - kb.find([], Node('sc:subfragment'), nil) selectors = [] fragments = {} root_fragments.each do |fragment| fragment.sc::selector.each do |selector| fragments[selector] = fragment selectors << selector end end uri_selectors = selectors.select { |selector| selector.rdf::type.include?(Node('sc:UriSelector')) or selector.rdf::type.include?(Node('sc:UriPatternSelector')) }. select { |selector| !kb.node(selector).filter(:uri=>uri).empty? } visual_selectors = selectors.select { |selector| selector.rdf::type.include?(Node('sc:VisualSelector')) } (uri_selectors + visual_selectors).map { |selector| fragments[selector].proxy } end |