Class: Scrappy::Agent

Inherits:

Object

Object
Scrappy::Agent

show all

Includes:: MapReduce, MonitorMixin, BlindAgent, Cached, Extractor, Optimizer, Trainer

Defined in:: lib/scrappy/agent/agent.rb

Constant Summary collapse

Options =

OpenStruct.new :format=>:yarf, :format_header=>true, :depth=>0, :delay=>0, :workers=>10

ContentTypes =

{ :png => 'image/png', :rdfxml => 'application/rdf+xml',
:rdf => 'application/rdf+xml' }

Instance Attribute Summary collapse

#id ⇒ Object

Returns the value of attribute id.
#kb ⇒ Object

Returns the value of attribute kb.
#options ⇒ Object

Returns the value of attribute options.

Class Method Summary collapse

Instance Method Summary collapse

#initialize(args = {}) ⇒ Agent constructor

A new instance of Agent.
#map(args, queue = nil) ⇒ Object
#observe(uris) ⇒ Object

Method to observe several webs, and extract the data periodically.
#proxy(args = {}) ⇒ Object
#reduce(results) ⇒ Object
#request(args = {}) ⇒ Object

Constructor Details

#initialize(args = {}) ⇒ `Agent`

Returns a new instance of Agent.

# File 'lib/scrappy/agent/agent.rb', line 24

def initialize args={}
  super()
  @cluster_count   = args[:workers] || Options.workers
  @cluster_options = [ { :referenceable=>Options.referenceable, :agent=>Options.agent,
                         :workers=>1, :window=>false } ]
  @cluster = args[:parent]
  @id = args[:id] || Agent.pool.keys.size
  Agent.pool[@id] = self
  @kb = args[:kb] || Options.kb
  @options = Options.clone
  @repository = args[:repository] || Options.repository
end

Instance Attribute Details

#id ⇒ `Object`

Returns the value of attribute id.



22
23
24

# File 'lib/scrappy/agent/agent.rb', line 22

def id
  @id
end

#kb ⇒ `Object`

Returns the value of attribute kb.



22
23
24

# File 'lib/scrappy/agent/agent.rb', line 22

def kb
  @kb
end

#options ⇒ `Object`

Returns the value of attribute options.



22
23
24

# File 'lib/scrappy/agent/agent.rb', line 22

def options
  @options
end

Class Method Details

.[](id) ⇒ `Object`



18
19
20

# File 'lib/scrappy/agent/agent.rb', line 18

def self.[] id
  pool[id] || Agent.new(:id=>id)
end

.pool ⇒ `Object`



15
16
17

# File 'lib/scrappy/agent/agent.rb', line 15

def self.pool
  @pool ||= {}
end

Instance Method Details

#map(args, queue = nil) ⇒ `Object`

# File 'lib/scrappy/agent/agent.rb', line 37

def map args, queue=nil
  depth = args[:depth] || options.depth
  request = { :method=>args[:method]||:get, :uri=>complete_uri(args[:uri]), :inputs=>args[:inputs]||{} }

  # Expire cache
  cache.expire! 300 # 5 minutes

  # Lookup in cache
  triples = if cache[request]
    puts "Retrieving cached #{request[:uri]}...done!" if options.debug
    
    cache[request][:response]
  elsif @repository
    # Extracts from the repository 
    request_from_repository(request)
  else
    # Perform the request
    request_uncached(request)
  end

  # If previous cache exists, do not cache it again
  unless cache[request]
    # Cache the request
    cache[request]                       = { :time=>Time.now, :response=>triples }
    cache[request.merge(:uri=>self.uri)] = { :time=>Time.now, :response=>triples } if self.uri
  end

  # Enqueue subresources
  # Pages are enqueued without reducing depth
  pages = triples.select { |s,p,o| p==ID("rdf:type") and o==ID("sc:Page") }.map{|s,p,o| s}.select{|n| n.is_a?(Symbol)}

  # All other URIS are enqueued with depth reduced
  uris = if depth != 0
    (triples.map { |s, p, o| [s,o] }.flatten - [ID(self.uri)] - pages).select{|n| n.is_a?(Symbol)}
  else
    []
  end

  # Recently created URIs are not followed
  nofollow = triples.select { |s,p,o| p==ID("rdf:type") and o==ID("sc:NewUri") }.map{|s,p,o| s}.select{|n| n.is_a?(Symbol)}
  pages -= nofollow
  uris  -= nofollow
  
  items = ( pages.map { |uri| {:uri=>uri.to_s, :depth=>[-1, depth].max} } +
            uris.map  { |uri| {:uri=>uri.to_s, :depth=>[-1, depth-1].max} } ).
            uniq.select { |item| !RDF::ID.bnode?(item[:uri]) }
  
  items.each { |item| puts "Enqueuing (depth = #{item[:depth]}): #{item[:uri]}" if !queue or !(queue.history + queue.items).include?(item) } if options.debug
  
  if queue
    items.each { |item| queue.push_unless_done item }
  else
    triples += process items
  end

  triples unless options.dump
end

#observe(uris) ⇒ `Object`

Method to observe several webs, and extract the data periodically

# File 'lib/scrappy/agent/agent.rb', line 140

def observe uris
  while true
    time_init = Time.now.to_i
    uris.each do |uri|
      puts "Pinging #{uri}..."
      request :uri=>uri
    end
    time = options.repository.time * 60 - (Time.now.to_i - time_init)
    puts "Sleeping until #{Time.now + time}..."
    sleep time
  end
end

#proxy(args = {}) ⇒ `Object`

# File 'lib/scrappy/agent/agent.rb', line 114

def proxy args={}
  request  = { :method=>:get, :inputs=>{}, :format=>options.format, :depth=>options.depth }.merge(args)
  
  response = self.request(request)
  
  output = if options.dump
    ""
  else
    if options.debug
      print "Serializing..."; $stdout.flush
    end
    
    output = response.serialize request[:format], options.format_header
  
    puts 'done!'if options.debug
    
    output
  end

  OpenStruct.new :output => output,
                 :content_type => ContentTypes[request[:format]] || 'text/plain',
                 :uri => self.uri,
                 :status => self.html_data? ? (self.uri == request[:uri] ? :ok : :redirect) : :error
end

#reduce(results) ⇒ `Object`

# File 'lib/scrappy/agent/agent.rb', line 95

def reduce results
  return [] if options.dump
  
  if options.debug
    print "Merging results..."; $stdout.flush
  end
  
  triples = []; results.each { |result| triples += result }
  triples.uniq!
  
  puts 'done!'if options.debug
  
  triples
end

#request(args = {}) ⇒ `Object`



110
111
112

# File 'lib/scrappy/agent/agent.rb', line 110

def request args={}
  RDF::Graph.new clean(map(args) || [])
end

Class: Scrappy::Agent

Constant Summary collapse

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Methods included from BlindAgent

Methods included from Cached

Methods included from MapReduce

Methods included from Optimizer

Methods included from Trainer

Methods included from Extractor

Constructor Details

#initialize(args = {}) ⇒ Agent

Instance Attribute Details

#id ⇒ Object

#kb ⇒ Object

#options ⇒ Object

Class Method Details

.[](id) ⇒ Object

.pool ⇒ Object