Class: SpeedSpider::Crawler

Inherits:

Object

Object
SpeedSpider::Crawler

show all

Defined in:: lib/speed_spider/crawler.rb

Instance Method Summary collapse

#after_crawl ⇒ Object
#crawl ⇒ Object
#focus_crawl ⇒ Object
#get_urls_from_css(data, pos = 0) ⇒ Object

return urls from css file contents.
#initialize(start_url, options) ⇒ Crawler constructor

A new instance of Crawler.

Constructor Details

#initialize(start_url, options) ⇒ `Crawler`

# File 'lib/speed_spider/crawler.rb', line 8

def initialize(start_url, options)
  @start_url = start_url
  @base_url = options[:base_url]
  @options = options
end

Instance Method Details

#after_crawl ⇒ `Object`

# File 'lib/speed_spider/crawler.rb', line 56

def after_crawl
  lambda { |pages|
    pages.each do |url, page|
      path = page.url.path
      path += 'index.html' if path.end_with? '/' or path.empty?

      path = "#{@options[:dir]}/#{page.url.host}#{path}"
      dir = File.dirname path

      FileUtils.mkdir_p dir unless dir.empty?
      File.open path, 'w' do |f|
        f.write page.body
      end

      puts "save file #{path}" if @options[:verbose]
    end
  }
end

#crawl ⇒ `Object`

# File 'lib/speed_spider/crawler.rb', line 75

def crawl
  Anemone.crawl @start_url, @options do |spider|
    spider.focus_crawl &focus_crawl
    spider.after_crawl &after_crawl
  end
end

#focus_crawl ⇒ `Object`

# File 'lib/speed_spider/crawler.rb', line 23

def focus_crawl
  lambda { |page|
    links = []
    if page.doc
      # include javascripts and img files as target links
      page.doc.search('//script[@src]', '//img[@src]', '//iframe[@src]').each do |s|
        u = s['src']
        next if u.nil? or u.empty?
        abs = page.to_absolute u rescue next
        links << abs if page.in_domain? abs
      end

      # include css files as target links
      page.doc.search('//link[@href]').each do |s|
        u = s['href']
        next if u.nil? or u.empty?
        abs = page.to_absolute u rescue next
        links << abs if page.in_domain? abs

      end
    elsif page.url.to_s.end_with? '.css'
      get_urls_from_css(page.body).each do |s|
        u = s.gsub('"', '').gsub("'", '')
        next if u.nil? or u.empty?
        abs = page.to_absolute u rescue next
        links << abs if page.in_domain? abs
      end
    end

    page.links + links.uniq
  }
end

#get_urls_from_css(data, pos = 0) ⇒ `Object`

return urls from css file contents

# File 'lib/speed_spider/crawler.rb', line 15

def get_urls_from_css data, pos = 0
  if m = data.match(/url\((.*?)\)/i, pos)
    [ m[1] ] + get_urls_from_css(data, m.end(1) + 1)
  else
    []
  end
end

Class: SpeedSpider::Crawler

Instance Method Summary collapse

Constructor Details

#initialize(start_url, options) ⇒ Crawler

Instance Method Details

#after_crawl ⇒ Object

#crawl ⇒ Object

#focus_crawl ⇒ Object

#get_urls_from_css(data, pos = 0) ⇒ Object

#initialize(start_url, options) ⇒ `Crawler`

#after_crawl ⇒ `Object`

#crawl ⇒ `Object`

#focus_crawl ⇒ `Object`

#get_urls_from_css(data, pos = 0) ⇒ `Object`