Class: Wraith::Crawler

Inherits:
Spider
  • Object
show all
Defined in:
lib/wraith/spider.rb

Constant Summary collapse

EXT =
%w(flv swf png jpg gif asx zip rar tar 7z \
gz jar js css dtd xsd ico raw mp3 mp4 \
wav wmv ape aac ac3 wma aiff mpg mpeg \
avi mov ogg mkv mka asx asf mp2 m1v \
m3u f4v pdf doc xls ppt pps bin exe rss xml)

Instance Method Summary collapse

Methods inherited from Spider

#determine_paths, #initialize

Constructor Details

This class inherits a constructor from Wraith::Spider

Instance Method Details

#modified_since(file, since) ⇒ Object



72
73
74
# File 'lib/wraith/spider.rb', line 72

def modified_since(file, since)
  (Time.now - File.ctime(file)) / (24 * 3600) < since
end

#spiderObject



57
58
59
60
61
62
63
64
65
66
67
68
69
70
# File 'lib/wraith/spider.rb', line 57

def spider
  if File.exist?(@wraith.spider_file) && modified_since(@wraith.spider_file, @wraith.spider_days[0])
    puts 'using existing spider file'
  else
    puts 'creating new spider file'
    spider_list = []
    Anemone.crawl(@wraith.base_domain) do |anemone|
      anemone.skip_links_like(/\.#{EXT.join('|')}$/)
      # Add user specified skips
      anemone.skip_links_like(@wraith.spider_skips)
      anemone.on_every_page { |page| add_path(page.url.path) }
    end
  end
end