5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
|
# File 'lib/export_command.rb', line 5
def self.start(opts, path)
uri = URI.parse(opts[:url])
default_root_path = File.join(Dir.pwd, uri.host)
options = {
:cache => 600,
:crawl_limit => 1000000,
:raise_exceptions => true,
:root_path => default_root_path
}.merge(opts)
if options.has_key?(:seed_url_file)
filename = options.delete(:seed_url_file)
options[:seed_urls] = []
File.open(filename, "r") do |f|
f.each_line do |line|
options[:seed_urls] << line
end
end
end
statistics = CobwebCrawler.new(options).crawl(options[:url]) do |page|
begin
puts "Just crawled #{page[:url]} and got a status of #{page[:status_code]}."
uri = URI.parse(page[:url])
path = ""
Dir.mkdir(options[:root_path]) unless File.exist?(options[:root_path])
uri.path.split("/")[0..-2].each do |dir|
path+="/" unless path.cobweb_ends_with?("/")
path+=dir
if File.exist?(options[:root_path] + path) && !File.directory?(options[:root_path] + path)
FileUtils.mv(options[:root_path] + path, options[:root_path] + path + ".tmp")
Dir.mkdir(options[:root_path] + path)
FileUtils.mv(options[:root_path] + path + ".tmp", options[:root_path] + path + "/index.html")
else
Dir.mkdir(options[:root_path] + path) unless Dir.exist?(options[:root_path] + path)
end
end
path += "/" unless path.cobweb_ends_with?("/")
filename = uri.path.split("/")[-1]
if filename.nil? || filename.empty?
filename = "index.html"
end
filename = filename + "_" + uri.query.gsub("/", "%2F") unless uri.query.nil?
if page[:text_content]
doc = Nokogiri::HTML.parse(page[:body])
if doc.search("title").first
title = doc.search("title").first.content.gsub(" - ", " ")
else
title = uri.path.split("/")[-1]
end
page[:description] = doc.search("meta[name=description]").first.content if doc.search("meta[name=description]").first
page[:keywords] = doc.search("meta[name=keywords]").first.content if doc.search("meta[name=keywords]").first
page[:meta_title] = doc.search("meta[name=title]").first.content if doc.search("meta[name=title]").first
body = page[:body]
File.open(options[:root_path] + path + filename, "w+"){|f| f.write(page.to_yaml)}
else
File.open(options[:root_path] + path + filename, "wb"){|f| f.write(Base64.decode64(page[:body]))}
end
puts "Finished Crawl with #{statistics[:page_count]} pages and #{statistics[:asset_count]} assets." if statistics
rescue => e
puts e.message
puts e.backtrace
end
end
end
|