Class: Staticizer::Crawler
- Inherits:
-
Object
- Object
- Staticizer::Crawler
- Defined in:
- lib/staticizer/crawler.rb
Instance Attribute Summary collapse
-
#output_dir ⇒ Object
Returns the value of attribute output_dir.
-
#url_queue ⇒ Object
readonly
Returns the value of attribute url_queue.
Instance Method Summary collapse
- #add_url(url, info = {}) ⇒ Object
- #add_urls(urls, info = {}) ⇒ Object
- #crawl ⇒ Object
- #extract_css_urls(css, base_uri) ⇒ Object
- #extract_hrefs(doc, base_uri) ⇒ Object
- #extract_images(doc, base_uri) ⇒ Object
- #extract_links(doc, base_uri) ⇒ Object
- #extract_scripts(doc, base_uri) ⇒ Object
-
#initialize(initial_page, opts = {}) ⇒ Crawler
constructor
A new instance of Crawler.
- #log_level ⇒ Object
- #log_level=(level) ⇒ Object
- #make_absolute(base_uri, href) ⇒ Object
- #process_body(body, uri, opts) ⇒ Object
-
#process_redirect(url, destination_url) ⇒ Object
If we hit a redirect we save the redirect as a meta refresh page TODO: for AWS S3 hosting we could instead create a redirect?.
- #process_success(response, parsed_uri) ⇒ Object
-
#process_url(url, info) ⇒ Object
Fetch a URI and save it to disk.
- #save_page(response, uri) ⇒ Object
- #save_page_to_aws(response, uri) ⇒ Object
- #save_page_to_disk(response, uri) ⇒ Object
Constructor Details
#initialize(initial_page, opts = {}) ⇒ Crawler
Returns a new instance of Crawler.
12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 |
# File 'lib/staticizer/crawler.rb', line 12 def initialize(initial_page, opts = {}) if initial_page.nil? raise ArgumentError, "Initial page required" end @opts = opts.dup @url_queue = [] @processed_urls = [] @output_dir = @opts[:output_dir] || File.("crawl/") @log = @opts[:logger] || Logger.new(STDOUT) @log.level = @opts[:log_level] || Logger::INFO if @opts[:aws] bucket_name = @opts[:aws].delete(:bucket_name) AWS.config(opts[:aws]) @s3_bucket = AWS::S3.new.buckets[bucket_name] @s3_bucket.acl = :public_read end if @opts[:valid_domains].nil? uri = URI.parse(initial_page) @opts[:valid_domains] ||= [uri.host] end if @opts[:process_body] @process_body = @opts[:process_body] end add_url(initial_page) end |
Instance Attribute Details
#output_dir ⇒ Object
Returns the value of attribute output_dir.
10 11 12 |
# File 'lib/staticizer/crawler.rb', line 10 def output_dir @output_dir end |
#url_queue ⇒ Object (readonly)
Returns the value of attribute url_queue.
9 10 11 |
# File 'lib/staticizer/crawler.rb', line 9 def url_queue @url_queue end |
Instance Method Details
#add_url(url, info = {}) ⇒ Object
91 92 93 94 95 96 97 98 99 100 101 102 103 |
# File 'lib/staticizer/crawler.rb', line 91 def add_url(url, info = {}) if @opts[:filter_url] url = @opts[:filter_url].call(url, info) return if url.nil? else regex = "(#{@opts[:valid_domains].join(")|(")})" return if url !~ %r{^https?://#{regex}} end url = url.sub(/#.*$/,'') # strip off any fragments return if @url_queue.index {|u| u[0] == url } || @processed_urls.include?(url) @url_queue << [url, info] end |
#add_urls(urls, info = {}) ⇒ Object
81 82 83 |
# File 'lib/staticizer/crawler.rb', line 81 def add_urls(urls, info = {}) urls.compact.uniq.each {|url| add_url(url, info.dup) } end |
#crawl ⇒ Object
51 52 53 54 55 56 57 58 59 |
# File 'lib/staticizer/crawler.rb', line 51 def crawl @log.info("Starting crawl") while(@url_queue.length > 0) url, info = @url_queue.shift @processed_urls << url process_url(url, info) end @log.info("Finished crawl") end |
#extract_css_urls(css, base_uri) ⇒ Object
77 78 79 |
# File 'lib/staticizer/crawler.rb', line 77 def extract_css_urls(css, base_uri) css.scan(/url\(['"]?(.+?)['"]?\)/).map {|src| make_absolute(base_uri, src[0]) } end |
#extract_hrefs(doc, base_uri) ⇒ Object
61 62 63 |
# File 'lib/staticizer/crawler.rb', line 61 def extract_hrefs(doc, base_uri) doc.xpath("//a/@href").map {|href| make_absolute(base_uri, href) } end |
#extract_images(doc, base_uri) ⇒ Object
65 66 67 |
# File 'lib/staticizer/crawler.rb', line 65 def extract_images(doc, base_uri) doc.xpath("//img/@src").map {|src| make_absolute(base_uri, src) } end |
#extract_links(doc, base_uri) ⇒ Object
69 70 71 |
# File 'lib/staticizer/crawler.rb', line 69 def extract_links(doc, base_uri) doc.xpath("//link/@href").map {|href| make_absolute(base_uri, href) } end |
#extract_scripts(doc, base_uri) ⇒ Object
73 74 75 |
# File 'lib/staticizer/crawler.rb', line 73 def extract_scripts(doc, base_uri) doc.xpath("//script/@src").map {|src| make_absolute(base_uri, src) } end |
#log_level ⇒ Object
43 44 45 |
# File 'lib/staticizer/crawler.rb', line 43 def log_level @log.level end |
#log_level=(level) ⇒ Object
47 48 49 |
# File 'lib/staticizer/crawler.rb', line 47 def log_level=(level) @log.level = level end |
#make_absolute(base_uri, href) ⇒ Object
85 86 87 88 89 |
# File 'lib/staticizer/crawler.rb', line 85 def make_absolute(base_uri, href) URI::join(base_uri, href).to_s rescue StandardError => e @log.error "Could not make absolute '#{base_uri}' - '#{href}' - #{e}" end |
#process_body(body, uri, opts) ⇒ Object
200 201 202 203 204 205 |
# File 'lib/staticizer/crawler.rb', line 200 def process_body(body, uri, opts) if @process_body body = @process_body.call(body, uri, opts) end body end |
#process_redirect(url, destination_url) ⇒ Object
If we hit a redirect we save the redirect as a meta refresh page TODO: for AWS S3 hosting we could instead create a redirect?
195 196 197 198 |
# File 'lib/staticizer/crawler.rb', line 195 def process_redirect(url, destination_url) body = "<html><head><META http-equiv='refresh' content='0;URL=\"#{destination_url}\"'></head><body>You are being redirected to <a href='#{destination_url}'>#{destination_url}</a>.</body></html>" save_page(body, url) end |
#process_success(response, parsed_uri) ⇒ Object
175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 |
# File 'lib/staticizer/crawler.rb', line 175 def process_success(response, parsed_uri) url = parsed_uri.to_s case response['content-type'] when /css/ save_page(response, parsed_uri) add_urls(extract_css_urls(response.body, url), {:type_hint => "css_url"}) when /html/ save_page(response, parsed_uri) doc = Nokogiri::HTML(response.body) add_urls(extract_links(doc, url), {:type_hint => "link"}) add_urls(extract_scripts(doc, url), {:type_hint => "script"}) add_urls(extract_images(doc, url), {:type_hint => "image"}) add_urls(extract_hrefs(doc, url), {:type_hint => "href"}) unless @opts[:single_page] else save_page(response, parsed_uri) end end |
#process_url(url, info) ⇒ Object
Fetch a URI and save it to disk
208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 |
# File 'lib/staticizer/crawler.rb', line 208 def process_url(url, info) @http_connections ||= {} parsed_uri = URI(url) @log.debug "Fetching #{parsed_uri}" # Attempt to use an already open Net::HTTP connection key = parsed_uri.host + parsed_uri.port.to_s connection = @http_connections[key] if connection.nil? connection = Net::HTTP.new(parsed_uri.host, parsed_uri.port) @http_connections[key] = connection end request = Net::HTTP::Get.new(parsed_uri.request_uri) connection.request(request) do |response| case response when Net::HTTPSuccess process_success(response, parsed_uri) when Net::HTTPRedirection redirect_url = response['location'] @log.debug "Processing redirect to #{redirect_url}" process_redirect(parsed_uri, redirect_url) add_url(redirect_url) else @log.error "Error #{response.code}:#{response.} fetching url #{url}" end end end |
#save_page(response, uri) ⇒ Object
105 106 107 108 109 110 111 |
# File 'lib/staticizer/crawler.rb', line 105 def save_page(response, uri) if @opts[:aws] save_page_to_aws(response, uri) else save_page_to_disk(response, uri) end end |
#save_page_to_aws(response, uri) ⇒ Object
157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 |
# File 'lib/staticizer/crawler.rb', line 157 def save_page_to_aws(response, uri) key = uri.path key += "?#{uri.query}" if uri.query key = key.gsub(%r{^/},"") key = "index.html" if key == "" # Upload this file directly to AWS::S3 opts = {:acl => :public_read} opts[:content_type] = response['content-type'] rescue "text/html" @log.info "Uploading #{key} to s3 with content type #{opts[:content_type]}" if response.respond_to?(:read_body) body = process_body(response.read_body, uri, opts) @s3_bucket.objects[key].write(body, opts) else body = process_body(response, uri, opts) @s3_bucket.objects[key].write(body, opts) end end |
#save_page_to_disk(response, uri) ⇒ Object
113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 |
# File 'lib/staticizer/crawler.rb', line 113 def save_page_to_disk(response, uri) path = uri.path path += "?#{uri.query}" if uri.query path_segments = path.scan(%r{[^/]*/}) filename = path.include?("/") ? path[path.rindex("/")+1..-1] : path current = @output_dir FileUtils.mkdir_p(current) unless File.exist?(current) # Create all the directories necessary for this file path_segments.each do |segment| current = File.join(current, "#{segment}").sub(%r{/$},'') if File.file?(current) # If we are trying to create a directory and there already is a file # with the same name add a .d to the file since we can't create # a directory and file with the same name in the file system dirfile = current + ".d" FileUtils.mv(current, dirfile) FileUtils.mkdir(current) FileUtils.cp(dirfile, File.join(current, "/index.html")) elsif !File.exists?(current) FileUtils.mkdir(current) end end body = response.respond_to?(:read_body) ? response.read_body : response body = process_body(body, uri, {}) outfile = File.join(current, "/#{filename}") if filename == "" indexfile = File.join(outfile, "/index.html") @log.info "Saving #{indexfile}" File.open(indexfile, "wb") {|f| f << body } elsif File.directory?(outfile) dirfile = outfile + ".d" @log.info "Saving #{dirfile}" File.open(dirfile, "wb") {|f| f << body } FileUtils.cp(dirfile, File.join(outfile, "/index.html")) else @log.info "Saving #{outfile}" File.open(outfile, "wb") {|f| f << body } end end |