Class: WebPageArchiver::InlineHtmlGenerator
- Inherits:
-
Object
- Object
- WebPageArchiver::InlineHtmlGenerator
- Includes:
- GeneratorHelpers
- Defined in:
- lib/web_page_archiver.rb
Overview
self-containing all-inline based html
Instance Attribute Summary collapse
-
#conf ⇒ Object
Returns the value of attribute conf.
Class Method Summary collapse
-
.generate(filename_or_uri) ⇒ String
generate self-containing all-inline based html file (html) file without instantiating a MhtmlGenerator object.
Instance Method Summary collapse
-
#convert(filename_or_uri) ⇒ String
convert object at uri to self-contained text-file.
- #set_contents ⇒ Object
Methods included from GeneratorHelpers
#content_type, #download_finished?, #initialize, #join_uri, #start_download_thread
Instance Attribute Details
#conf ⇒ Object
Returns the value of attribute conf.
272 273 274 |
# File 'lib/web_page_archiver.rb', line 272 def conf @conf end |
Class Method Details
.generate(filename_or_uri) ⇒ String
generate self-containing all-inline based html file (html) file without instantiating a MhtmlGenerator object
mhtml = WebPageArchiver::InlineHtmlGenerator.generate(“rubygems.org/”) open(“output.html”, “w+”){|f| f.write mhtml }
281 282 283 284 |
# File 'lib/web_page_archiver.rb', line 281 def InlineHtmlGenerator.generate(filename_or_uri) generator = InlineHtmlGenerator.new return generator.convert(filename_or_uri) end |
Instance Method Details
#convert(filename_or_uri) ⇒ String
convert object at uri to self-contained text-file
290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 |
# File 'lib/web_page_archiver.rb', line 290 def convert(filename_or_uri) @parser = Nokogiri::HTML(open(filename_or_uri)) @parser.search('img').each{|i| uri = i.attr('src'); uri = join_uri( filename_or_uri, uri).to_s uid = Digest::MD5.hexdigest(uri) @contents[uid] = {:uri=>uri, :parser_ref=>i, :attribute_name=>'src'} i.set_attribute('src',"cid:#{uid}") } #styles @parser.search('link[rel=stylesheet]').each{|i| uri = i.attr('href'); uri = join_uri( filename_or_uri, uri) uid = Digest::MD5.hexdigest(uri) @contents[uid] = {:uri=>uri, :parser_ref=>i, :attribute_name=>'href'} i.set_attribute('href',"cid:#{uid}") } #scripts @parser.search('script').map{ |i| next unless i.attr('src'); uri = i.attr('src'); uri = join_uri( filename_or_uri, uri) uid = Digest::MD5.hexdigest(uri) @contents[uid] = {:uri=>uri, :parser_ref=>i, :attribute_name=>'src'} i.set_attribute('src',"cid:#{uid}") } self.set_contents return @parser.to_s end |
#set_contents ⇒ Object
320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 |
# File 'lib/web_page_archiver.rb', line 320 def set_contents #prepeare_queue @contents.each{|k,v| @queue.push k} #start download threads self.start_download_thread # wait until download finished. @threads.each{|t|t.join} @contents.each do |k,v| tag=v[:parser_ref] if tag.name == "script" content_benc=Base64.encode64(v[:body]).gsub(/\n/,'') attribute=v[:attribute_name] content_type=v[:content_type] tag.content=v[:body] tag.remove_attribute(v[:attribute_name]) elsif tag.name == "link" and v[:content_type]="text/css" tag.after("<style type=\"text/css\">#{v[:body]}</style>") tag.remove() else # back to inline for non-script and style files... content_benc=Base64.encode64(v[:body]).gsub(/\n/,'') attribute=v[:attribute_name] content_type=v[:content_type] tag.set_attribute(attribute,"data:#{content_type};base64,#{content_benc}") end end end |