Class: WebPageArchiver::InlineHtmlGenerator

Inherits:
Object
  • Object
show all
Includes:
GeneratorHelpers
Defined in:
lib/web_page_archiver.rb

Overview

self-containing all-inline based html

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Methods included from GeneratorHelpers

#content_type, #download_finished?, #initialize, #join_uri, #start_download_thread

Instance Attribute Details

#confObject

Returns the value of attribute conf.



272
273
274
# File 'lib/web_page_archiver.rb', line 272

def conf
  @conf
end

Class Method Details

.generate(filename_or_uri) ⇒ String

generate self-containing all-inline based html file (html) file without instantiating a MhtmlGenerator object

mhtml = WebPageArchiver::InlineHtmlGenerator.generate(“rubygems.org/”) open(“output.html”, “w+”){|f| f.write mhtml }

Parameters:

  • filename_or_uri (String, URI)

    to test for

Returns:

  • (String)

    text blob containing the result



281
282
283
284
# File 'lib/web_page_archiver.rb', line 281

def InlineHtmlGenerator.generate(filename_or_uri)
  generator = InlineHtmlGenerator.new
  return generator.convert(filename_or_uri)
end

Instance Method Details

#convert(filename_or_uri) ⇒ String

convert object at uri to self-contained text-file

Parameters:

  • filename_or_uri (String, URI)

    to test for

Returns:

  • (String)

    text blob containing the result



290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
# File 'lib/web_page_archiver.rb', line 290

def convert(filename_or_uri)
    @parser = Nokogiri::HTML(open(filename_or_uri))
    @parser.search('img').each{|i| 
        uri = i.attr('src');
        uri = join_uri( filename_or_uri, uri).to_s
        uid = Digest::MD5.hexdigest(uri)
        @contents[uid] = {:uri=>uri, :parser_ref=>i, :attribute_name=>'src'}
        i.set_attribute('src',"cid:#{uid}")
      }
    #styles

    @parser.search('link[rel=stylesheet]').each{|i|
        uri = i.attr('href');
        uri = join_uri( filename_or_uri, uri)
        uid = Digest::MD5.hexdigest(uri)
        @contents[uid] = {:uri=>uri, :parser_ref=>i, :attribute_name=>'href'}
        i.set_attribute('href',"cid:#{uid}")
      }
    #scripts

    @parser.search('script').map{ |i|
        next unless i.attr('src');
        uri = i.attr('src');
        uri = join_uri( filename_or_uri, uri)
        uid = Digest::MD5.hexdigest(uri)
        @contents[uid] = {:uri=>uri, :parser_ref=>i, :attribute_name=>'src'}
        i.set_attribute('src',"cid:#{uid}")
    }
    self.set_contents
    return @parser.to_s
end

#set_contentsObject



320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
# File 'lib/web_page_archiver.rb', line 320

def set_contents
  #prepeare_queue

  @contents.each{|k,v| @queue.push k}
  #start download threads

  self.start_download_thread
  # wait until download finished.

  @threads.each{|t|t.join}
  @contents.each do |k,v|
    tag=v[:parser_ref]
    if tag.name == "script"
      content_benc=Base64.encode64(v[:body]).gsub(/\n/,'')
      attribute=v[:attribute_name]
      content_type=v[:content_type]
      tag.content=v[:body]
      tag.remove_attribute(v[:attribute_name])
    elsif tag.name == "link" and v[:content_type]="text/css"
      tag.after("<style type=\"text/css\">#{v[:body]}</style>")
      tag.remove()
    else
      # back to inline for non-script and style files...

      content_benc=Base64.encode64(v[:body]).gsub(/\n/,'')
      attribute=v[:attribute_name]
      content_type=v[:content_type]
      tag.set_attribute(attribute,"data:#{content_type};base64,#{content_benc}")
    end
  end
end