Class: WebPageArchiver::DataUriHtmlGenerator

Inherits:
Object
  • Object
show all
Includes:
GeneratorHelpers
Defined in:
lib/web_page_archiver.rb

Overview

self-containing data-uri based html

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Methods included from GeneratorHelpers

#content_type, #download_finished?, #initialize, #join_uri, #start_download_thread

Instance Attribute Details

#confObject

Returns the value of attribute conf.



202
203
204
# File 'lib/web_page_archiver.rb', line 202

def conf
  @conf
end

Class Method Details

.generate(filename_or_uri) ⇒ String

generate self-containing data-uri based html file (html) file without instantiating a MhtmlGenerator object

mhtml = WebPageArchiver::DataUriHtmlGenerator.generate(“rubygems.org/”) open(“output.html”, “w+”){|f| f.write mhtml }

Parameters:

  • filename_or_uri (String, URI)

    to test for

Returns:

  • (String)

    text blob containing the result



211
212
213
214
# File 'lib/web_page_archiver.rb', line 211

def DataUriHtmlGenerator.generate(filename_or_uri)
  generateror = DataUriHtmlGenerator.new
  return generateror.convert(filename_or_uri)
end

Instance Method Details

#convert(filename_or_uri) ⇒ String

convert object at uri to self-contained text-file

Parameters:

  • filename_or_uri (String, URI)

    to test for

Returns:

  • (String)

    text blob containing the result



220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
# File 'lib/web_page_archiver.rb', line 220

def convert(filename_or_uri)
    @parser = Nokogiri::HTML(open(filename_or_uri))
    @parser.search('img').each{|i| 
        uri = i.attr('src');
        uri = join_uri( filename_or_uri, uri).to_s
        uid = Digest::MD5.hexdigest(uri)
        @contents[uid] = {:uri=>uri, :parser_ref=>i, :attribute_name=>'src'}
        i.set_attribute('src',"cid:#{uid}")
      }
    #styles
    @parser.search('link[rel=stylesheet]').each{|i|
        uri = i.attr('href');
        uri = join_uri( filename_or_uri, uri)
        uid = Digest::MD5.hexdigest(uri)
        @contents[uid] = {:uri=>uri, :parser_ref=>i, :attribute_name=>'href'}
        i.set_attribute('href',"cid:#{uid}")
      }
    #scripts
    @parser.search('script').map{ |i|
        next unless i.attr('src');
        uri = i.attr('src');
        uri = join_uri( filename_or_uri, uri)
        uid = Digest::MD5.hexdigest(uri)
        @contents[uid] = {:uri=>uri, :parser_ref=>i, :attribute_name=>'src'}
        i.set_attribute('src',"cid:#{uid}")
    }
    self.set_contents
    return @parser.to_s
end

#set_contentsObject

replaces content-placeholders with actual content



251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
# File 'lib/web_page_archiver.rb', line 251

def set_contents
  #prepeare_queue
  @contents.each{|k,v| @queue.push k}
  #start download threads
  self.start_download_thread
  # wait until download finished.
  @threads.each{|t|t.join}
  @contents.each do |k,v|
    content_benc=Base64.encode64(v[:body]).gsub(/\n/,'')
    tag=v[:parser_ref]
    attribute=v[:attribute_name]
    content_type=v[:content_type]
    tag.set_attribute(attribute,"data:#{content_type};base64,#{content_benc}")
  end
end