Class: WebPageArchiver::MhtmlGenerator
- Inherits:
-
Object
- Object
- WebPageArchiver::MhtmlGenerator
- Includes:
- GeneratorHelpers
- Defined in:
- lib/web_page_archiver.rb
Overview
generates mht-files
Instance Attribute Summary collapse
-
#conf ⇒ Object
Returns the value of attribute conf.
Class Method Summary collapse
-
.generate(filename_or_uri) ⇒ String
generate mhtml (mht) file without instantiating a MhtmlGenerator object.
Instance Method Summary collapse
-
#add_html_content(cid) ⇒ Object
helper method to generate proper mime part headers.
-
#attach_contents ⇒ Object
adds mime-parts.
-
#convert(filename_or_uri) ⇒ String
convert object at uri to self-contained text-file.
Methods included from GeneratorHelpers
#content_type, #download_finished?, #initialize, #join_uri, #start_download_thread
Instance Attribute Details
#conf ⇒ Object
Returns the value of attribute conf.
95 96 97 |
# File 'lib/web_page_archiver.rb', line 95 def conf @conf end |
Class Method Details
.generate(filename_or_uri) ⇒ String
generate mhtml (mht) file without instantiating a MhtmlGenerator object
mhtml = WebPageArchiver::MhtmlGenerator.generate(“rubygems.org/”) open(“output.mht”, “w+”){|f| f.write mhtml }
104 105 106 107 |
# File 'lib/web_page_archiver.rb', line 104 def MhtmlGenerator.generate(filename_or_uri) generator = MhtmlGenerator.new return generator.convert(filename_or_uri) end |
Instance Method Details
#add_html_content(cid) ⇒ Object
helper method to generate proper mime part headers
param [String] cid content ID return [String] mime-part-text-blob
183 184 185 186 187 188 189 190 191 192 193 194 195 |
# File 'lib/web_page_archiver.rb', line 183 def add_html_content(cid) filename = File.basename(URI(@contents[cid][:uri]).path) @src.puts "--#{@boundary}" @src.puts "Content-Disposition: inline; filename=" + filename @src.puts "Content-Type: #{@contents[cid][:content_type]}" @src.puts "Content-Location: #{@contents[cid][:uri]}" @src.puts "Content-Transfer-Encoding: Base64" @src.puts "Content-Id: #{cid}" @src.puts "" @src.puts "#{Base64.encode64(@contents[cid][:body])}" @src.puts "" return end |
#attach_contents ⇒ Object
adds mime-parts
169 170 171 172 173 174 175 176 177 |
# File 'lib/web_page_archiver.rb', line 169 def attach_contents #prepeare_queue @contents.each{|k,v| @queue.push k} #start download threads self.start_download_thread # wait until download finished. @threads.each{|t|t.join} @contents.each{|k,v|self.add_html_content(k)} end |
#convert(filename_or_uri) ⇒ String
convert object at uri to self-contained text-file
113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 |
# File 'lib/web_page_archiver.rb', line 113 def convert(filename_or_uri) f = open(filename_or_uri) html = f.read @parser = Nokogiri::HTML html @src.puts "Subject: " + @parser.search("title").text() @src.puts "Content-Type: multipart/related; boundary=#{@boundary}" @src.puts "Content-Location: #{filename_or_uri}" @src.puts "Date: #{Time.now.to_s}" @src.puts "MIME-Version: 1.0" @src.puts "" @src.puts "mime mhtml content" @src.puts "" #imgs @parser.search('img').each{|i| uri = i.attr('src'); uri = join_uri( filename_or_uri, uri).to_s uid = Digest::MD5.hexdigest(uri) @contents[uid] = {:uri=>uri} i.set_attribute('src',"cid:#{uid}") } #styles @parser.search('link[rel=stylesheet]').each{|i| uri = i.attr('href'); uri = join_uri( filename_or_uri, uri) uid = Digest::MD5.hexdigest(uri) @contents[uid] = {:uri=>uri} i.set_attribute('href',"cid:#{uid}") } #scripts @parser.search('script').map{ |i| next unless i.attr('src'); uri = i.attr('src'); uri = join_uri( filename_or_uri, uri) uid = Digest::MD5.hexdigest(uri) @contents[uid] = {:uri=>uri} i.set_attribute('src',"cid:#{uid}") } @src.puts "--#{@boundary}" @src.puts "Content-Disposition: inline; filename=default.htm" @src.puts "Content-Type: #{content_type(f)}" @src.puts "Content-Id: #{Digest::MD5.hexdigest(filename_or_uri)}" @src.puts "Content-Location: #{filename_or_uri}" @src.puts "Content-Transfer-Encoding: 8bit" if @conf[:base64_except].find("html") @src.puts "Content-Transfer-Encoding: Base64" unless @conf[:base64_except].find("html") @src.puts "" #@src.puts html @src.puts "#{html}" if @conf[:base64_except].find("html") #@src.puts "#{Base64.encode64(html)}" unless @conf[:base64_except].find("html") @src.puts "" self.attach_contents @src.puts "--#{@boundary}--" @src.rewind return @src.read end |