Class: Stevedore::ArchiveSplitter
- Inherits:
-
Object
- Object
- Stevedore::ArchiveSplitter
- Defined in:
- lib/split_archive.rb
Constant Summary collapse
- HANDLED_FORMATS =
["zip", "mbox", "pst", "eml"]
Class Method Summary collapse
- .get_attachments_from_eml(email_filename) ⇒ Object
- .split(archive_filename) ⇒ Object
- .split_mbox(archive_filename) ⇒ Object
- .split_pst(archive_filename) ⇒ Object
- .split_zip(archive_filename) ⇒ Object
Class Method Details
.get_attachments_from_eml(email_filename) ⇒ Object
112 113 114 115 116 117 118 119 120 121 122 |
# File 'lib/split_archive.rb', line 112 def self.(email_filename) Enumerator.new do |yielder| mail = Mail.new open(email_filename){|f| f.read } = mail..map do || [.filename, lambda{|fn| open(fn, 'wb'){|fh| fh << .body.decoded }}, [], File.basename(email_filename)] end = .map{|a| File.basename(a[0]) } yielder << [File.basename(email_filename), lambda{|fn| open(fn, 'wb'){|fh| fh << open(email_filename){|f| f.read } } }, , nil] .each{|res| yielder << res } end end |
.split(archive_filename) ⇒ Object
20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 |
# File 'lib/split_archive.rb', line 20 def self.split(archive_filename) # if it's a PST use split_pst # if it's an mbox, use split_mbox, etc. # return a list of files Enumerator.new do |yielder| Dir.mktmpdir do |tmpdir| #TODO should probably do magic byte searching etc. extension = archive_filename.split(".")[-1] puts "splitting #{archive_filename}" constituent_files = if extension == "mbox" self.split_mbox(archive_filename) elsif extension == "pst" self.split_pst(archive_filename) elsif extension == "zip" self.split_zip(archive_filename) elsif extension == "eml" self.(archive_filename) end # should yield a relative filename # and a lambda that will write the file contents to the given filename FileUtils.mkdir_p(File.join(tmpdir, File.basename(archive_filename))) constituent_files.each_with_index do |, idx| basename, contents_lambda, , parent = * tmp_filename = File.join(tmpdir, File.basename(archive_filename), basename ) FileUtils.mkdir_p(File.dirname(tmp_filename)) begin contents_lambda.call(tmp_filename) rescue Errno::ENOENT puts "#{tmp_filename} wasn't extracted from #{archive_filename}" next end ||= [] yielder.yield tmp_filename, File.join(File.basename(archive_filename), basename), , parent end end end end |
.split_mbox(archive_filename) ⇒ Object
124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 |
# File 'lib/split_archive.rb', line 124 def self.split_mbox(archive_filename) # stolen shamelessly from the Ruby Enumerable docs, actually # split mails in mbox (slice before Unix From line after an empty line) Enumerator.new do |yielder| open(archive_filename) do |fh| fh.slice_before(empty: true) do |line, h| previous_was_empty = h[:empty] h[:empty] = line == "\n" || line == "\r\n" || line == "\r" previous_was_empty && line.start_with?("From ") end.each_with_index do |mail_str, idx| # TODO copy over stuff from get_attachments_from_eml for attachment/parents if mail_str.pop if mail_str.last == "\n" # remove last line if prexent yielder << ["#{idx}.eml", lambda{|fn| open(fn, 'wb'){|fh| fh << mail_str.join("") } }] mail = Mail.new mail_str.join("") mail..each do || yielder << [.filename, lambda{|fn| open(fn, 'wb'){|fh| fh << .body.decoded }}] end end end end end |
.split_pst(archive_filename) ⇒ Object
59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 |
# File 'lib/split_archive.rb', line 59 def self.split_pst(archive_filename) pstfile = Java::ComPFF::PSTFile.new(archive_filename) idx = 0 folders = pstfile.root.sub_folders.inject({}) do |memo,f| memo[f.name] = f memo end Enumerator.new do |yielder| folders.each do |folder_name, folder| while mail = folder.getNextChild eml_str = mail. + mail.get_body yielder << ["#{idx}.eml", lambda{|fn| open(fn, 'wb'){|fh| fh << eml_str } }] = mail. .times do || = mail.() = .get_filename yielder << ["#{idx}-#{}", lambda {|fn| open(fn, 'wb'){ |fh| fh << .get_file_input_stream.to_io.read }}] end idx += 1 end end end end |
.split_zip(archive_filename) ⇒ Object
146 147 148 149 150 151 152 153 154 155 156 157 158 |
# File 'lib/split_archive.rb', line 146 def self.split_zip(archive_filename) Zip::File.open(archive_filename) do |zip_file| Enumerator.new do |yielder| zip_file.each do |entry| begin yielder << [entry.name, lambda{|fn| entry.extract(fn) }] rescue puts "unable to extract #{entry.name} from #{archive_filename}" end end end end end |