Class: EncodedWord

Inherits:

Object

Object
EncodedWord

show all

Defined in:: lib/encoded_word.rb

Instance Method Summary collapse

Constructor Details

#initialize(inputdir) ⇒ `EncodedWord`

Returns a new instance of EncodedWord.

# File 'lib/encoded_word.rb', line 7

def initialize(inputdir)
    if File::ALT_SEPARATOR
        @inputdir = inputdir.gsub(File::ALT_SEPARATOR, File::SEPARATOR)
    else
        @inputdir = inputdir
    end
end

Instance Method Details

#combine ⇒ `Object`



20
21
22

# File 'lib/encoded_word.rb', line 20

def combine
    combine_all_mlog_plain
end

#combine_all_mlog_plain ⇒ `Object`

# File 'lib/encoded_word.rb', line 24

def combine_all_mlog_plain
    File.open(File.join(@inputdir, 'all_mlog.csv'), 'w:utf-8') do |out|
        out.puts '"key","","","date","from","to","cc","bcc","subject","attach"'
        Dir.glob(File.join(@inputdir, File.join('**', '*.plain'))).select do |f|
            puts f
            File.open(f, 'r:utf-8').each_line do |line|
                parts = mysplit(line)
                key = parts[3]
                date = parts[0]
                subject = parts[2]
                from = parts[3]
                to = to_csv_column(parts[4])
                cc = to_csv_column(parts[5])
                attach = to_csv_column(parts[6])
                out.puts %Q("#{key}","","","#{date}","#{from}","#{to}","#{cc}","","#{subject}","#{attach}")
            end
        end
    end
end

#concat_one_line(words) ⇒ `Object`

# File 'lib/encoded_word.rb', line 88

def concat_one_line(words)
    line = words.join('')
    parts = mysplit(line, "\n")
    parts.join('')
end

#decode ⇒ `Object`

# File 'lib/encoded_word.rb', line 15

def decode
    decode_all_mlog
    combine_all_mlog_plain
end

#decode_all_mlog ⇒ `Object`

# File 'lib/encoded_word.rb', line 49

def decode_all_mlog
    Dir.glob(File.join(@inputdir, File.join('**', '*.mlog'))).select do |f|
        puts f
        mids = {}
        File.open(f + '.plain', 'w:utf-8') do |out|
            File.open(f) do |input|
                @input_enc = input.external_encoding
                input.each_line do |line|
                    parts = mysplit(line)
                    next if mids.has_key?(parts[1])
                    mids[parts[1]] = 0

                    newparts = []
                    newparts << format_date(parts[0]) #date
                    newparts << parts[1] #message-id
                    newparts << decode_subject(parts[2]) #subject
                    newparts << trim_emails(parts[3]) #from
                    newparts << trim_emails(parts[4]) #to
                    newparts << trim_emails(parts[5]) #cc
                    newparts << decode_attaches(parts)
                    0.upto(newparts.length-2) do |i|
                        begin
                            out.write newparts[i]
                            out.write "\t"
                        rescue
                            out.write "\t"
                        end
                    end
                    begin
                        out.puts newparts[newparts.length-1]
                    rescue
                        out.puts ''
                    end
                end
            end
        end
    end
end

#decode_attach(attach) ⇒ `Object`

# File 'lib/encoded_word.rb', line 174

def decode_attach(attach)
    return '' unless attach and attach.length > 0
    parts = mysplit(attach, "\a")
    words = []
    parts.each do |p|
        wd = word_decode(p) 
        words << wd
    end
    concat_one_line(words)
end

#decode_attaches(parts) ⇒ `Object`

# File 'lib/encoded_word.rb', line 165

def decode_attaches(parts)
    attaches = []
    6.upto(parts.length-1) do |i|
        attaches << decode_attach(parts[i])
    end
    return '' unless attaches.length > 0
    attaches.join("\a")
end

#decode_subject(sub) ⇒ `Object`

# File 'lib/encoded_word.rb', line 123

def decode_subject(sub)
    return '' unless sub and sub.length > 0
    parts = mysplit(sub, "\a")
    words = []
    parts.each do |p|
        wd = word_decode(p) 
        words << wd
    end
    concat_one_line(words)
end

#format_date(engdate) ⇒ `Object`

# File 'lib/encoded_word.rb', line 116

def format_date(engdate)
    return '' unless engdate and engdate.length > 0
    dt = DateTime.parse(engdate)
    dt = dt.new_offset('+0900')
    dt.strftime("%Y/%m/%d %H:%M:%S")
end

#getmail(line, at) ⇒ `Object`

# File 'lib/encoded_word.rb', line 149

def getmail(line, at)
    pos1 = 0
    pos2 = line.length - 1
    (at-1).step(0, -1) do |i|
        next if line[i] =~ /[\._a-zA-Z0-9-]/
        pos1 = i + 1
        break
    end
    (at+1).upto(line.length) do |i|
        next if line[i] =~ /[\.a-zA-Z0-9-]/
        pos2 = i - 1
        break
    end
    line[pos1..pos2]
end

#mysplit(line, sep = "\t") ⇒ `Object`

# File 'lib/encoded_word.rb', line 94

def mysplit(line, sep = "\t")
    return [] unless line
    return [] unless line.length > 0

    parts = []
    last = -1
    pos1 = -1
    while true do
        pos1 += 1
        pos2 = line.index(sep, pos1)
        if pos2
            parts << line[pos1...pos2]
            pos1 = pos2
        else
            last -= 1 if line[last] == "\n"
            parts << line[pos1..last]
            break
        end
    end
    return parts
end

#to_csv_column(str) ⇒ `Object`

# File 'lib/encoded_word.rb', line 44

def to_csv_column(str)
    return '' unless str and str.length > 0
    str.gsub("\a", ';')
end

#trim_emails(emails) ⇒ `Object`

# File 'lib/encoded_word.rb', line 134

def trim_emails(emails)
    return '' unless emails and emails.length > 0
    emails = emails.encode('utf-8', @input_enc, :undef=>:replace, :invalid=>:replace)
    pos1 = -1
    newparts = []
    while true do
        pos1 += 1
        pos2 = emails.index('@', pos1)
        break unless pos2
        newparts << getmail(emails, pos2)
        pos1 = pos2
    end
    newparts.join("\a")
end

#word_decode(input, out_charset = 'utf-8') ⇒ `Object`

# File 'lib/encoded_word.rb', line 185

def word_decode(input, out_charset = 'utf-8')
    u8 = input.encode('utf-8', @u8_enc, :undef=>:replace, :invalid=>:replace)
    parts = u8.scan(/=\?([A-Za-z0-9_-]+)\?([BQbq])\?([^\?]+)\?=/).first
    return input unless parts and parts.length == 3
    charset = parts[0]
    enc = parts[1].upcase
    charset = 'utf-8' if charset.downcase == 'utf8'
    wd = parts[2].unpack({ "B"=>"m*", "Q"=>"M*" }[enc]).first
    begin
        return wd.encode(out_charset, charset, :undef=>:replace, :invalid=>:replace)
    rescue => e
        print "Cannot encode #{input} because #{e}, try iconv..."
        begin
            puts "OK"
            return Iconv.conv(out_charset + "//IGNORE", charset, wd)
        rescue => e
            puts "Iconv failed too because #{e}. Return ''."
            return ''
        end
    end
end

Class: EncodedWord

Instance Method Summary collapse

Constructor Details

#initialize(inputdir) ⇒ EncodedWord

Instance Method Details

#combine ⇒ Object

#combine_all_mlog_plain ⇒ Object

#concat_one_line(words) ⇒ Object

#decode ⇒ Object

#decode_all_mlog ⇒ Object

#decode_attach(attach) ⇒ Object

#decode_attaches(parts) ⇒ Object

#decode_subject(sub) ⇒ Object

#format_date(engdate) ⇒ Object

#getmail(line, at) ⇒ Object

#mysplit(line, sep = "\t") ⇒ Object

#to_csv_column(str) ⇒ Object

#trim_emails(emails) ⇒ Object

#word_decode(input, out_charset = 'utf-8') ⇒ Object

#initialize(inputdir) ⇒ `EncodedWord`

#combine ⇒ `Object`

#combine_all_mlog_plain ⇒ `Object`

#concat_one_line(words) ⇒ `Object`

#decode ⇒ `Object`

#decode_all_mlog ⇒ `Object`

#decode_attach(attach) ⇒ `Object`

#decode_attaches(parts) ⇒ `Object`

#decode_subject(sub) ⇒ `Object`

#format_date(engdate) ⇒ `Object`

#getmail(line, at) ⇒ `Object`

#mysplit(line, sep = "\t") ⇒ `Object`

#to_csv_column(str) ⇒ `Object`

#trim_emails(emails) ⇒ `Object`

#word_decode(input, out_charset = 'utf-8') ⇒ `Object`