Broobles » eml2mbox » The script  
 
#!/usr/bin/ruby
#============================================================================================#
# eml2mbox.rb v0.08                                                                          #
# Last updated: Jan 23, 2004                                                                 #
#                                                                                            #
# Converts a bunch of eml files into one mbox file.                                          #
#                                                                                            #
# Usage: [ruby] eml2mbx.rb [-c] [-l] [-s] [-yz] [emlpath [trgtmbx]]                          #
#         Switches:                                                                          #
#            -c Remove CRs (^M) appearing at end of lines (Unix)                             #
#            -l Remove LFs appearing at beggining of lines (old Mac) - not tested            #
#            -s Don't use standard mbox postmark formatting (for From_ line)                 #
#               This will force the use of original From and Date found in mail headers.     #
#               Not recommended, unless you really have problems importing emls.             #
#           -yz Use this to force the order of the year and timezone in date in the From_    #
#               line from the default [timezone][year] to [year][timezone].                  #
#         emlpath - Path of dir with eml files. Defaults to the current dir if not specified #
#         trgtmbx - Name of the target mbox file. Defaults to "archive.mbox" in 'emlpath'    #
#                                                                                            #
# Ruby homepage: http://www.ruby-lang.org/en/                                                #
# Unix mailbox format: http://www.broobles.com/eml2mbox/mbox.html                            #
# This script  : http://www.broobles.com/eml2mbox                                            #
#                                                                                            #
#============================================================================================#
# Licence:                                                                                   #
#                                                                                            #
# This script is free software; you can redistribute it and/or modify it under the terms of  #
# the GNU Lesser General Public License as published by the Free Software Foundation;        # 
# either version 2.1 of the License, or (at your option) any later version.                  #
#                                                                                            #
# You should have received a copy of the GNU Lesser General Public License along with this   #
# script; if not, please visit http://www.gnu.org/copyleft/gpl.html for more information.    #
#============================================================================================#

require "parsedate"

include ParseDate

#=======================================================#
# Class that encapsulates the processing file in memory #
#=======================================================#

class FileInMemory
    
    ZoneOffset = {
        # Standard zones by RFC 2822
        'UTC' => '0000', 
        'UT' => '0000', 'GMT' => '0000',
        'EST' => '-0500', 'EDT' => '-0400',
        'CST' => '-0600', 'CDT' => '-0500',
        'MST' => '-0700', 'MDT' => '-0600',
        'PST' => '-0800', 'PDT' => '-0700',
    }   
    
    def initialize()
        @lines = Array.new
        @counter = 1          # keep the 0 position for the From_ line
        @from = nil           # from part of the From_ line
        @date = nil           # date part of the From_ line
    end

    def addLine(line)
        # If the line is a 'false' From line, add a '>' to its beggining
        line = line.sub(/From/, '>From') if line =~ /^From/ and @from!=nil

        # If the line is the first valid From line, save it (without the line break)
        if line =~ /^From:\s.*@/ and @from==nil
            @from = line.sub(/From:/,'From')
            @from = @from.chop    # Remove line break(s)
            @from = standardizeFrom(@from) unless $switches["noStandardFromLine"]
        end

        # Get the date
        if $switches["noStandardFromLine"]
            # Don't parse the content of the Date header
            @date = line.sub(/Date:\s/,'') if line =~ /^Date:\s/ and @date==nil
        else
            if line =~ /^Date:\s/ and @date==nil
                # Parse content of the Date header and convert to the mbox standard for the From_ line
                @date = line.sub(/Date:\s/,'')
                year, month, day, hour, minute, second, timezone, wday = parsedate(@date)
                # Need to convert the timezone from a string to a 4 digit offset
                unless timezone =~ /[+|-]\d*/
                    timezone=ZoneOffset[timezone]
                end
                time = Time.gm(year,month,day,hour,minute,second)
                @date = formMboxDate(time,timezone)
            end
        end

        # Now add the line to the array
        line = fixLineEndings(line)
        @lines[@counter]=line
        @counter+=1
    end

    # Forms the first line (from + date) and returns all the lines
    # Returns all the lines in the file
    def getProcessedLines()
        if @from != nil
            # Add from and date to the first line
            if @date==nil
                puts "WARN: Failed to extract date. Will use current time in the From_ line"
                @date=formMboxDate(Time.now,nil)
            end
            @lines[0] = @from + " " + @date 
            
            @lines[0] = fixLineEndings(@lines[0])
            @lines[@counter] = ""
            return @lines
        end
        # else don't return anything
    end

    # Fixes CR/LFs
    def fixLineEndings(line)
        line = removeCR(line) if $switches["removeCRs"];
        line = removeLF(line) if $switches["removeLFs"];
        return line
    end

    # emls usually have CR+LF (DOS) line endings, Unix uses LF as a line break,
    # so there's a hanging CR at the end of the line when viewed on Unix.
    # This method will remove the next to the last character from a line
    def removeCR(line)
        line = line[0..-3]+line[-1..-1] if line[-2]==0xD
        return line
    end

    # Similar to the above. This one is for Macs that use CR as a line break.
    # So, remove the last char
    def removeLF(line)
        line = line[0..-2] if line[-1]==0xA
        return line
    end

end

#================#
# Helper methods #
#================#

# Converts: 'From "some one " ' -> 'From aa@aa.aa'
def standardizeFrom(fromLine)
    # Get indexes of last "<" and ">" in line
    openIndex = fromLine.rindex('<')
    closeIndex = fromLine.rindex('>')
    if openIndex!=nil and closeIndex!=nil
        fromLine = fromLine[0..4]+fromLine[openIndex+1..closeIndex-1]
    end
    # else leave as it is - it is either already well formed or is invalid
    return fromLine
end

# Returns a mbox postmark formatted date.
# If timezone is unknown, it is skipped.
# mbox date format used is described here:
# http://www.broobles.com/eml2mbox/mbox.html
def formMboxDate(time,timezone)
    if timezone==nil
        return time.strftime("%a %b %d %H:%M:%S %Y")
    else
        if $switches["zoneYearOrder"]
            return time.strftime("%a %b %d %H:%M:%S "+timezone.to_s+" %Y")
        else 
            return time.strftime("%a %b %d %H:%M:%S %Y "+timezone.to_s)
        end
    end
end


# Extracts all switches from the command line and returns
# a hashmap with valid switch names as keys and booleans as values
# Moves real params to the beggining of the ARGV array
def extractSwitches()
    switches = Hash.new(false)  # All switches (values) default to false
    i=0
    while (ARGV[i]=~ /^-/)  # while arguments are switches
        if ARGV[i]=="-c"
            switches["removeCRs"] = true
            puts "\nWill fix lines ending with a CR"
        elsif ARGV[i]=="-l"
            switches["removeLFs"] = true
            puts "\nWill fix lines beggining with a LF"
        elsif ARGV[i]=="-s"
            switches["noStandardFromLine"] = true
            puts "\nWill use From and Date from mail headers in From_ line"
        elsif ARGV[i]=="-yz"
            switches["zoneYearOrder"] = true
            puts "\nTimezone will be placed before the year in From_ line"
        else
            puts "\nUnknown switch: "+ARGV[i]+". Ignoring."
        end
        i = i+1
    end
    # Move real arguments to the beggining of the array
    ARGV[0] = ARGV[i]
    ARGV[1] = ARGV[i+1]
    return switches
end

#===============#
#     Main      #
#===============#

    $switches = extractSwitches()

    # Extract specified directory with emls and the target archive (if any)
    emlDir = "."     # default if not specified
    emlDir = ARGV[0] if ARGV[0]!=nil
    mboxArchive = emlDir+"/archive.mbox"    # default if not specified
    mboxArchive = ARGV[1] if ARGV[1] != nil

    # Show specified settings
    puts "\nSpecified dir : "+emlDir
    puts "Specified file: "+mboxArchive+"\n"

    # Check that the dir exists
    if FileTest.directory?(emlDir)
        Dir.chdir(emlDir)
    else
        puts "\n["+emlDir+"] is not a directory (might not exist). Please specify a valid dir"
        exit(0)
    end

    # Check if destination file exists. If yes allow user to select an option.
    canceled = false
    if FileTest.exist?(mboxArchive)
        print "\nFile ["+mboxArchive+"] exists! Please select: [A]ppend  [O]verwrite  [C]ancel (default) "
        sel = STDIN.gets.chomp
        if sel == 'A' or sel == 'a'
            aFile = File.new(mboxArchive, "a");
        elsif sel == 'O' or sel == 'o'
            aFile = File.new(mboxArchive, "w");
        else
            canceled = true
        end
    else
        # File doesn't exist, open for writing
        aFile = File.new(mboxArchive, "w");
    end

    if not canceled
        puts
        files = Dir["*.eml"]
        if files.size == 0
            puts "No *.eml files in this directory. mbox file not created."
            aFile.close
            File.delete(mboxArchive)
            exit(0)
        end
        # For each .eml file in the specified directory do the following
        files.each() do |x|
            puts "Processing file: "+x
            thisFile = FileInMemory.new()
            File.open(x).each  {|item| thisFile.addLine(item) }
            lines = thisFile.getProcessedLines
            if lines == nil
                puts "WARN: File ["+x+"] doesn't seem to have a regular From: line. Not included in mbox"
            else
                lines.each {|line| aFile.puts line}
            end
        end
        aFile.close
    end
    
  Copyright 2002-2004 Broobles