Как извлечь файл .msg?

#python #email #extract

Вопрос:

У меня есть проект по извлечению вложений .msg из нескольких файлов .eml, созданных в Outlook.

Я делал это в прошлом для обычных вложений, docx, txt, pdf и т. Д.

Когда дело доходит до файлов .msg, вложенных в файл .eml, я получаю следующий ответ.

 Fail: <_io.TextIOWrapper name='file1.eml' mode='r' encoding='cp1252'>

Fail: <_io.TextIOWrapper name='file2.eml' mode='r' encoding='cp1252'>

Fail: <_io.TextIOWrapper name='file3.eml' mode='r' encoding='cp1252'>

Fail: <_io.TextIOWrapper name='file4.eml' mode='r' encoding='cp1252'>

Done: Processed 4 files with 0 attachments.
 

Сценарий, который я изменял, изначально был взят из beamzer на github.

 import glob
import os
import email
import argparse
from multiprocessing import Pool
from cs.rfc2047 import unrfc2047

EXTENSION = "eml"

parser = argparse.ArgumentParser(description='extract attachments from eml files')
parser.add_argument(
    '-d','--debug',
    action='store_true',
    help='print debug messages to stderr'
)
parser.add_argument(
    '-s','--single',
    action='store_true',
    help='run as single thread (default = multithreaded, one thread per core)'
)
parser.add_argument(
    '-q','--quiet',
    action='store_true',
    help='no output'
)
args = parser.parse_args()
debug = args.debug
single = args.single
quiet = args.quiet
debug and print("debug output is active")

# ensure that an output dir exists
od = "attachments"
# the exist_ok=True avoids error messages due to us being multithreaded and race-conditions
# that should be no problem since we moved this out of the repetitive extract function
os.path.exists(od) or os.makedirs(od,exist_ok=True)

def extract(filename):
    """
    Try to extract the attachments from filename
    """
    debug and print("=> reading %s" % filename)
    output_count = 0
    try:
        with open(filename, "r") as f:
            try:
                msg = email.message_from_file(f)
                nratt = len(msg.get_payload())
                # this will be 4000something if no attachments are present
                if (nratt > 1 and nratt < 20):
                    for attachment in msg.get_payload()[1:]:
                        of = attachment.get_filename()
                        debug and print("attachment name: %s" % of)

                        # handle multi-line strings, and other problematic characters
                        of = of.replace("n", "")
                        of = of.replace("t", "_")
                        of = of.replace("*", "#")

                        # this is to handle RFC2047 MIME encoded filenames (often used for obfuscation)
                        try:
                            output_filename = unrfc2047(of)
                            if ( of != output_filename):
                                debug and print("decoded attachment name: %s" % output_filename)
                        except Exception as inst:
                            print(type(inst))    # the exception instance
                            print(inst.args)     # arguments stored in .args
                            print(inst)          # __str__ allows args to be printed directly

                        # If no attachments are found, skip this file
                        if output_filename:
                            # check if this filename already exists
                            fn = od   "/"   output_filename
                            debug and print("checking existence of %s" % fn)
                            expand = 0
                            if os.path.isfile(fn):
                                while True:
                                    expand  = 1
                                    # add the increment before the filename extension
                                    fn_name, fn_ext = os.path.splitext(output_filename)
                                    new_filename = fn_name   "_"   str(expand)   fn_ext
                                    fn = od   "/"   new_filename
                                    if os.path.isfile(fn):
                                        continue
                                    else:
                                        output_filename = new_filename
                                        break
                            not(quiet) and print("Writing %s " % output_filename)
                            with open(os.path.join(od, output_filename), "wb") as of:
                                of.write(attachment.get_payload(decode=True))
                                output_count  = 1

                if output_count == 0:
                    not(quiet) and print("No attachment found for file %s!" % f.name)
            except Exception:
                print('Fail: %sn' % f)

    # this should catch read and write errors
    except IOError:
        not(quiet) and print("Problem with %s or one of its attachments!" % f.name)
    return 1, output_count

if __name__ == "__main__":
    if not(single):
        debug and print("running multithreaded")
        # let's do this in parallel, using cpu count as number of threads
        pool = Pool(None)
        res = pool.map(extract, glob.iglob("*.%s" % EXTENSION))
        # need these if we use _async
        pool.close()
        pool.join()
        # 2-element list holding number of files, number of attachments
        numfiles = [sum(i) for i in zip(*res)]
        not(quiet) and print("Done: Processed {} files with {} attachments.".format(*numfiles))
    else:
        filecnt = 0
        cnt = 0
        debug and print("running single threaded")
        for file in glob.glob("*.%s" % EXTENSION):
            filecnt  = 1
            cnt  = extract(file)[1]
        not(quiet) and print("Done: Processed %s files with %s attachments." % (filecnt, cnt))