#python #email #extract
Вопрос:
У меня есть проект по извлечению вложений .msg из нескольких файлов .eml, созданных в Outlook.
Я делал это в прошлом для обычных вложений, docx, txt, pdf и т. Д.
Когда дело доходит до файлов .msg, вложенных в файл .eml, я получаю следующий ответ.
Fail: <_io.TextIOWrapper name='file1.eml' mode='r' encoding='cp1252'>
Fail: <_io.TextIOWrapper name='file2.eml' mode='r' encoding='cp1252'>
Fail: <_io.TextIOWrapper name='file3.eml' mode='r' encoding='cp1252'>
Fail: <_io.TextIOWrapper name='file4.eml' mode='r' encoding='cp1252'>
Done: Processed 4 files with 0 attachments.
Сценарий, который я изменял, изначально был взят из beamzer на github.
import glob
import os
import email
import argparse
from multiprocessing import Pool
from cs.rfc2047 import unrfc2047
EXTENSION = "eml"
parser = argparse.ArgumentParser(description='extract attachments from eml files')
parser.add_argument(
'-d','--debug',
action='store_true',
help='print debug messages to stderr'
)
parser.add_argument(
'-s','--single',
action='store_true',
help='run as single thread (default = multithreaded, one thread per core)'
)
parser.add_argument(
'-q','--quiet',
action='store_true',
help='no output'
)
args = parser.parse_args()
debug = args.debug
single = args.single
quiet = args.quiet
debug and print("debug output is active")
# ensure that an output dir exists
od = "attachments"
# the exist_ok=True avoids error messages due to us being multithreaded and race-conditions
# that should be no problem since we moved this out of the repetitive extract function
os.path.exists(od) or os.makedirs(od,exist_ok=True)
def extract(filename):
"""
Try to extract the attachments from filename
"""
debug and print("=> reading %s" % filename)
output_count = 0
try:
with open(filename, "r") as f:
try:
msg = email.message_from_file(f)
nratt = len(msg.get_payload())
# this will be 4000something if no attachments are present
if (nratt > 1 and nratt < 20):
for attachment in msg.get_payload()[1:]:
of = attachment.get_filename()
debug and print("attachment name: %s" % of)
# handle multi-line strings, and other problematic characters
of = of.replace("n", "")
of = of.replace("t", "_")
of = of.replace("*", "#")
# this is to handle RFC2047 MIME encoded filenames (often used for obfuscation)
try:
output_filename = unrfc2047(of)
if ( of != output_filename):
debug and print("decoded attachment name: %s" % output_filename)
except Exception as inst:
print(type(inst)) # the exception instance
print(inst.args) # arguments stored in .args
print(inst) # __str__ allows args to be printed directly
# If no attachments are found, skip this file
if output_filename:
# check if this filename already exists
fn = od "/" output_filename
debug and print("checking existence of %s" % fn)
expand = 0
if os.path.isfile(fn):
while True:
expand = 1
# add the increment before the filename extension
fn_name, fn_ext = os.path.splitext(output_filename)
new_filename = fn_name "_" str(expand) fn_ext
fn = od "/" new_filename
if os.path.isfile(fn):
continue
else:
output_filename = new_filename
break
not(quiet) and print("Writing %s " % output_filename)
with open(os.path.join(od, output_filename), "wb") as of:
of.write(attachment.get_payload(decode=True))
output_count = 1
if output_count == 0:
not(quiet) and print("No attachment found for file %s!" % f.name)
except Exception:
print('Fail: %sn' % f)
# this should catch read and write errors
except IOError:
not(quiet) and print("Problem with %s or one of its attachments!" % f.name)
return 1, output_count
if __name__ == "__main__":
if not(single):
debug and print("running multithreaded")
# let's do this in parallel, using cpu count as number of threads
pool = Pool(None)
res = pool.map(extract, glob.iglob("*.%s" % EXTENSION))
# need these if we use _async
pool.close()
pool.join()
# 2-element list holding number of files, number of attachments
numfiles = [sum(i) for i in zip(*res)]
not(quiet) and print("Done: Processed {} files with {} attachments.".format(*numfiles))
else:
filecnt = 0
cnt = 0
debug and print("running single threaded")
for file in glob.glob("*.%s" % EXTENSION):
filecnt = 1
cnt = extract(file)[1]
not(quiet) and print("Done: Processed %s files with %s attachments." % (filecnt, cnt))