Pythonでemlファイルを読み取る方法は？

Question

python 3.4でemlファイルをロードする方法がわかりません。
すべてをリストして、Pythonですべて読みたいです。

enter image description here

Dalen · Answer

これは、電子メールのコンテンツ、つまり* .emlファイルを取得する方法です。これはPython2.5-2.7で完全に機能します。 3で試してみてください。同様に機能するはずです。

 from email import message_from_file import os # Path to directory where attachments will be stored: path = "./msgfiles" # To have attachments extracted into memory, change behaviour of 2 following functions: def file_exists (f): """Checks whether extracted file was extracted before.""" return os.path.exists(os.path.join(path, f)) def save_file (fn, cont): """Saves cont to a file fn""" file = open(os.path.join(path, fn), "wb") file.write(cont) file.close() def construct_name (id, fn): """Constructs a file name out of messages ID and packed file name""" id = id.split(".") id = id[0]+id[1] return id+"."+fn def disqo (s): """Removes double or single quotations.""" s = s.strip() if s.startswith("'") and s.endswith("'"): return s[1:-1] if s.startswith('"') and s.endswith('"'): return s[1:-1] return s def disgra (s): """Removes < and > from HTML-like tag or e-mail address or e-mail ID.""" s = s.strip() if s.startswith("<") and s.endswith(">"): return s[1:-1] return s def pullout (m, key): """Extracts content from an e-mail message. This works for multipart and nested multipart messages too. m -- email.Message() or mailbox.Message() key -- Initial message ID (some string) Returns Tuple(Text, Html, Files, Parts) Text -- All text from all parts. Html -- All HTMLs from all parts Files -- Dictionary mapping extracted file to message ID it belongs to. Parts -- Number of parts in original message. """ Html = "" Text = "" Files = {} Parts = 0 if not m.is_multipart(): if m.get_filename(): # It's an attachment fn = m.get_filename() cfn = construct_name(key, fn) Files[fn] = (cfn, None) if file_exists(cfn): return Text, Html, Files, 1 save_file(cfn, m.get_payload(decode=True)) return Text, Html, Files, 1 # Not an attachment! # See where this belongs. Text, Html or some other data: cp = m.get_content_type() if cp=="text/plain": Text += m.get_payload(decode=True) Elif cp=="text/html": Html += m.get_payload(decode=True) else: # Something else! # Extract a message ID and a file name if there is one: # This is some packed file and name is contained in content-type header # instead of content-disposition header explicitly cp = m.get("content-type") try: id = disgra(m.get("content-id")) except: id = None # Find file name: o = cp.find("name=") if o==-1: return Text, Html, Files, 1 ox = cp.find(";", o) if ox==-1: ox = None o += 5; fn = cp[o:ox] fn = disqo(fn) cfn = construct_name(key, fn) Files[fn] = (cfn, id) if file_exists(cfn): return Text, Html, Files, 1 save_file(cfn, m.get_payload(decode=True)) return Text, Html, Files, 1 # This IS a multipart message. # So, we iterate over it and call pullout() recursively for each part. y = 0 while 1: # If we cannot get the payload, it means we hit the end: try: pl = m.get_payload(y) except: break # pl is a new Message object which goes back to pullout t, h, f, p = pullout(pl, key) Text += t; Html += h; Files.update(f); Parts += p y += 1 return Text, Html, Files, Parts def extract (msgfile, key): """Extracts all data from e-mail, including From, To, etc., and returns it as a dictionary. msgfile -- A file-like readable object key -- Some ID string for that particular Message. Can be a file name or anything. Returns dict() Keys: from, to, subject, date, text, html, parts[, files] Key files will be present only when message contained binary files. For more see __doc__ for pullout() and caption() functions. """ m = message_from_file(msgfile) From, To, Subject, Date = caption(m) Text, Html, Files, Parts = pullout(m, key) Text = Text.strip(); Html = Html.strip() msg = {"subject": Subject, "from": From, "to": To, "date": Date, "text": Text, "html": Html, "parts": Parts} if Files: msg["files"] = Files return msg def caption (Origin): """Extracts: To, From, Subject and Date from email.Message() or mailbox.Message() Origin -- Message() object Returns Tuple(From, To, Subject, Date) If message doesn't contain one/more of them, the empty strings will be returned. """ Date = "" if Origin.has_key("date"): Date = Origin["date"].strip() From = "" if Origin.has_key("from"): From = Origin["from"].strip() To = "" if Origin.has_key("to"): To = Origin["to"].strip() Subject = "" if Origin.has_key("subject"): Subject = Origin["subject"].strip() return From, To, Subject, Date

# Usage: f = open("message.eml", "rb") print extract(f, f.name) f.close()

メールボックスを使用してメールグループ用にこれをプログラムしたので、非常に複雑です。それは私を決して失敗させなかった。決してがらくたはありません。メッセージがマルチパートの場合、出力ディクショナリには、テキストまたはhtmlではない抽出された他のファイルのすべてのファイル名を含むキー「ファイル」（サブディクテーション）が含まれます。これは、添付ファイルやその他のバイナリデータを抽出する方法でした。 pullout（）で変更するか、file_exists（）とsave_file（）の動作を変更することができます。

construct_name（）は、メッセージIDとマルチパートメッセージファイル名（存在する場合）からファイル名を作成します。

Pullout（）では、Text変数とHtml変数は文字列です。オンラインメールグループの場合、一度に添付ファイルではないテキストまたはHTMLをマルチパートにパックしても問題ありませんでした。

より洗練されたものが必要な場合は、TextとHtmlをリストに変更して追加し、必要に応じて追加します。問題はありません。

Email.Message（）ではなくmailbox.Message（）で動作することを目的としているため、ここにいくつかのエラーがある可能性があります。 email.Message（）で試してみましたが、問題なく動作しました。

あなたは「それらすべてをリストしたい」と言いました。どこから？ POP3メールボックスまたはNiceオープンソースメーラーのメールボックスを参照する場合は、メールボックスモジュールを使用して参照します。他の人からそれらをリストしたい場合は、問題があります。たとえば、MS Outlookからメールを取得するには、OLE2複合ファイルの読み取り方法を知っている必要があります。他のメーラーがそれらを* .emlファイルと呼ぶことはめったにないので、これはまさにあなたがやりたいことだと思います。次に、PyPIでolefileまたはcompoundfilesモジュールを検索し、GoogleでMSOutlookの受信トレイファイルから電子メールを抽出する方法を検索します。または、混乱を保存して、そこからディレクトリにエクスポートするだけです。それらをemlファイルとして持っている場合は、このコードを適用します。

Mike · Answer

私はこれを見つけましたコードはるかに簡単です

import email import os path = './' listing = os.listdir(path) for fle in listing: if str.lower(fle[-3:])=="eml": msg = email.message_from_file(open(fle)) attachments=msg.get_payload() for attachment in attachments: try: fnam=attachment.get_filename() f=open(fnam, 'wb').write(attachment.get_payload(decode=True,)) f.close() except Exception as detail: #print detail pass

IvanTheFirst · Answer

これを試して：

#!python3 # -*- coding: utf-8 -*- import email import os SOURCE_DIR = 'email' DEST_DIR = 'temp' def extractattachements(fle,suffix=None): message = email.message_from_file(open(fle)) filenames = [] if message.get_content_maintype() == 'multipart': for part in message.walk(): if part.get_content_maintype() == 'multipart': continue #if part.get('Content-Disposition') is None: continue if part.get('Content-Type').find('application/octet-stream') == -1: continue filename = part.get_filename() if suffix: filename = ''.join( [filename.split('.')[0], '_', suffix, '.', filename.split('.')[1]]) filename = os.path.join(DEST_DIR, filename) fb = open(filename,'wb') fb.write(part.get_payload(decode=True)) fb.close() filenames.append(filename) return filenames def main(): onlyfiles = [f for f in os.listdir(SOURCE_DIR) if os.path.isfile(os.path.join(SOURCE_DIR, f))] for file in onlyfiles: #print path.join(SOURCE_DIR,file) extractattachements(os.path.join(SOURCE_DIR,file)) return True if __name__ == "__main__": main()