forked from gnh1201/lecture-sonsemail
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathday1.py
37 lines (31 loc) · 1.05 KB
/
day1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import sys
import os
import json
import datetime
import eml_parser
from bs4 import BeautifulSoup
def json_serial(obj):
if isinstance(obj, datetime.datetime):
serial = obj.isoformat()
return serial
def tokenize(text):
return text.split(' ')
def parse_content(content):
cleantext = BeautifulSoup(content, "lxml").text
terms = tokenize(cleantext)
print(' '.join(terms))
def main(args):
for root, dirs, files in os.walk("./data"):
path = root.split(os.sep)
for file in files:
f_ext = file.split('.')[-1]
f_path = root + os.sep + file;
if f_ext == "eml":
with open(f_path, 'rb') as fh:
raw_email = fh.read()
parsed_eml = eml_parser.eml_parser.decode_email_b(raw_email, True)
#print(json.dumps(parsed_eml, default=json_serial))
if "body" in parsed_eml:
for e_body in parsed_eml["body"]:
parse_content(e_body["content"])
main(sys.argv)