forked from gnh1201/lecture-sonsemail
-
Notifications
You must be signed in to change notification settings - Fork 0
/
day2.py
52 lines (43 loc) · 1.58 KB
/
day2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import sys
import os
import json
import datetime
import eml_parser
from bs4 import BeautifulSoup
from konlpy.tag import Kkma
def json_serial(obj):
if isinstance(obj, datetime.datetime):
serial = obj.isoformat()
return serial
def tokenize(text):
return kkma.nouns(text)
def parse_content(content):
cleantext = BeautifulSoup(content, "lxml").text
terms = tokenize(cleantext)
print(','.join(terms))
def main(args):
for root, dirs, files in os.walk("./data"):
path = root.split(os.sep)
for file in files:
f_ext = file.split('.')[-1]
f_path = root + os.sep + file;
if f_ext == "eml":
with open(f_path, 'rb') as fh:
raw_email = fh.read()
parsed_eml = eml_parser.eml_parser.decode_email_b(raw_email, True)
email_contents = ""
#print(json.dumps(parsed_eml, default=json_serial))
# get text from body.content
if "body" in parsed_eml:
for e_body in parsed_eml["body"]:
email_contents += e_body["content"]
# get text from header.subject
if "header" in parsed_eml:
e_header = parsed_eml["header"]
if "subject" in e_header:
email_contents += e_header["subject"]
# parse content
if email_contents != "":
parse_content(email_contents)
kkma = Kkma()
main(sys.argv)