-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathuploader.py
71 lines (59 loc) · 2.35 KB
/
uploader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import json, sys, logging, re
import pathlib
from libs.mongo_wrapper import MongoWrapper
from arguments import get_preprocessing_args
import emoji
logging.basicConfig(format='[%(asctime)s] %(levelname)s: %(message)s', stream=sys.stdout, level=logging.DEBUG)
def filter_text(sent):
# replace chinese character
# replace '[ㅏ - ㅣ]'
emojis = ''.join(emoji.UNICODE_EMOJI.keys())
pattern = re.compile(f'[^ .,?!/@$%~%·∼()\x00-\x7Fㄱ-ㅎ가-힣{emojis}]+')
url_pattern = re.compile(
r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)')
rm_pattern = re.compile(r'#+')
sent = pattern.sub('', sent)
sent = url_pattern.sub('', sent)
sent = rm_pattern.sub('', sent)
sent = sent.strip()
return sent
if __name__ == '__main__':
args = get_preprocessing_args()
md = MongoWrapper(args.config_src)
with open(args.config_src) as fp:
config = json.load(fp)
src_collection = config['COLLECTIONS'][0]
logging.info("[Source]: %s" % src_collection)
flist = pathlib.Path('./data_files/nikl_web').glob('*.json')
pat = re.compile("ES")
idx = 0
for fpath in flist:
if len(pat.findall(str(fpath))) != 0:
continue
data = json.load(fpath.open())
docs = data['document']
text_set = []
for doc in docs:
paragraph = doc['paragraph']
for p in paragraph:
text = p['form']
filt_text = filter_text(text)
if len(filt_text) < 5 or filt_text == '.':
continue
data = {'filt_text': filt_text, 'idx': idx}
text_set.append(data)
idx += 1
md.insert_docs(text_set, collection_name=src_collection)
md.update_meta_info(collection_name=src_collection)
# docs = data['document']
# text_set = []
# for doc in docs:
# ut_list = doc['utterance']
# for u in ut_list:
# text = u['form']
# data = {'form': text, 'idx': idx}
# text_set.append(data)
# idx += 1
#
# md.insert_docs(text_set, collection_name=src_collection)
# md.update_meta_info(collection_name=src_collection)