-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocessing.py
107 lines (99 loc) · 3.74 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import re
import os
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np
from nltk.corpus import stopwords
def text_clean(text):
text = str(text).lower()
text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
text = re.sub(r"\'s", " is ", text)
text = re.sub(r"\'ve", " have ", text)
text = re.sub(r"can't", "cannot ", text)
text = re.sub(r"n't", " not ", text)
text = re.sub(r"\'m", " am ", text)
text = re.sub(r"\'re", " are ", text)
text = re.sub(r"\'d", " would ", text)
text = re.sub(r"\'ll", " will ", text)
text = re.sub(r",", " ", text)
text = re.sub(r"\.", " ", text)
text = re.sub(r"!", " ! ", text)
text = re.sub(r"\/", " ", text)
text = re.sub(r"\^", " ^ ", text)
text = re.sub(r"\+", " + ", text)
text = re.sub(r"\-", " - ", text)
text = re.sub(r"\=", " = ", text)
text = re.sub(r"'", " ", text)
text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
text = re.sub(r":", " : ", text)
text = re.sub(r"\0s", "0", text)
text = re.sub(r"\s{2,}", " ", text)
# stop_words = stopwords.words('english')
# filter_words = [w for w in text.split() if not w in stop_words]
return text # " ".join(filter_words).strip()
def one_hot_encode(label):
label_one_hot = [0] * 6
label = round(label)
label_one_hot[label] = 1
return label_one_hot
def load_data(folder_path):
"""
Load training data
:param folder_path: the folder path
:return: sentences and labels
"""
text1 = []
text2 = []
one_hot_labels = []
labels = []
for filename in os.listdir(folder_path):
input = re.search(r'.*input.*txt', filename)
if input:
with open(folder_path + '/' + filename) as f:
text_lines = f.readlines()
gs_file = filename.replace("input", "gs")
with open(folder_path + '/' + gs_file) as f:
labels_lines = f.readlines()
for text, label in zip(text_lines, labels_lines):
if label != '' and label != '\n':
t1 = text.strip().split('\t')[0]
t2 = text.strip().split('\t')[1]
text1.append(text_clean(t1))
text2.append(text_clean(t2))
one_hot_labels.append(one_hot_encode(float(label)))
labels.append(float(label))
return text1, text2, labels, one_hot_labels
def load_eval_data(folder_path, categories):
"""
load testing data and remove data without label
:param folder_path: the folder path
:param category: a list of source category
:return: sentences and labels
"""
text1 = []
text2 = []
one_hot_labels = []
labels = []
indexs = [0]
current_len = 0
for category in categories:
labels_count = 0
try:
with open(folder_path + '/' + "STS2016.input." + category + ".txt") as f:
text_lines = f.readlines()
with open(folder_path + '/' + "STS2016.gs." + category + ".txt") as f:
labels_lines = f.readlines()
for text, label in zip(text_lines, labels_lines):
if label != '' and label != '\n':
labels_count += 1
t1 = text.strip().split('\t')[0]
t2 = text.strip().split('\t')[1]
text1.append(text_clean(t1))
text2.append(text_clean(t2))
one_hot_labels.append(one_hot_encode(float(label)))
labels.append(float(label))
current_len += labels_count
indexs.append(current_len)
except:
print('not a valid category name')
return text1, text2, labels, one_hot_labels, indexs