-
Notifications
You must be signed in to change notification settings - Fork 2
/
eval.py
111 lines (95 loc) · 4.23 KB
/
eval.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import os, pickle
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences
from data_helpers import load_data, load_data_eval,load_data_and_labels
from datetime import datetime
from sklearn.metrics import f1_score, precision_score, recall_score
###############
# # Optional for GPU Selection Instead, you can also do "CUDA_VISIBLE_DEVICES=0"
# import GPUtil
# GPUtil.showUtilization()
# DEVICE_ID_LIST = GPUtil.getFirstAvailable() # Get the first available GPU
# DEVICE_ID = DEVICE_ID_LIST[0] # grab first element from list
# os.environ["CUDA_VISIBLE_DEVICES"] = str(DEVICE_ID) # Set CUDA_VISIBLE_DEVICES to mask out all other GPUs than the first available device id
# print('Device ID (unmasked): ' + str(DEVICE_ID)) # Since all other GPUs are masked out, the first available GPU will now be identified as GPU:0
###########################################################################
# #data input GWAS
# file_tag = "GWAS"
# pos_file = "./data/GWAS/gwas_before2017_test_pos.txt"
# neg_file = "./data/GWAS/gwas_before2017_test_neg.txt"
# checkpoint_folder = './checkpoints/GWAS/20180516_162442/' #Use your own checkpoint folder
# checkpoint_file = os.path.join(checkpoint_folder, 'final_model.h5')
# tokenizer_file = os.path.join(checkpoint_folder, 'tokenizer.pickle')
# #data input MYCO
# file_tag = "MYCO"
# pos_file = "./data/MYCO/mycoSet_PMIDs_POS_toText_test.txt"
# neg_file = "./data/MYCO/mycoSet_PMIDs_NEG_toText_test.txt"
# checkpoint_folder = './checkpoints/MYCO/20180521_105845/'
# checkpoint_file = os.path.join(checkpoint_folder, 'final_model.h5')
# tokenizer_file = os.path.join(checkpoint_folder, 'tokenizer.pickle')
batch_size = 50
print('Loading data')
current_time = str(datetime.now().strftime('%Y%m%d_%H%M%S'))
print(current_time)
log_dir = os.path.join('./logs')
if not os.path.exists(log_dir):
os.makedirs(log_dir)
f_log = open(os.path.join(log_dir, 'result_logs_'+file_tag+'.txt'), 'a')
print('Log File: %s' % os.path.join(os.getcwd(), log_dir, 'result_logs_'+file_tag+'.txt'))
f_log.write("\n")
f_log.write(current_time)
f_log.write("\n")
f_log.write('pos_file: %s\n' % pos_file)
f_log.write('neg_file: %s\n' % neg_file)
f_log.write('checkpoint_folder: %s\n' % checkpoint_folder)
texts, labels = load_data_and_labels(pos_file, neg_file)
pmids = []
for i in range(0, len(texts)):
#print(texts[i])
pmidtemp = texts[i].split()[0]
pmids.append(pmidtemp)
model = load_model(checkpoint_file)
input_seq_length = model.input_shape[1]
EMBEDDING_DIM = model.input_shape[0]
MAX_SEQUENCE_LENGTH = model.input_shape[1]
# loading tokenizer
with open(tokenizer_file, 'rb') as handle:
tokenizer = pickle.load(handle)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
x_test = data
y_test = labels
score, mse, acc = model.evaluate(data, y_test, batch_size, verbose=2)
pred_results = model.predict(data)
print("===============")
print("loss:\t%s" % score)
print("mse:\t%s" % mse)
print("accuracy:\t%s" % acc)
f1_res = f1_score(y_test.argmax(axis=1), pred_results.argmax(axis=1))
precision_res = precision_score(y_test.argmax(axis=1), pred_results.argmax(axis=1))
recall_res = recall_score(y_test.argmax(axis=1), pred_results.argmax(axis=1))
print("F1:\t%s" % f1_res)
print("Precision:\t%s" % precision_res)
print("recall:\t%s" % recall_res)
print("===============")
print("Writing on files...")
f_listoutput_file = pos_file+'_results.txt'
f_listoutput = open(f_listoutput_file, 'w')
f_listoutput.writelines("PMID\tAnswer\tPred\tNegScore\tPosScore\n")
for x in range(0, len(pred_results)):
f_listoutput.writelines("%s\t%d\t%d\t%.5f\t%.5f\n" % (pmids[x], y_test.argmax(axis=1)[x], pred_results.argmax(axis=1)[x], pred_results[x][0],pred_results[x][1]))
f_listoutput.write("\n")
f_listoutput.close()
print("Evaluation results are saved at %s" % f_listoutput_file)
f_log.write("Loss: \t%s\n" % score)
f_log.write("MSE: \t%s\n" % mse)
f_log.write("Accuracy: \t%s\n" % acc)
f_log.write("F1: \t%s\n" % f1_res)
f_log.write("Precision:\t%s\n" % precision_res)
f_log.write("recall: \t%s\n" % recall_res)
f_log.write("\n")
f_log.close()
print("Logs are saved at %s" % log_dir)
print("\n")