-
Notifications
You must be signed in to change notification settings - Fork 1
/
learn_models.py
118 lines (103 loc) · 4.59 KB
/
learn_models.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import numpy as np
from networkx.algorithms import node_classification
def calculate_scores(y_true, y_pred):
from sklearn.metrics import confusion_matrix, f1_score, matthews_corrcoef, roc_auc_score, average_precision_score
# Calculate AUC and AUPR
y_pred = np.array(y_pred, dtype=float)
y_true = np.array(y_true, dtype=int)
auc = roc_auc_score(y_true=y_true, y_score=y_pred)
aupr = average_precision_score(y_true=y_true, y_score=y_pred)
# Print out the scores
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
acc = (tp + tn) / (tp + fp + tn + fn)
prec = tp / (tp + fp)
sens = tp / (tp + fn)
spec = tn / (tn + fp)
f1 = f1_score(y_true, y_pred)
mcc = matthews_corrcoef(y_true, y_pred)
print('Accuracy:', round(acc, 4))
print('Precision:', round(prec, 4))
print('Sensitivity/Recall:', round(sens, 4))
print('Specificity:', round(spec, 4))
print('F1:', round(f1, 4))
print('MCC:', round(mcc, 4))
print('AUC:', round(auc, 4))
print('AUPR:', round(aupr, 4))
return {'Accuracy': acc,
'Precision': prec,
'Sensitivity': sens,
'Specificity': spec,
'F1': f1,
'MCC': mcc,
'AUC': auc,
'AUPR': aupr}
def learn_rf(train_features, train_labels, test_features, test_labels):
from sklearn.ensemble import RandomForestClassifier
# Instantiate model with 100 decision trees
rf = RandomForestClassifier(n_estimators=100, random_state=42, verbose=1, n_jobs=6)
# Train the model on training data
print("Fitting RF ...")
rf.fit(train_features, train_labels)
# Use the forest's predict method on the test data
print("Predicting ...")
y_pred = rf.predict(test_features)
return calculate_scores(y_true=test_labels, y_pred=y_pred)
def learn_SVM(train_features, train_labels, test_features, test_labels):
import numpy as np
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
clf = SVC(random_state=42, max_iter=1000)
scaler = StandardScaler()
print("Scaling feature matrix ...")
train_features = scaler.fit_transform(train_features)
print(f'Scaler mean for first 10 features: {scaler.mean_[:10]}')
print("Fitting SVM ...")
clf.fit(train_features, train_labels)
print("Predicting ...")
test_features = scaler.transform(test_features)
print(f'Scaler mean for first 10 features: {scaler.mean_[:10]}')
y_pred = clf.predict(test_features)
return calculate_scores(y_true=test_labels, y_pred=y_pred)
def semi_supervised_analysis(line_graph, shuffle_labels, rewired, method_name):
import networkx as nx
# get node_ids dict (uniprotid1, uniprotid2): enumerated node_id and node_labels dict node_id: interaction label
node_ids, node_labels = _get_ids_and_labels_of_labeled_nodes(line_graph, 'interaction')
training_nodes = [node_ids[node] for node, data in line_graph.nodes(data=True) if data['split'] == 'training']
test_nodes = [node_ids[node] for node, data in line_graph.nodes(data=True) if data['split'] == 'test']
if shuffle_labels:
node_labels = _shuffle_labels(node_labels)
y_true = np.array([node_labels[node_id] for node_id in node_ids.values() if node_id in test_nodes], dtype=int)
tmp_lg = nx.Graph(line_graph)
if rewired:
degree_sequence = [d for _, d in tmp_lg.degree()]
tmp_lg = nx.expected_degree_graph(degree_sequence, selfloops=False)
node_list = list(tmp_lg.nodes)
for node_id in training_nodes:
node = node_list[node_id]
tmp_lg.nodes[node]['label'] = node_labels[node_id]
predicted_labels = _get_method(method_name)(tmp_lg)
y_pred = np.array(predicted_labels, dtype=int)[test_nodes]
scores = calculate_scores(y_true, y_pred)
return scores
def _get_ids_and_labels_of_labeled_nodes(line_graph, predict_attribute):
node_ids = dict()
node_labels = dict()
for node_id, node in enumerate(line_graph.nodes(data=True)):
label = node[1][predict_attribute]
node_ids[node[0]] = node_id
node_labels[node_id] = label
return node_ids, node_labels
def _get_method(method_name):
method_dict = {
'Harmonic function': node_classification.harmonic_function,
'Local and global consistency': node_classification.local_and_global_consistency
}
return method_dict[method_name]
def _shuffle_labels(node_labels):
import numpy as np
node_ids = list(node_labels.keys())
labels = list(node_labels.values())
np.random.shuffle(labels)
for i in range(len(node_labels)):
node_labels[node_ids[i]] = labels[i]
return node_labels