-
Notifications
You must be signed in to change notification settings - Fork 12
/
custom.py
157 lines (116 loc) · 3.97 KB
/
custom.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
from collections import Counter
from copy import deepcopy
import numpy as np
from sklearn.preprocessing import LabelBinarizer
from keras.models import model_from_yaml
def right_pad(sequences):
"""
Pads sequences with extra "*" characters.
"""
padded_sequences = deepcopy(sequences)
seq_lengths = compute_seq_lengths(sequences)
for s in padded_sequences:
while len(s) < max(seq_lengths.keys()):
s.seq += '*'
return padded_sequences
def compute_seq_lengths(sequences):
"""
Computes the sequence lengths.
"""
seq_lengths = [len(s) for s in sequences]
seq_lengths = Counter(seq_lengths)
return seq_lengths
def seq2chararray(sequences):
"""
Returns sequences coded as a numpy array. Doesn't perform one-hot-encoding.
"""
padded_sequences = right_pad(sequences)
seq_lengths = compute_seq_lengths(sequences)
char_array = np.chararray(shape=(len(sequences), max(seq_lengths.keys())),
unicode=True)
for i, seq in enumerate(padded_sequences):
char_array[i, :] = list(seq)
return char_array
def compute_alphabet(sequences):
"""
Returns the alphabet used in a set of sequences.
"""
alphabet = set()
for s in sequences:
alphabet = alphabet.union(set(s))
return alphabet
def encode_array(sequences):
"""
Performs binary encoding of the sequence array.
Inputs:
=======
- seq_array: (numpy array) of characters.
- seq_lengths: (Counter) dictionary; key::sequence length; value::number of
sequences with that length.
"""
# Binarize the features to one-of-K encoding.
alphabet = compute_alphabet(sequences)
seq_lengths = compute_seq_lengths(sequences)
seq_array = seq2chararray(sequences)
lb = LabelBinarizer()
lb.fit(list(alphabet))
print(len(alphabet))
encoded_array = np.zeros(shape=(seq_array.shape[0],
max(seq_lengths.keys()) * len(alphabet)))
for i in range(seq_array.shape[1]):
encoded_array[:, i*len(alphabet):(i+1)*len(alphabet)] = \
lb.transform(seq_array[:, i])
return encoded_array
def embedding2binary(decoder, predictions):
"""
Decodes the predictions into a binary array.
Inputs:
=======
- decoder: a Keras model.
- predictions: a numpy array corresponding to the lower dimensional
projection.
Returns:
========
- a binary encoding numpy array that corresponds to a predicted sequence.
"""
return np.rint(decoder.predict(predictions))
def binary2chararray(sequences, binary_array):
"""
Converts a binary array into a character array.
"""
alphabet = compute_alphabet(sequences)
seq_lengths = compute_seq_lengths(sequences)
seq_array = seq2chararray(sequences)
lb = LabelBinarizer()
lb.fit(list(alphabet))
char_array = np.chararray(shape=(len(binary_array),
max(seq_lengths.keys())), unicode=True)
for i in range(seq_array.shape[1]):
char_array[:, i] = lb.inverse_transform(
binary_array[:, i*len(alphabet):(i+1)*len(alphabet)])
return char_array
def save_model(model, path):
with open(path + '.yaml', 'w+') as f:
model_yaml = model.to_yaml()
f.write(model_yaml)
model.save_weights(path + '.h5')
def load_model(path):
with open(path + '.yaml', 'r+') as f:
yaml_rep = ''
for l in f.readlines():
yaml_rep += l
model = model_from_yaml(yaml_rep)
model.load_weights(path + '.h5')
return model
def get_density_interval(percentage, array, axis=0):
"""
Returns the highest density interval on the array.
Parameters:
===========
percentage: (float, int) value between 0 and 100, inclusive.
array: a numpy array of numbers.
"""
low = (100 - percentage) / 2
high = (100 - low)
lowp, highp = np.percentile(array, [low, high], axis=axis)
return lowp, highp