diff --git a/parrot/bayesian_optimization.py b/parrot/bayesian_optimization.py index 1243784..951926e 100644 --- a/parrot/bayesian_optimization.py +++ b/parrot/bayesian_optimization.py @@ -8,11 +8,9 @@ Question/comments/concerns? Raise an issue on github: https://github.com/idptools/parrot -Licensed under the MIT license. +Licensed under the MIT license. """ -import math - import numpy as np try: @@ -20,18 +18,19 @@ import GPyOpt from GPyOpt.methods import BayesianOptimization except ImportError: - print('Error importing GPy.') - print(' If trying to run parrot-optimize, make sure to use `pip install idptools-parrot[optimize]`') + print("Error importing GPy.") + print( + " If trying to run parrot-optimize, make sure to use `pip install idptools-parrot[optimize]`" + ) -from parrot import train_network -from parrot import brnn_architecture +from parrot import brnn_architecture, train_network class BayesianOptimizer(object): """A class for conducting Bayesian Optimization on a PyTorch RNN Sets up and runs GPy Bayesian Optimization in order to choose the best- - performing hyperparameters for a RNN for a given machine learning task. + performing hyperparameters for a RNN for a given machine learning task. Iteratively change learning rate, hidden vector size, and the number of layers in the network, then train and validating using 5-fold cross validation. @@ -55,7 +54,7 @@ class BayesianOptimizer(object): weights_file : str Path to which the network weights will be saved during training device : str - 'cpu' or 'cuda' depending on system hardware + 'cpu', 'mps' or 'cuda' depending on system hardware max_iterations : int Maximum number of iterations to perform the optimization procedure silent : bool @@ -64,8 +63,18 @@ class BayesianOptimizer(object): GPy-compatible bounds for each of the hyperparameters to be optimized """ - def __init__(self, cv_dataloaders, input_size, n_epochs, n_classes, - dtype, weights_file, max_iterations, device, silent): + def __init__( + self, + cv_dataloaders, + input_size, + n_epochs, + n_classes, + dtype, + weights_file, + max_iterations, + device, + silent, + ): """ Parameters ---------- @@ -83,7 +92,7 @@ def __init__(self, cv_dataloaders, input_size, n_epochs, n_classes, weights_file : str Path to which the network weights will be saved during training device : str - 'cpu' or 'cuda' depending on system hardware + 'cpu', 'mps' or 'cuda' depending on system hardware max_iterations : int Maximum number of iterations to perform the optimization procedure silent : bool @@ -96,9 +105,9 @@ def __init__(self, cv_dataloaders, input_size, n_epochs, n_classes, self.n_folds = len(cv_dataloaders) self.n_classes = n_classes if n_classes > 1: - self.problem_type = 'classification' + self.problem_type = "classification" else: - self.problem_type = 'regression' + self.problem_type = "regression" self.dtype = dtype self.weights_file = weights_file @@ -106,9 +115,19 @@ def __init__(self, cv_dataloaders, input_size, n_epochs, n_classes, self.device = device self.silent = silent - self.bds = [{'name': 'log_learning_rate', 'type': 'continuous', 'domain': (-5, -2)}, # 0.00001-0.01 - {'name': 'n_layers', 'type': 'discrete', 'domain': tuple(range(1, 6))}, # 1-5 - {'name': 'hidden_size', 'type': 'discrete', 'domain': tuple(range(5, 51))}] # 5-50 + self.bds = [ + { + "name": "log_learning_rate", + "type": "continuous", + "domain": (-5, -2), + }, # 0.00001-0.01 + { + "name": "n_layers", + "type": "discrete", + "domain": tuple(range(1, 6)), + }, # 1-5 + {"name": "hidden_size", "type": "discrete", "domain": tuple(range(5, 51))}, + ] # 5-50 def compute_cv_loss(self, hyperparameters): """Compute the average cross-val loss for a given set of hyperparameters @@ -125,7 +144,7 @@ def compute_cv_loss(self, hyperparameters): Returns ------- numpy float array - a Nx1 numpy array of the average cross-val loss + a Nx1 numpy array of the average cross-val loss per set of input hyperparameters """ @@ -134,7 +153,7 @@ def compute_cv_loss(self, hyperparameters): for i in range(len(hyperparameters)): log_lr, nl, hs = hyperparameters[i] - lr = 10**float(log_lr) + lr = 10 ** float(log_lr) nl = int(nl) hs = int(hs) @@ -143,7 +162,10 @@ def compute_cv_loss(self, hyperparameters): avg = np.average(cv_outputs[i]) if self.silent is False: - print(' %.6f | %2d | %2d | %.3f' % (lr, nl, hs, avg)) + print( + " %.6f | %2d | %2d | %.3f" + % (lr, nl, hs, avg) + ) outputs = np.average(cv_outputs, axis=1) return outputs @@ -166,23 +188,36 @@ def eval_cv_brnns(self, lr, nl, hs): the best validation loss from each fold of cross validation """ - cv_losses = np.zeros(self.n_folds) - 1 # -1 so that it's obvious if something goes wrong + cv_losses = ( + np.zeros(self.n_folds) - 1 + ) # -1 so that it's obvious if something goes wrong for k in range(self.n_folds): - if self.dtype == 'sequence': + if self.dtype == "sequence": # Use a many-to-one architecture - brnn_network = brnn_architecture.BRNN_MtO(self.input_size, hs, nl, - self.n_classes, self.device).to(self.device) + brnn_network = brnn_architecture.BRNN_MtO( + self.input_size, hs, nl, self.n_classes, self.device + ).to(self.device) else: # Use a many-to-many architecture - brnn_network = brnn_architecture.BRNN_MtM(self.input_size, hs, nl, - self.n_classes, self.device).to(self.device) + brnn_network = brnn_architecture.BRNN_MtM( + self.input_size, hs, nl, self.n_classes, self.device + ).to(self.device) # Train network with this set of hyperparameters - train_losses, val_losses = train_network.train(brnn_network, self.cv_loaders[k][0], - self.cv_loaders[k][1], self.dtype, self.problem_type, - self.weights_file, stop_condition='iter', device=self.device, - learn_rate=lr, n_epochs=self.n_epochs, silent=True) + train_losses, val_losses = train_network.train( + brnn_network, + self.cv_loaders[k][0], + self.cv_loaders[k][1], + self.dtype, + self.problem_type, + self.weights_file, + stop_condition="iter", + device=self.device, + learn_rate=lr, + n_epochs=self.n_epochs, + silent=True, + ) # Take best val loss best_val_loss = np.min(val_losses) cv_losses[k] = best_val_loss @@ -211,7 +246,7 @@ def initial_search(self, x): for i in range(len(x)): log_lr, nl, hs = x[i] - lr = 10**float(log_lr) + lr = 10 ** float(log_lr) nl = int(nl) hs = int(hs) @@ -237,32 +272,55 @@ def optimize(self): """ # Initial hyperparameter search -- used to get noise estimate - x_init = np.array([[-3.0, 1, 20], [-3.0, 2, 20], [-3.0, 3, 20], [-3.0, 4, 20], [-3.0, 5, 20], - [-2.0, 2, 20], [-3.3, 2, 20], [-4.0, 2, 20], [-5.0, 2, 20], - [-3.0, 2, 5], [-3.0, 2, 15], [-3.0, 2, 35], [-3.0, 2, 50]]) + x_init = np.array( + [ + [-3.0, 1, 20], + [-3.0, 2, 20], + [-3.0, 3, 20], + [-3.0, 4, 20], + [-3.0, 5, 20], + [-2.0, 2, 20], + [-3.3, 2, 20], + [-4.0, 2, 20], + [-5.0, 2, 20], + [-3.0, 2, 5], + [-3.0, 2, 15], + [-3.0, 2, 35], + [-3.0, 2, 50], + ] + ) y_init, noise = self.initial_search(x_init) if self.silent is False: print("\nInitial search results:") print("lr\tnl\ths\toutput") for i in range(len(x_init)): - print("%.5f\t%2d\t%2d\t%.4f" % (10**x_init[i][0], x_init[i][1], x_init[i][2], y_init[i][0])) + print( + "%.5f\t%2d\t%2d\t%.4f" + % (10 ** x_init[i][0], x_init[i][1], x_init[i][2], y_init[i][0]) + ) print("Noise estimate:", noise) - print('\n') - print('Primary optimization:') - print('--------------------\n') - print('Learning rate | n_layers | hidden vector size | avg CV loss ') - print('======================================================================') - - optimizer = BayesianOptimization(f=self.compute_cv_loss, - domain=self.bds, - model_type='GP', - acquisition_type='EI', - acquisition_jitter=0.05, - X=x_init, - Y=y_init, - noise_var=noise, - maximize=False) + print("\n") + print("Primary optimization:") + print("--------------------\n") + print( + "Learning rate | n_layers | hidden vector size | avg CV loss " + ) + print( + "======================================================================" + ) + + optimizer = BayesianOptimization( + f=self.compute_cv_loss, + domain=self.bds, + model_type="GP", + acquisition_type="EI", + acquisition_jitter=0.05, + X=x_init, + Y=y_init, + noise_var=noise, + maximize=False, + ) optimizer.run_optimization(max_iter=self.max_iterations) @@ -270,8 +328,10 @@ def optimize(self): outs = optimizer.get_evaluations()[1].flatten() if self.silent is False: - print("\nThe optimal hyperparameters are:\nlr = %.5f\nnl = %d\nhs = %d" % - (10**optimizer.x_opt[0], optimizer.x_opt[1], optimizer.x_opt[2])) + print( + "\nThe optimal hyperparameters are:\nlr = %.5f\nnl = %d\nhs = %d" + % (10 ** optimizer.x_opt[0], optimizer.x_opt[1], optimizer.x_opt[2]) + ) print() return optimizer.x_opt diff --git a/parrot/brnn_architecture.py b/parrot/brnn_architecture.py index efe09c9..7e52527 100644 --- a/parrot/brnn_architecture.py +++ b/parrot/brnn_architecture.py @@ -8,7 +8,7 @@ Question/comments/concerns? Raise an issue on github: https://github.com/idptools/parrot -Licensed under the MIT license. +Licensed under the MIT license. """ import torch @@ -24,15 +24,15 @@ class BRNN_MtM(nn.Module): aggregates the deepest hidden layers of both directions and produces the outputs. - "Many-to-many" refers to the fact that the network will produce outputs - corresponding to every item of the input sequence. For example, an input + "Many-to-many" refers to the fact that the network will produce outputs + corresponding to every item of the input sequence. For example, an input sequence of length 10 will produce 10 sequential outputs. Attributes ---------- device : str String describing where the network is physically stored on the computer. - Should be either 'cpu' or 'cuda' (GPU). + Should be 'cpu', 'mps' or 'cuda' (GPU). hidden_size : int Size of hidden vectors in the network num_layers : int @@ -43,8 +43,8 @@ class BRNN_MtM(nn.Module): it should be the number of classes. lstm : PyTorch LSTM object The bidirectional LSTM layer(s) of the recurrent neural network. - fc : PyTorch Linear object - The fully connected linear layer of the recurrent neural network. Across + fc : PyTorch Linear object + The fully connected linear layer of the recurrent neural network. Across the length of the input sequence, this layer aggregates the output of the LSTM nodes from the deepest forward layer and deepest reverse layer and returns the output for that residue in the sequence. @@ -66,7 +66,7 @@ def __init__(self, input_size, hidden_size, num_layers, num_classes, device): it should be the number of classes. device : str String describing where the network is physically stored on the computer. - Should be either 'cpu' or 'cuda' (GPU). + Should be 'cpu', 'mps' or 'cuda' (GPU). """ super(BRNN_MtM, self).__init__() @@ -74,10 +74,12 @@ def __init__(self, input_size, hidden_size, num_layers, num_classes, device): self.hidden_size = hidden_size self.num_layers = num_layers self.num_classes = num_classes - self.lstm = nn.LSTM(input_size, hidden_size, num_layers, - batch_first=True, bidirectional=True) - self.fc = nn.Linear(in_features=hidden_size*2, # *2 for bidirection - out_features=num_classes) + self.lstm = nn.LSTM( + input_size, hidden_size, num_layers, batch_first=True, bidirectional=True + ) + self.fc = nn.Linear( + in_features=hidden_size * 2, out_features=num_classes # *2 for bidirection + ) def forward(self, x): """Propogate input sequences through the network to produce outputs @@ -98,10 +100,12 @@ def forward(self, x): # Set initial states # h0 and c0 dimensions: [num_layers*2 X batch_size X hidden_size] - h0 = torch.zeros(self.num_layers*2, # *2 for bidirection - x.size(0), self.hidden_size).to(self.device) - c0 = torch.zeros(self.num_layers*2, - x.size(0), self.hidden_size).to(self.device) + h0 = torch.zeros( + self.num_layers * 2, x.size(0), self.hidden_size # *2 for bidirection + ).to(self.device) + c0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to( + self.device + ) # Forward propagate LSTM # out: tensor of shape: [batch_size, seq_length, hidden_size*2] @@ -121,7 +125,7 @@ class BRNN_MtO(nn.Module): aggregates the deepest hidden layers of both directions and produces the output. - "Many-to-one" refers to the fact that the network will produce a single output + "Many-to-one" refers to the fact that the network will produce a single output for an entire input sequence. For example, an input sequence of length 10 will produce only one output. @@ -129,7 +133,7 @@ class BRNN_MtO(nn.Module): ---------- device : str String describing where the network is physically stored on the computer. - Should be either 'cpu' or 'cuda' (GPU). + Should be 'cpu', 'mps' or 'cuda' (GPU). hidden_size : int Size of hidden vectors in the network num_layers : int @@ -140,8 +144,8 @@ class BRNN_MtO(nn.Module): it should be the number of classes. lstm : PyTorch LSTM object The bidirectional LSTM layer(s) of the recurrent neural network. - fc : PyTorch Linear object - The fully connected linear layer of the recurrent neural network. Across + fc : PyTorch Linear object + The fully connected linear layer of the recurrent neural network. Across the length of the input sequence, this layer aggregates the output of the LSTM nodes from the deepest forward layer and deepest reverse layer and returns the output for that residue in the sequence. @@ -163,17 +167,19 @@ def __init__(self, input_size, hidden_size, num_layers, num_classes, device): it should be the number of classes. device : str String describing where the network is physically stored on the computer. - Should be either 'cpu' or 'cuda' (GPU). + Should be 'cpu', 'mps' or 'cuda' (GPU). """ super(BRNN_MtO, self).__init__() self.device = device self.hidden_size = hidden_size self.num_layers = num_layers - self.lstm = nn.LSTM(input_size, hidden_size, num_layers, - batch_first=True, bidirectional=True) - self.fc = nn.Linear(in_features=hidden_size*2, # *2 for bidirection - out_features=num_classes) + self.lstm = nn.LSTM( + input_size, hidden_size, num_layers, batch_first=True, bidirectional=True + ) + self.fc = nn.Linear( + in_features=hidden_size * 2, out_features=num_classes # *2 for bidirection + ) def forward(self, x): """Propogate input sequences through the network to produce outputs @@ -194,10 +200,12 @@ def forward(self, x): # Set initial states # h0 and c0 dimensions: [num_layers*2 X batch_size X hidden_size] - h0 = torch.zeros(self.num_layers*2, # *2 for bidirection - x.size(0), self.hidden_size).to(self.device) - c0 = torch.zeros(self.num_layers*2, - x.size(0), self.hidden_size).to(self.device) + h0 = torch.zeros( + self.num_layers * 2, x.size(0), self.hidden_size # *2 for bidirection + ).to(self.device) + c0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to( + self.device + ) # Forward propagate LSTM # out: tensor of shape: [batch_size, seq_length, hidden_size*2] diff --git a/parrot/brnn_plot.py b/parrot/brnn_plot.py index 3a107d8..efbc327 100644 --- a/parrot/brnn_plot.py +++ b/parrot/brnn_plot.py @@ -1,5 +1,5 @@ """ -Plot training results for regression and classification tasks on both +Plot training results for regression and classification tasks on both sequence-mapped and residue-mapped data. ............................................................................. @@ -9,24 +9,41 @@ Question/comments/concerns? Raise an issue on github: https://github.com/idptools/parrot -Licensed under the MIT license. +Licensed under the MIT license. """ -import numpy as np -import torch import itertools -from scipy.stats import linregress, pearsonr, spearmanr -from sklearn.metrics import roc_curve, auc -from sklearn.metrics import precision_recall_curve, average_precision_score -from sklearn.metrics import f1_score, matthews_corrcoef, accuracy_score -import matplotlib.pyplot as plt -import seaborn as sn +import os +import traceback + +import matplotlib as mpl + +mpl.use('Agg') + +import numpy as np import pandas as pd +import seaborn as sn +from scipy.stats import linregress, pearsonr, spearmanr +from sklearn.metrics import ( + accuracy_score, + auc, + average_precision_score, + f1_score, + matthews_corrcoef, + precision_recall_curve, + roc_curve, +) from parrot import encode_sequence +# Set global font size and line width for better publication clarity +mpl.rcParams["font.size"] = 12 +mpl.rcParams["lines.linewidth"] = 2 -def training_loss(train_loss, val_loss, output_file_prefix=''): +import matplotlib.pyplot as plt + + +def training_loss(train_loss, val_loss, output_file_prefix=""): """Plot training and validation loss per epoch Figure is saved to file at "_train_val_loss.png". @@ -41,34 +58,63 @@ def training_loss(train_loss, val_loss, output_file_prefix=''): File to which the plot will be saved as "_train_val_loss.png" """ - fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(6, 6)) - props = dict(boxstyle='round', facecolor='gainsboro', alpha=0.5) + fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(8, 8)) num_epochs = len(train_loss) # Loss per epoch - training_loss, = ax.plot(np.arange(1, num_epochs+1), train_loss, label='Train') - validation_loss, = ax.plot(np.arange(1, num_epochs+1), val_loss, label='Val') - ax.set_xlabel("Epoch") - ax.set_ylabel("Avg loss") - ax.set_title("Training and testing loss per epoch") - ax.legend(handles=[training_loss, validation_loss], fontsize=14, - facecolor='gainsboro', edgecolor='slategray') + (training_loss,) = ax.plot(np.arange(1, num_epochs + 1), train_loss, label="Train") + (validation_loss,) = ax.plot(np.arange(1, num_epochs + 1), val_loss, label="Val") + ax.set_xlabel("Epoch", fontsize=10) + ax.set_ylabel("Avg loss", fontsize=10) + plt.suptitle("Training and testing loss per epoch", size=12) + plt.title(f"epochs={num_epochs}", size=11) + + # Shrink current axis's height by 10% on the bottom + box = ax.get_position() + ax.set_position([box.x0, box.y0 + box.height * 0.1, box.width, box.height * 0.9]) + + # Add legend at the bottom + ax.legend( + handles=[training_loss, validation_loss], + loc="upper center", + fontsize=8, + facecolor="gainsboro", + edgecolor="slategray", + bbox_to_anchor=(0.5, -0.15), + ncol=2, + fancybox=True, + ) if num_epochs < 21: - ax.set_xticks(np.arange(2, num_epochs+1, 2)) + ax.set_xticks(np.arange(2, num_epochs + 1, 2)) elif num_epochs < 66: - ax.set_xticks(np.arange(5, num_epochs+1, 5)) + ax.set_xticks(np.arange(5, num_epochs + 1, 5)) elif num_epochs < 151: - ax.set_xticks(np.arange(10, num_epochs+1, 10)) + ax.set_xticks(np.arange(10, num_epochs + 1, 10)) else: - ax.set_xticks(np.arange(50, num_epochs+1, 50)) - - plt.savefig(output_file_prefix + '_train_val_loss.png') + ax.set_xticks(np.arange(50, num_epochs + 1, 50)) + + plt.savefig( + output_file_prefix + "_train_val_loss.png", + dpi=300, + transparent=True, + format="png", + bbox_inches="tight", + pad_inches=0.1, + ) + plt.savefig( + output_file_prefix + "_train_val_loss.pdf", + dpi=300, + transparent=True, + format="pdf", + bbox_inches="tight", + pad_inches=0.1, + ) plt.clf() -def sequence_regression_scatterplot(true, predicted, output_file_prefix=''): +def sequence_regression_scatterplot(true, predicted, output_file_prefix=""): """Create a scatterplot for a sequence-mapped values regression problem Figure is saved to file at "_seq_scatterplot.png". @@ -93,20 +139,48 @@ def sequence_regression_scatterplot(true, predicted, output_file_prefix=''): for item in predicted: pred_list.append(item.cpu().numpy()[0][0]) - plt.scatter(true_list, pred_list) - edge_vals = [0.9*min(min(true_list), min(pred_list)), - 1.1*max(max(true_list), max(pred_list))] + fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(8, 8)) + ax.scatter(true_list, pred_list) + edge_vals = [ + 0.9 * min(min(true_list), min(pred_list)), + 1.1 * max(max(true_list), max(pred_list)), + ] plt.xlim(edge_vals) plt.ylim(edge_vals) - plt.plot(edge_vals, edge_vals, 'k--') - plt.xlabel('True') - plt.ylabel('Predicted') - slope, intercept, r_value, p_value, std_err = linregress(true_list, pred_list) - plt.title('Testing accuracy: R^2=%.3f' % (r_value**2)) - plt.savefig(output_file_prefix + '_seq_scatterplot.png') + plt.plot(edge_vals, edge_vals, "k--") + plt.xlabel("True", fontsize=10) + plt.ylabel("Predicted", fontsize=10) + + r_value = 0 + try: + slope, intercept, r_value, p_value, std_err = linregress(true_list, pred_list) + except: + traceback.print_exc() + + plt.suptitle( + "Testing accuracy for a sequence-mapped values regression problem", size=12 + ) + plt.title("R^2=%.3f" % (r_value**2), size=11) + plt.savefig( + output_file_prefix + "_seq_scatterplot.png", + dpi=300, + transparent=True, + format="png", + bbox_inches="tight", + pad_inches=0.1, + ) + plt.savefig( + output_file_prefix + "_seq_scatterplot.pdf", + dpi=300, + transparent=True, + format="pdf", + bbox_inches="tight", + pad_inches=0.1, + ) + plt.clf() -def residue_regression_scatterplot(true, predicted, output_file_prefix=''): +def residue_regression_scatterplot(true, predicted, output_file_prefix=""): """Create a scatterplot for a residue-mapped values regression problem Each sequence is plotted with a unique marker-color combination, up to 70 @@ -120,7 +194,7 @@ def residue_regression_scatterplot(true, predicted, output_file_prefix=''): A list where each item is a [1 x len(sequence)] tensor with the true regression values of each residue in a sequence predicted : list of PyTorch FloatTensors - A list where each item is a [1 x len(sequence)] tensor with the + A list where each item is a [1 x len(sequence)] tensor with the regression predictions for each residue in a sequence output_file_prefix : str, optional File to which the plot will be saved as "_res_scatterplot.png" @@ -129,7 +203,7 @@ def residue_regression_scatterplot(true, predicted, output_file_prefix=''): true_list = [] pred_list = [] - marker = itertools.cycle(('>', '+', '.', 'o', '*', 'v', 'D')) + marker = itertools.cycle((">", "+", ".", "o", "*", "v", "D")) for item in true: single_frag = item.cpu().numpy()[0].flatten() @@ -138,24 +212,54 @@ def residue_regression_scatterplot(true, predicted, output_file_prefix=''): single_frag = item.cpu().numpy()[0].flatten() pred_list.append(list(single_frag)) - for i in range(len(true_list)): - plt.scatter(true_list[i], pred_list[i], s=6, marker=next(marker)) + fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(8, 8)) - plt.figure(1) + for i in range(len(true_list)): + ax.scatter(true_list[i], pred_list[i], s=6, marker=next(marker)) left, right = plt.xlim() bottom, top = plt.ylim() edge_vals = [min(left, bottom), max(right, top)] plt.xlim(edge_vals) plt.ylim(edge_vals) - plt.plot(edge_vals, edge_vals, 'k--') - plt.xlabel('True') - plt.ylabel('Predicted') - slope, intercept, r_value, p_value, std_err = linregress(sum(true_list, []), sum(pred_list, [])) - plt.title('Testing accuracy: R^2=%.3f' % (r_value**2)) - plt.savefig(output_file_prefix + '_res_scatterplot.png') - -def plot_roc_curve(true_classes, predicted_class_probs, num_classes, output_file_prefix=''): + plt.plot(edge_vals, edge_vals, "k--") + plt.xlabel("True", fontsize=10) + plt.ylabel("Predicted", fontsize=10) + r_value = 0 + + try: + slope, intercept, r_value, p_value, std_err = linregress( + sum(true_list, []), sum(pred_list, []) + ) + except: + traceback.print_exc() + + plt.suptitle( + "Testing accuracy for a sequence-mapped values regression problem", size=12 + ) + plt.title("R^2=%.3f" % (r_value**2), size=11) + plt.savefig( + output_file_prefix + "_res_scatterplot.png", + dpi=300, + transparent=True, + format="png", + bbox_inches="tight", + pad_inches=0.1, + ) + plt.savefig( + output_file_prefix + "_res_scatterplot.pdf", + dpi=300, + transparent=True, + format="pdf", + bbox_inches="tight", + pad_inches=0.1, + ) + plt.clf() + + +def plot_roc_curve( + true_classes, predicted_class_probs, num_classes, output_file_prefix="" +): """Create an ROC curve for a sequence classification problem Figure is saved to file at "_ROC_curve.png". @@ -188,40 +292,85 @@ def plot_roc_curve(true_classes, predicted_class_probs, num_classes, output_file fpr[c], tpr[c], _ = roc_curve(y_test[:, c], y_score[:, c]) roc_auc[c] = auc(fpr[c], tpr[c]) - plt.figure() + fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(8, 8)) if num_classes > 2: # Compute micro-average ROC curve and ROC area (if multiclass) fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel()) roc_auc["micro"] = auc(fpr["micro"], tpr["micro"]) # Plot all ROC curves - plt.plot(fpr["micro"], tpr["micro"], - label='Average (area = {0:0.2f})' - ''.format(roc_auc["micro"]), - color='deeppink', linestyle=':', linewidth=4) + ax.plot( + fpr["micro"], + tpr["micro"], + label="Average (area = {0:0.2f})" "".format(roc_auc["micro"]), + color="deeppink", + linestyle=":", + linewidth=4, + ) for c in range(num_classes): - plt.plot(fpr[c], tpr[c], lw=2, - label='Class {0} (area = {1:0.2f})' - ''.format(c, roc_auc[c])) - - elif num_classes == 2: # If binary classification + ax.plot( + fpr[c], + tpr[c], + lw=2, + label="Class {0} (area = {1:0.2f})" "".format(c, roc_auc[c]), + ) + + elif num_classes == 2: # If binary classification # Plot only one curve (doesn't matter which one, they are symmetric) - plt.plot(fpr[1], tpr[1], lw=2, - label='Binary class (area = {0:0.2f})' - ''.format(roc_auc[1])) - - plt.plot([0, 1], [0, 1], 'k--', lw=2) + ax.plot( + fpr[1], + tpr[1], + lw=2, + label="Binary class (area = {0:0.2f})" "".format(roc_auc[1]), + ) + + ax.plot([0, 1], [0, 1], "k--", lw=2) plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) - plt.xlabel('False Positive Rate') - plt.ylabel('True Positive Rate') - plt.title('Receiver operating characteristic') - plt.legend(loc="lower right", fontsize=8) - plt.savefig(output_file_prefix + '_ROC_curve.png') - -def plot_precision_recall_curve(true_classes, predicted_class_probs, - num_classes, output_file_prefix=''): + plt.xlabel("False Positive Rate", fontsize=10) + plt.ylabel("True Positive Rate", fontsize=10) + + plt.suptitle("Receiver operating characteristic (ROC) curve", size=12) + plt.title("for a sequence-mapped values classification problem", size=11) + + # Shrink current axis's height by 10% on the bottom + box = ax.get_position() + ax.set_position([box.x0, box.y0 + box.height * 0.1, box.width, box.height * 0.9]) + + # Add legend at the bottom + ax.legend( + loc="upper center", + fontsize=8, + facecolor="gainsboro", + edgecolor="slategray", + bbox_to_anchor=(0.5, -0.15), + ncol=5, + fancybox=True, + ) + + plt.savefig( + output_file_prefix + "_ROC_curve.png", + dpi=300, + transparent=True, + format="png", + bbox_inches="tight", + pad_inches=0.1, + ) + plt.savefig( + output_file_prefix + "_ROC_curve.pdf", + dpi=300, + transparent=True, + format="pdf", + bbox_inches="tight", + pad_inches=0.1, + ) + plt.clf() + + +def plot_precision_recall_curve( + true_classes, predicted_class_probs, num_classes, output_file_prefix="" +): """Create an PR curve for a sequence classification problem Figure is saved to file at "_PR_curve.png". @@ -251,36 +400,79 @@ def plot_precision_recall_curve(true_classes, predicted_class_probs, recall = dict() average_precision = dict() for i in range(num_classes): - precision[i], recall[i], _ = precision_recall_curve(y_test[:, i], - y_score[:, i]) + precision[i], recall[i], _ = precision_recall_curve(y_test[:, i], y_score[:, i]) average_precision[i] = average_precision_score(y_test[:, i], y_score[:, i]) # A "micro-average": quantifying score on all classes jointly - precision["micro"], recall["micro"], _ = precision_recall_curve(y_test.ravel(), - y_score.ravel()) - average_precision["micro"] = average_precision_score(y_test, y_score, - average="micro") + precision["micro"], recall["micro"], _ = precision_recall_curve( + y_test.ravel(), y_score.ravel() + ) + average_precision["micro"] = average_precision_score( + y_test, y_score, average="micro" + ) # Plot - plt.figure() - plt.plot(recall["micro"], precision["micro"], color='deeppink', linestyle=':', - linewidth=4, label='Average (area = {0:0.2f})' - ''.format(average_precision["micro"])) + fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(8, 8)) + ax.plot( + recall["micro"], + precision["micro"], + color="deeppink", + linestyle=":", + linewidth=4, + label="Average (area = {0:0.2f})" "".format(average_precision["micro"]), + ) for c in range(num_classes): - plt.plot(recall[c], precision[c], lw=2, - label='Class {0} (area = {1:0.2f})' - ''.format(c, average_precision[c])) + ax.plot( + recall[c], + precision[c], + lw=2, + label="Class {0} (area = {1:0.2f})" "".format(c, average_precision[c]), + ) plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) - plt.xlabel('Recall') - plt.ylabel('Precision') - plt.title('Precision-Recall') - plt.legend() - plt.savefig(output_file_prefix + '_PR_curve.png') + plt.xlabel("Recall", fontsize=10) + plt.ylabel("Precision", fontsize=10) + plt.suptitle("Precision-Recall curve", size=12) + plt.title("for a sequence-mapped values classification problem", size=11) + + # Shrink current axis's height by 10% on the bottom + box = ax.get_position() + ax.set_position([box.x0, box.y0 + box.height * 0.1, box.width, box.height * 0.9]) + + # Add legend at the bottom + ax.legend( + loc="upper center", + fontsize=8, + facecolor="gainsboro", + edgecolor="slategray", + bbox_to_anchor=(0.5, -0.15), + ncol=5, + fancybox=True, + ) + + plt.savefig( + output_file_prefix + "_PR_curve.png", + dpi=300, + transparent=True, + format="png", + bbox_inches="tight", + pad_inches=0.1, + ) + plt.savefig( + output_file_prefix + "_PR_curve.pdf", + dpi=300, + transparent=True, + format="pdf", + bbox_inches="tight", + pad_inches=0.1, + ) + plt.clf() -def confusion_matrix(true_classes, predicted_classes, num_classes, output_file_prefix=''): +def confusion_matrix( + true_classes, predicted_classes, num_classes, output_file_prefix="" +): """Create a confusion matrix for a sequence classification problem Figure is saved to file at "_seq_CM.png". @@ -304,16 +496,40 @@ class label for a particular sequence cm[np.argmax(predicted_classes[i][0].cpu().numpy()), true_classes[i][0]] += 1 df_cm = pd.DataFrame(cm, range(num_classes), range(num_classes)) - sn.set(font_scale=1.4) # for label size - sn.heatmap(df_cm, cmap='Blues', annot=True, annot_kws={"size": 16}) # font size - plt.xlabel('True labels') - plt.ylabel('Predicted labels') - plt.title('Test set confusion matrix') + sn.set_theme(font_scale=1.4) # for label size + sn.heatmap( + df_cm, cmap="Blues", annot=True, annot_kws={"size": 8}, fmt="g" + ) # font size + + plt.xlabel("True labels", fontsize=10) + plt.ylabel("Predicted labels", fontsize=10) + plt.suptitle("Test set confusion matrix", size=12) + plt.title("for a sequence-mapped values classification problem", size=11) + plt.tight_layout() - plt.savefig(output_file_prefix + '_seq_CM.png') + + plt.savefig( + output_file_prefix + "_seq_CM.png", + dpi=300, + transparent=True, + format="png", + bbox_inches="tight", + pad_inches=0.1, + ) + plt.savefig( + output_file_prefix + "_seq_CM.pdf", + dpi=300, + transparent=True, + format="pdf", + bbox_inches="tight", + pad_inches=0.1, + ) + plt.clf() -def res_confusion_matrix(true_classes, predicted_classes, num_classes, output_file_prefix=''): +def res_confusion_matrix( + true_classes, predicted_classes, num_classes, output_file_prefix="" +): """Create a confusion matrix for a residue classification problem Figure is saved to file at "_res_CM.png". @@ -351,17 +567,39 @@ def res_confusion_matrix(true_classes, predicted_classes, num_classes, output_fi cm[pred_list[i], true_list[i]] += 1 df_cm = pd.DataFrame(cm, range(num_classes), range(num_classes)) - sn.set(font_scale=1.4) # for label size - sn.heatmap(df_cm, cmap='Blues', annot=True, annot_kws={"size": 16}) # font size - plt.xlabel('True labels') - plt.ylabel('Predicted labels') - plt.title('Test set confusion matrix') + sn.set_theme(font_scale=1.4) # for label size + sn.heatmap( + df_cm, cmap="Blues", annot=True, annot_kws={"size": 8}, fmt="g" + ) # font size + plt.xlabel("True labels", fontsize=10) + plt.ylabel("Predicted labels", fontsize=10) + plt.suptitle("Test set confusion matrix ", size=12) + plt.title("for a residue-mapped values classification problem", size=11) + plt.tight_layout() - plt.savefig(output_file_prefix + '_res_CM.png') + + plt.savefig( + output_file_prefix + "_res_CM.png", + dpi=300, + transparent=True, + format="png", + bbox_inches="tight", + pad_inches=0.1, + ) + plt.savefig( + output_file_prefix + "_res_CM.pdf", + dpi=300, + transparent=True, + format="pdf", + bbox_inches="tight", + pad_inches=0.1, + ) + plt.clf() -def write_performance_metrics(sequence_data, dtype, problem_type, - prob_class, output_file_prefix=''): +def write_performance_metrics( + sequence_data, dtype, problem_type, prob_class, output_file_prefix="" +): """Writes a short text file describing performance on a variety of metrics Writes different output depending on whether a classification or regression task @@ -390,13 +628,25 @@ def write_performance_metrics(sequence_data, dtype, problem_type, true_vals = [l[1] for l in sequence_data] pred_vals = [l[2] for l in sequence_data] - perform_metrics = {} - - if dtype == 'residues': + perform_metrics = { + "File": os.path.basename(output_file_prefix), + "Dataset_Type": dtype, + "Problem_Type": problem_type, + "Prob_Class": prob_class, + "Pearson_R": None, + "Spearman_R": None, + "Area_under_Precision_Recall_curve": None, + "Area_under_ROC": None, + "Matthews_Correlation_Coef": None, + "F1_Score": None, + "Accuracy": None, + } + + if dtype == "residues": true_vals = np.hstack(true_vals) pred_vals = np.hstack(pred_vals) - if problem_type == 'classification': + if problem_type == "classification": # Take care of probabilistic-classification case first if prob_class: # Reformat @@ -406,39 +656,61 @@ def write_performance_metrics(sequence_data, dtype, problem_type, true_vals_array[i, true_vals[i]] = 1 # AUROC, AUPRC - perform_metrics['Area under Precision-Recall curve'] = round( - average_precision_score(true_vals_array, - pred_vals, average="micro"), 3) + perform_metrics["Area_under_Precision_Recall_curve"] = round( + average_precision_score(true_vals_array, pred_vals, average="micro"), 3 + ) fpr, tpr, _ = roc_curve(true_vals_array.ravel(), pred_vals.ravel()) - perform_metrics["Area under ROC"] = round(auc(fpr, tpr), 3) + perform_metrics["Area_under_ROC"] = round(auc(fpr, tpr), 3) # Change probs to discrete classes pred_vals = np.argmax(pred_vals, axis=1) # Then take care of general classification stats: accuracy, F1, MCC - perform_metrics['Matthews Correlation Coef'] = round( - matthews_corrcoef(true_vals, pred_vals), 3) - perform_metrics['F1 Score'] = round( - f1_score(true_vals, pred_vals, average='weighted'), 3) - perform_metrics['Accuracy'] = round(accuracy_score(true_vals, pred_vals), 3) - - - elif problem_type == 'regression': + perform_metrics["Matthews_Correlation_Coef"] = round( + matthews_corrcoef(true_vals, pred_vals), 3 + ) + perform_metrics["F1_Score"] = round( + f1_score(true_vals, pred_vals, average="weighted"), 3 + ) + perform_metrics["Accuracy"] = round(accuracy_score(true_vals, pred_vals), 3) + + elif problem_type == "regression": # Pearson R, Spearman R pears_r, p_val = pearsonr(true_vals, pred_vals) - perform_metrics['Pearson R'] = round(pears_r, 3) + perform_metrics["Pearson_R"] = round(pears_r, 3) spearman_r, p_val = spearmanr(true_vals, pred_vals) - perform_metrics['Spearman R'] = round(spearman_r, 3) + perform_metrics["Spearman_R"] = round(spearman_r, 3) # Write performance metrics to file - with open(output_file_prefix + '_performance_stats.txt', 'w') as f: + with open(output_file_prefix + "_performance_stats.txt", "w") as f: for key, value in perform_metrics.items(): - outstr = '%s : %.3f\n' % (key, value) + if isinstance(value, (int, float)): + outstr = "%s: %.3f\n" % (key, float(value)) + else: + outstr = "%s: %s\n" % ( + key, + value, + ) + print(outstr) f.write(outstr) + # Convert the dictionary to a DataFrame (Note: it's a single row dictionary) + df = pd.DataFrame([perform_metrics]) + + # Save the DataFrame to a CSV file, formatting floats with 6 decimals + df.to_csv( + output_file_prefix + "_performance_stats.csv", index=False, float_format="%.3f" + ) + -def output_predictions_to_file(sequence_data, excludeSeqID, encoding_scheme, - probabilistic_class, encoder=None, output_file_prefix=''): +def output_predictions_to_file( + sequence_data, + excludeSeqID, + encoding_scheme, + probabilistic_class, + encoder=None, + output_file_prefix="", +): """Output sequences, their true values, and their predicted values to a file Used on the output of the test_unlabeled_data() function in the train_network module in @@ -455,10 +727,10 @@ def output_predictions_to_file(sequence_data, excludeSeqID, encoding_scheme, true_value, predicted_value, sequence_ID] excludeSeqID : bool Boolean indicating whether or not each line in `tsvfile` has a sequence ID - (default is False) + (default is False) encoding_scheme : str - Description of how an amino acid sequence should be encoded as a numeric - vector. Providing a string other than 'onehot', 'biophysics', or 'user' + Description of how an amino acid sequence should be encoded as a numeric + vector. Providing a string other than 'onehot', 'biophysics', or 'user' will produce unintended consequences. probabilistic_class : bool Flag indicating if probabilistic classification was specified by the user. If True, @@ -484,40 +756,46 @@ def output_predictions_to_file(sequence_data, excludeSeqID, encoding_scheme, pred_vals.append(pred_val) if excludeSeqID: - names.append('test' + str(count)) + names.append("test" + str(count)) count += 1 else: names.append(name) # Decode the sequence vectors - if encoding_scheme == 'onehot': + if encoding_scheme == "onehot": sequences = encode_sequence.rev_one_hot(seq_vectors) - elif encoding_scheme == 'biophysics': + elif encoding_scheme == "biophysics": sequences = encode_sequence.rev_biophysics(seq_vectors) else: sequences = encoder.decode(seq_vectors) # Write to file - with open(output_file_prefix + '_predictions.tsv', 'w') as tsvfile: + with open(output_file_prefix + "_predictions.tsv", "w") as tsvfile: for i in range(len(names)): # Adjust formatting for residues or sequence data if isinstance(true_vals[i], np.ndarray): - true_vals_format = ' '.join(true_vals[i].astype(str)) - pred_vals_format = ' '.join(pred_vals[i].astype(str)) + true_vals_format = " ".join(true_vals[i].astype(str)) + pred_vals_format = " ".join(pred_vals[i].astype(str)) elif probabilistic_class: true_vals_format = true_vals[i] - pred_vals_format = ' '.join(np.around(pred_vals[i], decimals=4).astype(str)) + pred_vals_format = " ".join( + np.around(pred_vals[i], decimals=4).astype(str) + ) else: true_vals_format = true_vals[i] pred_vals_format = pred_vals[i] - ''' + """ Format: NAME_TRUE SEQUENCE TRUE_VALUE(S) NAME_PRED SEQUENCE PRED_VALUE(S) - ''' + """ output_str = "%s_TRUE %s %s\n" % (names[i], sequences[i], true_vals_format) - output_str = output_str + "%s_PRED %s %s\n" % (names[i], sequences[i], pred_vals_format) + output_str = output_str + "%s_PRED %s %s\n" % ( + names[i], + sequences[i], + pred_vals_format, + ) tsvfile.write(output_str) diff --git a/parrot/train_network.py b/parrot/train_network.py index d4d939a..3a95f6f 100644 --- a/parrot/train_network.py +++ b/parrot/train_network.py @@ -8,20 +8,32 @@ Question/comments/concerns? Raise an issue on github: https://github.com/idptools/parrot -Licensed under the MIT license. +Licensed under the MIT license. """ +import time +from datetime import timedelta +import numpy as np import torch import torch.nn as nn -from torch.utils.data import Dataset, DataLoader -import numpy as np - -from parrot import brnn_plot -from parrot import encode_sequence - -def train(network, train_loader, val_loader, datatype, problem_type, weights_file, - stop_condition, device, learn_rate, n_epochs, verbose=False, silent=False): +from parrot import brnn_plot, encode_sequence + + +def train( + network, + train_loader, + val_loader, + datatype, + problem_type, + weights_file, + stop_condition, + device, + learn_rate, + n_epochs, + verbose=False, + silent=False, +): """Train a BRNN and save the best performing network weights Train the network on a training set, and every epoch evaluate its performance on @@ -30,8 +42,8 @@ def train(network, train_loader, val_loader, datatype, problem_type, weights_fil User must specify the machine learning tast (`problem_type`) and the format of the data (`datatype`). Additionally, this function requires the learning rate - hyperparameter and the number of epochs of training. The other hyperparameters, - number of hidden layers and hidden vector size, are implictly included on the + hyperparameter and the number of epochs of training. The other hyperparameters, + number of hidden layers and hidden vector size, are implictly included on the the provided network. The user may specify if they want to train the network for a set number of @@ -64,8 +76,9 @@ def train(network, train_loader, val_loader, datatype, problem_type, weights_fil performance has sufficiently stagnated. If the performance plateaus for `n_epochs` consecutive epochs, then training will stop. device : str - Location of where training will take place--should be either 'cpu' or - 'cuda' (GPU). If available, training on GPU is typically much faster. + Location of where training will take place--should be 'cpu', 'mps' (Apple + GPU) or 'cuda' (GPU). If available, training on GPU is typically + much faster. learn_rate : float Initial learning rate of network training. The training process is controlled by the Adam optimization algorithm, so this learning rate @@ -87,50 +100,62 @@ def train(network, train_loader, val_loader, datatype, problem_type, weights_fil A list of the average validation set losses achieved at each epoch """ + print(f"Using device: {device}") + network.to(device) + # Set optimizer optimizer = torch.optim.Adam(network.parameters(), lr=learn_rate) # Set loss criteria - if problem_type == 'regression': - if datatype == 'residues': - criterion = nn.MSELoss(reduction='sum') - elif datatype == 'sequence': - criterion = nn.L1Loss(reduction='sum') - elif problem_type == 'classification': - criterion = nn.CrossEntropyLoss(reduction='sum') + if problem_type == "regression": + if datatype == "residues": + criterion = nn.MSELoss(reduction="sum") + elif datatype == "sequence": + criterion = nn.L1Loss(reduction="sum") + elif problem_type == "classification": + criterion = nn.CrossEntropyLoss(reduction="sum") network = network.float() - total_step = len(train_loader) + # total_step = len(train_loader) min_val_loss = np.inf avg_train_losses = [] avg_val_losses = [] - if stop_condition == 'auto': + if stop_condition == "auto": min_epochs = n_epochs # Set to some arbitrarily large number of iterations -- will stop automatically - n_epochs = 20000000 + # n_epochs = 20000000 + n_epochs = 2 * n_epochs last_decrease = 0 # Train the model - evaluate performance on val set every epoch end_training = False + + with open("train_network.debug.log", "w") as f: + f.write("DataType;ProblemType;Epoch;MaxEpochs;Percentage;StopCondition;StartTime;EndTime;ElapsedTime;TrainLoss;ValLoss;SignificantDecrease;LastDecreaseEpoch\n") + + # To be saved only once + best_network_dict = None + for epoch in range(n_epochs): # Main loop + start_time = time.time() # Initialize training and testing loss for epoch train_loss = 0 val_loss = 0 # Iterate over batches - for i, (names, vectors, targets) in enumerate(train_loader): + for _names, vectors, targets in train_loader: vectors = vectors.to(device) targets = targets.to(device) # Forward pass outputs = network(vectors.float()) - if problem_type == 'regression': + if problem_type == "regression": loss = criterion(outputs, targets.float()) else: - if datatype == 'residues': + if datatype == "residues": outputs = outputs.permute(0, 2, 1) loss = criterion(outputs, targets.long()) @@ -141,16 +166,16 @@ def train(network, train_loader, val_loader, datatype, problem_type, weights_fil loss.backward() optimizer.step() - for names, vectors, targets in val_loader: + for _names, vectors, targets in val_loader: vectors = vectors.to(device) targets = targets.to(device) # Forward pass outputs = network(vectors.float()) - if problem_type == 'regression': + if problem_type == "regression": loss = criterion(outputs, targets.float()) else: - if datatype == 'residues': + if datatype == "residues": outputs = outputs.permute(0, 2, 1) loss = criterion(outputs, targets.long()) @@ -162,12 +187,12 @@ def train(network, train_loader, val_loader, datatype, problem_type, weights_fil val_loss /= len(val_loader.dataset) signif_decrease = True - if stop_condition == 'auto' and epoch > min_epochs - 1: + if stop_condition == "auto" and epoch > min_epochs - 1: # Check to see if loss has stopped decreasing last_epochs_loss = avg_val_losses[-min_epochs:] for loss in last_epochs_loss: - if val_loss >= loss*0.995: + if val_loss >= loss * 0.995: signif_decrease = False # If network performance has plateaued over the last range of epochs, end training @@ -176,37 +201,70 @@ def train(network, train_loader, val_loader, datatype, problem_type, weights_fil # Only save updated weights to memory if they improve val set performance if val_loss < min_val_loss: - min_val_loss = val_loss # Reset min_val_loss + min_val_loss = val_loss # Reset min_val_loss last_decrease = epoch - torch.save(network.state_dict(), weights_file) # Save model + best_network_dict = network.state_dict() + # torch.save(network.state_dict(), weights_file) # Save model # Append losses to lists avg_train_losses.append(train_loss) avg_val_losses.append(val_loss) + end_time = time.time() + elapsed_time = timedelta(seconds=end_time - start_time) + + # Convert elapsed time to hours, minutes, seconds, and milliseconds + hours, remainder = divmod(elapsed_time.seconds, 3600) + minutes, seconds = divmod(remainder, 60) + milliseconds = elapsed_time.microseconds // 1000 # Convert microseconds to milliseconds + + formatted_start_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(start_time)) + formatted_end_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(end_time)) + formatted_elapsed_time = f"{hours}h, {minutes}m, {seconds}s, {milliseconds}ms" + percentage = 100*((epoch+1)/n_epochs) + + epoch_detail_line = f"{datatype};{problem_type};{epoch + 1};{n_epochs};{percentage:.4f}%;{stop_condition};{formatted_start_time};{formatted_end_time};{formatted_elapsed_time};{train_loss:.4f};{val_loss:.4f};{str(signif_decrease)};{last_decrease + 1}\n" + + with open("train_network.debug.log", "a") as f: + f.write(epoch_detail_line) + if verbose: - print('Epoch %d\tLoss %.4f' % (epoch, val_loss)) + print(epoch_detail_line) elif epoch % 5 == 0 and silent is False: - print('Epoch %d\tLoss %.4f' % (epoch, val_loss)) + print(epoch_detail_line) # This is placed here to ensure that the best network, even if the performance # improvement is marginal, is saved. if end_training: break + # Save model only once + if best_network_dict: + torch.save(best_network_dict, weights_file) + else: + raise "No data to save the model" + # Return loss per epoch so that they can be plotted return avg_train_losses, avg_val_losses -def test_labeled_data(network, test_loader, datatype, - problem_type, weights_file, num_classes, - probabilistic_classification, include_figs, - device, output_file_prefix=''): +def test_labeled_data( + network, + test_loader, + datatype, + problem_type, + weights_file, + num_classes, + probabilistic_classification, + include_figs, + device, + output_file_prefix="", +): """Test a trained BRNN on labeled sequences Using the saved weights of a trained network, run a set of sequences through the network and evaluate the performancd. Return the average loss per - sequence and plot the results. Testing a network on previously-unseen data + sequence and plot the results. Testing a network on previously-unseen data provides a useful estimate of how generalizeable the network's performance is. Parameters @@ -232,10 +290,11 @@ def test_labeled_data(network, test_loader, datatype, include_figs: bool Whether or not matplotlib figures should be generated. device : str - Location of where testing will take place--should be either 'cpu' or - 'cuda' (GPU). If available, training on GPU is typically much faster. + Location of where training will take place--should be 'cpu', 'mps' (Apple + GPU) or 'cuda' (GPU). If available, training on GPU is typically + much faster. output_file_prefix : str - Path and filename prefix to which the test set predictions and plots will be saved. + Path and filename prefix to which the test set predictions and plots will be saved. Returns ------- @@ -251,20 +310,20 @@ def test_labeled_data(network, test_loader, datatype, network.load_state_dict(torch.load(weights_file)) # Get output directory for images - network_filename = weights_file.split('/')[-1] - output_dir = weights_file[:-len(network_filename)] + network_filename = weights_file.split("/")[-1] + output_dir = weights_file[: -len(network_filename)] # Set loss criteria - if problem_type == 'regression': + if problem_type == "regression": criterion = nn.MSELoss() - elif problem_type == 'classification': + elif problem_type == "classification": criterion = nn.CrossEntropyLoss() test_loss = 0 all_targets = [] all_outputs = [] predictions = [] - for names, vectors, targets in test_loader: # batch size of 1 + for names, vectors, targets in test_loader: # batch size of 1 all_targets.append(targets) vectors = vectors.to(device) @@ -272,10 +331,10 @@ def test_labeled_data(network, test_loader, datatype, # Forward pass outputs = network(vectors.float()) - if problem_type == 'regression': + if problem_type == "regression": loss = criterion(outputs, targets.float()) else: - if datatype == 'residues': + if datatype == "residues": outputs = outputs.permute(0, 2, 1) loss = criterion(outputs, targets.long()) @@ -283,46 +342,60 @@ def test_labeled_data(network, test_loader, datatype, all_outputs.append(outputs.detach()) # Add to list as: [seq_vector, true value, predicted value, name] - predictions.append([vectors[0].cpu().numpy(), targets.cpu().numpy() - [0], outputs.cpu().detach().numpy(), names[0]]) + predictions.append( + [ + vectors[0].cpu().numpy(), + targets.cpu().numpy()[0], + outputs.cpu().detach().numpy(), + names[0], + ] + ) # Plot 'accuracy' depending on the problem type and datatype - if problem_type == 'regression': - if datatype == 'residues': + if problem_type == "regression": + if datatype == "residues": if include_figs: - brnn_plot.residue_regression_scatterplot(all_targets, all_outputs, - output_file_prefix=output_file_prefix) + brnn_plot.residue_regression_scatterplot( + all_targets, all_outputs, output_file_prefix=output_file_prefix + ) # Format predictions for i in range(len(predictions)): predictions[i][2] = predictions[i][2].flatten() predictions[i][1] = predictions[i][1].flatten() - elif datatype == 'sequence': + elif datatype == "sequence": if include_figs: - brnn_plot.sequence_regression_scatterplot(all_targets, all_outputs, - output_file_prefix=output_file_prefix) + brnn_plot.sequence_regression_scatterplot( + all_targets, all_outputs, output_file_prefix=output_file_prefix + ) # Format predictions for i in range(len(predictions)): predictions[i][2] = predictions[i][2][0][0] predictions[i][1] = predictions[i][1][0] - elif problem_type == 'classification': + elif problem_type == "classification": - if datatype == 'residues': + if datatype == "residues": if include_figs: - brnn_plot.res_confusion_matrix(all_targets, all_outputs, num_classes, - output_file_prefix=output_file_prefix) + brnn_plot.res_confusion_matrix( + all_targets, + all_outputs, + num_classes, + output_file_prefix=output_file_prefix, + ) # Format predictions and assign class predictions for i in range(len(predictions)): pred_values = [] + for j in range(len(predictions[i][2])): pred_values = np.argmax(predictions[i][2], axis=1)[0] - predictions[i][2] = np.array(pred_values, dtype=np.int) - elif datatype == 'sequence': + predictions[i][2] = np.array(pred_values, dtype=int) + + elif datatype == "sequence": if probabilistic_classification: # Probabilistic assignment of class predictions # Optional implementation for classification tasks @@ -337,10 +410,18 @@ def test_labeled_data(network, test_loader, datatype, # Plot ROC and PR curves if include_figs: - brnn_plot.plot_roc_curve(all_targets, pred_probabilities, num_classes, - output_file_prefix=output_file_prefix) - brnn_plot.plot_precision_recall_curve(all_targets, pred_probabilities, - num_classes, output_file_prefix=output_file_prefix) + brnn_plot.plot_roc_curve( + all_targets, + pred_probabilities, + num_classes, + output_file_prefix=output_file_prefix, + ) + brnn_plot.plot_precision_recall_curve( + all_targets, + pred_probabilities, + num_classes, + output_file_prefix=output_file_prefix, + ) else: # Absolute assignment of class predictions @@ -351,20 +432,31 @@ def test_labeled_data(network, test_loader, datatype, # Plot confusion matrix (if not in probabilistic classification mode) if include_figs: - brnn_plot.confusion_matrix(all_targets, all_outputs, num_classes, - output_file_prefix=output_file_prefix) + brnn_plot.confusion_matrix( + all_targets, + all_outputs, + num_classes, + output_file_prefix=output_file_prefix, + ) return test_loss / len(test_loader.dataset), predictions -def test_unlabeled_data(network, sequences, device, encoding_scheme='onehot', encoder=None, print_frequency=None): +def test_unlabeled_data( + network, + sequences, + device, + encoding_scheme="onehot", + encoder=None, + print_frequency=None, +): """Test a trained BRNN on unlabeled sequences Use a trained network to make predictions on previously-unseen data. - ** + ** Note: Unlike the previous functions, `network` here must have pre-loaded - weights. + weights. ** Parameters @@ -374,8 +466,9 @@ def test_unlabeled_data(network, sequences, device, encoding_scheme='onehot', en sequences : list A list of amino acid sequences to test using the network device : str - Location of where testing will take place--should be either 'cpu' or - 'cuda' (GPU). If available, training on GPU is typically much faster. + Location of where training will take place--should be 'cpu', 'mps' (Apple + GPU) or 'cuda' (GPU). If available, training on GPU is typically + much faster. encoding_scheme : str, optional How amino acid sequences are to be encoded as numeric vectors. Currently, 'onehot','biophysics' and 'user' are the implemented options. @@ -386,7 +479,7 @@ def test_unlabeled_data(network, sequences, device, encoding_scheme='onehot', en print_frequency : int If provided defines at what sequence interval an update is printed. Default = None. - + Returns ------- dict @@ -403,13 +496,13 @@ def test_unlabeled_data(network, sequences, device, encoding_scheme='onehot', en local_count = local_count + 1 if print_frequency is not None: if local_count % print_frequency == 0: - print(f'On {local_count} of {total_count}') + print(f"On {local_count} of {total_count}") - if encoding_scheme == 'onehot': + if encoding_scheme == "onehot": seq_vector = encode_sequence.one_hot(seq) - elif encoding_scheme == 'biophysics': + elif encoding_scheme == "biophysics": seq_vector = encode_sequence.biophysics(seq) - elif encoding_scheme == 'user': + elif encoding_scheme == "user": seq_vector = encoder.encode(seq) seq_vector = seq_vector.view(1, len(seq_vector), -1) diff --git a/scripts/parrot-optimize b/scripts/parrot-optimize index 22a8365..f091951 100755 --- a/scripts/parrot-optimize +++ b/scripts/parrot-optimize @@ -1,8 +1,8 @@ #!/usr/bin/env python """ Usage: $ parrot-optimize data_file output_network - -Driver script for finding optimal hyperparameters for a bidirectional recurrent + +Driver script for finding optimal hyperparameters for a bidirectional recurrent neural network on a given dataset, then training a network with those parameters For more information on usage, use the '-h' flag. @@ -13,7 +13,7 @@ idptools-parrot was developed by the Holehouse lab Question/comments/concerns? Raise an issue on github: https://github.com/idptools/parrot -Licensed under the MIT license. +Licensed under the MIT license. """ import os @@ -34,63 +34,148 @@ from parrot.tools import validate_args from parrot.tools import dataset_warnings # Parse the command line arguments -parser = argparse.ArgumentParser(description='Train and test a bi-directional RNN using entire sequence.') - -parser.add_argument('data_file', help='path to tsv file with format: ') - -parser.add_argument('output_network', help='location to save the trained network') - -parser.add_argument('-d', '--datatype', metavar='dtype', type=str, required=True, - help="REQUIRED. Format of the input data file, must be 'sequence' or 'residues'") - -parser.add_argument('-c', '--classes', type=int, metavar='num_classes', required=True, - help='REQUIRED. Number of output classes, for regression put 1') - -parser.add_argument('-b', '--batch', default=32, type=int, metavar='batch_size', - help='size of training batch (def=32)') - -parser.add_argument('-e', '--epochs', default=100, type=int, metavar='num_epochs', - help='number of training epochs (def=100)') - -parser.add_argument('--max-iter', default=50, type=int, metavar='max_iter', - help='Maximum number of iterations for the optimization procedure (def=50)') - -parser.add_argument('--split', default='', metavar='split_file', type=str, - help="file indicating how to split datafile into training, validation, and testing sets") - -parser.add_argument('--set-fractions', nargs=3, default=[0.7, 0.15, 0.15], type=float, - dest='setFractions', metavar=('train', 'val', 'test'), - help='Proportion of dataset that should be divided into training, validation, and test sets') - -parser.add_argument('--encode', default='onehot', type=str, metavar='encoding_scheme', - help="'onehot' (default), 'biophysics', or specify a path to a user-created scheme") - -parser.add_argument('--exclude-seq-id', dest='excludeSeqID', action='store_true', - help='use if data_file lacks sequence IDs in the first column of each line') - -parser.add_argument('--probabilistic-classification', dest='probabilistic_classification', - action='store_true', help='Optional implementation for sequence classificaion') - -parser.add_argument('--include-figs', dest='include_figs', action='store_true', - help='Generate figures from training results and save to same location as network') - -parser.add_argument('--no-stats', dest='ignore_metrics', action='store_true', - help='If passed, do not output a perfomance stats file.') - -parser.add_argument('--force-cpu', dest='forceCPU', action='store_true', - help='force network to train on CPU, even if GPU is available') - -parser.add_argument('--ignore-warnings', '-w', dest='ignore_warnings', action='store_true', - help='Do not display warnings for dataset structure') - -parser.add_argument('--save-splits', dest='save_splits', action='store_true', - help='Save a split-file using the random splits from this run') - -parser.add_argument('--verbose', '-v', action='store_true', - help='Flag which, if provided, causes output to terminal to be more descriptive') - -parser.add_argument('--silent', action='store_true', - help="Flag which, if provided, ensures no output is generated to the terminal") +parser = argparse.ArgumentParser( + description="Train and test a bi-directional RNN using entire sequence." +) + +parser.add_argument( + "data_file", help="path to tsv file with format: " +) + +parser.add_argument("output_network", help="location to save the trained network") + +parser.add_argument( + "-d", + "--datatype", + metavar="dtype", + type=str, + required=True, + help="REQUIRED. Format of the input data file, must be 'sequence' or 'residues'", +) + +parser.add_argument( + "-c", + "--classes", + type=int, + metavar="num_classes", + required=True, + help="REQUIRED. Number of output classes, for regression put 1", +) + +parser.add_argument( + "-b", + "--batch", + default=32, + type=int, + metavar="batch_size", + help="size of training batch (def=32)", +) + +parser.add_argument( + "-e", + "--epochs", + default=100, + type=int, + metavar="num_epochs", + help="number of training epochs (def=100)", +) + +parser.add_argument( + "--max-iter", + default=50, + type=int, + metavar="max_iter", + help="Maximum number of iterations for the optimization procedure (def=50)", +) + +parser.add_argument( + "--split", + default="", + metavar="split_file", + type=str, + help="file indicating how to split datafile into training, validation, and testing sets", +) + +parser.add_argument( + "--set-fractions", + nargs=3, + default=[0.7, 0.15, 0.15], + type=float, + dest="setFractions", + metavar=("train", "val", "test"), + help="Proportion of dataset that should be divided into training, validation, and test sets", +) + +parser.add_argument( + "--encode", + default="onehot", + type=str, + metavar="encoding_scheme", + help="'onehot' (default), 'biophysics', or specify a path to a user-created scheme", +) + +parser.add_argument( + "--exclude-seq-id", + dest="excludeSeqID", + action="store_true", + help="use if data_file lacks sequence IDs in the first column of each line", +) + +parser.add_argument( + "--probabilistic-classification", + dest="probabilistic_classification", + action="store_true", + help="Optional implementation for sequence classificaion", +) + +parser.add_argument( + "--include-figs", + dest="include_figs", + action="store_true", + help="Generate figures from training results and save to same location as network", +) + +parser.add_argument( + "--no-stats", + dest="ignore_metrics", + action="store_true", + help="If passed, do not output a perfomance stats file.", +) + +parser.add_argument( + "--force-cpu", + dest="forceCPU", + action="store_true", + help="force network to train on CPU, even if GPU is available", +) + +parser.add_argument( + "--ignore-warnings", + "-w", + dest="ignore_warnings", + action="store_true", + help="Do not display warnings for dataset structure", +) + +parser.add_argument( + "--save-splits", + dest="save_splits", + action="store_true", + help="Save a split-file using the random splits from this run", +) + +parser.add_argument( + "--verbose", + "-v", + action="store_true", + help="Flag which, if provided, causes output to terminal to be more descriptive", +) + +parser.add_argument( + "--silent", + action="store_true", + help="Flag which, if provided, ensures no output is generated to the terminal", +) args = parser.parse_args() @@ -119,29 +204,42 @@ save_splits = args.save_splits # Device configuration if forceCPU: - device = 'cpu' + device = "cpu" + device_string = "cpu" else: - device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + if torch.cuda.is_available(): + device_string = "cuda" + device = torch.device(device_string) + elif torch.backends.mps.is_available() and torch.backends.mps.is_built(): + # Use MPS if available on ARM-based MacBooks + device_string = "mps" + device = torch.device(device_string) + else: + device_string = "cpu" + device = torch.device(device_string) + +if verbose: + print(f"Torch device={device_string}") ############################################################################### ################ Validate arguments and initialize: ################### # Ensure that provided data file exists -data_file = validate_args.check_file_exists(args.data_file, 'Datafile') +data_file = validate_args.check_file_exists(args.data_file, "Datafile") # Extract output directory and output prediction file name network_file = os.path.abspath(args.output_network) filename_prefix, output_dir = validate_args.split_file_and_directory(network_file) # If provided, check that split_file exists -if split_file != '': - split_file = validate_args.check_file_exists(split_file, 'Split-file') +if split_file != "": + split_file = validate_args.check_file_exists(split_file, "Split-file") else: split_file = None # If specified, get location where randomly generated train/val/test splits will be saved if save_splits: - save_splits_output = filename_prefix + '_split_file.txt' + save_splits_output = filename_prefix + "_split_file.txt" else: save_splits_output = None @@ -152,20 +250,22 @@ encoding_scheme, encoder, input_size = validate_args.set_encoding_scheme(encode) problem_type, collate_function = validate_args.set_ml_task(num_classes, dtype) # Ensure that network hyperparams (not being optimized) are valid -validate_args.check_positive(num_epochs, 'Number of epochs') -validate_args.check_positive(batch_size, 'Batch size') +validate_args.check_positive(num_epochs, "Number of epochs") +validate_args.check_positive(batch_size, "Batch size") # Ensure that the sum of setFractions adds up to 1 for frac in setFractions: - validate_args.check_between_zero_and_one(frac, 'Set fractions') + validate_args.check_between_zero_and_one(frac, "Set fractions") if sum(setFractions) != 1.0: - raise ValueError('Set fractions must sum to 1.') + raise ValueError("Set fractions must sum to 1.") # Ensure that task is binary sequence classification if # probabilistic_classfication is set if probabilistic_classification: - if dtype != 'sequence' or num_classes < 2: - raise ValueError('Probabilistic classification only implemented for sequence classification') + if dtype != "sequence" or num_classes < 2: + raise ValueError( + "Probabilistic classification only implemented for sequence classification" + ) # Set ignore_warnings to True if --silent is provided if silent: @@ -175,12 +275,20 @@ if silent: ################################ Main code ################################## # Split data -cvs, train, val, test = pid.split_data_cv(data_file, datatype=dtype, problem_type=problem_type, - num_classes=num_classes, excludeSeqID=excludeSeqID, - split_file=split_file, encoding_scheme=encoding_scheme, - encoder=encoder, ignoreWarnings=ignore_warnings, - percent_val=setFractions[1], percent_test=setFractions[2], - save_splits_output=save_splits_output) +cvs, train, val, test = pid.split_data_cv( + data_file, + datatype=dtype, + problem_type=problem_type, + num_classes=num_classes, + excludeSeqID=excludeSeqID, + split_file=split_file, + encoding_scheme=encoding_scheme, + encoder=encoder, + ignoreWarnings=ignore_warnings, + percent_val=setFractions[1], + percent_test=setFractions[2], + save_splits_output=save_splits_output, +) # Assess batch size compared to training set size if not ignore_warnings: @@ -189,10 +297,18 @@ if not ignore_warnings: # Convert CV datasets to dataloaders cv_loaders = [] for cv_train, cv_val in cvs: - cv_train_loader = torch.utils.data.DataLoader(dataset=cv_train, batch_size=batch_size, - collate_fn=collate_function, shuffle=True) - cv_val_loader = torch.utils.data.DataLoader(dataset=cv_val, batch_size=batch_size, - collate_fn=collate_function, shuffle=False) + cv_train_loader = torch.utils.data.DataLoader( + dataset=cv_train, + batch_size=batch_size, + collate_fn=collate_function, + shuffle=True, + ) + cv_val_loader = torch.utils.data.DataLoader( + dataset=cv_val, + batch_size=batch_size, + collate_fn=collate_function, + shuffle=False, + ) cv_loaders.append((cv_train_loader, cv_val_loader)) # Output to std out @@ -202,7 +318,7 @@ if silent is False: print("PARROT with hyperparameter optimization") print("---------------------------------------") if verbose: - print('Train on:\t%s' % device) + print("Train on:\t%s" % device) print("Datatype:\t%s" % dtype) print("ML Task:\t%s" % problem_type) print("Batch size:\t%d" % batch_size) @@ -210,70 +326,109 @@ if silent is False: print("Number of optimization iterations:\t%d\n" % max_iterations) # Optimization procedure -optimizer = bayesian_optimization.BayesianOptimizer(cv_loaders, input_size, num_epochs, - num_classes, dtype, network_file, - max_iterations, device, silent) +optimizer = bayesian_optimization.BayesianOptimizer( + cv_loaders, + input_size, + num_epochs, + num_classes, + dtype, + network_file, + max_iterations, + device, + silent, +) best_hyperparams = optimizer.optimize() -lr = 10**best_hyperparams[0] +lr = 10 ** best_hyperparams[0] nl = int(best_hyperparams[1]) hs = int(best_hyperparams[2]) # Save these hyperparamters to a file so that the user has a record # TODO: move to helper function -params_file = filename_prefix + '_optimal_hyperparams.txt' -with open(params_file, 'w') as f: - f.write('Learning rate: %.5f\n' % lr) - f.write('Num Layers: %d\n' % nl) - f.write('Hidden vector size: %d\n' % hs) +params_file = filename_prefix + "_optimal_hyperparams.txt" +with open(params_file, "w") as f: + f.write("Learning rate: %.5f\n" % lr) + f.write("Num Layers: %d\n" % nl) + f.write("Hidden vector size: %d\n" % hs) # Use these best hyperparams to train the network from scratch using the entire train/val sets # Add data to dataloaders -train_loader = torch.utils.data.DataLoader(dataset=train, - batch_size=batch_size, - collate_fn=collate_function, - shuffle=True) -val_loader = torch.utils.data.DataLoader(dataset=val, - batch_size=batch_size, - collate_fn=collate_function, - shuffle=False) -test_loader = torch.utils.data.DataLoader(dataset=test, - batch_size=1, # Set test batch size to 1 - collate_fn=collate_function, - shuffle=False) +train_loader = torch.utils.data.DataLoader( + dataset=train, batch_size=batch_size, collate_fn=collate_function, shuffle=True +) +val_loader = torch.utils.data.DataLoader( + dataset=val, batch_size=batch_size, collate_fn=collate_function, shuffle=False +) +test_loader = torch.utils.data.DataLoader( + dataset=test, + batch_size=1, # Set test batch size to 1 + collate_fn=collate_function, + shuffle=False, +) # Initialize network: -if dtype == 'sequence': - brnn_network = brnn_architecture.BRNN_MtO(input_size, hs, nl, num_classes, device).to(device) +if dtype == "sequence": + brnn_network = brnn_architecture.BRNN_MtO( + input_size, hs, nl, num_classes, device + ).to(device) else: # dtype == 'residues' - brnn_network = brnn_architecture.BRNN_MtM(input_size, hs, nl, num_classes, device).to(device) + brnn_network = brnn_architecture.BRNN_MtM( + input_size, hs, nl, num_classes, device + ).to(device) # Train network if silent is False: - print('Training with optimal hyperparams:') -train_loss, val_loss = train_network.train(brnn_network, train_loader, val_loader, datatype=dtype, - problem_type=problem_type, weights_file=network_file, - stop_condition='iter', device=device, learn_rate=lr, - n_epochs=num_epochs*2, verbose=verbose, silent=silent) + print("Training with optimal hyperparams:") +train_loss, val_loss = train_network.train( + brnn_network, + train_loader, + val_loader, + datatype=dtype, + problem_type=problem_type, + weights_file=network_file, + stop_condition="iter", + device=device, + learn_rate=lr, + n_epochs=num_epochs * 2, + verbose=verbose, + silent=silent, +) if include_figs: # Plot training & validation loss per epoch brnn_plot.training_loss(train_loss, val_loss, output_file_prefix=filename_prefix) # Test network -test_loss, test_set_predictions = train_network.test_labeled_data(brnn_network, test_loader, - datatype=dtype, problem_type=problem_type, - weights_file=network_file, num_classes=num_classes, - probabilistic_classification=probabilistic_classification, - include_figs=include_figs, device=device, - output_file_prefix=filename_prefix) +test_loss, test_set_predictions = train_network.test_labeled_data( + brnn_network, + test_loader, + datatype=dtype, + problem_type=problem_type, + weights_file=network_file, + num_classes=num_classes, + probabilistic_classification=probabilistic_classification, + include_figs=include_figs, + device=device, + output_file_prefix=filename_prefix, +) if silent is False: - print('\nTest Loss: %.4f' % test_loss) + print("\nTest Loss: %.4f" % test_loss) # Output performance metrics if not ignore_metrics: - brnn_plot.write_performance_metrics(test_set_predictions, dtype, problem_type, - probabilistic_classification, filename_prefix) - + brnn_plot.write_performance_metrics( + test_set_predictions, + dtype, + problem_type, + probabilistic_classification, + filename_prefix, + ) + # Output the test set predictions to a text file -brnn_plot.output_predictions_to_file(test_set_predictions, excludeSeqID, encoding_scheme, - probabilistic_classification, encoder, output_file_prefix=filename_prefix) +brnn_plot.output_predictions_to_file( + test_set_predictions, + excludeSeqID, + encoding_scheme, + probabilistic_classification, + encoder, + output_file_prefix=filename_prefix, +) diff --git a/scripts/parrot-train b/scripts/parrot-train index 39ae2b9..39dce11 100755 --- a/scripts/parrot-train +++ b/scripts/parrot-train @@ -1,7 +1,7 @@ #!/usr/bin/env python """ Usage: $ parrot-train data_file output_network - + Driver script for training a bidirectional recurrent neural network with user specified parameters. For more information on usage, use the '-h' flag. @@ -12,7 +12,7 @@ idptools-parrot was developed by the Holehouse lab Question/comments/concerns? Raise an issue on github: https://github.com/idptools/parrot -Licensed under the MIT license. +Licensed under the MIT license. """ import os @@ -32,75 +32,182 @@ from parrot.tools import validate_args from parrot.tools import dataset_warnings # Parse the command line arguments -parser = argparse.ArgumentParser(description='Train and test a bi-directional RNN using entire sequence.') - -parser.add_argument('data_file', help='path to tsv file with format: ') - -parser.add_argument('output_network', help='location to save the trained network') - -parser.add_argument('-d', '--datatype', metavar='dtype', type=str, required=True, - help="REQUIRED. Format of the input data file, must be 'sequence' or 'residues'") - -parser.add_argument('-c', '--classes', type=int, metavar='num_classes', required=True, - help='REQUIRED. Number of output classes, for regression put 1') - -parser.add_argument('-hs', '--hidden-size', default=10, type=int, metavar='hidden_size', - help='hidden vector size (def=10)') - -parser.add_argument('-nl', '--num-layers', default=1, type=int, metavar='num_layers', - help='number of layers per direction (def=1)') - -parser.add_argument('-lr', '--learning-rate', default=0.001, type=float, - metavar='learning_rate', help='(def=0.001)') - -parser.add_argument('-b', '--batch', default=32, type=int, metavar='batch_size', - help='size of training batch (def=32)') - -parser.add_argument('-e', '--epochs', default=100, type=int, metavar='num_epochs', - help='number of training epochs (def=100)') - -parser.add_argument('--split', default='', metavar='split_file', type=str, - help="file indicating how to split datafile into training, validation, and test sets") - -parser.add_argument('--stop', default='iter', metavar='stop_condition', - type=str, help="training stop condition: either 'auto' or 'iter' (default 'iter')") - -parser.add_argument('--set-fractions', nargs=3, default=[0.7, 0.15, 0.15], type=float, - dest='setFractions', metavar=('train', 'val', 'test'), - help='proportion of dataset that should be divided into training, validation, and test sets') - -parser.add_argument('--encode', default='onehot', type=str, metavar='encoding_scheme', - help="'onehot' (default), 'biophysics', or specify a path to a user-created scheme") - -parser.add_argument('--exclude-seq-id', dest='excludeSeqID', action='store_true', - help='use if data_file lacks sequence IDs in the first column of each line') - -parser.add_argument('--probabilistic-classification', dest='probabilistic_classification', - action='store_true', help='Optional implementation for sequence classificaion') - -parser.add_argument('--include-figs', dest='include_figs', action='store_true', - help='Generate figures from training results and save to same location as network') - -parser.add_argument('--no-stats', dest='ignore_metrics', action='store_true', - help='If passed, do not output a performance stats file.') - -parser.add_argument('--force-cpu', dest='forceCPU', action='store_true', - help='force network to train on CPU, even if GPU is available') - -parser.add_argument('--gpu-id', dest='gpu_id', type=int, - help='User defined control over which CUDA device will be used by parrot') - -parser.add_argument('--ignore-warnings', '-w', dest='ignore_warnings', action='store_true', - help='Do not display warnings for dataset structure') - -parser.add_argument('--save-splits', dest='save_splits', action='store_true', - help='Save a split-file using the random splits from this run') - -parser.add_argument('--verbose', '-v', action='store_true', - help='Flag which, if provided, causes output to terminal to be more descriptive') - -parser.add_argument('--silent', action='store_true', - help="Flag which, if provided, ensures no output is generated to the terminal") +parser = argparse.ArgumentParser( + description="Train and test a bi-directional RNN using entire sequence." +) + +parser.add_argument( + "data_file", help="path to tsv file with format: " +) + +parser.add_argument("output_network", help="location to save the trained network") + +parser.add_argument( + "-d", + "--datatype", + metavar="dtype", + type=str, + required=True, + help="REQUIRED. Format of the input data file, must be 'sequence' or 'residues'", +) + +parser.add_argument( + "-c", + "--classes", + type=int, + metavar="num_classes", + required=True, + help="REQUIRED. Number of output classes, for regression put 1", +) + +parser.add_argument( + "-hs", + "--hidden-size", + default=10, + type=int, + metavar="hidden_size", + help="hidden vector size (def=10)", +) + +parser.add_argument( + "-nl", + "--num-layers", + default=1, + type=int, + metavar="num_layers", + help="number of layers per direction (def=1)", +) + +parser.add_argument( + "-lr", + "--learning-rate", + default=0.001, + type=float, + metavar="learning_rate", + help="(def=0.001)", +) + +parser.add_argument( + "-b", + "--batch", + default=32, + type=int, + metavar="batch_size", + help="size of training batch (def=32)", +) + +parser.add_argument( + "-e", + "--epochs", + default=100, + type=int, + metavar="num_epochs", + help="number of training epochs (def=100)", +) + +parser.add_argument( + "--split", + default="", + metavar="split_file", + type=str, + help="file indicating how to split datafile into training, validation, and test sets", +) + +parser.add_argument( + "--stop", + default="iter", + metavar="stop_condition", + type=str, + help="training stop condition: either 'auto' or 'iter' (default 'iter')", +) + +parser.add_argument( + "--set-fractions", + nargs=3, + default=[0.7, 0.15, 0.15], + type=float, + dest="setFractions", + metavar=("train", "val", "test"), + help="proportion of dataset that should be divided into training, validation, and test sets", +) + +parser.add_argument( + "--encode", + default="onehot", + type=str, + metavar="encoding_scheme", + help="'onehot' (default), 'biophysics', or specify a path to a user-created scheme", +) + +parser.add_argument( + "--exclude-seq-id", + dest="excludeSeqID", + action="store_true", + help="use if data_file lacks sequence IDs in the first column of each line", +) + +parser.add_argument( + "--probabilistic-classification", + dest="probabilistic_classification", + action="store_true", + help="Optional implementation for sequence classificaion", +) + +parser.add_argument( + "--include-figs", + dest="include_figs", + action="store_true", + help="Generate figures from training results and save to same location as network", +) + +parser.add_argument( + "--no-stats", + dest="ignore_metrics", + action="store_true", + help="If passed, do not output a performance stats file.", +) + +parser.add_argument( + "--force-cpu", + dest="forceCPU", + action="store_true", + help="force network to train on CPU, even if GPU is available", +) + +parser.add_argument( + "--gpu-id", + dest="gpu_id", + type=int, + help="User defined control over which CUDA device will be used by parrot", +) + +parser.add_argument( + "--ignore-warnings", + "-w", + dest="ignore_warnings", + action="store_true", + help="Do not display warnings for dataset structure", +) + +parser.add_argument( + "--save-splits", + dest="save_splits", + action="store_true", + help="Save a split-file using the random splits from this run", +) + +parser.add_argument( + "--verbose", + "-v", + action="store_true", + help="Flag which, if provided, causes output to terminal to be more descriptive", +) + +parser.add_argument( + "--silent", + action="store_true", + help="Flag which, if provided, ensures no output is generated to the terminal", +) args = parser.parse_args() print(args) @@ -134,32 +241,48 @@ save_splits = args.save_splits # Device configuration if forceCPU: - device = 'cpu' + device = "cpu" + device_string = "cpu" elif gpu_id: - device = torch.device(f"cuda:{gpu_id}" if torch.cuda.is_available() else 'cpu') - print(f"You've specified to run this network on cuda:{gpu_id}. Running on {device=}") + device = torch.device(f"cuda:{gpu_id}" if torch.cuda.is_available() else "cpu") + device_string = "cuda" + print( + f"You've specified to run this network on cuda:{gpu_id}. Running on {device=}" + ) else: - device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + if torch.cuda.is_available(): + device_string = "cuda" + device = torch.device(device_string) + elif torch.backends.mps.is_available() and torch.backends.mps.is_built(): + # Use MPS if available on ARM-based MacBooks + device_string = "mps" + device = torch.device(device_string) + else: + device_string = "cpu" + device = torch.device(device_string) + +if verbose: + print(f"Torch device={device_string}") ############################################################################### ############# Validate arguments and initialize network: ############## # Ensure that provided data file exists -data_file = validate_args.check_file_exists(args.data_file, 'Datafile') +data_file = validate_args.check_file_exists(args.data_file, "Datafile") # Extract output directory and output prediction file name network_file = os.path.abspath(args.output_network) filename_prefix, output_dir = validate_args.split_file_and_directory(network_file) # If provided, check that split_file exists -if split_file != '': - split_file = validate_args.check_file_exists(split_file, 'Split-file') +if split_file != "": + split_file = validate_args.check_file_exists(split_file, "Split-file") else: split_file = None # If specified, get location where randomly generated train/val/test splits will be saved if save_splits: - save_splits_output = filename_prefix + '_split_file.txt' + save_splits_output = filename_prefix + "_split_file.txt" else: save_splits_output = None @@ -170,69 +293,81 @@ encoding_scheme, encoder, input_size = validate_args.set_encoding_scheme(encode) problem_type, collate_function = validate_args.set_ml_task(num_classes, dtype) # Ensure that network hyperparams are valid -validate_args.check_between_zero_and_one(learning_rate, 'Learning rate') -validate_args.check_positive(hidden_size, 'Hidden vector size') -validate_args.check_positive(num_layers, 'Number of layers') -validate_args.check_positive(num_epochs, 'Number of epochs') -validate_args.check_positive(batch_size, 'Batch size') +validate_args.check_between_zero_and_one(learning_rate, "Learning rate") +validate_args.check_positive(hidden_size, "Hidden vector size") +validate_args.check_positive(num_layers, "Number of layers") +validate_args.check_positive(num_epochs, "Number of epochs") +validate_args.check_positive(batch_size, "Batch size") # Ensure that stop condition is 'iter' or 'auto' validate_args.check_stop_condition(stop_cond, num_epochs) # Ensure that the sum of setFractions adds up to 1 for frac in setFractions: - validate_args.check_between_zero_and_one(frac, 'Set fractions') + validate_args.check_between_zero_and_one(frac, "Set fractions") if sum(setFractions) != 1.0: - raise ValueError('Set fractions must sum to 1.') + raise ValueError("Set fractions must sum to 1.") # Ensure that task is binary sequence classification if # probabilistic_classfication is set if probabilistic_classification: - if dtype != 'sequence' or num_classes < 2: - raise ValueError('Probabilistic classification only implemented for sequence classification') + if dtype != "sequence" or num_classes < 2: + raise ValueError( + "Probabilistic classification only implemented for sequence classification" + ) # Set ignore_warnings to True if --silent is provided if silent: ignore_warnings = True # Initialize network architecture depending on data format -if dtype == 'sequence': +if dtype == "sequence": # Use a many-to-one architecture - brnn_network = brnn_architecture.BRNN_MtO(input_size, hidden_size, - num_layers, num_classes, device).to(device) -elif dtype == 'residues': + brnn_network = brnn_architecture.BRNN_MtO( + input_size, hidden_size, num_layers, num_classes, device + ).to(device) +elif dtype == "residues": # Use a many-to-many architecture - brnn_network = brnn_architecture.BRNN_MtM(input_size, hidden_size, - num_layers, num_classes, device).to(device) + brnn_network = brnn_architecture.BRNN_MtM( + input_size, hidden_size, num_layers, num_classes, device + ).to(device) ############################################################################### ################################ Main code ################################## # Split data -train, val, test = pid.split_data(data_file, datatype=dtype, problem_type=problem_type, - num_classes=num_classes, excludeSeqID=excludeSeqID, - split_file=split_file, encoding_scheme=encoding_scheme, - encoder=encoder, percent_val=setFractions[1], - percent_test=setFractions[2], ignoreWarnings=ignore_warnings, - save_splits_output=save_splits_output) +train, val, test = pid.split_data( + data_file, + datatype=dtype, + problem_type=problem_type, + num_classes=num_classes, + excludeSeqID=excludeSeqID, + split_file=split_file, + encoding_scheme=encoding_scheme, + encoder=encoder, + percent_val=setFractions[1], + percent_test=setFractions[2], + ignoreWarnings=ignore_warnings, + save_splits_output=save_splits_output, +) # Assess batch size compared to training set size if not ignore_warnings: dataset_warnings.eval_batch_size(batch_size, len(train)) # Add data to dataloaders -train_loader = torch.utils.data.DataLoader(dataset=train, - batch_size=batch_size, - collate_fn=collate_function, - shuffle=True) -val_loader = torch.utils.data.DataLoader(dataset=val, - batch_size=batch_size, - collate_fn=collate_function, - shuffle=False) -test_loader = torch.utils.data.DataLoader(dataset=test, - batch_size=1, # Set test batch size to 1 - collate_fn=collate_function, - shuffle=False) +train_loader = torch.utils.data.DataLoader( + dataset=train, batch_size=batch_size, collate_fn=collate_function, shuffle=True +) +val_loader = torch.utils.data.DataLoader( + dataset=val, batch_size=batch_size, collate_fn=collate_function, shuffle=False +) +test_loader = torch.utils.data.DataLoader( + dataset=test, + batch_size=1, # Set test batch size to 1 + collate_fn=collate_function, + shuffle=False, +) # Output to std out # TODO: move to helper function in /tools @@ -241,7 +376,7 @@ if silent is False: print("PARROT with user-specified parameters") print("-------------------------------------") if verbose > 1: - print('Train on:\t%s' % device) + print("Train on:\t%s" % device) print("Datatype:\t%s" % dtype) print("ML Task:\t%s" % problem_type) print("Learning rate:\t%f" % learning_rate) @@ -252,31 +387,57 @@ if silent is False: print("Validation set loss per epoch:") # Train network -train_loss, val_loss = train_network.train(brnn_network, train_loader, val_loader, datatype=dtype, - problem_type=problem_type, weights_file=network_file, - stop_condition=stop_cond, device=device, - learn_rate=learning_rate, n_epochs=num_epochs, - verbose=verbose, silent=silent) +train_loss, val_loss = train_network.train( + brnn_network, + train_loader, + val_loader, + datatype=dtype, + problem_type=problem_type, + weights_file=network_file, + stop_condition=stop_cond, + device=device, + learn_rate=learning_rate, + n_epochs=num_epochs, + verbose=verbose, + silent=silent, +) if include_figs: # Plot training & validation loss per epoch brnn_plot.training_loss(train_loss, val_loss, output_file_prefix=filename_prefix) # Test network -test_loss, test_set_predictions = train_network.test_labeled_data(brnn_network, test_loader, - datatype=dtype, problem_type=problem_type, - weights_file=network_file, num_classes=num_classes, - probabilistic_classification=probabilistic_classification, - include_figs=include_figs, device=device, - output_file_prefix=filename_prefix) +test_loss, test_set_predictions = train_network.test_labeled_data( + brnn_network, + test_loader, + datatype=dtype, + problem_type=problem_type, + weights_file=network_file, + num_classes=num_classes, + probabilistic_classification=probabilistic_classification, + include_figs=include_figs, + device=device, + output_file_prefix=filename_prefix, +) if silent is False: - print('\nTest Loss: %.4f' % test_loss) + print("\nTest Loss: %.4f" % test_loss) # Output performance metrics if not ignore_metrics: - brnn_plot.write_performance_metrics(test_set_predictions, dtype, problem_type, - probabilistic_classification, filename_prefix) + brnn_plot.write_performance_metrics( + test_set_predictions, + dtype, + problem_type, + probabilistic_classification, + filename_prefix, + ) # Output the test set predictions to a text file -brnn_plot.output_predictions_to_file(test_set_predictions, excludeSeqID, encoding_scheme, - probabilistic_classification, encoder, output_file_prefix=filename_prefix) +brnn_plot.output_predictions_to_file( + test_set_predictions, + excludeSeqID, + encoding_scheme, + probabilistic_classification, + encoder, + output_file_prefix=filename_prefix, +)