diff --git a/parrot/bayesian_optimization.py b/parrot/bayesian_optimization.py index 1243784..951926e 100644 --- a/parrot/bayesian_optimization.py +++ b/parrot/bayesian_optimization.py @@ -8,11 +8,9 @@ Question/comments/concerns? Raise an issue on github: https://github.com/idptools/parrot -Licensed under the MIT license. +Licensed under the MIT license. """ -import math - import numpy as np try: @@ -20,18 +18,19 @@ import GPyOpt from GPyOpt.methods import BayesianOptimization except ImportError: - print('Error importing GPy.') - print(' If trying to run parrot-optimize, make sure to use `pip install idptools-parrot[optimize]`') + print("Error importing GPy.") + print( + " If trying to run parrot-optimize, make sure to use `pip install idptools-parrot[optimize]`" + ) -from parrot import train_network -from parrot import brnn_architecture +from parrot import brnn_architecture, train_network class BayesianOptimizer(object): """A class for conducting Bayesian Optimization on a PyTorch RNN Sets up and runs GPy Bayesian Optimization in order to choose the best- - performing hyperparameters for a RNN for a given machine learning task. + performing hyperparameters for a RNN for a given machine learning task. Iteratively change learning rate, hidden vector size, and the number of layers in the network, then train and validating using 5-fold cross validation. @@ -55,7 +54,7 @@ class BayesianOptimizer(object): weights_file : str Path to which the network weights will be saved during training device : str - 'cpu' or 'cuda' depending on system hardware + 'cpu', 'mps' or 'cuda' depending on system hardware max_iterations : int Maximum number of iterations to perform the optimization procedure silent : bool @@ -64,8 +63,18 @@ class BayesianOptimizer(object): GPy-compatible bounds for each of the hyperparameters to be optimized """ - def __init__(self, cv_dataloaders, input_size, n_epochs, n_classes, - dtype, weights_file, max_iterations, device, silent): + def __init__( + self, + cv_dataloaders, + input_size, + n_epochs, + n_classes, + dtype, + weights_file, + max_iterations, + device, + silent, + ): """ Parameters ---------- @@ -83,7 +92,7 @@ def __init__(self, cv_dataloaders, input_size, n_epochs, n_classes, weights_file : str Path to which the network weights will be saved during training device : str - 'cpu' or 'cuda' depending on system hardware + 'cpu', 'mps' or 'cuda' depending on system hardware max_iterations : int Maximum number of iterations to perform the optimization procedure silent : bool @@ -96,9 +105,9 @@ def __init__(self, cv_dataloaders, input_size, n_epochs, n_classes, self.n_folds = len(cv_dataloaders) self.n_classes = n_classes if n_classes > 1: - self.problem_type = 'classification' + self.problem_type = "classification" else: - self.problem_type = 'regression' + self.problem_type = "regression" self.dtype = dtype self.weights_file = weights_file @@ -106,9 +115,19 @@ def __init__(self, cv_dataloaders, input_size, n_epochs, n_classes, self.device = device self.silent = silent - self.bds = [{'name': 'log_learning_rate', 'type': 'continuous', 'domain': (-5, -2)}, # 0.00001-0.01 - {'name': 'n_layers', 'type': 'discrete', 'domain': tuple(range(1, 6))}, # 1-5 - {'name': 'hidden_size', 'type': 'discrete', 'domain': tuple(range(5, 51))}] # 5-50 + self.bds = [ + { + "name": "log_learning_rate", + "type": "continuous", + "domain": (-5, -2), + }, # 0.00001-0.01 + { + "name": "n_layers", + "type": "discrete", + "domain": tuple(range(1, 6)), + }, # 1-5 + {"name": "hidden_size", "type": "discrete", "domain": tuple(range(5, 51))}, + ] # 5-50 def compute_cv_loss(self, hyperparameters): """Compute the average cross-val loss for a given set of hyperparameters @@ -125,7 +144,7 @@ def compute_cv_loss(self, hyperparameters): Returns ------- numpy float array - a Nx1 numpy array of the average cross-val loss + a Nx1 numpy array of the average cross-val loss per set of input hyperparameters """ @@ -134,7 +153,7 @@ def compute_cv_loss(self, hyperparameters): for i in range(len(hyperparameters)): log_lr, nl, hs = hyperparameters[i] - lr = 10**float(log_lr) + lr = 10 ** float(log_lr) nl = int(nl) hs = int(hs) @@ -143,7 +162,10 @@ def compute_cv_loss(self, hyperparameters): avg = np.average(cv_outputs[i]) if self.silent is False: - print(' %.6f | %2d | %2d | %.3f' % (lr, nl, hs, avg)) + print( + " %.6f | %2d | %2d | %.3f" + % (lr, nl, hs, avg) + ) outputs = np.average(cv_outputs, axis=1) return outputs @@ -166,23 +188,36 @@ def eval_cv_brnns(self, lr, nl, hs): the best validation loss from each fold of cross validation """ - cv_losses = np.zeros(self.n_folds) - 1 # -1 so that it's obvious if something goes wrong + cv_losses = ( + np.zeros(self.n_folds) - 1 + ) # -1 so that it's obvious if something goes wrong for k in range(self.n_folds): - if self.dtype == 'sequence': + if self.dtype == "sequence": # Use a many-to-one architecture - brnn_network = brnn_architecture.BRNN_MtO(self.input_size, hs, nl, - self.n_classes, self.device).to(self.device) + brnn_network = brnn_architecture.BRNN_MtO( + self.input_size, hs, nl, self.n_classes, self.device + ).to(self.device) else: # Use a many-to-many architecture - brnn_network = brnn_architecture.BRNN_MtM(self.input_size, hs, nl, - self.n_classes, self.device).to(self.device) + brnn_network = brnn_architecture.BRNN_MtM( + self.input_size, hs, nl, self.n_classes, self.device + ).to(self.device) # Train network with this set of hyperparameters - train_losses, val_losses = train_network.train(brnn_network, self.cv_loaders[k][0], - self.cv_loaders[k][1], self.dtype, self.problem_type, - self.weights_file, stop_condition='iter', device=self.device, - learn_rate=lr, n_epochs=self.n_epochs, silent=True) + train_losses, val_losses = train_network.train( + brnn_network, + self.cv_loaders[k][0], + self.cv_loaders[k][1], + self.dtype, + self.problem_type, + self.weights_file, + stop_condition="iter", + device=self.device, + learn_rate=lr, + n_epochs=self.n_epochs, + silent=True, + ) # Take best val loss best_val_loss = np.min(val_losses) cv_losses[k] = best_val_loss @@ -211,7 +246,7 @@ def initial_search(self, x): for i in range(len(x)): log_lr, nl, hs = x[i] - lr = 10**float(log_lr) + lr = 10 ** float(log_lr) nl = int(nl) hs = int(hs) @@ -237,32 +272,55 @@ def optimize(self): """ # Initial hyperparameter search -- used to get noise estimate - x_init = np.array([[-3.0, 1, 20], [-3.0, 2, 20], [-3.0, 3, 20], [-3.0, 4, 20], [-3.0, 5, 20], - [-2.0, 2, 20], [-3.3, 2, 20], [-4.0, 2, 20], [-5.0, 2, 20], - [-3.0, 2, 5], [-3.0, 2, 15], [-3.0, 2, 35], [-3.0, 2, 50]]) + x_init = np.array( + [ + [-3.0, 1, 20], + [-3.0, 2, 20], + [-3.0, 3, 20], + [-3.0, 4, 20], + [-3.0, 5, 20], + [-2.0, 2, 20], + [-3.3, 2, 20], + [-4.0, 2, 20], + [-5.0, 2, 20], + [-3.0, 2, 5], + [-3.0, 2, 15], + [-3.0, 2, 35], + [-3.0, 2, 50], + ] + ) y_init, noise = self.initial_search(x_init) if self.silent is False: print("\nInitial search results:") print("lr\tnl\ths\toutput") for i in range(len(x_init)): - print("%.5f\t%2d\t%2d\t%.4f" % (10**x_init[i][0], x_init[i][1], x_init[i][2], y_init[i][0])) + print( + "%.5f\t%2d\t%2d\t%.4f" + % (10 ** x_init[i][0], x_init[i][1], x_init[i][2], y_init[i][0]) + ) print("Noise estimate:", noise) - print('\n') - print('Primary optimization:') - print('--------------------\n') - print('Learning rate | n_layers | hidden vector size | avg CV loss ') - print('======================================================================') - - optimizer = BayesianOptimization(f=self.compute_cv_loss, - domain=self.bds, - model_type='GP', - acquisition_type='EI', - acquisition_jitter=0.05, - X=x_init, - Y=y_init, - noise_var=noise, - maximize=False) + print("\n") + print("Primary optimization:") + print("--------------------\n") + print( + "Learning rate | n_layers | hidden vector size | avg CV loss " + ) + print( + "======================================================================" + ) + + optimizer = BayesianOptimization( + f=self.compute_cv_loss, + domain=self.bds, + model_type="GP", + acquisition_type="EI", + acquisition_jitter=0.05, + X=x_init, + Y=y_init, + noise_var=noise, + maximize=False, + ) optimizer.run_optimization(max_iter=self.max_iterations) @@ -270,8 +328,10 @@ def optimize(self): outs = optimizer.get_evaluations()[1].flatten() if self.silent is False: - print("\nThe optimal hyperparameters are:\nlr = %.5f\nnl = %d\nhs = %d" % - (10**optimizer.x_opt[0], optimizer.x_opt[1], optimizer.x_opt[2])) + print( + "\nThe optimal hyperparameters are:\nlr = %.5f\nnl = %d\nhs = %d" + % (10 ** optimizer.x_opt[0], optimizer.x_opt[1], optimizer.x_opt[2]) + ) print() return optimizer.x_opt diff --git a/parrot/brnn_architecture.py b/parrot/brnn_architecture.py index efe09c9..7e52527 100644 --- a/parrot/brnn_architecture.py +++ b/parrot/brnn_architecture.py @@ -8,7 +8,7 @@ Question/comments/concerns? Raise an issue on github: https://github.com/idptools/parrot -Licensed under the MIT license. +Licensed under the MIT license. """ import torch @@ -24,15 +24,15 @@ class BRNN_MtM(nn.Module): aggregates the deepest hidden layers of both directions and produces the outputs. - "Many-to-many" refers to the fact that the network will produce outputs - corresponding to every item of the input sequence. For example, an input + "Many-to-many" refers to the fact that the network will produce outputs + corresponding to every item of the input sequence. For example, an input sequence of length 10 will produce 10 sequential outputs. Attributes ---------- device : str String describing where the network is physically stored on the computer. - Should be either 'cpu' or 'cuda' (GPU). + Should be 'cpu', 'mps' or 'cuda' (GPU). hidden_size : int Size of hidden vectors in the network num_layers : int @@ -43,8 +43,8 @@ class BRNN_MtM(nn.Module): it should be the number of classes. lstm : PyTorch LSTM object The bidirectional LSTM layer(s) of the recurrent neural network. - fc : PyTorch Linear object - The fully connected linear layer of the recurrent neural network. Across + fc : PyTorch Linear object + The fully connected linear layer of the recurrent neural network. Across the length of the input sequence, this layer aggregates the output of the LSTM nodes from the deepest forward layer and deepest reverse layer and returns the output for that residue in the sequence. @@ -66,7 +66,7 @@ def __init__(self, input_size, hidden_size, num_layers, num_classes, device): it should be the number of classes. device : str String describing where the network is physically stored on the computer. - Should be either 'cpu' or 'cuda' (GPU). + Should be 'cpu', 'mps' or 'cuda' (GPU). """ super(BRNN_MtM, self).__init__() @@ -74,10 +74,12 @@ def __init__(self, input_size, hidden_size, num_layers, num_classes, device): self.hidden_size = hidden_size self.num_layers = num_layers self.num_classes = num_classes - self.lstm = nn.LSTM(input_size, hidden_size, num_layers, - batch_first=True, bidirectional=True) - self.fc = nn.Linear(in_features=hidden_size*2, # *2 for bidirection - out_features=num_classes) + self.lstm = nn.LSTM( + input_size, hidden_size, num_layers, batch_first=True, bidirectional=True + ) + self.fc = nn.Linear( + in_features=hidden_size * 2, out_features=num_classes # *2 for bidirection + ) def forward(self, x): """Propogate input sequences through the network to produce outputs @@ -98,10 +100,12 @@ def forward(self, x): # Set initial states # h0 and c0 dimensions: [num_layers*2 X batch_size X hidden_size] - h0 = torch.zeros(self.num_layers*2, # *2 for bidirection - x.size(0), self.hidden_size).to(self.device) - c0 = torch.zeros(self.num_layers*2, - x.size(0), self.hidden_size).to(self.device) + h0 = torch.zeros( + self.num_layers * 2, x.size(0), self.hidden_size # *2 for bidirection + ).to(self.device) + c0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to( + self.device + ) # Forward propagate LSTM # out: tensor of shape: [batch_size, seq_length, hidden_size*2] @@ -121,7 +125,7 @@ class BRNN_MtO(nn.Module): aggregates the deepest hidden layers of both directions and produces the output. - "Many-to-one" refers to the fact that the network will produce a single output + "Many-to-one" refers to the fact that the network will produce a single output for an entire input sequence. For example, an input sequence of length 10 will produce only one output. @@ -129,7 +133,7 @@ class BRNN_MtO(nn.Module): ---------- device : str String describing where the network is physically stored on the computer. - Should be either 'cpu' or 'cuda' (GPU). + Should be 'cpu', 'mps' or 'cuda' (GPU). hidden_size : int Size of hidden vectors in the network num_layers : int @@ -140,8 +144,8 @@ class BRNN_MtO(nn.Module): it should be the number of classes. lstm : PyTorch LSTM object The bidirectional LSTM layer(s) of the recurrent neural network. - fc : PyTorch Linear object - The fully connected linear layer of the recurrent neural network. Across + fc : PyTorch Linear object + The fully connected linear layer of the recurrent neural network. Across the length of the input sequence, this layer aggregates the output of the LSTM nodes from the deepest forward layer and deepest reverse layer and returns the output for that residue in the sequence. @@ -163,17 +167,19 @@ def __init__(self, input_size, hidden_size, num_layers, num_classes, device): it should be the number of classes. device : str String describing where the network is physically stored on the computer. - Should be either 'cpu' or 'cuda' (GPU). + Should be 'cpu', 'mps' or 'cuda' (GPU). """ super(BRNN_MtO, self).__init__() self.device = device self.hidden_size = hidden_size self.num_layers = num_layers - self.lstm = nn.LSTM(input_size, hidden_size, num_layers, - batch_first=True, bidirectional=True) - self.fc = nn.Linear(in_features=hidden_size*2, # *2 for bidirection - out_features=num_classes) + self.lstm = nn.LSTM( + input_size, hidden_size, num_layers, batch_first=True, bidirectional=True + ) + self.fc = nn.Linear( + in_features=hidden_size * 2, out_features=num_classes # *2 for bidirection + ) def forward(self, x): """Propogate input sequences through the network to produce outputs @@ -194,10 +200,12 @@ def forward(self, x): # Set initial states # h0 and c0 dimensions: [num_layers*2 X batch_size X hidden_size] - h0 = torch.zeros(self.num_layers*2, # *2 for bidirection - x.size(0), self.hidden_size).to(self.device) - c0 = torch.zeros(self.num_layers*2, - x.size(0), self.hidden_size).to(self.device) + h0 = torch.zeros( + self.num_layers * 2, x.size(0), self.hidden_size # *2 for bidirection + ).to(self.device) + c0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to( + self.device + ) # Forward propagate LSTM # out: tensor of shape: [batch_size, seq_length, hidden_size*2] diff --git a/parrot/train_network.py b/parrot/train_network.py index d4d939a..eb0a7fe 100644 --- a/parrot/train_network.py +++ b/parrot/train_network.py @@ -8,20 +8,30 @@ Question/comments/concerns? Raise an issue on github: https://github.com/idptools/parrot -Licensed under the MIT license. +Licensed under the MIT license. """ +import numpy as np import torch import torch.nn as nn -from torch.utils.data import Dataset, DataLoader -import numpy as np - -from parrot import brnn_plot -from parrot import encode_sequence - -def train(network, train_loader, val_loader, datatype, problem_type, weights_file, - stop_condition, device, learn_rate, n_epochs, verbose=False, silent=False): +from parrot import brnn_plot, encode_sequence + + +def train( + network, + train_loader, + val_loader, + datatype, + problem_type, + weights_file, + stop_condition, + device, + learn_rate, + n_epochs, + verbose=False, + silent=False, +): """Train a BRNN and save the best performing network weights Train the network on a training set, and every epoch evaluate its performance on @@ -30,8 +40,8 @@ def train(network, train_loader, val_loader, datatype, problem_type, weights_fil User must specify the machine learning tast (`problem_type`) and the format of the data (`datatype`). Additionally, this function requires the learning rate - hyperparameter and the number of epochs of training. The other hyperparameters, - number of hidden layers and hidden vector size, are implictly included on the + hyperparameter and the number of epochs of training. The other hyperparameters, + number of hidden layers and hidden vector size, are implictly included on the the provided network. The user may specify if they want to train the network for a set number of @@ -64,8 +74,9 @@ def train(network, train_loader, val_loader, datatype, problem_type, weights_fil performance has sufficiently stagnated. If the performance plateaus for `n_epochs` consecutive epochs, then training will stop. device : str - Location of where training will take place--should be either 'cpu' or - 'cuda' (GPU). If available, training on GPU is typically much faster. + Location of where training will take place--should be 'cpu', 'mps' (Apple + GPU) or 'cuda' (GPU). If available, training on GPU is typically + much faster. learn_rate : float Initial learning rate of network training. The training process is controlled by the Adam optimization algorithm, so this learning rate @@ -91,13 +102,13 @@ def train(network, train_loader, val_loader, datatype, problem_type, weights_fil optimizer = torch.optim.Adam(network.parameters(), lr=learn_rate) # Set loss criteria - if problem_type == 'regression': - if datatype == 'residues': - criterion = nn.MSELoss(reduction='sum') - elif datatype == 'sequence': - criterion = nn.L1Loss(reduction='sum') - elif problem_type == 'classification': - criterion = nn.CrossEntropyLoss(reduction='sum') + if problem_type == "regression": + if datatype == "residues": + criterion = nn.MSELoss(reduction="sum") + elif datatype == "sequence": + criterion = nn.L1Loss(reduction="sum") + elif problem_type == "classification": + criterion = nn.CrossEntropyLoss(reduction="sum") network = network.float() total_step = len(train_loader) @@ -105,7 +116,7 @@ def train(network, train_loader, val_loader, datatype, problem_type, weights_fil avg_train_losses = [] avg_val_losses = [] - if stop_condition == 'auto': + if stop_condition == "auto": min_epochs = n_epochs # Set to some arbitrarily large number of iterations -- will stop automatically n_epochs = 20000000 @@ -127,10 +138,10 @@ def train(network, train_loader, val_loader, datatype, problem_type, weights_fil # Forward pass outputs = network(vectors.float()) - if problem_type == 'regression': + if problem_type == "regression": loss = criterion(outputs, targets.float()) else: - if datatype == 'residues': + if datatype == "residues": outputs = outputs.permute(0, 2, 1) loss = criterion(outputs, targets.long()) @@ -147,10 +158,10 @@ def train(network, train_loader, val_loader, datatype, problem_type, weights_fil # Forward pass outputs = network(vectors.float()) - if problem_type == 'regression': + if problem_type == "regression": loss = criterion(outputs, targets.float()) else: - if datatype == 'residues': + if datatype == "residues": outputs = outputs.permute(0, 2, 1) loss = criterion(outputs, targets.long()) @@ -162,12 +173,12 @@ def train(network, train_loader, val_loader, datatype, problem_type, weights_fil val_loss /= len(val_loader.dataset) signif_decrease = True - if stop_condition == 'auto' and epoch > min_epochs - 1: + if stop_condition == "auto" and epoch > min_epochs - 1: # Check to see if loss has stopped decreasing last_epochs_loss = avg_val_losses[-min_epochs:] for loss in last_epochs_loss: - if val_loss >= loss*0.995: + if val_loss >= loss * 0.995: signif_decrease = False # If network performance has plateaued over the last range of epochs, end training @@ -176,7 +187,7 @@ def train(network, train_loader, val_loader, datatype, problem_type, weights_fil # Only save updated weights to memory if they improve val set performance if val_loss < min_val_loss: - min_val_loss = val_loss # Reset min_val_loss + min_val_loss = val_loss # Reset min_val_loss last_decrease = epoch torch.save(network.state_dict(), weights_file) # Save model @@ -185,9 +196,9 @@ def train(network, train_loader, val_loader, datatype, problem_type, weights_fil avg_val_losses.append(val_loss) if verbose: - print('Epoch %d\tLoss %.4f' % (epoch, val_loss)) + print("Epoch %d\tLoss %.4f" % (epoch, val_loss)) elif epoch % 5 == 0 and silent is False: - print('Epoch %d\tLoss %.4f' % (epoch, val_loss)) + print("Epoch %d\tLoss %.4f" % (epoch, val_loss)) # This is placed here to ensure that the best network, even if the performance # improvement is marginal, is saved. @@ -198,15 +209,23 @@ def train(network, train_loader, val_loader, datatype, problem_type, weights_fil return avg_train_losses, avg_val_losses -def test_labeled_data(network, test_loader, datatype, - problem_type, weights_file, num_classes, - probabilistic_classification, include_figs, - device, output_file_prefix=''): +def test_labeled_data( + network, + test_loader, + datatype, + problem_type, + weights_file, + num_classes, + probabilistic_classification, + include_figs, + device, + output_file_prefix="", +): """Test a trained BRNN on labeled sequences Using the saved weights of a trained network, run a set of sequences through the network and evaluate the performancd. Return the average loss per - sequence and plot the results. Testing a network on previously-unseen data + sequence and plot the results. Testing a network on previously-unseen data provides a useful estimate of how generalizeable the network's performance is. Parameters @@ -232,10 +251,11 @@ def test_labeled_data(network, test_loader, datatype, include_figs: bool Whether or not matplotlib figures should be generated. device : str - Location of where testing will take place--should be either 'cpu' or - 'cuda' (GPU). If available, training on GPU is typically much faster. + Location of where training will take place--should be 'cpu', 'mps' (Apple + GPU) or 'cuda' (GPU). If available, training on GPU is typically + much faster. output_file_prefix : str - Path and filename prefix to which the test set predictions and plots will be saved. + Path and filename prefix to which the test set predictions and plots will be saved. Returns ------- @@ -251,20 +271,20 @@ def test_labeled_data(network, test_loader, datatype, network.load_state_dict(torch.load(weights_file)) # Get output directory for images - network_filename = weights_file.split('/')[-1] - output_dir = weights_file[:-len(network_filename)] + network_filename = weights_file.split("/")[-1] + output_dir = weights_file[: -len(network_filename)] # Set loss criteria - if problem_type == 'regression': + if problem_type == "regression": criterion = nn.MSELoss() - elif problem_type == 'classification': + elif problem_type == "classification": criterion = nn.CrossEntropyLoss() test_loss = 0 all_targets = [] all_outputs = [] predictions = [] - for names, vectors, targets in test_loader: # batch size of 1 + for names, vectors, targets in test_loader: # batch size of 1 all_targets.append(targets) vectors = vectors.to(device) @@ -272,10 +292,10 @@ def test_labeled_data(network, test_loader, datatype, # Forward pass outputs = network(vectors.float()) - if problem_type == 'regression': + if problem_type == "regression": loss = criterion(outputs, targets.float()) else: - if datatype == 'residues': + if datatype == "residues": outputs = outputs.permute(0, 2, 1) loss = criterion(outputs, targets.long()) @@ -283,37 +303,49 @@ def test_labeled_data(network, test_loader, datatype, all_outputs.append(outputs.detach()) # Add to list as: [seq_vector, true value, predicted value, name] - predictions.append([vectors[0].cpu().numpy(), targets.cpu().numpy() - [0], outputs.cpu().detach().numpy(), names[0]]) + predictions.append( + [ + vectors[0].cpu().numpy(), + targets.cpu().numpy()[0], + outputs.cpu().detach().numpy(), + names[0], + ] + ) # Plot 'accuracy' depending on the problem type and datatype - if problem_type == 'regression': - if datatype == 'residues': + if problem_type == "regression": + if datatype == "residues": if include_figs: - brnn_plot.residue_regression_scatterplot(all_targets, all_outputs, - output_file_prefix=output_file_prefix) + brnn_plot.residue_regression_scatterplot( + all_targets, all_outputs, output_file_prefix=output_file_prefix + ) # Format predictions for i in range(len(predictions)): predictions[i][2] = predictions[i][2].flatten() predictions[i][1] = predictions[i][1].flatten() - elif datatype == 'sequence': + elif datatype == "sequence": if include_figs: - brnn_plot.sequence_regression_scatterplot(all_targets, all_outputs, - output_file_prefix=output_file_prefix) + brnn_plot.sequence_regression_scatterplot( + all_targets, all_outputs, output_file_prefix=output_file_prefix + ) # Format predictions for i in range(len(predictions)): predictions[i][2] = predictions[i][2][0][0] predictions[i][1] = predictions[i][1][0] - elif problem_type == 'classification': + elif problem_type == "classification": - if datatype == 'residues': + if datatype == "residues": if include_figs: - brnn_plot.res_confusion_matrix(all_targets, all_outputs, num_classes, - output_file_prefix=output_file_prefix) + brnn_plot.res_confusion_matrix( + all_targets, + all_outputs, + num_classes, + output_file_prefix=output_file_prefix, + ) # Format predictions and assign class predictions for i in range(len(predictions)): @@ -322,7 +354,7 @@ def test_labeled_data(network, test_loader, datatype, pred_values = np.argmax(predictions[i][2], axis=1)[0] predictions[i][2] = np.array(pred_values, dtype=np.int) - elif datatype == 'sequence': + elif datatype == "sequence": if probabilistic_classification: # Probabilistic assignment of class predictions # Optional implementation for classification tasks @@ -337,10 +369,18 @@ def test_labeled_data(network, test_loader, datatype, # Plot ROC and PR curves if include_figs: - brnn_plot.plot_roc_curve(all_targets, pred_probabilities, num_classes, - output_file_prefix=output_file_prefix) - brnn_plot.plot_precision_recall_curve(all_targets, pred_probabilities, - num_classes, output_file_prefix=output_file_prefix) + brnn_plot.plot_roc_curve( + all_targets, + pred_probabilities, + num_classes, + output_file_prefix=output_file_prefix, + ) + brnn_plot.plot_precision_recall_curve( + all_targets, + pred_probabilities, + num_classes, + output_file_prefix=output_file_prefix, + ) else: # Absolute assignment of class predictions @@ -351,20 +391,31 @@ def test_labeled_data(network, test_loader, datatype, # Plot confusion matrix (if not in probabilistic classification mode) if include_figs: - brnn_plot.confusion_matrix(all_targets, all_outputs, num_classes, - output_file_prefix=output_file_prefix) + brnn_plot.confusion_matrix( + all_targets, + all_outputs, + num_classes, + output_file_prefix=output_file_prefix, + ) return test_loss / len(test_loader.dataset), predictions -def test_unlabeled_data(network, sequences, device, encoding_scheme='onehot', encoder=None, print_frequency=None): +def test_unlabeled_data( + network, + sequences, + device, + encoding_scheme="onehot", + encoder=None, + print_frequency=None, +): """Test a trained BRNN on unlabeled sequences Use a trained network to make predictions on previously-unseen data. - ** + ** Note: Unlike the previous functions, `network` here must have pre-loaded - weights. + weights. ** Parameters @@ -374,8 +425,9 @@ def test_unlabeled_data(network, sequences, device, encoding_scheme='onehot', en sequences : list A list of amino acid sequences to test using the network device : str - Location of where testing will take place--should be either 'cpu' or - 'cuda' (GPU). If available, training on GPU is typically much faster. + Location of where training will take place--should be 'cpu', 'mps' (Apple + GPU) or 'cuda' (GPU). If available, training on GPU is typically + much faster. encoding_scheme : str, optional How amino acid sequences are to be encoded as numeric vectors. Currently, 'onehot','biophysics' and 'user' are the implemented options. @@ -386,7 +438,7 @@ def test_unlabeled_data(network, sequences, device, encoding_scheme='onehot', en print_frequency : int If provided defines at what sequence interval an update is printed. Default = None. - + Returns ------- dict @@ -403,13 +455,13 @@ def test_unlabeled_data(network, sequences, device, encoding_scheme='onehot', en local_count = local_count + 1 if print_frequency is not None: if local_count % print_frequency == 0: - print(f'On {local_count} of {total_count}') + print(f"On {local_count} of {total_count}") - if encoding_scheme == 'onehot': + if encoding_scheme == "onehot": seq_vector = encode_sequence.one_hot(seq) - elif encoding_scheme == 'biophysics': + elif encoding_scheme == "biophysics": seq_vector = encode_sequence.biophysics(seq) - elif encoding_scheme == 'user': + elif encoding_scheme == "user": seq_vector = encoder.encode(seq) seq_vector = seq_vector.view(1, len(seq_vector), -1) diff --git a/scripts/parrot-optimize b/scripts/parrot-optimize index 22a8365..f091951 100755 --- a/scripts/parrot-optimize +++ b/scripts/parrot-optimize @@ -1,8 +1,8 @@ #!/usr/bin/env python """ Usage: $ parrot-optimize data_file output_network - -Driver script for finding optimal hyperparameters for a bidirectional recurrent + +Driver script for finding optimal hyperparameters for a bidirectional recurrent neural network on a given dataset, then training a network with those parameters For more information on usage, use the '-h' flag. @@ -13,7 +13,7 @@ idptools-parrot was developed by the Holehouse lab Question/comments/concerns? Raise an issue on github: https://github.com/idptools/parrot -Licensed under the MIT license. +Licensed under the MIT license. """ import os @@ -34,63 +34,148 @@ from parrot.tools import validate_args from parrot.tools import dataset_warnings # Parse the command line arguments -parser = argparse.ArgumentParser(description='Train and test a bi-directional RNN using entire sequence.') - -parser.add_argument('data_file', help='path to tsv file with format: ') - -parser.add_argument('output_network', help='location to save the trained network') - -parser.add_argument('-d', '--datatype', metavar='dtype', type=str, required=True, - help="REQUIRED. Format of the input data file, must be 'sequence' or 'residues'") - -parser.add_argument('-c', '--classes', type=int, metavar='num_classes', required=True, - help='REQUIRED. Number of output classes, for regression put 1') - -parser.add_argument('-b', '--batch', default=32, type=int, metavar='batch_size', - help='size of training batch (def=32)') - -parser.add_argument('-e', '--epochs', default=100, type=int, metavar='num_epochs', - help='number of training epochs (def=100)') - -parser.add_argument('--max-iter', default=50, type=int, metavar='max_iter', - help='Maximum number of iterations for the optimization procedure (def=50)') - -parser.add_argument('--split', default='', metavar='split_file', type=str, - help="file indicating how to split datafile into training, validation, and testing sets") - -parser.add_argument('--set-fractions', nargs=3, default=[0.7, 0.15, 0.15], type=float, - dest='setFractions', metavar=('train', 'val', 'test'), - help='Proportion of dataset that should be divided into training, validation, and test sets') - -parser.add_argument('--encode', default='onehot', type=str, metavar='encoding_scheme', - help="'onehot' (default), 'biophysics', or specify a path to a user-created scheme") - -parser.add_argument('--exclude-seq-id', dest='excludeSeqID', action='store_true', - help='use if data_file lacks sequence IDs in the first column of each line') - -parser.add_argument('--probabilistic-classification', dest='probabilistic_classification', - action='store_true', help='Optional implementation for sequence classificaion') - -parser.add_argument('--include-figs', dest='include_figs', action='store_true', - help='Generate figures from training results and save to same location as network') - -parser.add_argument('--no-stats', dest='ignore_metrics', action='store_true', - help='If passed, do not output a perfomance stats file.') - -parser.add_argument('--force-cpu', dest='forceCPU', action='store_true', - help='force network to train on CPU, even if GPU is available') - -parser.add_argument('--ignore-warnings', '-w', dest='ignore_warnings', action='store_true', - help='Do not display warnings for dataset structure') - -parser.add_argument('--save-splits', dest='save_splits', action='store_true', - help='Save a split-file using the random splits from this run') - -parser.add_argument('--verbose', '-v', action='store_true', - help='Flag which, if provided, causes output to terminal to be more descriptive') - -parser.add_argument('--silent', action='store_true', - help="Flag which, if provided, ensures no output is generated to the terminal") +parser = argparse.ArgumentParser( + description="Train and test a bi-directional RNN using entire sequence." +) + +parser.add_argument( + "data_file", help="path to tsv file with format: " +) + +parser.add_argument("output_network", help="location to save the trained network") + +parser.add_argument( + "-d", + "--datatype", + metavar="dtype", + type=str, + required=True, + help="REQUIRED. Format of the input data file, must be 'sequence' or 'residues'", +) + +parser.add_argument( + "-c", + "--classes", + type=int, + metavar="num_classes", + required=True, + help="REQUIRED. Number of output classes, for regression put 1", +) + +parser.add_argument( + "-b", + "--batch", + default=32, + type=int, + metavar="batch_size", + help="size of training batch (def=32)", +) + +parser.add_argument( + "-e", + "--epochs", + default=100, + type=int, + metavar="num_epochs", + help="number of training epochs (def=100)", +) + +parser.add_argument( + "--max-iter", + default=50, + type=int, + metavar="max_iter", + help="Maximum number of iterations for the optimization procedure (def=50)", +) + +parser.add_argument( + "--split", + default="", + metavar="split_file", + type=str, + help="file indicating how to split datafile into training, validation, and testing sets", +) + +parser.add_argument( + "--set-fractions", + nargs=3, + default=[0.7, 0.15, 0.15], + type=float, + dest="setFractions", + metavar=("train", "val", "test"), + help="Proportion of dataset that should be divided into training, validation, and test sets", +) + +parser.add_argument( + "--encode", + default="onehot", + type=str, + metavar="encoding_scheme", + help="'onehot' (default), 'biophysics', or specify a path to a user-created scheme", +) + +parser.add_argument( + "--exclude-seq-id", + dest="excludeSeqID", + action="store_true", + help="use if data_file lacks sequence IDs in the first column of each line", +) + +parser.add_argument( + "--probabilistic-classification", + dest="probabilistic_classification", + action="store_true", + help="Optional implementation for sequence classificaion", +) + +parser.add_argument( + "--include-figs", + dest="include_figs", + action="store_true", + help="Generate figures from training results and save to same location as network", +) + +parser.add_argument( + "--no-stats", + dest="ignore_metrics", + action="store_true", + help="If passed, do not output a perfomance stats file.", +) + +parser.add_argument( + "--force-cpu", + dest="forceCPU", + action="store_true", + help="force network to train on CPU, even if GPU is available", +) + +parser.add_argument( + "--ignore-warnings", + "-w", + dest="ignore_warnings", + action="store_true", + help="Do not display warnings for dataset structure", +) + +parser.add_argument( + "--save-splits", + dest="save_splits", + action="store_true", + help="Save a split-file using the random splits from this run", +) + +parser.add_argument( + "--verbose", + "-v", + action="store_true", + help="Flag which, if provided, causes output to terminal to be more descriptive", +) + +parser.add_argument( + "--silent", + action="store_true", + help="Flag which, if provided, ensures no output is generated to the terminal", +) args = parser.parse_args() @@ -119,29 +204,42 @@ save_splits = args.save_splits # Device configuration if forceCPU: - device = 'cpu' + device = "cpu" + device_string = "cpu" else: - device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + if torch.cuda.is_available(): + device_string = "cuda" + device = torch.device(device_string) + elif torch.backends.mps.is_available() and torch.backends.mps.is_built(): + # Use MPS if available on ARM-based MacBooks + device_string = "mps" + device = torch.device(device_string) + else: + device_string = "cpu" + device = torch.device(device_string) + +if verbose: + print(f"Torch device={device_string}") ############################################################################### ################ Validate arguments and initialize: ################### # Ensure that provided data file exists -data_file = validate_args.check_file_exists(args.data_file, 'Datafile') +data_file = validate_args.check_file_exists(args.data_file, "Datafile") # Extract output directory and output prediction file name network_file = os.path.abspath(args.output_network) filename_prefix, output_dir = validate_args.split_file_and_directory(network_file) # If provided, check that split_file exists -if split_file != '': - split_file = validate_args.check_file_exists(split_file, 'Split-file') +if split_file != "": + split_file = validate_args.check_file_exists(split_file, "Split-file") else: split_file = None # If specified, get location where randomly generated train/val/test splits will be saved if save_splits: - save_splits_output = filename_prefix + '_split_file.txt' + save_splits_output = filename_prefix + "_split_file.txt" else: save_splits_output = None @@ -152,20 +250,22 @@ encoding_scheme, encoder, input_size = validate_args.set_encoding_scheme(encode) problem_type, collate_function = validate_args.set_ml_task(num_classes, dtype) # Ensure that network hyperparams (not being optimized) are valid -validate_args.check_positive(num_epochs, 'Number of epochs') -validate_args.check_positive(batch_size, 'Batch size') +validate_args.check_positive(num_epochs, "Number of epochs") +validate_args.check_positive(batch_size, "Batch size") # Ensure that the sum of setFractions adds up to 1 for frac in setFractions: - validate_args.check_between_zero_and_one(frac, 'Set fractions') + validate_args.check_between_zero_and_one(frac, "Set fractions") if sum(setFractions) != 1.0: - raise ValueError('Set fractions must sum to 1.') + raise ValueError("Set fractions must sum to 1.") # Ensure that task is binary sequence classification if # probabilistic_classfication is set if probabilistic_classification: - if dtype != 'sequence' or num_classes < 2: - raise ValueError('Probabilistic classification only implemented for sequence classification') + if dtype != "sequence" or num_classes < 2: + raise ValueError( + "Probabilistic classification only implemented for sequence classification" + ) # Set ignore_warnings to True if --silent is provided if silent: @@ -175,12 +275,20 @@ if silent: ################################ Main code ################################## # Split data -cvs, train, val, test = pid.split_data_cv(data_file, datatype=dtype, problem_type=problem_type, - num_classes=num_classes, excludeSeqID=excludeSeqID, - split_file=split_file, encoding_scheme=encoding_scheme, - encoder=encoder, ignoreWarnings=ignore_warnings, - percent_val=setFractions[1], percent_test=setFractions[2], - save_splits_output=save_splits_output) +cvs, train, val, test = pid.split_data_cv( + data_file, + datatype=dtype, + problem_type=problem_type, + num_classes=num_classes, + excludeSeqID=excludeSeqID, + split_file=split_file, + encoding_scheme=encoding_scheme, + encoder=encoder, + ignoreWarnings=ignore_warnings, + percent_val=setFractions[1], + percent_test=setFractions[2], + save_splits_output=save_splits_output, +) # Assess batch size compared to training set size if not ignore_warnings: @@ -189,10 +297,18 @@ if not ignore_warnings: # Convert CV datasets to dataloaders cv_loaders = [] for cv_train, cv_val in cvs: - cv_train_loader = torch.utils.data.DataLoader(dataset=cv_train, batch_size=batch_size, - collate_fn=collate_function, shuffle=True) - cv_val_loader = torch.utils.data.DataLoader(dataset=cv_val, batch_size=batch_size, - collate_fn=collate_function, shuffle=False) + cv_train_loader = torch.utils.data.DataLoader( + dataset=cv_train, + batch_size=batch_size, + collate_fn=collate_function, + shuffle=True, + ) + cv_val_loader = torch.utils.data.DataLoader( + dataset=cv_val, + batch_size=batch_size, + collate_fn=collate_function, + shuffle=False, + ) cv_loaders.append((cv_train_loader, cv_val_loader)) # Output to std out @@ -202,7 +318,7 @@ if silent is False: print("PARROT with hyperparameter optimization") print("---------------------------------------") if verbose: - print('Train on:\t%s' % device) + print("Train on:\t%s" % device) print("Datatype:\t%s" % dtype) print("ML Task:\t%s" % problem_type) print("Batch size:\t%d" % batch_size) @@ -210,70 +326,109 @@ if silent is False: print("Number of optimization iterations:\t%d\n" % max_iterations) # Optimization procedure -optimizer = bayesian_optimization.BayesianOptimizer(cv_loaders, input_size, num_epochs, - num_classes, dtype, network_file, - max_iterations, device, silent) +optimizer = bayesian_optimization.BayesianOptimizer( + cv_loaders, + input_size, + num_epochs, + num_classes, + dtype, + network_file, + max_iterations, + device, + silent, +) best_hyperparams = optimizer.optimize() -lr = 10**best_hyperparams[0] +lr = 10 ** best_hyperparams[0] nl = int(best_hyperparams[1]) hs = int(best_hyperparams[2]) # Save these hyperparamters to a file so that the user has a record # TODO: move to helper function -params_file = filename_prefix + '_optimal_hyperparams.txt' -with open(params_file, 'w') as f: - f.write('Learning rate: %.5f\n' % lr) - f.write('Num Layers: %d\n' % nl) - f.write('Hidden vector size: %d\n' % hs) +params_file = filename_prefix + "_optimal_hyperparams.txt" +with open(params_file, "w") as f: + f.write("Learning rate: %.5f\n" % lr) + f.write("Num Layers: %d\n" % nl) + f.write("Hidden vector size: %d\n" % hs) # Use these best hyperparams to train the network from scratch using the entire train/val sets # Add data to dataloaders -train_loader = torch.utils.data.DataLoader(dataset=train, - batch_size=batch_size, - collate_fn=collate_function, - shuffle=True) -val_loader = torch.utils.data.DataLoader(dataset=val, - batch_size=batch_size, - collate_fn=collate_function, - shuffle=False) -test_loader = torch.utils.data.DataLoader(dataset=test, - batch_size=1, # Set test batch size to 1 - collate_fn=collate_function, - shuffle=False) +train_loader = torch.utils.data.DataLoader( + dataset=train, batch_size=batch_size, collate_fn=collate_function, shuffle=True +) +val_loader = torch.utils.data.DataLoader( + dataset=val, batch_size=batch_size, collate_fn=collate_function, shuffle=False +) +test_loader = torch.utils.data.DataLoader( + dataset=test, + batch_size=1, # Set test batch size to 1 + collate_fn=collate_function, + shuffle=False, +) # Initialize network: -if dtype == 'sequence': - brnn_network = brnn_architecture.BRNN_MtO(input_size, hs, nl, num_classes, device).to(device) +if dtype == "sequence": + brnn_network = brnn_architecture.BRNN_MtO( + input_size, hs, nl, num_classes, device + ).to(device) else: # dtype == 'residues' - brnn_network = brnn_architecture.BRNN_MtM(input_size, hs, nl, num_classes, device).to(device) + brnn_network = brnn_architecture.BRNN_MtM( + input_size, hs, nl, num_classes, device + ).to(device) # Train network if silent is False: - print('Training with optimal hyperparams:') -train_loss, val_loss = train_network.train(brnn_network, train_loader, val_loader, datatype=dtype, - problem_type=problem_type, weights_file=network_file, - stop_condition='iter', device=device, learn_rate=lr, - n_epochs=num_epochs*2, verbose=verbose, silent=silent) + print("Training with optimal hyperparams:") +train_loss, val_loss = train_network.train( + brnn_network, + train_loader, + val_loader, + datatype=dtype, + problem_type=problem_type, + weights_file=network_file, + stop_condition="iter", + device=device, + learn_rate=lr, + n_epochs=num_epochs * 2, + verbose=verbose, + silent=silent, +) if include_figs: # Plot training & validation loss per epoch brnn_plot.training_loss(train_loss, val_loss, output_file_prefix=filename_prefix) # Test network -test_loss, test_set_predictions = train_network.test_labeled_data(brnn_network, test_loader, - datatype=dtype, problem_type=problem_type, - weights_file=network_file, num_classes=num_classes, - probabilistic_classification=probabilistic_classification, - include_figs=include_figs, device=device, - output_file_prefix=filename_prefix) +test_loss, test_set_predictions = train_network.test_labeled_data( + brnn_network, + test_loader, + datatype=dtype, + problem_type=problem_type, + weights_file=network_file, + num_classes=num_classes, + probabilistic_classification=probabilistic_classification, + include_figs=include_figs, + device=device, + output_file_prefix=filename_prefix, +) if silent is False: - print('\nTest Loss: %.4f' % test_loss) + print("\nTest Loss: %.4f" % test_loss) # Output performance metrics if not ignore_metrics: - brnn_plot.write_performance_metrics(test_set_predictions, dtype, problem_type, - probabilistic_classification, filename_prefix) - + brnn_plot.write_performance_metrics( + test_set_predictions, + dtype, + problem_type, + probabilistic_classification, + filename_prefix, + ) + # Output the test set predictions to a text file -brnn_plot.output_predictions_to_file(test_set_predictions, excludeSeqID, encoding_scheme, - probabilistic_classification, encoder, output_file_prefix=filename_prefix) +brnn_plot.output_predictions_to_file( + test_set_predictions, + excludeSeqID, + encoding_scheme, + probabilistic_classification, + encoder, + output_file_prefix=filename_prefix, +) diff --git a/scripts/parrot-train b/scripts/parrot-train index 39ae2b9..39dce11 100755 --- a/scripts/parrot-train +++ b/scripts/parrot-train @@ -1,7 +1,7 @@ #!/usr/bin/env python """ Usage: $ parrot-train data_file output_network - + Driver script for training a bidirectional recurrent neural network with user specified parameters. For more information on usage, use the '-h' flag. @@ -12,7 +12,7 @@ idptools-parrot was developed by the Holehouse lab Question/comments/concerns? Raise an issue on github: https://github.com/idptools/parrot -Licensed under the MIT license. +Licensed under the MIT license. """ import os @@ -32,75 +32,182 @@ from parrot.tools import validate_args from parrot.tools import dataset_warnings # Parse the command line arguments -parser = argparse.ArgumentParser(description='Train and test a bi-directional RNN using entire sequence.') - -parser.add_argument('data_file', help='path to tsv file with format: ') - -parser.add_argument('output_network', help='location to save the trained network') - -parser.add_argument('-d', '--datatype', metavar='dtype', type=str, required=True, - help="REQUIRED. Format of the input data file, must be 'sequence' or 'residues'") - -parser.add_argument('-c', '--classes', type=int, metavar='num_classes', required=True, - help='REQUIRED. Number of output classes, for regression put 1') - -parser.add_argument('-hs', '--hidden-size', default=10, type=int, metavar='hidden_size', - help='hidden vector size (def=10)') - -parser.add_argument('-nl', '--num-layers', default=1, type=int, metavar='num_layers', - help='number of layers per direction (def=1)') - -parser.add_argument('-lr', '--learning-rate', default=0.001, type=float, - metavar='learning_rate', help='(def=0.001)') - -parser.add_argument('-b', '--batch', default=32, type=int, metavar='batch_size', - help='size of training batch (def=32)') - -parser.add_argument('-e', '--epochs', default=100, type=int, metavar='num_epochs', - help='number of training epochs (def=100)') - -parser.add_argument('--split', default='', metavar='split_file', type=str, - help="file indicating how to split datafile into training, validation, and test sets") - -parser.add_argument('--stop', default='iter', metavar='stop_condition', - type=str, help="training stop condition: either 'auto' or 'iter' (default 'iter')") - -parser.add_argument('--set-fractions', nargs=3, default=[0.7, 0.15, 0.15], type=float, - dest='setFractions', metavar=('train', 'val', 'test'), - help='proportion of dataset that should be divided into training, validation, and test sets') - -parser.add_argument('--encode', default='onehot', type=str, metavar='encoding_scheme', - help="'onehot' (default), 'biophysics', or specify a path to a user-created scheme") - -parser.add_argument('--exclude-seq-id', dest='excludeSeqID', action='store_true', - help='use if data_file lacks sequence IDs in the first column of each line') - -parser.add_argument('--probabilistic-classification', dest='probabilistic_classification', - action='store_true', help='Optional implementation for sequence classificaion') - -parser.add_argument('--include-figs', dest='include_figs', action='store_true', - help='Generate figures from training results and save to same location as network') - -parser.add_argument('--no-stats', dest='ignore_metrics', action='store_true', - help='If passed, do not output a performance stats file.') - -parser.add_argument('--force-cpu', dest='forceCPU', action='store_true', - help='force network to train on CPU, even if GPU is available') - -parser.add_argument('--gpu-id', dest='gpu_id', type=int, - help='User defined control over which CUDA device will be used by parrot') - -parser.add_argument('--ignore-warnings', '-w', dest='ignore_warnings', action='store_true', - help='Do not display warnings for dataset structure') - -parser.add_argument('--save-splits', dest='save_splits', action='store_true', - help='Save a split-file using the random splits from this run') - -parser.add_argument('--verbose', '-v', action='store_true', - help='Flag which, if provided, causes output to terminal to be more descriptive') - -parser.add_argument('--silent', action='store_true', - help="Flag which, if provided, ensures no output is generated to the terminal") +parser = argparse.ArgumentParser( + description="Train and test a bi-directional RNN using entire sequence." +) + +parser.add_argument( + "data_file", help="path to tsv file with format: " +) + +parser.add_argument("output_network", help="location to save the trained network") + +parser.add_argument( + "-d", + "--datatype", + metavar="dtype", + type=str, + required=True, + help="REQUIRED. Format of the input data file, must be 'sequence' or 'residues'", +) + +parser.add_argument( + "-c", + "--classes", + type=int, + metavar="num_classes", + required=True, + help="REQUIRED. Number of output classes, for regression put 1", +) + +parser.add_argument( + "-hs", + "--hidden-size", + default=10, + type=int, + metavar="hidden_size", + help="hidden vector size (def=10)", +) + +parser.add_argument( + "-nl", + "--num-layers", + default=1, + type=int, + metavar="num_layers", + help="number of layers per direction (def=1)", +) + +parser.add_argument( + "-lr", + "--learning-rate", + default=0.001, + type=float, + metavar="learning_rate", + help="(def=0.001)", +) + +parser.add_argument( + "-b", + "--batch", + default=32, + type=int, + metavar="batch_size", + help="size of training batch (def=32)", +) + +parser.add_argument( + "-e", + "--epochs", + default=100, + type=int, + metavar="num_epochs", + help="number of training epochs (def=100)", +) + +parser.add_argument( + "--split", + default="", + metavar="split_file", + type=str, + help="file indicating how to split datafile into training, validation, and test sets", +) + +parser.add_argument( + "--stop", + default="iter", + metavar="stop_condition", + type=str, + help="training stop condition: either 'auto' or 'iter' (default 'iter')", +) + +parser.add_argument( + "--set-fractions", + nargs=3, + default=[0.7, 0.15, 0.15], + type=float, + dest="setFractions", + metavar=("train", "val", "test"), + help="proportion of dataset that should be divided into training, validation, and test sets", +) + +parser.add_argument( + "--encode", + default="onehot", + type=str, + metavar="encoding_scheme", + help="'onehot' (default), 'biophysics', or specify a path to a user-created scheme", +) + +parser.add_argument( + "--exclude-seq-id", + dest="excludeSeqID", + action="store_true", + help="use if data_file lacks sequence IDs in the first column of each line", +) + +parser.add_argument( + "--probabilistic-classification", + dest="probabilistic_classification", + action="store_true", + help="Optional implementation for sequence classificaion", +) + +parser.add_argument( + "--include-figs", + dest="include_figs", + action="store_true", + help="Generate figures from training results and save to same location as network", +) + +parser.add_argument( + "--no-stats", + dest="ignore_metrics", + action="store_true", + help="If passed, do not output a performance stats file.", +) + +parser.add_argument( + "--force-cpu", + dest="forceCPU", + action="store_true", + help="force network to train on CPU, even if GPU is available", +) + +parser.add_argument( + "--gpu-id", + dest="gpu_id", + type=int, + help="User defined control over which CUDA device will be used by parrot", +) + +parser.add_argument( + "--ignore-warnings", + "-w", + dest="ignore_warnings", + action="store_true", + help="Do not display warnings for dataset structure", +) + +parser.add_argument( + "--save-splits", + dest="save_splits", + action="store_true", + help="Save a split-file using the random splits from this run", +) + +parser.add_argument( + "--verbose", + "-v", + action="store_true", + help="Flag which, if provided, causes output to terminal to be more descriptive", +) + +parser.add_argument( + "--silent", + action="store_true", + help="Flag which, if provided, ensures no output is generated to the terminal", +) args = parser.parse_args() print(args) @@ -134,32 +241,48 @@ save_splits = args.save_splits # Device configuration if forceCPU: - device = 'cpu' + device = "cpu" + device_string = "cpu" elif gpu_id: - device = torch.device(f"cuda:{gpu_id}" if torch.cuda.is_available() else 'cpu') - print(f"You've specified to run this network on cuda:{gpu_id}. Running on {device=}") + device = torch.device(f"cuda:{gpu_id}" if torch.cuda.is_available() else "cpu") + device_string = "cuda" + print( + f"You've specified to run this network on cuda:{gpu_id}. Running on {device=}" + ) else: - device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + if torch.cuda.is_available(): + device_string = "cuda" + device = torch.device(device_string) + elif torch.backends.mps.is_available() and torch.backends.mps.is_built(): + # Use MPS if available on ARM-based MacBooks + device_string = "mps" + device = torch.device(device_string) + else: + device_string = "cpu" + device = torch.device(device_string) + +if verbose: + print(f"Torch device={device_string}") ############################################################################### ############# Validate arguments and initialize network: ############## # Ensure that provided data file exists -data_file = validate_args.check_file_exists(args.data_file, 'Datafile') +data_file = validate_args.check_file_exists(args.data_file, "Datafile") # Extract output directory and output prediction file name network_file = os.path.abspath(args.output_network) filename_prefix, output_dir = validate_args.split_file_and_directory(network_file) # If provided, check that split_file exists -if split_file != '': - split_file = validate_args.check_file_exists(split_file, 'Split-file') +if split_file != "": + split_file = validate_args.check_file_exists(split_file, "Split-file") else: split_file = None # If specified, get location where randomly generated train/val/test splits will be saved if save_splits: - save_splits_output = filename_prefix + '_split_file.txt' + save_splits_output = filename_prefix + "_split_file.txt" else: save_splits_output = None @@ -170,69 +293,81 @@ encoding_scheme, encoder, input_size = validate_args.set_encoding_scheme(encode) problem_type, collate_function = validate_args.set_ml_task(num_classes, dtype) # Ensure that network hyperparams are valid -validate_args.check_between_zero_and_one(learning_rate, 'Learning rate') -validate_args.check_positive(hidden_size, 'Hidden vector size') -validate_args.check_positive(num_layers, 'Number of layers') -validate_args.check_positive(num_epochs, 'Number of epochs') -validate_args.check_positive(batch_size, 'Batch size') +validate_args.check_between_zero_and_one(learning_rate, "Learning rate") +validate_args.check_positive(hidden_size, "Hidden vector size") +validate_args.check_positive(num_layers, "Number of layers") +validate_args.check_positive(num_epochs, "Number of epochs") +validate_args.check_positive(batch_size, "Batch size") # Ensure that stop condition is 'iter' or 'auto' validate_args.check_stop_condition(stop_cond, num_epochs) # Ensure that the sum of setFractions adds up to 1 for frac in setFractions: - validate_args.check_between_zero_and_one(frac, 'Set fractions') + validate_args.check_between_zero_and_one(frac, "Set fractions") if sum(setFractions) != 1.0: - raise ValueError('Set fractions must sum to 1.') + raise ValueError("Set fractions must sum to 1.") # Ensure that task is binary sequence classification if # probabilistic_classfication is set if probabilistic_classification: - if dtype != 'sequence' or num_classes < 2: - raise ValueError('Probabilistic classification only implemented for sequence classification') + if dtype != "sequence" or num_classes < 2: + raise ValueError( + "Probabilistic classification only implemented for sequence classification" + ) # Set ignore_warnings to True if --silent is provided if silent: ignore_warnings = True # Initialize network architecture depending on data format -if dtype == 'sequence': +if dtype == "sequence": # Use a many-to-one architecture - brnn_network = brnn_architecture.BRNN_MtO(input_size, hidden_size, - num_layers, num_classes, device).to(device) -elif dtype == 'residues': + brnn_network = brnn_architecture.BRNN_MtO( + input_size, hidden_size, num_layers, num_classes, device + ).to(device) +elif dtype == "residues": # Use a many-to-many architecture - brnn_network = brnn_architecture.BRNN_MtM(input_size, hidden_size, - num_layers, num_classes, device).to(device) + brnn_network = brnn_architecture.BRNN_MtM( + input_size, hidden_size, num_layers, num_classes, device + ).to(device) ############################################################################### ################################ Main code ################################## # Split data -train, val, test = pid.split_data(data_file, datatype=dtype, problem_type=problem_type, - num_classes=num_classes, excludeSeqID=excludeSeqID, - split_file=split_file, encoding_scheme=encoding_scheme, - encoder=encoder, percent_val=setFractions[1], - percent_test=setFractions[2], ignoreWarnings=ignore_warnings, - save_splits_output=save_splits_output) +train, val, test = pid.split_data( + data_file, + datatype=dtype, + problem_type=problem_type, + num_classes=num_classes, + excludeSeqID=excludeSeqID, + split_file=split_file, + encoding_scheme=encoding_scheme, + encoder=encoder, + percent_val=setFractions[1], + percent_test=setFractions[2], + ignoreWarnings=ignore_warnings, + save_splits_output=save_splits_output, +) # Assess batch size compared to training set size if not ignore_warnings: dataset_warnings.eval_batch_size(batch_size, len(train)) # Add data to dataloaders -train_loader = torch.utils.data.DataLoader(dataset=train, - batch_size=batch_size, - collate_fn=collate_function, - shuffle=True) -val_loader = torch.utils.data.DataLoader(dataset=val, - batch_size=batch_size, - collate_fn=collate_function, - shuffle=False) -test_loader = torch.utils.data.DataLoader(dataset=test, - batch_size=1, # Set test batch size to 1 - collate_fn=collate_function, - shuffle=False) +train_loader = torch.utils.data.DataLoader( + dataset=train, batch_size=batch_size, collate_fn=collate_function, shuffle=True +) +val_loader = torch.utils.data.DataLoader( + dataset=val, batch_size=batch_size, collate_fn=collate_function, shuffle=False +) +test_loader = torch.utils.data.DataLoader( + dataset=test, + batch_size=1, # Set test batch size to 1 + collate_fn=collate_function, + shuffle=False, +) # Output to std out # TODO: move to helper function in /tools @@ -241,7 +376,7 @@ if silent is False: print("PARROT with user-specified parameters") print("-------------------------------------") if verbose > 1: - print('Train on:\t%s' % device) + print("Train on:\t%s" % device) print("Datatype:\t%s" % dtype) print("ML Task:\t%s" % problem_type) print("Learning rate:\t%f" % learning_rate) @@ -252,31 +387,57 @@ if silent is False: print("Validation set loss per epoch:") # Train network -train_loss, val_loss = train_network.train(brnn_network, train_loader, val_loader, datatype=dtype, - problem_type=problem_type, weights_file=network_file, - stop_condition=stop_cond, device=device, - learn_rate=learning_rate, n_epochs=num_epochs, - verbose=verbose, silent=silent) +train_loss, val_loss = train_network.train( + brnn_network, + train_loader, + val_loader, + datatype=dtype, + problem_type=problem_type, + weights_file=network_file, + stop_condition=stop_cond, + device=device, + learn_rate=learning_rate, + n_epochs=num_epochs, + verbose=verbose, + silent=silent, +) if include_figs: # Plot training & validation loss per epoch brnn_plot.training_loss(train_loss, val_loss, output_file_prefix=filename_prefix) # Test network -test_loss, test_set_predictions = train_network.test_labeled_data(brnn_network, test_loader, - datatype=dtype, problem_type=problem_type, - weights_file=network_file, num_classes=num_classes, - probabilistic_classification=probabilistic_classification, - include_figs=include_figs, device=device, - output_file_prefix=filename_prefix) +test_loss, test_set_predictions = train_network.test_labeled_data( + brnn_network, + test_loader, + datatype=dtype, + problem_type=problem_type, + weights_file=network_file, + num_classes=num_classes, + probabilistic_classification=probabilistic_classification, + include_figs=include_figs, + device=device, + output_file_prefix=filename_prefix, +) if silent is False: - print('\nTest Loss: %.4f' % test_loss) + print("\nTest Loss: %.4f" % test_loss) # Output performance metrics if not ignore_metrics: - brnn_plot.write_performance_metrics(test_set_predictions, dtype, problem_type, - probabilistic_classification, filename_prefix) + brnn_plot.write_performance_metrics( + test_set_predictions, + dtype, + problem_type, + probabilistic_classification, + filename_prefix, + ) # Output the test set predictions to a text file -brnn_plot.output_predictions_to_file(test_set_predictions, excludeSeqID, encoding_scheme, - probabilistic_classification, encoder, output_file_prefix=filename_prefix) +brnn_plot.output_predictions_to_file( + test_set_predictions, + excludeSeqID, + encoding_scheme, + probabilistic_classification, + encoder, + output_file_prefix=filename_prefix, +)