diff --git a/parrot/bayesian_optimization.py b/parrot/bayesian_optimization.py
index 1243784..951926e 100644
--- a/parrot/bayesian_optimization.py
+++ b/parrot/bayesian_optimization.py
@@ -8,11 +8,9 @@
 Question/comments/concerns? Raise an issue on github:
 https://github.com/idptools/parrot
 
-Licensed under the MIT license. 
+Licensed under the MIT license.
 """
 
-import math
-
 import numpy as np
 
 try:
@@ -20,18 +18,19 @@
     import GPyOpt
     from GPyOpt.methods import BayesianOptimization
 except ImportError:
-    print('Error importing GPy.')
-    print(' If trying to run parrot-optimize, make sure to use `pip install idptools-parrot[optimize]`')
+    print("Error importing GPy.")
+    print(
+        " If trying to run parrot-optimize, make sure to use `pip install idptools-parrot[optimize]`"
+    )
 
-from parrot import train_network
-from parrot import brnn_architecture
+from parrot import brnn_architecture, train_network
 
 
 class BayesianOptimizer(object):
     """A class for conducting Bayesian Optimization on a PyTorch RNN
 
     Sets up and runs GPy Bayesian Optimization in order to choose the best-
-    performing hyperparameters for a RNN for a given machine learning task. 
+    performing hyperparameters for a RNN for a given machine learning task.
     Iteratively change learning rate, hidden vector size, and the number of layers
     in the network, then train and validating using 5-fold cross validation.
 
@@ -55,7 +54,7 @@ class BayesianOptimizer(object):
     weights_file : str
             Path to which the network weights will be saved during training
     device : str
-            'cpu' or 'cuda' depending on system hardware
+            'cpu', 'mps' or 'cuda' depending on system hardware
     max_iterations : int
             Maximum number of iterations to perform the optimization procedure
     silent : bool
@@ -64,8 +63,18 @@ class BayesianOptimizer(object):
             GPy-compatible bounds for each of the hyperparameters to be optimized
     """
 
-    def __init__(self, cv_dataloaders, input_size, n_epochs, n_classes,
-                 dtype, weights_file, max_iterations, device, silent):
+    def __init__(
+        self,
+        cv_dataloaders,
+        input_size,
+        n_epochs,
+        n_classes,
+        dtype,
+        weights_file,
+        max_iterations,
+        device,
+        silent,
+    ):
         """
         Parameters
         ----------
@@ -83,7 +92,7 @@ def __init__(self, cv_dataloaders, input_size, n_epochs, n_classes,
         weights_file : str
                 Path to which the network weights will be saved during training
         device : str
-                'cpu' or 'cuda' depending on system hardware
+                'cpu', 'mps' or 'cuda' depending on system hardware
         max_iterations : int
                 Maximum number of iterations to perform the optimization procedure
         silent : bool
@@ -96,9 +105,9 @@ def __init__(self, cv_dataloaders, input_size, n_epochs, n_classes,
         self.n_folds = len(cv_dataloaders)
         self.n_classes = n_classes
         if n_classes > 1:
-            self.problem_type = 'classification'
+            self.problem_type = "classification"
         else:
-            self.problem_type = 'regression'
+            self.problem_type = "regression"
 
         self.dtype = dtype
         self.weights_file = weights_file
@@ -106,9 +115,19 @@ def __init__(self, cv_dataloaders, input_size, n_epochs, n_classes,
         self.device = device
         self.silent = silent
 
-        self.bds = [{'name': 'log_learning_rate', 'type': 'continuous', 'domain': (-5, -2)},  # 0.00001-0.01
-                    {'name': 'n_layers', 'type': 'discrete', 'domain': tuple(range(1, 6))},  # 1-5
-                    {'name': 'hidden_size', 'type': 'discrete', 'domain': tuple(range(5, 51))}]  # 5-50
+        self.bds = [
+            {
+                "name": "log_learning_rate",
+                "type": "continuous",
+                "domain": (-5, -2),
+            },  # 0.00001-0.01
+            {
+                "name": "n_layers",
+                "type": "discrete",
+                "domain": tuple(range(1, 6)),
+            },  # 1-5
+            {"name": "hidden_size", "type": "discrete", "domain": tuple(range(5, 51))},
+        ]  # 5-50
 
     def compute_cv_loss(self, hyperparameters):
         """Compute the average cross-val loss for a given set of hyperparameters
@@ -125,7 +144,7 @@ def compute_cv_loss(self, hyperparameters):
         Returns
         -------
         numpy float array
-                a Nx1 numpy array of the average cross-val loss 
+                a Nx1 numpy array of the average cross-val loss
                 per set of input hyperparameters
         """
 
@@ -134,7 +153,7 @@ def compute_cv_loss(self, hyperparameters):
         for i in range(len(hyperparameters)):
 
             log_lr, nl, hs = hyperparameters[i]
-            lr = 10**float(log_lr)
+            lr = 10 ** float(log_lr)
             nl = int(nl)
             hs = int(hs)
 
@@ -143,7 +162,10 @@ def compute_cv_loss(self, hyperparameters):
             avg = np.average(cv_outputs[i])
 
             if self.silent is False:
-                print('  %.6f	|     %2d       |         %2d           |    %.3f' % (lr, nl, hs, avg))
+                print(
+                    "  %.6f	|     %2d       |         %2d           |    %.3f"
+                    % (lr, nl, hs, avg)
+                )
 
         outputs = np.average(cv_outputs, axis=1)
         return outputs
@@ -166,23 +188,36 @@ def eval_cv_brnns(self, lr, nl, hs):
                 the best validation loss from each fold of cross validation
         """
 
-        cv_losses = np.zeros(self.n_folds) - 1  # -1 so that it's obvious if something goes wrong
+        cv_losses = (
+            np.zeros(self.n_folds) - 1
+        )  # -1 so that it's obvious if something goes wrong
 
         for k in range(self.n_folds):
-            if self.dtype == 'sequence':
+            if self.dtype == "sequence":
                 # Use a many-to-one architecture
-                brnn_network = brnn_architecture.BRNN_MtO(self.input_size, hs, nl,
-                                                          self.n_classes, self.device).to(self.device)
+                brnn_network = brnn_architecture.BRNN_MtO(
+                    self.input_size, hs, nl, self.n_classes, self.device
+                ).to(self.device)
             else:
                 # Use a many-to-many architecture
-                brnn_network = brnn_architecture.BRNN_MtM(self.input_size, hs, nl,
-                                                          self.n_classes, self.device).to(self.device)
+                brnn_network = brnn_architecture.BRNN_MtM(
+                    self.input_size, hs, nl, self.n_classes, self.device
+                ).to(self.device)
 
             # Train network with this set of hyperparameters
-            train_losses, val_losses = train_network.train(brnn_network, self.cv_loaders[k][0],
-                                                           self.cv_loaders[k][1], self.dtype, self.problem_type,
-                                                           self.weights_file, stop_condition='iter', device=self.device,
-                                                           learn_rate=lr, n_epochs=self.n_epochs, silent=True)
+            train_losses, val_losses = train_network.train(
+                brnn_network,
+                self.cv_loaders[k][0],
+                self.cv_loaders[k][1],
+                self.dtype,
+                self.problem_type,
+                self.weights_file,
+                stop_condition="iter",
+                device=self.device,
+                learn_rate=lr,
+                n_epochs=self.n_epochs,
+                silent=True,
+            )
             # Take best val loss
             best_val_loss = np.min(val_losses)
             cv_losses[k] = best_val_loss
@@ -211,7 +246,7 @@ def initial_search(self, x):
         for i in range(len(x)):
 
             log_lr, nl, hs = x[i]
-            lr = 10**float(log_lr)
+            lr = 10 ** float(log_lr)
             nl = int(nl)
             hs = int(hs)
 
@@ -237,32 +272,55 @@ def optimize(self):
         """
 
         # Initial hyperparameter search -- used to get noise estimate
-        x_init = np.array([[-3.0, 1, 20], [-3.0, 2, 20], [-3.0, 3, 20], [-3.0, 4, 20], [-3.0, 5, 20],
-                           [-2.0, 2, 20], [-3.3, 2, 20], [-4.0, 2, 20], [-5.0, 2, 20],
-                           [-3.0, 2, 5],  [-3.0, 2, 15], [-3.0, 2, 35], [-3.0, 2, 50]])
+        x_init = np.array(
+            [
+                [-3.0, 1, 20],
+                [-3.0, 2, 20],
+                [-3.0, 3, 20],
+                [-3.0, 4, 20],
+                [-3.0, 5, 20],
+                [-2.0, 2, 20],
+                [-3.3, 2, 20],
+                [-4.0, 2, 20],
+                [-5.0, 2, 20],
+                [-3.0, 2, 5],
+                [-3.0, 2, 15],
+                [-3.0, 2, 35],
+                [-3.0, 2, 50],
+            ]
+        )
         y_init, noise = self.initial_search(x_init)
 
         if self.silent is False:
             print("\nInitial search results:")
             print("lr\tnl\ths\toutput")
             for i in range(len(x_init)):
-                print("%.5f\t%2d\t%2d\t%.4f" % (10**x_init[i][0], x_init[i][1], x_init[i][2], y_init[i][0]))
+                print(
+                    "%.5f\t%2d\t%2d\t%.4f"
+                    % (10 ** x_init[i][0], x_init[i][1], x_init[i][2], y_init[i][0])
+                )
             print("Noise estimate:", noise)
-            print('\n')
-            print('Primary optimization:')
-            print('--------------------\n')
-            print('Learning rate   |   n_layers   |   hidden vector size |  avg CV loss  ')
-            print('======================================================================')
-
-        optimizer = BayesianOptimization(f=self.compute_cv_loss,
-                                         domain=self.bds,
-                                         model_type='GP',
-                                         acquisition_type='EI',
-                                         acquisition_jitter=0.05,
-                                         X=x_init,
-                                         Y=y_init,
-                                         noise_var=noise,
-                                         maximize=False)
+            print("\n")
+            print("Primary optimization:")
+            print("--------------------\n")
+            print(
+                "Learning rate   |   n_layers   |   hidden vector size |  avg CV loss  "
+            )
+            print(
+                "======================================================================"
+            )
+
+        optimizer = BayesianOptimization(
+            f=self.compute_cv_loss,
+            domain=self.bds,
+            model_type="GP",
+            acquisition_type="EI",
+            acquisition_jitter=0.05,
+            X=x_init,
+            Y=y_init,
+            noise_var=noise,
+            maximize=False,
+        )
 
         optimizer.run_optimization(max_iter=self.max_iterations)
 
@@ -270,8 +328,10 @@ def optimize(self):
         outs = optimizer.get_evaluations()[1].flatten()
 
         if self.silent is False:
-            print("\nThe optimal hyperparameters are:\nlr = %.5f\nnl = %d\nhs = %d" %
-                  (10**optimizer.x_opt[0], optimizer.x_opt[1], optimizer.x_opt[2]))
+            print(
+                "\nThe optimal hyperparameters are:\nlr = %.5f\nnl = %d\nhs = %d"
+                % (10 ** optimizer.x_opt[0], optimizer.x_opt[1], optimizer.x_opt[2])
+            )
             print()
 
         return optimizer.x_opt
diff --git a/parrot/brnn_architecture.py b/parrot/brnn_architecture.py
index efe09c9..7e52527 100644
--- a/parrot/brnn_architecture.py
+++ b/parrot/brnn_architecture.py
@@ -8,7 +8,7 @@
 Question/comments/concerns? Raise an issue on github:
 https://github.com/idptools/parrot
 
-Licensed under the MIT license. 
+Licensed under the MIT license.
 """
 
 import torch
@@ -24,15 +24,15 @@ class BRNN_MtM(nn.Module):
     aggregates the deepest hidden layers of both directions and produces the
     outputs.
 
-    "Many-to-many" refers to the fact that the network will produce outputs 
-    corresponding to every item of the input sequence. For example, an input 
+    "Many-to-many" refers to the fact that the network will produce outputs
+    corresponding to every item of the input sequence. For example, an input
     sequence of length 10 will produce 10 sequential outputs.
 
     Attributes
     ----------
     device : str
         String describing where the network is physically stored on the computer.
-        Should be either 'cpu' or 'cuda' (GPU).
+        Should be 'cpu', 'mps' or 'cuda' (GPU).
     hidden_size : int
         Size of hidden vectors in the network
     num_layers : int
@@ -43,8 +43,8 @@ class BRNN_MtM(nn.Module):
         it should be the number of classes.
     lstm : PyTorch LSTM object
         The bidirectional LSTM layer(s) of the recurrent neural network.
-    fc : PyTorch Linear object  
-        The fully connected linear layer of the recurrent neural network. Across 
+    fc : PyTorch Linear object
+        The fully connected linear layer of the recurrent neural network. Across
         the length of the input sequence, this layer aggregates the output of the
         LSTM nodes from the deepest forward layer and deepest reverse layer and
         returns the output for that residue in the sequence.
@@ -66,7 +66,7 @@ def __init__(self, input_size, hidden_size, num_layers, num_classes, device):
             it should be the number of classes.
         device : str
             String describing where the network is physically stored on the computer.
-            Should be either 'cpu' or 'cuda' (GPU).
+            Should be 'cpu', 'mps' or 'cuda' (GPU).
         """
 
         super(BRNN_MtM, self).__init__()
@@ -74,10 +74,12 @@ def __init__(self, input_size, hidden_size, num_layers, num_classes, device):
         self.hidden_size = hidden_size
         self.num_layers = num_layers
         self.num_classes = num_classes
-        self.lstm = nn.LSTM(input_size, hidden_size, num_layers,
-                            batch_first=True, bidirectional=True)
-        self.fc = nn.Linear(in_features=hidden_size*2,  # *2 for bidirection
-                            out_features=num_classes)
+        self.lstm = nn.LSTM(
+            input_size, hidden_size, num_layers, batch_first=True, bidirectional=True
+        )
+        self.fc = nn.Linear(
+            in_features=hidden_size * 2, out_features=num_classes  # *2 for bidirection
+        )
 
     def forward(self, x):
         """Propogate input sequences through the network to produce outputs
@@ -98,10 +100,12 @@ def forward(self, x):
 
         # Set initial states
         # h0 and c0 dimensions: [num_layers*2 X batch_size X hidden_size]
-        h0 = torch.zeros(self.num_layers*2,     # *2 for bidirection
-                         x.size(0), self.hidden_size).to(self.device)
-        c0 = torch.zeros(self.num_layers*2,
-                         x.size(0), self.hidden_size).to(self.device)
+        h0 = torch.zeros(
+            self.num_layers * 2, x.size(0), self.hidden_size  # *2 for bidirection
+        ).to(self.device)
+        c0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(
+            self.device
+        )
 
         # Forward propagate LSTM
         # out: tensor of shape: [batch_size, seq_length, hidden_size*2]
@@ -121,7 +125,7 @@ class BRNN_MtO(nn.Module):
     aggregates the deepest hidden layers of both directions and produces the
     output.
 
-    "Many-to-one" refers to the fact that the network will produce a single output 
+    "Many-to-one" refers to the fact that the network will produce a single output
     for an entire input sequence. For example, an input sequence of length 10 will
     produce only one output.
 
@@ -129,7 +133,7 @@ class BRNN_MtO(nn.Module):
     ----------
     device : str
         String describing where the network is physically stored on the computer.
-        Should be either 'cpu' or 'cuda' (GPU).
+        Should be 'cpu', 'mps' or 'cuda' (GPU).
     hidden_size : int
         Size of hidden vectors in the network
     num_layers : int
@@ -140,8 +144,8 @@ class BRNN_MtO(nn.Module):
         it should be the number of classes.
     lstm : PyTorch LSTM object
         The bidirectional LSTM layer(s) of the recurrent neural network.
-    fc : PyTorch Linear object  
-        The fully connected linear layer of the recurrent neural network. Across 
+    fc : PyTorch Linear object
+        The fully connected linear layer of the recurrent neural network. Across
         the length of the input sequence, this layer aggregates the output of the
         LSTM nodes from the deepest forward layer and deepest reverse layer and
         returns the output for that residue in the sequence.
@@ -163,17 +167,19 @@ def __init__(self, input_size, hidden_size, num_layers, num_classes, device):
             it should be the number of classes.
         device : str
             String describing where the network is physically stored on the computer.
-            Should be either 'cpu' or 'cuda' (GPU).
+            Should be 'cpu', 'mps' or 'cuda' (GPU).
         """
 
         super(BRNN_MtO, self).__init__()
         self.device = device
         self.hidden_size = hidden_size
         self.num_layers = num_layers
-        self.lstm = nn.LSTM(input_size, hidden_size, num_layers,
-                            batch_first=True, bidirectional=True)
-        self.fc = nn.Linear(in_features=hidden_size*2,  # *2 for bidirection
-                            out_features=num_classes)
+        self.lstm = nn.LSTM(
+            input_size, hidden_size, num_layers, batch_first=True, bidirectional=True
+        )
+        self.fc = nn.Linear(
+            in_features=hidden_size * 2, out_features=num_classes  # *2 for bidirection
+        )
 
     def forward(self, x):
         """Propogate input sequences through the network to produce outputs
@@ -194,10 +200,12 @@ def forward(self, x):
 
         # Set initial states
         # h0 and c0 dimensions: [num_layers*2 X batch_size X hidden_size]
-        h0 = torch.zeros(self.num_layers*2,     # *2 for bidirection
-                         x.size(0), self.hidden_size).to(self.device)
-        c0 = torch.zeros(self.num_layers*2,
-                         x.size(0), self.hidden_size).to(self.device)
+        h0 = torch.zeros(
+            self.num_layers * 2, x.size(0), self.hidden_size  # *2 for bidirection
+        ).to(self.device)
+        c0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(
+            self.device
+        )
 
         # Forward propagate LSTM
         # out: tensor of shape: [batch_size, seq_length, hidden_size*2]
diff --git a/parrot/brnn_plot.py b/parrot/brnn_plot.py
index 3a107d8..efbc327 100644
--- a/parrot/brnn_plot.py
+++ b/parrot/brnn_plot.py
@@ -1,5 +1,5 @@
 """
-Plot training results for regression and classification tasks on both 
+Plot training results for regression and classification tasks on both
 sequence-mapped and residue-mapped data.
 
 .............................................................................
@@ -9,24 +9,41 @@
 Question/comments/concerns? Raise an issue on github:
 https://github.com/idptools/parrot
 
-Licensed under the MIT license. 
+Licensed under the MIT license.
 """
 
-import numpy as np
-import torch
 import itertools
-from scipy.stats import linregress, pearsonr, spearmanr
-from sklearn.metrics import roc_curve, auc
-from sklearn.metrics import precision_recall_curve, average_precision_score
-from sklearn.metrics import f1_score, matthews_corrcoef, accuracy_score
-import matplotlib.pyplot as plt
-import seaborn as sn
+import os
+import traceback
+
+import matplotlib as mpl
+
+mpl.use('Agg')
+
+import numpy as np
 import pandas as pd
+import seaborn as sn
+from scipy.stats import linregress, pearsonr, spearmanr
+from sklearn.metrics import (
+    accuracy_score,
+    auc,
+    average_precision_score,
+    f1_score,
+    matthews_corrcoef,
+    precision_recall_curve,
+    roc_curve,
+)
 
 from parrot import encode_sequence
 
+# Set global font size and line width for better publication clarity
+mpl.rcParams["font.size"] = 12
+mpl.rcParams["lines.linewidth"] = 2
 
-def training_loss(train_loss, val_loss, output_file_prefix=''):
+import matplotlib.pyplot as plt
+
+
+def training_loss(train_loss, val_loss, output_file_prefix=""):
     """Plot training and validation loss per epoch
 
     Figure is saved to file at "<output_file_prefix>_train_val_loss.png".
@@ -41,34 +58,63 @@ def training_loss(train_loss, val_loss, output_file_prefix=''):
             File to which the plot will be saved as "<output_file_prefix>_train_val_loss.png"
     """
 
-    fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(6, 6))
-    props = dict(boxstyle='round', facecolor='gainsboro', alpha=0.5)
+    fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(8, 8))
 
     num_epochs = len(train_loss)
 
     # Loss per epoch
-    training_loss, = ax.plot(np.arange(1, num_epochs+1), train_loss, label='Train')
-    validation_loss, = ax.plot(np.arange(1, num_epochs+1), val_loss, label='Val')
-    ax.set_xlabel("Epoch")
-    ax.set_ylabel("Avg loss")
-    ax.set_title("Training and testing loss per epoch")
-    ax.legend(handles=[training_loss, validation_loss], fontsize=14,
-              facecolor='gainsboro', edgecolor='slategray')
+    (training_loss,) = ax.plot(np.arange(1, num_epochs + 1), train_loss, label="Train")
+    (validation_loss,) = ax.plot(np.arange(1, num_epochs + 1), val_loss, label="Val")
+    ax.set_xlabel("Epoch", fontsize=10)
+    ax.set_ylabel("Avg loss", fontsize=10)
+    plt.suptitle("Training and testing loss per epoch", size=12)
+    plt.title(f"epochs={num_epochs}", size=11)
+
+    # Shrink current axis's height by 10% on the bottom
+    box = ax.get_position()
+    ax.set_position([box.x0, box.y0 + box.height * 0.1, box.width, box.height * 0.9])
+
+    # Add legend at the bottom
+    ax.legend(
+        handles=[training_loss, validation_loss],
+        loc="upper center",
+        fontsize=8,
+        facecolor="gainsboro",
+        edgecolor="slategray",
+        bbox_to_anchor=(0.5, -0.15),
+        ncol=2,
+        fancybox=True,
+    )
 
     if num_epochs < 21:
-        ax.set_xticks(np.arange(2, num_epochs+1, 2))
+        ax.set_xticks(np.arange(2, num_epochs + 1, 2))
     elif num_epochs < 66:
-        ax.set_xticks(np.arange(5, num_epochs+1, 5))
+        ax.set_xticks(np.arange(5, num_epochs + 1, 5))
     elif num_epochs < 151:
-        ax.set_xticks(np.arange(10, num_epochs+1, 10))
+        ax.set_xticks(np.arange(10, num_epochs + 1, 10))
     else:
-        ax.set_xticks(np.arange(50, num_epochs+1, 50))
-
-    plt.savefig(output_file_prefix + '_train_val_loss.png')
+        ax.set_xticks(np.arange(50, num_epochs + 1, 50))
+
+    plt.savefig(
+        output_file_prefix + "_train_val_loss.png",
+        dpi=300,
+        transparent=True,
+        format="png",
+        bbox_inches="tight",
+        pad_inches=0.1,
+    )
+    plt.savefig(
+        output_file_prefix + "_train_val_loss.pdf",
+        dpi=300,
+        transparent=True,
+        format="pdf",
+        bbox_inches="tight",
+        pad_inches=0.1,
+    )
     plt.clf()
 
 
-def sequence_regression_scatterplot(true, predicted, output_file_prefix=''):
+def sequence_regression_scatterplot(true, predicted, output_file_prefix=""):
     """Create a scatterplot for a sequence-mapped values regression problem
 
     Figure is saved to file at "<output_file_prefix>_seq_scatterplot.png".
@@ -93,20 +139,48 @@ def sequence_regression_scatterplot(true, predicted, output_file_prefix=''):
     for item in predicted:
         pred_list.append(item.cpu().numpy()[0][0])
 
-    plt.scatter(true_list, pred_list)
-    edge_vals = [0.9*min(min(true_list), min(pred_list)),
-                 1.1*max(max(true_list), max(pred_list))]
+    fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(8, 8))
+    ax.scatter(true_list, pred_list)
+    edge_vals = [
+        0.9 * min(min(true_list), min(pred_list)),
+        1.1 * max(max(true_list), max(pred_list)),
+    ]
     plt.xlim(edge_vals)
     plt.ylim(edge_vals)
-    plt.plot(edge_vals, edge_vals, 'k--')
-    plt.xlabel('True')
-    plt.ylabel('Predicted')
-    slope, intercept, r_value, p_value, std_err = linregress(true_list, pred_list)
-    plt.title('Testing accuracy: R^2=%.3f' % (r_value**2))
-    plt.savefig(output_file_prefix + '_seq_scatterplot.png')
+    plt.plot(edge_vals, edge_vals, "k--")
+    plt.xlabel("True", fontsize=10)
+    plt.ylabel("Predicted", fontsize=10)
+
+    r_value = 0
+    try:
+        slope, intercept, r_value, p_value, std_err = linregress(true_list, pred_list)
+    except:
+        traceback.print_exc()
+
+    plt.suptitle(
+        "Testing accuracy for a sequence-mapped values regression problem", size=12
+    )
+    plt.title("R^2=%.3f" % (r_value**2), size=11)
+    plt.savefig(
+        output_file_prefix + "_seq_scatterplot.png",
+        dpi=300,
+        transparent=True,
+        format="png",
+        bbox_inches="tight",
+        pad_inches=0.1,
+    )
+    plt.savefig(
+        output_file_prefix + "_seq_scatterplot.pdf",
+        dpi=300,
+        transparent=True,
+        format="pdf",
+        bbox_inches="tight",
+        pad_inches=0.1,
+    )
+    plt.clf()
 
 
-def residue_regression_scatterplot(true, predicted, output_file_prefix=''):
+def residue_regression_scatterplot(true, predicted, output_file_prefix=""):
     """Create a scatterplot for a residue-mapped values regression problem
 
     Each sequence is plotted with a unique marker-color combination, up to 70
@@ -120,7 +194,7 @@ def residue_regression_scatterplot(true, predicted, output_file_prefix=''):
             A list where each item is a [1 x len(sequence)] tensor with the true
             regression values of each residue in a sequence
     predicted : list of PyTorch FloatTensors
-            A list where each item is a [1 x len(sequence)] tensor with the 
+            A list where each item is a [1 x len(sequence)] tensor with the
             regression predictions for each residue in a sequence
     output_file_prefix : str, optional
             File to which the plot will be saved as "<output_file_prefix>_res_scatterplot.png"
@@ -129,7 +203,7 @@ def residue_regression_scatterplot(true, predicted, output_file_prefix=''):
     true_list = []
     pred_list = []
 
-    marker = itertools.cycle(('>', '+', '.', 'o', '*', 'v', 'D'))
+    marker = itertools.cycle((">", "+", ".", "o", "*", "v", "D"))
 
     for item in true:
         single_frag = item.cpu().numpy()[0].flatten()
@@ -138,24 +212,54 @@ def residue_regression_scatterplot(true, predicted, output_file_prefix=''):
         single_frag = item.cpu().numpy()[0].flatten()
         pred_list.append(list(single_frag))
 
-    for i in range(len(true_list)):
-        plt.scatter(true_list[i], pred_list[i], s=6, marker=next(marker))
+    fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(8, 8))
 
-    plt.figure(1)
+    for i in range(len(true_list)):
+        ax.scatter(true_list[i], pred_list[i], s=6, marker=next(marker))
 
     left, right = plt.xlim()
     bottom, top = plt.ylim()
     edge_vals = [min(left, bottom), max(right, top)]
     plt.xlim(edge_vals)
     plt.ylim(edge_vals)
-    plt.plot(edge_vals, edge_vals, 'k--')
-    plt.xlabel('True')
-    plt.ylabel('Predicted')
-    slope, intercept, r_value, p_value, std_err = linregress(sum(true_list, []), sum(pred_list, []))
-    plt.title('Testing accuracy: R^2=%.3f' % (r_value**2))
-    plt.savefig(output_file_prefix + '_res_scatterplot.png')
-
-def plot_roc_curve(true_classes, predicted_class_probs, num_classes, output_file_prefix=''):
+    plt.plot(edge_vals, edge_vals, "k--")
+    plt.xlabel("True", fontsize=10)
+    plt.ylabel("Predicted", fontsize=10)
+    r_value = 0
+
+    try:
+        slope, intercept, r_value, p_value, std_err = linregress(
+            sum(true_list, []), sum(pred_list, [])
+        )
+    except:
+        traceback.print_exc()
+
+    plt.suptitle(
+        "Testing accuracy for a sequence-mapped values regression problem", size=12
+    )
+    plt.title("R^2=%.3f" % (r_value**2), size=11)
+    plt.savefig(
+        output_file_prefix + "_res_scatterplot.png",
+        dpi=300,
+        transparent=True,
+        format="png",
+        bbox_inches="tight",
+        pad_inches=0.1,
+    )
+    plt.savefig(
+        output_file_prefix + "_res_scatterplot.pdf",
+        dpi=300,
+        transparent=True,
+        format="pdf",
+        bbox_inches="tight",
+        pad_inches=0.1,
+    )
+    plt.clf()
+
+
+def plot_roc_curve(
+    true_classes, predicted_class_probs, num_classes, output_file_prefix=""
+):
     """Create an ROC curve for a sequence classification problem
 
     Figure is saved to file at "<output_file_prefix>_ROC_curve.png".
@@ -188,40 +292,85 @@ def plot_roc_curve(true_classes, predicted_class_probs, num_classes, output_file
         fpr[c], tpr[c], _ = roc_curve(y_test[:, c], y_score[:, c])
         roc_auc[c] = auc(fpr[c], tpr[c])
 
-    plt.figure()
+    fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(8, 8))
     if num_classes > 2:
         # Compute micro-average ROC curve and ROC area (if multiclass)
         fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
         roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
 
         # Plot all ROC curves
-        plt.plot(fpr["micro"], tpr["micro"],
-                 label='Average (area = {0:0.2f})'
-                       ''.format(roc_auc["micro"]),
-                 color='deeppink', linestyle=':', linewidth=4)
+        ax.plot(
+            fpr["micro"],
+            tpr["micro"],
+            label="Average (area = {0:0.2f})" "".format(roc_auc["micro"]),
+            color="deeppink",
+            linestyle=":",
+            linewidth=4,
+        )
 
         for c in range(num_classes):
-            plt.plot(fpr[c], tpr[c], lw=2,
-                     label='Class {0} (area = {1:0.2f})'
-                     ''.format(c, roc_auc[c]))
-
-    elif num_classes == 2: # If binary classification
+            ax.plot(
+                fpr[c],
+                tpr[c],
+                lw=2,
+                label="Class {0} (area = {1:0.2f})" "".format(c, roc_auc[c]),
+            )
+
+    elif num_classes == 2:  # If binary classification
         # Plot only one curve (doesn't matter which one, they are symmetric)
-        plt.plot(fpr[1], tpr[1], lw=2,
-                     label='Binary class (area = {0:0.2f})'
-                     ''.format(roc_auc[1]))
-
-    plt.plot([0, 1], [0, 1], 'k--', lw=2)
+        ax.plot(
+            fpr[1],
+            tpr[1],
+            lw=2,
+            label="Binary class (area = {0:0.2f})" "".format(roc_auc[1]),
+        )
+
+    ax.plot([0, 1], [0, 1], "k--", lw=2)
     plt.xlim([0.0, 1.0])
     plt.ylim([0.0, 1.05])
-    plt.xlabel('False Positive Rate')
-    plt.ylabel('True Positive Rate')
-    plt.title('Receiver operating characteristic')
-    plt.legend(loc="lower right", fontsize=8)
-    plt.savefig(output_file_prefix + '_ROC_curve.png')
-
-def plot_precision_recall_curve(true_classes, predicted_class_probs, 
-                                num_classes, output_file_prefix=''):
+    plt.xlabel("False Positive Rate", fontsize=10)
+    plt.ylabel("True Positive Rate", fontsize=10)
+
+    plt.suptitle("Receiver operating characteristic (ROC) curve", size=12)
+    plt.title("for a sequence-mapped values classification problem", size=11)
+
+    # Shrink current axis's height by 10% on the bottom
+    box = ax.get_position()
+    ax.set_position([box.x0, box.y0 + box.height * 0.1, box.width, box.height * 0.9])
+
+    # Add legend at the bottom
+    ax.legend(
+        loc="upper center",
+        fontsize=8,
+        facecolor="gainsboro",
+        edgecolor="slategray",
+        bbox_to_anchor=(0.5, -0.15),
+        ncol=5,
+        fancybox=True,
+    )
+
+    plt.savefig(
+        output_file_prefix + "_ROC_curve.png",
+        dpi=300,
+        transparent=True,
+        format="png",
+        bbox_inches="tight",
+        pad_inches=0.1,
+    )
+    plt.savefig(
+        output_file_prefix + "_ROC_curve.pdf",
+        dpi=300,
+        transparent=True,
+        format="pdf",
+        bbox_inches="tight",
+        pad_inches=0.1,
+    )
+    plt.clf()
+
+
+def plot_precision_recall_curve(
+    true_classes, predicted_class_probs, num_classes, output_file_prefix=""
+):
     """Create an PR curve for a sequence classification problem
 
     Figure is saved to file at "<output_file_prefix>_PR_curve.png".
@@ -251,36 +400,79 @@ def plot_precision_recall_curve(true_classes, predicted_class_probs,
     recall = dict()
     average_precision = dict()
     for i in range(num_classes):
-        precision[i], recall[i], _ = precision_recall_curve(y_test[:, i],
-                                                            y_score[:, i])
+        precision[i], recall[i], _ = precision_recall_curve(y_test[:, i], y_score[:, i])
         average_precision[i] = average_precision_score(y_test[:, i], y_score[:, i])
 
     # A "micro-average": quantifying score on all classes jointly
-    precision["micro"], recall["micro"], _ = precision_recall_curve(y_test.ravel(),
-        y_score.ravel())
-    average_precision["micro"] = average_precision_score(y_test, y_score,
-                                                        average="micro")
+    precision["micro"], recall["micro"], _ = precision_recall_curve(
+        y_test.ravel(), y_score.ravel()
+    )
+    average_precision["micro"] = average_precision_score(
+        y_test, y_score, average="micro"
+    )
 
     # Plot
-    plt.figure()
-    plt.plot(recall["micro"], precision["micro"], color='deeppink', linestyle=':', 
-        linewidth=4, label='Average (area = {0:0.2f})'
-              ''.format(average_precision["micro"]))
+    fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(8, 8))
+    ax.plot(
+        recall["micro"],
+        precision["micro"],
+        color="deeppink",
+        linestyle=":",
+        linewidth=4,
+        label="Average (area = {0:0.2f})" "".format(average_precision["micro"]),
+    )
     for c in range(num_classes):
-        plt.plot(recall[c], precision[c], lw=2, 
-            label='Class {0} (area = {1:0.2f})'
-                  ''.format(c, average_precision[c]))
+        ax.plot(
+            recall[c],
+            precision[c],
+            lw=2,
+            label="Class {0} (area = {1:0.2f})" "".format(c, average_precision[c]),
+        )
 
     plt.xlim([0.0, 1.0])
     plt.ylim([0.0, 1.05])
-    plt.xlabel('Recall')
-    plt.ylabel('Precision')
-    plt.title('Precision-Recall')
-    plt.legend()
-    plt.savefig(output_file_prefix + '_PR_curve.png')
+    plt.xlabel("Recall", fontsize=10)
+    plt.ylabel("Precision", fontsize=10)
+    plt.suptitle("Precision-Recall curve", size=12)
+    plt.title("for a sequence-mapped values classification problem", size=11)
+
+    # Shrink current axis's height by 10% on the bottom
+    box = ax.get_position()
+    ax.set_position([box.x0, box.y0 + box.height * 0.1, box.width, box.height * 0.9])
+
+    # Add legend at the bottom
+    ax.legend(
+        loc="upper center",
+        fontsize=8,
+        facecolor="gainsboro",
+        edgecolor="slategray",
+        bbox_to_anchor=(0.5, -0.15),
+        ncol=5,
+        fancybox=True,
+    )
+
+    plt.savefig(
+        output_file_prefix + "_PR_curve.png",
+        dpi=300,
+        transparent=True,
+        format="png",
+        bbox_inches="tight",
+        pad_inches=0.1,
+    )
+    plt.savefig(
+        output_file_prefix + "_PR_curve.pdf",
+        dpi=300,
+        transparent=True,
+        format="pdf",
+        bbox_inches="tight",
+        pad_inches=0.1,
+    )
+    plt.clf()
 
 
-def confusion_matrix(true_classes, predicted_classes, num_classes, output_file_prefix=''):
+def confusion_matrix(
+    true_classes, predicted_classes, num_classes, output_file_prefix=""
+):
     """Create a confusion matrix for a sequence classification problem
 
     Figure is saved to file at "<output_file_prefix>_seq_CM.png".
@@ -304,16 +496,40 @@ class label for a particular sequence
         cm[np.argmax(predicted_classes[i][0].cpu().numpy()), true_classes[i][0]] += 1
 
     df_cm = pd.DataFrame(cm, range(num_classes), range(num_classes))
-    sn.set(font_scale=1.4)  # for label size
-    sn.heatmap(df_cm, cmap='Blues', annot=True, annot_kws={"size": 16})  # font size
-    plt.xlabel('True labels')
-    plt.ylabel('Predicted labels')
-    plt.title('Test set confusion matrix')
+    sn.set_theme(font_scale=1.4)  # for label size
+    sn.heatmap(
+        df_cm, cmap="Blues", annot=True, annot_kws={"size": 8}, fmt="g"
+    )  # font size
+
+    plt.xlabel("True labels", fontsize=10)
+    plt.ylabel("Predicted labels", fontsize=10)
+    plt.suptitle("Test set confusion matrix", size=12)
+    plt.title("for a sequence-mapped values classification problem", size=11)
+
     plt.tight_layout()
-    plt.savefig(output_file_prefix + '_seq_CM.png')
+
+    plt.savefig(
+        output_file_prefix + "_seq_CM.png",
+        dpi=300,
+        transparent=True,
+        format="png",
+        bbox_inches="tight",
+        pad_inches=0.1,
+    )
+    plt.savefig(
+        output_file_prefix + "_seq_CM.pdf",
+        dpi=300,
+        transparent=True,
+        format="pdf",
+        bbox_inches="tight",
+        pad_inches=0.1,
+    )
+    plt.clf()
 
 
-def res_confusion_matrix(true_classes, predicted_classes, num_classes, output_file_prefix=''):
+def res_confusion_matrix(
+    true_classes, predicted_classes, num_classes, output_file_prefix=""
+):
     """Create a confusion matrix for a residue classification problem
 
     Figure is saved to file at "<output_file_prefix>_res_CM.png".
@@ -351,17 +567,39 @@ def res_confusion_matrix(true_classes, predicted_classes, num_classes, output_fi
         cm[pred_list[i], true_list[i]] += 1
 
     df_cm = pd.DataFrame(cm, range(num_classes), range(num_classes))
-    sn.set(font_scale=1.4)  # for label size
-    sn.heatmap(df_cm, cmap='Blues', annot=True, annot_kws={"size": 16})  # font size
-    plt.xlabel('True labels')
-    plt.ylabel('Predicted labels')
-    plt.title('Test set confusion matrix')
+    sn.set_theme(font_scale=1.4)  # for label size
+    sn.heatmap(
+        df_cm, cmap="Blues", annot=True, annot_kws={"size": 8}, fmt="g"
+    )  # font size
+    plt.xlabel("True labels", fontsize=10)
+    plt.ylabel("Predicted labels", fontsize=10)
+    plt.suptitle("Test set confusion matrix ", size=12)
+    plt.title("for a residue-mapped values classification problem", size=11)
+
     plt.tight_layout()
-    plt.savefig(output_file_prefix + '_res_CM.png')
+
+    plt.savefig(
+        output_file_prefix + "_res_CM.png",
+        dpi=300,
+        transparent=True,
+        format="png",
+        bbox_inches="tight",
+        pad_inches=0.1,
+    )
+    plt.savefig(
+        output_file_prefix + "_res_CM.pdf",
+        dpi=300,
+        transparent=True,
+        format="pdf",
+        bbox_inches="tight",
+        pad_inches=0.1,
+    )
+    plt.clf()
 
 
-def write_performance_metrics(sequence_data, dtype, problem_type,
-                                prob_class, output_file_prefix=''):
+def write_performance_metrics(
+    sequence_data, dtype, problem_type, prob_class, output_file_prefix=""
+):
     """Writes a short text file describing performance on a variety of metrics
 
     Writes different output depending on whether a classification or regression task
@@ -390,13 +628,25 @@ def write_performance_metrics(sequence_data, dtype, problem_type,
     true_vals = [l[1] for l in sequence_data]
     pred_vals = [l[2] for l in sequence_data]
 
-    perform_metrics = {}
-
-    if dtype == 'residues':
+    perform_metrics = {
+        "File": os.path.basename(output_file_prefix),
+        "Dataset_Type": dtype,
+        "Problem_Type": problem_type,
+        "Prob_Class": prob_class,
+        "Pearson_R": None,
+        "Spearman_R": None,
+        "Area_under_Precision_Recall_curve": None,
+        "Area_under_ROC": None,
+        "Matthews_Correlation_Coef": None,
+        "F1_Score": None,
+        "Accuracy": None,
+    }
+
+    if dtype == "residues":
         true_vals = np.hstack(true_vals)
         pred_vals = np.hstack(pred_vals)
 
-    if problem_type == 'classification':
+    if problem_type == "classification":
         # Take care of probabilistic-classification case first
         if prob_class:
             # Reformat
@@ -406,39 +656,61 @@ def write_performance_metrics(sequence_data, dtype, problem_type,
                 true_vals_array[i, true_vals[i]] = 1
 
             # AUROC, AUPRC
-            perform_metrics['Area under Precision-Recall curve'] = round(
-                                        average_precision_score(true_vals_array, 
-                                        pred_vals, average="micro"), 3)
+            perform_metrics["Area_under_Precision_Recall_curve"] = round(
+                average_precision_score(true_vals_array, pred_vals, average="micro"), 3
+            )
             fpr, tpr, _ = roc_curve(true_vals_array.ravel(), pred_vals.ravel())
-            perform_metrics["Area under ROC"] = round(auc(fpr, tpr), 3)
+            perform_metrics["Area_under_ROC"] = round(auc(fpr, tpr), 3)
 
             # Change probs to discrete classes
             pred_vals = np.argmax(pred_vals, axis=1)
 
         # Then take care of general classification stats: accuracy, F1, MCC
-        perform_metrics['Matthews Correlation Coef'] = round(
-                            matthews_corrcoef(true_vals, pred_vals), 3)
-        perform_metrics['F1 Score'] = round(
-                            f1_score(true_vals, pred_vals, average='weighted'), 3)
-        perform_metrics['Accuracy'] = round(accuracy_score(true_vals, pred_vals), 3)
-
-
-    elif problem_type == 'regression':
+        perform_metrics["Matthews_Correlation_Coef"] = round(
+            matthews_corrcoef(true_vals, pred_vals), 3
+        )
+        perform_metrics["F1_Score"] = round(
+            f1_score(true_vals, pred_vals, average="weighted"), 3
+        )
+        perform_metrics["Accuracy"] = round(accuracy_score(true_vals, pred_vals), 3)
+
+    elif problem_type == "regression":
         # Pearson R, Spearman R
         pears_r, p_val = pearsonr(true_vals, pred_vals)
-        perform_metrics['Pearson R'] = round(pears_r, 3)
+        perform_metrics["Pearson_R"] = round(pears_r, 3)
         spearman_r, p_val = spearmanr(true_vals, pred_vals)
-        perform_metrics['Spearman R'] = round(spearman_r, 3)     
+        perform_metrics["Spearman_R"] = round(spearman_r, 3)
 
     # Write performance metrics to file
-    with open(output_file_prefix + '_performance_stats.txt', 'w') as f:
+    with open(output_file_prefix + "_performance_stats.txt", "w") as f:
         for key, value in perform_metrics.items():
-            outstr = '%s : %.3f\n' % (key, value)
+            if isinstance(value, (int, float)):
+                outstr = "%s: %.3f\n" % (key, float(value))
+            else:
+                outstr = "%s: %s\n" % (
+                    key,
+                    value,
+                )
+            print(outstr)
             f.write(outstr)
 
+    # Convert the dictionary to a DataFrame (Note: it's a single row dictionary)
+    df = pd.DataFrame([perform_metrics])
+
+    # Save the DataFrame to a CSV file, formatting floats with 6 decimals
+    df.to_csv(
+        output_file_prefix + "_performance_stats.csv", index=False, float_format="%.3f"
+    )
+
 
-def output_predictions_to_file(sequence_data, excludeSeqID, encoding_scheme,
-                            probabilistic_class, encoder=None, output_file_prefix=''):
+def output_predictions_to_file(
+    sequence_data,
+    excludeSeqID,
+    encoding_scheme,
+    probabilistic_class,
+    encoder=None,
+    output_file_prefix="",
+):
     """Output sequences, their true values, and their predicted values to a file
 
     Used on the output of the test_unlabeled_data() function in the train_network module in
@@ -455,10 +727,10 @@ def output_predictions_to_file(sequence_data, excludeSeqID, encoding_scheme,
             true_value, predicted_value, sequence_ID]
     excludeSeqID : bool
             Boolean indicating whether or not each line in `tsvfile` has a sequence ID
-            (default is False)		
+            (default is False)
     encoding_scheme : str
-            Description of how an amino acid sequence should be encoded as a numeric 
-            vector. Providing a string other than 'onehot', 'biophysics', or 'user' 
+            Description of how an amino acid sequence should be encoded as a numeric
+            vector. Providing a string other than 'onehot', 'biophysics', or 'user'
             will produce unintended consequences.
     probabilistic_class : bool
             Flag indicating if probabilistic classification was specified by the user. If True,
@@ -484,40 +756,46 @@ def output_predictions_to_file(sequence_data, excludeSeqID, encoding_scheme,
         pred_vals.append(pred_val)
 
         if excludeSeqID:
-            names.append('test' + str(count))
+            names.append("test" + str(count))
             count += 1
         else:
             names.append(name)
 
     # Decode the sequence vectors
-    if encoding_scheme == 'onehot':
+    if encoding_scheme == "onehot":
         sequences = encode_sequence.rev_one_hot(seq_vectors)
-    elif encoding_scheme == 'biophysics':
+    elif encoding_scheme == "biophysics":
         sequences = encode_sequence.rev_biophysics(seq_vectors)
     else:
         sequences = encoder.decode(seq_vectors)
 
     # Write to file
-    with open(output_file_prefix + '_predictions.tsv', 'w') as tsvfile:
+    with open(output_file_prefix + "_predictions.tsv", "w") as tsvfile:
         for i in range(len(names)):
 
             # Adjust formatting for residues or sequence data
             if isinstance(true_vals[i], np.ndarray):
-                true_vals_format = ' '.join(true_vals[i].astype(str))
-                pred_vals_format = ' '.join(pred_vals[i].astype(str))
+                true_vals_format = " ".join(true_vals[i].astype(str))
+                pred_vals_format = " ".join(pred_vals[i].astype(str))
             elif probabilistic_class:
                 true_vals_format = true_vals[i]
-                pred_vals_format = ' '.join(np.around(pred_vals[i], decimals=4).astype(str))
+                pred_vals_format = " ".join(
+                    np.around(pred_vals[i], decimals=4).astype(str)
+                )
             else:
                 true_vals_format = true_vals[i]
                 pred_vals_format = pred_vals[i]
 
-            '''
+            """
 			Format:
 			NAME_TRUE SEQUENCE TRUE_VALUE(S)
 			NAME_PRED SEQUENCE PRED_VALUE(S)
-			'''
+			"""
             output_str = "%s_TRUE %s %s\n" % (names[i], sequences[i], true_vals_format)
-            output_str = output_str + "%s_PRED %s %s\n" % (names[i], sequences[i], pred_vals_format)
+            output_str = output_str + "%s_PRED %s %s\n" % (
+                names[i],
+                sequences[i],
+                pred_vals_format,
+            )
 
             tsvfile.write(output_str)
diff --git a/parrot/train_network.py b/parrot/train_network.py
index d4d939a..3a95f6f 100644
--- a/parrot/train_network.py
+++ b/parrot/train_network.py
@@ -8,20 +8,32 @@
 Question/comments/concerns? Raise an issue on github:
 https://github.com/idptools/parrot
 
-Licensed under the MIT license. 
+Licensed under the MIT license.
 """
+import time
+from datetime import timedelta
 
+import numpy as np
 import torch
 import torch.nn as nn
-from torch.utils.data import Dataset, DataLoader
-import numpy as np
-
-from parrot import brnn_plot
-from parrot import encode_sequence
 
-
-def train(network, train_loader, val_loader, datatype, problem_type, weights_file,
-          stop_condition, device, learn_rate, n_epochs, verbose=False, silent=False):
+from parrot import brnn_plot, encode_sequence
+
+
+def train(
+    network,
+    train_loader,
+    val_loader,
+    datatype,
+    problem_type,
+    weights_file,
+    stop_condition,
+    device,
+    learn_rate,
+    n_epochs,
+    verbose=False,
+    silent=False,
+):
     """Train a BRNN and save the best performing network weights
 
     Train the network on a training set, and every epoch evaluate its performance on
@@ -30,8 +42,8 @@ def train(network, train_loader, val_loader, datatype, problem_type, weights_fil
 
     User must specify the machine learning tast (`problem_type`) and the format of
     the data (`datatype`). Additionally, this function requires the learning rate
-    hyperparameter and the number of epochs of training. The other hyperparameters, 
-    number of hidden layers and hidden vector size, are implictly included on the 
+    hyperparameter and the number of epochs of training. The other hyperparameters,
+    number of hidden layers and hidden vector size, are implictly included on the
     the provided network.
 
     The user may specify if they want to train the network for a set number of
@@ -64,8 +76,9 @@ def train(network, train_loader, val_loader, datatype, problem_type, weights_fil
             performance has sufficiently stagnated. If the performance plateaus for
             `n_epochs` consecutive epochs, then training will stop.
     device : str
-            Location of where training will take place--should be either 'cpu' or
-            'cuda' (GPU). If available, training on GPU is typically much faster.
+            Location of where training will take place--should be 'cpu', 'mps' (Apple
+            GPU) or 'cuda' (GPU). If available, training on GPU is typically
+            much faster.
     learn_rate : float
             Initial learning rate of network training. The training process is
             controlled by the Adam optimization algorithm, so this learning rate
@@ -87,50 +100,62 @@ def train(network, train_loader, val_loader, datatype, problem_type, weights_fil
             A list of the average validation set losses achieved at each epoch
     """
 
+    print(f"Using device: {device}")
+    network.to(device)
+
     # Set optimizer
     optimizer = torch.optim.Adam(network.parameters(), lr=learn_rate)
 
     # Set loss criteria
-    if problem_type == 'regression':
-        if datatype == 'residues':
-            criterion = nn.MSELoss(reduction='sum')
-        elif datatype == 'sequence':
-            criterion = nn.L1Loss(reduction='sum')
-    elif problem_type == 'classification':
-        criterion = nn.CrossEntropyLoss(reduction='sum')
+    if problem_type == "regression":
+        if datatype == "residues":
+            criterion = nn.MSELoss(reduction="sum")
+        elif datatype == "sequence":
+            criterion = nn.L1Loss(reduction="sum")
+    elif problem_type == "classification":
+        criterion = nn.CrossEntropyLoss(reduction="sum")
 
     network = network.float()
-    total_step = len(train_loader)
+    # total_step = len(train_loader)
     min_val_loss = np.inf
     avg_train_losses = []
     avg_val_losses = []
 
-    if stop_condition == 'auto':
+    if stop_condition == "auto":
         min_epochs = n_epochs
         # Set to some arbitrarily large number of iterations -- will stop automatically
-        n_epochs = 20000000
+        # n_epochs = 20000000
+        n_epochs = 2 * n_epochs
         last_decrease = 0
 
     # Train the model - evaluate performance on val set every epoch
     end_training = False
+
+    with open("train_network.debug.log", "w") as f:
+        f.write("DataType;ProblemType;Epoch;MaxEpochs;Percentage;StopCondition;StartTime;EndTime;ElapsedTime;TrainLoss;ValLoss;SignificantDecrease;LastDecreaseEpoch\n")
+
+    # To be saved only once
+    best_network_dict = None
+
     for epoch in range(n_epochs):  # Main loop
+        start_time = time.time()
 
         # Initialize training and testing loss for epoch
         train_loss = 0
         val_loss = 0
 
         # Iterate over batches
-        for i, (names, vectors, targets) in enumerate(train_loader):
+        for _names, vectors, targets in train_loader:
             vectors = vectors.to(device)
             targets = targets.to(device)
 
             # Forward pass
             outputs = network(vectors.float())
 
-            if problem_type == 'regression':
+            if problem_type == "regression":
                 loss = criterion(outputs, targets.float())
             else:
-                if datatype == 'residues':
+                if datatype == "residues":
                     outputs = outputs.permute(0, 2, 1)
                 loss = criterion(outputs, targets.long())
 
@@ -141,16 +166,16 @@ def train(network, train_loader, val_loader, datatype, problem_type, weights_fil
             loss.backward()
             optimizer.step()
 
-        for names, vectors, targets in val_loader:
+        for _names, vectors, targets in val_loader:
             vectors = vectors.to(device)
             targets = targets.to(device)
 
             # Forward pass
             outputs = network(vectors.float())
-            if problem_type == 'regression':
+            if problem_type == "regression":
                 loss = criterion(outputs, targets.float())
             else:
-                if datatype == 'residues':
+                if datatype == "residues":
                     outputs = outputs.permute(0, 2, 1)
                 loss = criterion(outputs, targets.long())
 
@@ -162,12 +187,12 @@ def train(network, train_loader, val_loader, datatype, problem_type, weights_fil
         val_loss /= len(val_loader.dataset)
 
         signif_decrease = True
-        if stop_condition == 'auto' and epoch > min_epochs - 1:
+        if stop_condition == "auto" and epoch > min_epochs - 1:
             # Check to see if loss has stopped decreasing
             last_epochs_loss = avg_val_losses[-min_epochs:]
 
             for loss in last_epochs_loss:
-                if val_loss >= loss*0.995:
+                if val_loss >= loss * 0.995:
                     signif_decrease = False
 
             # If network performance has plateaued over the last range of epochs, end training
@@ -176,37 +201,70 @@ def train(network, train_loader, val_loader, datatype, problem_type, weights_fil
 
         # Only save updated weights to memory if they improve val set performance
         if val_loss < min_val_loss:
-            min_val_loss = val_loss 	# Reset min_val_loss
+            min_val_loss = val_loss  # Reset min_val_loss
             last_decrease = epoch
-            torch.save(network.state_dict(), weights_file)  # Save model
+            best_network_dict = network.state_dict()
+            # torch.save(network.state_dict(), weights_file)  # Save model
 
         # Append losses to lists
         avg_train_losses.append(train_loss)
         avg_val_losses.append(val_loss)
 
+        end_time = time.time()
+        elapsed_time = timedelta(seconds=end_time - start_time)
+
+        # Convert elapsed time to hours, minutes, seconds, and milliseconds
+        hours, remainder = divmod(elapsed_time.seconds, 3600)
+        minutes, seconds = divmod(remainder, 60)
+        milliseconds = elapsed_time.microseconds // 1000  # Convert microseconds to milliseconds
+
+        formatted_start_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(start_time))
+        formatted_end_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(end_time))
+        formatted_elapsed_time = f"{hours}h, {minutes}m, {seconds}s, {milliseconds}ms"
+        percentage = 100*((epoch+1)/n_epochs)
+
+        epoch_detail_line = f"{datatype};{problem_type};{epoch + 1};{n_epochs};{percentage:.4f}%;{stop_condition};{formatted_start_time};{formatted_end_time};{formatted_elapsed_time};{train_loss:.4f};{val_loss:.4f};{str(signif_decrease)};{last_decrease + 1}\n"
+
+        with open("train_network.debug.log", "a") as f:
+            f.write(epoch_detail_line)
+
         if verbose:
-            print('Epoch %d\tLoss %.4f' % (epoch, val_loss))
+            print(epoch_detail_line)
         elif epoch % 5 == 0 and silent is False:
-            print('Epoch %d\tLoss %.4f' % (epoch, val_loss))
+            print(epoch_detail_line)
 
         # This is placed here to ensure that the best network, even if the performance
         # improvement is marginal, is saved.
         if end_training:
             break
 
+    # Save model only once
+    if best_network_dict:
+        torch.save(best_network_dict, weights_file)
+    else:
+        raise "No data to save the model"
+
     # Return loss per epoch so that they can be plotted
     return avg_train_losses, avg_val_losses
 
 
-def test_labeled_data(network, test_loader, datatype,
-                      problem_type, weights_file, num_classes,
-                      probabilistic_classification, include_figs, 
-                      device, output_file_prefix=''):
+def test_labeled_data(
+    network,
+    test_loader,
+    datatype,
+    problem_type,
+    weights_file,
+    num_classes,
+    probabilistic_classification,
+    include_figs,
+    device,
+    output_file_prefix="",
+):
     """Test a trained BRNN on labeled sequences
 
     Using the saved weights of a trained network, run a set of sequences through
     the network and evaluate the performancd. Return the average loss per
-    sequence and plot the results. Testing a network on previously-unseen data 
+    sequence and plot the results. Testing a network on previously-unseen data
     provides a useful estimate of how generalizeable the network's performance is.
 
     Parameters
@@ -232,10 +290,11 @@ def test_labeled_data(network, test_loader, datatype,
     include_figs: bool
             Whether or not matplotlib figures should be generated.
     device : str
-            Location of where testing will take place--should be either 'cpu' or
-            'cuda' (GPU). If available, training on GPU is typically much faster.
+            Location of where training will take place--should be 'cpu', 'mps' (Apple
+            GPU) or 'cuda' (GPU). If available, training on GPU is typically
+            much faster.
     output_file_prefix : str
-            Path and filename prefix to which the test set predictions and plots will be saved. 
+            Path and filename prefix to which the test set predictions and plots will be saved.
 
     Returns
     -------
@@ -251,20 +310,20 @@ def test_labeled_data(network, test_loader, datatype,
     network.load_state_dict(torch.load(weights_file))
 
     # Get output directory for images
-    network_filename = weights_file.split('/')[-1]
-    output_dir = weights_file[:-len(network_filename)]
+    network_filename = weights_file.split("/")[-1]
+    output_dir = weights_file[: -len(network_filename)]
 
     # Set loss criteria
-    if problem_type == 'regression':
+    if problem_type == "regression":
         criterion = nn.MSELoss()
-    elif problem_type == 'classification':
+    elif problem_type == "classification":
         criterion = nn.CrossEntropyLoss()
 
     test_loss = 0
     all_targets = []
     all_outputs = []
     predictions = []
-    for names, vectors, targets in test_loader: 	# batch size of 1
+    for names, vectors, targets in test_loader:  # batch size of 1
         all_targets.append(targets)
 
         vectors = vectors.to(device)
@@ -272,10 +331,10 @@ def test_labeled_data(network, test_loader, datatype,
 
         # Forward pass
         outputs = network(vectors.float())
-        if problem_type == 'regression':
+        if problem_type == "regression":
             loss = criterion(outputs, targets.float())
         else:
-            if datatype == 'residues':
+            if datatype == "residues":
                 outputs = outputs.permute(0, 2, 1)
             loss = criterion(outputs, targets.long())
 
@@ -283,46 +342,60 @@ def test_labeled_data(network, test_loader, datatype,
         all_outputs.append(outputs.detach())
 
         # Add to list as: [seq_vector, true value, predicted value, name]
-        predictions.append([vectors[0].cpu().numpy(), targets.cpu().numpy()
-                           [0], outputs.cpu().detach().numpy(), names[0]])
+        predictions.append(
+            [
+                vectors[0].cpu().numpy(),
+                targets.cpu().numpy()[0],
+                outputs.cpu().detach().numpy(),
+                names[0],
+            ]
+        )
 
     # Plot 'accuracy' depending on the problem type and datatype
-    if problem_type == 'regression':
-        if datatype == 'residues':
+    if problem_type == "regression":
+        if datatype == "residues":
             if include_figs:
-                brnn_plot.residue_regression_scatterplot(all_targets, all_outputs, 
-                                            output_file_prefix=output_file_prefix)
+                brnn_plot.residue_regression_scatterplot(
+                    all_targets, all_outputs, output_file_prefix=output_file_prefix
+                )
 
             # Format predictions
             for i in range(len(predictions)):
                 predictions[i][2] = predictions[i][2].flatten()
                 predictions[i][1] = predictions[i][1].flatten()
 
-        elif datatype == 'sequence':
+        elif datatype == "sequence":
             if include_figs:
-                brnn_plot.sequence_regression_scatterplot(all_targets, all_outputs, 
-                                            output_file_prefix=output_file_prefix)
+                brnn_plot.sequence_regression_scatterplot(
+                    all_targets, all_outputs, output_file_prefix=output_file_prefix
+                )
 
             # Format predictions
             for i in range(len(predictions)):
                 predictions[i][2] = predictions[i][2][0][0]
                 predictions[i][1] = predictions[i][1][0]
 
-    elif problem_type == 'classification':
+    elif problem_type == "classification":
 
-        if datatype == 'residues':
+        if datatype == "residues":
             if include_figs:
-                brnn_plot.res_confusion_matrix(all_targets, all_outputs, num_classes, 
-                                            output_file_prefix=output_file_prefix)
+                brnn_plot.res_confusion_matrix(
+                    all_targets,
+                    all_outputs,
+                    num_classes,
+                    output_file_prefix=output_file_prefix,
+                )
 
             # Format predictions and assign class predictions
             for i in range(len(predictions)):
                 pred_values = []
+
                 for j in range(len(predictions[i][2])):
                     pred_values = np.argmax(predictions[i][2], axis=1)[0]
-                predictions[i][2] = np.array(pred_values, dtype=np.int)
 
-        elif datatype == 'sequence':
+                predictions[i][2] = np.array(pred_values, dtype=int)
+
+        elif datatype == "sequence":
             if probabilistic_classification:
                 # Probabilistic assignment of class predictions
                 # Optional implementation for classification tasks
@@ -337,10 +410,18 @@ def test_labeled_data(network, test_loader, datatype,
 
                 # Plot ROC and PR curves
                 if include_figs:
-                    brnn_plot.plot_roc_curve(all_targets, pred_probabilities, num_classes, 
-                                            output_file_prefix=output_file_prefix)
-                    brnn_plot.plot_precision_recall_curve(all_targets, pred_probabilities, 
-                                            num_classes, output_file_prefix=output_file_prefix)
+                    brnn_plot.plot_roc_curve(
+                        all_targets,
+                        pred_probabilities,
+                        num_classes,
+                        output_file_prefix=output_file_prefix,
+                    )
+                    brnn_plot.plot_precision_recall_curve(
+                        all_targets,
+                        pred_probabilities,
+                        num_classes,
+                        output_file_prefix=output_file_prefix,
+                    )
 
             else:
                 # Absolute assignment of class predictions
@@ -351,20 +432,31 @@ def test_labeled_data(network, test_loader, datatype,
 
                 # Plot confusion matrix (if not in probabilistic classification mode)
                 if include_figs:
-                    brnn_plot.confusion_matrix(all_targets, all_outputs, num_classes, 
-                                                output_file_prefix=output_file_prefix)
+                    brnn_plot.confusion_matrix(
+                        all_targets,
+                        all_outputs,
+                        num_classes,
+                        output_file_prefix=output_file_prefix,
+                    )
 
     return test_loss / len(test_loader.dataset), predictions
 
 
-def test_unlabeled_data(network, sequences, device, encoding_scheme='onehot', encoder=None, print_frequency=None):
+def test_unlabeled_data(
+    network,
+    sequences,
+    device,
+    encoding_scheme="onehot",
+    encoder=None,
+    print_frequency=None,
+):
     """Test a trained BRNN on unlabeled sequences
 
     Use a trained network to make predictions on previously-unseen data.
 
-    ** 
+    **
     Note: Unlike the previous functions, `network` here must have pre-loaded
-    weights. 
+    weights.
     **
 
     Parameters
@@ -374,8 +466,9 @@ def test_unlabeled_data(network, sequences, device, encoding_scheme='onehot', en
     sequences : list
             A list of amino acid sequences to test using the network
     device : str
-            Location of where testing will take place--should be either 'cpu' or
-            'cuda' (GPU). If available, training on GPU is typically much faster.
+            Location of where training will take place--should be 'cpu', 'mps' (Apple
+            GPU) or 'cuda' (GPU). If available, training on GPU is typically
+            much faster.
     encoding_scheme : str, optional
             How amino acid sequences are to be encoded as numeric vectors. Currently,
             'onehot','biophysics' and 'user' are the implemented options.
@@ -386,7 +479,7 @@ def test_unlabeled_data(network, sequences, device, encoding_scheme='onehot', en
     print_frequency : int
             If provided defines at what sequence interval an update is printed.
             Default = None.
-    
+
     Returns
     -------
     dict
@@ -403,13 +496,13 @@ def test_unlabeled_data(network, sequences, device, encoding_scheme='onehot', en
         local_count = local_count + 1
         if print_frequency is not None:
             if local_count % print_frequency == 0:
-                print(f'On {local_count} of {total_count}')
+                print(f"On {local_count} of {total_count}")
 
-        if encoding_scheme == 'onehot':
+        if encoding_scheme == "onehot":
             seq_vector = encode_sequence.one_hot(seq)
-        elif encoding_scheme == 'biophysics':
+        elif encoding_scheme == "biophysics":
             seq_vector = encode_sequence.biophysics(seq)
-        elif encoding_scheme == 'user':
+        elif encoding_scheme == "user":
             seq_vector = encoder.encode(seq)
 
         seq_vector = seq_vector.view(1, len(seq_vector), -1)
diff --git a/scripts/parrot-optimize b/scripts/parrot-optimize
index 22a8365..f091951 100755
--- a/scripts/parrot-optimize
+++ b/scripts/parrot-optimize
@@ -1,8 +1,8 @@
 #!/usr/bin/env python
 """
 Usage: $ parrot-optimize data_file output_network <flags>
-  
-Driver script for finding optimal hyperparameters for a bidirectional recurrent 
+
+Driver script for finding optimal hyperparameters for a bidirectional recurrent
 neural network on a given dataset, then training a network with those parameters
 For more information on usage, use the '-h' flag.
 
@@ -13,7 +13,7 @@ idptools-parrot was developed by the Holehouse lab
 Question/comments/concerns? Raise an issue on github:
 https://github.com/idptools/parrot
 
-Licensed under the MIT license. 
+Licensed under the MIT license.
 """
 
 import os
@@ -34,63 +34,148 @@ from parrot.tools import validate_args
 from parrot.tools import dataset_warnings
 
 # Parse the command line arguments
-parser = argparse.ArgumentParser(description='Train and test a bi-directional RNN using entire sequence.')
-
-parser.add_argument('data_file', help='path to tsv file with format: <idx> <sequence> <data>')
-
-parser.add_argument('output_network', help='location to save the trained network')
-
-parser.add_argument('-d', '--datatype', metavar='dtype', type=str, required=True,
-                    help="REQUIRED. Format of the input data file, must be 'sequence' or 'residues'")
-
-parser.add_argument('-c', '--classes', type=int, metavar='num_classes', required=True,
-                    help='REQUIRED. Number of output classes, for regression put 1')
-
-parser.add_argument('-b', '--batch', default=32, type=int, metavar='batch_size',
-                    help='size of training batch (def=32)')
-
-parser.add_argument('-e', '--epochs', default=100, type=int, metavar='num_epochs',
-                    help='number of training epochs (def=100)')
-
-parser.add_argument('--max-iter', default=50, type=int, metavar='max_iter',
-                    help='Maximum number of iterations for the optimization procedure (def=50)')
-
-parser.add_argument('--split', default='', metavar='split_file', type=str,
-                    help="file indicating how to split datafile into training, validation, and testing sets")
-
-parser.add_argument('--set-fractions', nargs=3, default=[0.7, 0.15, 0.15], type=float,
-                    dest='setFractions', metavar=('train', 'val', 'test'),
-                    help='Proportion of dataset that should be divided into training, validation, and test sets')
-
-parser.add_argument('--encode', default='onehot', type=str, metavar='encoding_scheme',
-                    help="'onehot' (default), 'biophysics', or specify a path to a user-created scheme")
-
-parser.add_argument('--exclude-seq-id', dest='excludeSeqID', action='store_true',
-                    help='use if data_file lacks sequence IDs in the first column of each line')
-
-parser.add_argument('--probabilistic-classification', dest='probabilistic_classification',
-                    action='store_true', help='Optional implementation for sequence classificaion')
-
-parser.add_argument('--include-figs', dest='include_figs', action='store_true',
-                    help='Generate figures from training results and save to same location as network')
-
-parser.add_argument('--no-stats', dest='ignore_metrics', action='store_true',
-                    help='If passed, do not output a perfomance stats file.')
-
-parser.add_argument('--force-cpu', dest='forceCPU', action='store_true',
-                    help='force network to train on CPU, even if GPU is available')
-
-parser.add_argument('--ignore-warnings', '-w', dest='ignore_warnings', action='store_true',
-                    help='Do not display warnings for dataset structure')
-
-parser.add_argument('--save-splits', dest='save_splits', action='store_true',
-                    help='Save a split-file using the random splits from this run')
-
-parser.add_argument('--verbose', '-v', action='store_true',
-                    help='Flag which, if provided, causes output to terminal to be more descriptive')
-
-parser.add_argument('--silent', action='store_true',
-                    help="Flag which, if provided, ensures no output is generated to the terminal")
+parser = argparse.ArgumentParser(
+    description="Train and test a bi-directional RNN using entire sequence."
+)
+
+parser.add_argument(
+    "data_file", help="path to tsv file with format: <idx> <sequence> <data>"
+)
+
+parser.add_argument("output_network", help="location to save the trained network")
+
+parser.add_argument(
+    "-d",
+    "--datatype",
+    metavar="dtype",
+    type=str,
+    required=True,
+    help="REQUIRED. Format of the input data file, must be 'sequence' or 'residues'",
+)
+
+parser.add_argument(
+    "-c",
+    "--classes",
+    type=int,
+    metavar="num_classes",
+    required=True,
+    help="REQUIRED. Number of output classes, for regression put 1",
+)
+
+parser.add_argument(
+    "-b",
+    "--batch",
+    default=32,
+    type=int,
+    metavar="batch_size",
+    help="size of training batch (def=32)",
+)
+
+parser.add_argument(
+    "-e",
+    "--epochs",
+    default=100,
+    type=int,
+    metavar="num_epochs",
+    help="number of training epochs (def=100)",
+)
+
+parser.add_argument(
+    "--max-iter",
+    default=50,
+    type=int,
+    metavar="max_iter",
+    help="Maximum number of iterations for the optimization procedure (def=50)",
+)
+
+parser.add_argument(
+    "--split",
+    default="",
+    metavar="split_file",
+    type=str,
+    help="file indicating how to split datafile into training, validation, and testing sets",
+)
+
+parser.add_argument(
+    "--set-fractions",
+    nargs=3,
+    default=[0.7, 0.15, 0.15],
+    type=float,
+    dest="setFractions",
+    metavar=("train", "val", "test"),
+    help="Proportion of dataset that should be divided into training, validation, and test sets",
+)
+
+parser.add_argument(
+    "--encode",
+    default="onehot",
+    type=str,
+    metavar="encoding_scheme",
+    help="'onehot' (default), 'biophysics', or specify a path to a user-created scheme",
+)
+
+parser.add_argument(
+    "--exclude-seq-id",
+    dest="excludeSeqID",
+    action="store_true",
+    help="use if data_file lacks sequence IDs in the first column of each line",
+)
+
+parser.add_argument(
+    "--probabilistic-classification",
+    dest="probabilistic_classification",
+    action="store_true",
+    help="Optional implementation for sequence classificaion",
+)
+
+parser.add_argument(
+    "--include-figs",
+    dest="include_figs",
+    action="store_true",
+    help="Generate figures from training results and save to same location as network",
+)
+
+parser.add_argument(
+    "--no-stats",
+    dest="ignore_metrics",
+    action="store_true",
+    help="If passed, do not output a perfomance stats file.",
+)
+
+parser.add_argument(
+    "--force-cpu",
+    dest="forceCPU",
+    action="store_true",
+    help="force network to train on CPU, even if GPU is available",
+)
+
+parser.add_argument(
+    "--ignore-warnings",
+    "-w",
+    dest="ignore_warnings",
+    action="store_true",
+    help="Do not display warnings for dataset structure",
+)
+
+parser.add_argument(
+    "--save-splits",
+    dest="save_splits",
+    action="store_true",
+    help="Save a split-file using the random splits from this run",
+)
+
+parser.add_argument(
+    "--verbose",
+    "-v",
+    action="store_true",
+    help="Flag which, if provided, causes output to terminal to be more descriptive",
+)
+
+parser.add_argument(
+    "--silent",
+    action="store_true",
+    help="Flag which, if provided, ensures no output is generated to the terminal",
+)
 
 args = parser.parse_args()
 
@@ -119,29 +204,42 @@ save_splits = args.save_splits
 
 # Device configuration
 if forceCPU:
-    device = 'cpu'
+    device = "cpu"
+    device_string = "cpu"
 else:
-    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    if torch.cuda.is_available():
+        device_string = "cuda"
+        device = torch.device(device_string)
+    elif torch.backends.mps.is_available() and torch.backends.mps.is_built():
+        # Use MPS if available on ARM-based MacBooks
+        device_string = "mps"
+        device = torch.device(device_string)
+    else:
+        device_string = "cpu"
+        device = torch.device(device_string)
+
+if verbose:
+    print(f"Torch device={device_string}")
 
 ###############################################################################
 ################    Validate arguments and initialize:      ###################
 
 # Ensure that provided data file exists
-data_file = validate_args.check_file_exists(args.data_file, 'Datafile')
+data_file = validate_args.check_file_exists(args.data_file, "Datafile")
 
 # Extract output directory and output prediction file name
 network_file = os.path.abspath(args.output_network)
 filename_prefix, output_dir = validate_args.split_file_and_directory(network_file)
 
 # If provided, check that split_file exists
-if split_file != '':
-    split_file = validate_args.check_file_exists(split_file, 'Split-file')
+if split_file != "":
+    split_file = validate_args.check_file_exists(split_file, "Split-file")
 else:
     split_file = None
 
 # If specified, get location where randomly generated train/val/test splits will be saved
 if save_splits:
-    save_splits_output = filename_prefix + '_split_file.txt'
+    save_splits_output = filename_prefix + "_split_file.txt"
 else:
     save_splits_output = None
 
@@ -152,20 +250,22 @@ encoding_scheme, encoder, input_size = validate_args.set_encoding_scheme(encode)
 problem_type, collate_function = validate_args.set_ml_task(num_classes, dtype)
 
 # Ensure that network hyperparams (not being optimized) are valid
-validate_args.check_positive(num_epochs, 'Number of epochs')
-validate_args.check_positive(batch_size, 'Batch size')
+validate_args.check_positive(num_epochs, "Number of epochs")
+validate_args.check_positive(batch_size, "Batch size")
 
 # Ensure that the sum of setFractions adds up to 1
 for frac in setFractions:
-    validate_args.check_between_zero_and_one(frac, 'Set fractions')
+    validate_args.check_between_zero_and_one(frac, "Set fractions")
 if sum(setFractions) != 1.0:
-    raise ValueError('Set fractions must sum to 1.')
+    raise ValueError("Set fractions must sum to 1.")
 
 # Ensure that task is binary sequence classification if
 # probabilistic_classfication is set
 if probabilistic_classification:
-    if dtype != 'sequence' or num_classes < 2:
-        raise ValueError('Probabilistic classification only implemented for sequence classification')
+    if dtype != "sequence" or num_classes < 2:
+        raise ValueError(
+            "Probabilistic classification only implemented for sequence classification"
+        )
 
 # Set ignore_warnings to True if --silent is provided
 if silent:
@@ -175,12 +275,20 @@ if silent:
 ################################  Main code  ##################################
 
 # Split data
-cvs, train, val, test = pid.split_data_cv(data_file, datatype=dtype, problem_type=problem_type,
-                                          num_classes=num_classes, excludeSeqID=excludeSeqID,
-                                          split_file=split_file, encoding_scheme=encoding_scheme,
-                                          encoder=encoder, ignoreWarnings=ignore_warnings,
-                                          percent_val=setFractions[1], percent_test=setFractions[2],
-                                          save_splits_output=save_splits_output)
+cvs, train, val, test = pid.split_data_cv(
+    data_file,
+    datatype=dtype,
+    problem_type=problem_type,
+    num_classes=num_classes,
+    excludeSeqID=excludeSeqID,
+    split_file=split_file,
+    encoding_scheme=encoding_scheme,
+    encoder=encoder,
+    ignoreWarnings=ignore_warnings,
+    percent_val=setFractions[1],
+    percent_test=setFractions[2],
+    save_splits_output=save_splits_output,
+)
 
 # Assess batch size compared to training set size
 if not ignore_warnings:
@@ -189,10 +297,18 @@ if not ignore_warnings:
 # Convert CV datasets to dataloaders
 cv_loaders = []
 for cv_train, cv_val in cvs:
-    cv_train_loader = torch.utils.data.DataLoader(dataset=cv_train, batch_size=batch_size,
-                                                  collate_fn=collate_function, shuffle=True)
-    cv_val_loader = torch.utils.data.DataLoader(dataset=cv_val, batch_size=batch_size,
-                                                collate_fn=collate_function, shuffle=False)
+    cv_train_loader = torch.utils.data.DataLoader(
+        dataset=cv_train,
+        batch_size=batch_size,
+        collate_fn=collate_function,
+        shuffle=True,
+    )
+    cv_val_loader = torch.utils.data.DataLoader(
+        dataset=cv_val,
+        batch_size=batch_size,
+        collate_fn=collate_function,
+        shuffle=False,
+    )
     cv_loaders.append((cv_train_loader, cv_val_loader))
 
 # Output to std out
@@ -202,7 +318,7 @@ if silent is False:
     print("PARROT with hyperparameter optimization")
     print("---------------------------------------")
     if verbose:
-        print('Train on:\t%s' % device)
+        print("Train on:\t%s" % device)
         print("Datatype:\t%s" % dtype)
         print("ML Task:\t%s" % problem_type)
         print("Batch size:\t%d" % batch_size)
@@ -210,70 +326,109 @@ if silent is False:
         print("Number of optimization iterations:\t%d\n" % max_iterations)
 
 # Optimization procedure
-optimizer = bayesian_optimization.BayesianOptimizer(cv_loaders, input_size, num_epochs,
-                                                    num_classes, dtype, network_file,
-                                                    max_iterations, device, silent)
+optimizer = bayesian_optimization.BayesianOptimizer(
+    cv_loaders,
+    input_size,
+    num_epochs,
+    num_classes,
+    dtype,
+    network_file,
+    max_iterations,
+    device,
+    silent,
+)
 
 best_hyperparams = optimizer.optimize()
-lr = 10**best_hyperparams[0]
+lr = 10 ** best_hyperparams[0]
 nl = int(best_hyperparams[1])
 hs = int(best_hyperparams[2])
 
 # Save these hyperparamters to a file so that the user has a record
 # TODO: move to helper function
-params_file = filename_prefix + '_optimal_hyperparams.txt'
-with open(params_file, 'w') as f:
-    f.write('Learning rate: %.5f\n' % lr)
-    f.write('Num Layers: %d\n' % nl)
-    f.write('Hidden vector size: %d\n' % hs)
+params_file = filename_prefix + "_optimal_hyperparams.txt"
+with open(params_file, "w") as f:
+    f.write("Learning rate: %.5f\n" % lr)
+    f.write("Num Layers: %d\n" % nl)
+    f.write("Hidden vector size: %d\n" % hs)
 
 # Use these best hyperparams to train the network from scratch using the entire train/val sets
 # Add data to dataloaders
-train_loader = torch.utils.data.DataLoader(dataset=train,
-                                           batch_size=batch_size,
-                                           collate_fn=collate_function,
-                                           shuffle=True)
-val_loader = torch.utils.data.DataLoader(dataset=val,
-                                         batch_size=batch_size,
-                                         collate_fn=collate_function,
-                                         shuffle=False)
-test_loader = torch.utils.data.DataLoader(dataset=test,
-                                          batch_size=1,		# Set test batch size to 1
-                                          collate_fn=collate_function,
-                                          shuffle=False)
+train_loader = torch.utils.data.DataLoader(
+    dataset=train, batch_size=batch_size, collate_fn=collate_function, shuffle=True
+)
+val_loader = torch.utils.data.DataLoader(
+    dataset=val, batch_size=batch_size, collate_fn=collate_function, shuffle=False
+)
+test_loader = torch.utils.data.DataLoader(
+    dataset=test,
+    batch_size=1,  # Set test batch size to 1
+    collate_fn=collate_function,
+    shuffle=False,
+)
 
 # Initialize network:
-if dtype == 'sequence':
-    brnn_network = brnn_architecture.BRNN_MtO(input_size, hs, nl, num_classes, device).to(device)
+if dtype == "sequence":
+    brnn_network = brnn_architecture.BRNN_MtO(
+        input_size, hs, nl, num_classes, device
+    ).to(device)
 else:  # dtype == 'residues'
-    brnn_network = brnn_architecture.BRNN_MtM(input_size, hs, nl, num_classes, device).to(device)
+    brnn_network = brnn_architecture.BRNN_MtM(
+        input_size, hs, nl, num_classes, device
+    ).to(device)
 
 # Train network
 if silent is False:
-    print('Training with optimal hyperparams:')
-train_loss, val_loss = train_network.train(brnn_network, train_loader, val_loader, datatype=dtype,
-                                           problem_type=problem_type, weights_file=network_file, 
-                                           stop_condition='iter', device=device, learn_rate=lr, 
-                                           n_epochs=num_epochs*2, verbose=verbose, silent=silent)
+    print("Training with optimal hyperparams:")
+train_loss, val_loss = train_network.train(
+    brnn_network,
+    train_loader,
+    val_loader,
+    datatype=dtype,
+    problem_type=problem_type,
+    weights_file=network_file,
+    stop_condition="iter",
+    device=device,
+    learn_rate=lr,
+    n_epochs=num_epochs * 2,
+    verbose=verbose,
+    silent=silent,
+)
 
 if include_figs:  # Plot training & validation loss per epoch
     brnn_plot.training_loss(train_loss, val_loss, output_file_prefix=filename_prefix)
 
 # Test network
-test_loss, test_set_predictions = train_network.test_labeled_data(brnn_network, test_loader,
-                                                datatype=dtype, problem_type=problem_type,
-                                                weights_file=network_file, num_classes=num_classes,
-                                                probabilistic_classification=probabilistic_classification,
-                                                include_figs=include_figs, device=device,
-                                                output_file_prefix=filename_prefix)
+test_loss, test_set_predictions = train_network.test_labeled_data(
+    brnn_network,
+    test_loader,
+    datatype=dtype,
+    problem_type=problem_type,
+    weights_file=network_file,
+    num_classes=num_classes,
+    probabilistic_classification=probabilistic_classification,
+    include_figs=include_figs,
+    device=device,
+    output_file_prefix=filename_prefix,
+)
 if silent is False:
-    print('\nTest Loss: %.4f' % test_loss)
+    print("\nTest Loss: %.4f" % test_loss)
 
 # Output performance metrics
 if not ignore_metrics:
-    brnn_plot.write_performance_metrics(test_set_predictions, dtype, problem_type,
-                                      probabilistic_classification, filename_prefix)
-    
+    brnn_plot.write_performance_metrics(
+        test_set_predictions,
+        dtype,
+        problem_type,
+        probabilistic_classification,
+        filename_prefix,
+    )
+
 # Output the test set predictions to a text file
-brnn_plot.output_predictions_to_file(test_set_predictions, excludeSeqID, encoding_scheme,
-                                    probabilistic_classification, encoder, output_file_prefix=filename_prefix)
+brnn_plot.output_predictions_to_file(
+    test_set_predictions,
+    excludeSeqID,
+    encoding_scheme,
+    probabilistic_classification,
+    encoder,
+    output_file_prefix=filename_prefix,
+)
diff --git a/scripts/parrot-train b/scripts/parrot-train
index 39ae2b9..39dce11 100755
--- a/scripts/parrot-train
+++ b/scripts/parrot-train
@@ -1,7 +1,7 @@
 #!/usr/bin/env python
 """
 Usage: $ parrot-train data_file output_network <flags>
-  
+
 Driver script for training a bidirectional recurrent neural network with user
 specified parameters. For more information on usage, use the '-h' flag.
 
@@ -12,7 +12,7 @@ idptools-parrot was developed by the Holehouse lab
 Question/comments/concerns? Raise an issue on github:
 https://github.com/idptools/parrot
 
-Licensed under the MIT license. 
+Licensed under the MIT license.
 """
 
 import os
@@ -32,75 +32,182 @@ from parrot.tools import validate_args
 from parrot.tools import dataset_warnings
 
 # Parse the command line arguments
-parser = argparse.ArgumentParser(description='Train and test a bi-directional RNN using entire sequence.')
-
-parser.add_argument('data_file', help='path to tsv file with format: <idx> <sequence> <data>')
-
-parser.add_argument('output_network', help='location to save the trained network')
-
-parser.add_argument('-d', '--datatype', metavar='dtype', type=str, required=True,
-                    help="REQUIRED. Format of the input data file, must be 'sequence' or 'residues'")
-
-parser.add_argument('-c', '--classes', type=int, metavar='num_classes', required=True,
-                    help='REQUIRED. Number of output classes, for regression put 1')
-
-parser.add_argument('-hs', '--hidden-size', default=10, type=int, metavar='hidden_size',
-                    help='hidden vector size (def=10)')
-
-parser.add_argument('-nl', '--num-layers', default=1, type=int, metavar='num_layers',
-                    help='number of layers per direction (def=1)')
-
-parser.add_argument('-lr', '--learning-rate', default=0.001, type=float,
-                    metavar='learning_rate', help='(def=0.001)')
-
-parser.add_argument('-b', '--batch', default=32, type=int, metavar='batch_size',
-                    help='size of training batch (def=32)')
-
-parser.add_argument('-e', '--epochs', default=100, type=int, metavar='num_epochs',
-                    help='number of training epochs (def=100)')
-
-parser.add_argument('--split', default='', metavar='split_file', type=str,
-                    help="file indicating how to split datafile into training, validation, and test sets")
-
-parser.add_argument('--stop', default='iter', metavar='stop_condition',
-                    type=str, help="training stop condition: either 'auto' or 'iter' (default 'iter')")
-
-parser.add_argument('--set-fractions', nargs=3, default=[0.7, 0.15, 0.15], type=float,
-                    dest='setFractions', metavar=('train', 'val', 'test'),
-                    help='proportion of dataset that should be divided into training, validation, and test sets')
-
-parser.add_argument('--encode', default='onehot', type=str, metavar='encoding_scheme',
-                    help="'onehot' (default), 'biophysics', or specify a path to a user-created scheme")
-
-parser.add_argument('--exclude-seq-id', dest='excludeSeqID', action='store_true',
-                    help='use if data_file lacks sequence IDs in the first column of each line')
-
-parser.add_argument('--probabilistic-classification', dest='probabilistic_classification',
-                    action='store_true', help='Optional implementation for sequence classificaion')
-
-parser.add_argument('--include-figs', dest='include_figs', action='store_true',
-                    help='Generate figures from training results and save to same location as network')
-
-parser.add_argument('--no-stats', dest='ignore_metrics', action='store_true',
-                    help='If passed, do not output a performance stats file.')
-
-parser.add_argument('--force-cpu', dest='forceCPU', action='store_true',
-                    help='force network to train on CPU, even if GPU is available')
-
-parser.add_argument('--gpu-id', dest='gpu_id', type=int, 
-                    help='User defined control over which CUDA device will be used by parrot')
-
-parser.add_argument('--ignore-warnings', '-w', dest='ignore_warnings', action='store_true',
-                    help='Do not display warnings for dataset structure')
-
-parser.add_argument('--save-splits', dest='save_splits', action='store_true',
-                    help='Save a split-file using the random splits from this run')
-
-parser.add_argument('--verbose', '-v', action='store_true',
-                    help='Flag which, if provided, causes output to terminal to be more descriptive')
-
-parser.add_argument('--silent', action='store_true',
-                    help="Flag which, if provided, ensures no output is generated to the terminal")
+parser = argparse.ArgumentParser(
+    description="Train and test a bi-directional RNN using entire sequence."
+)
+
+parser.add_argument(
+    "data_file", help="path to tsv file with format: <idx> <sequence> <data>"
+)
+
+parser.add_argument("output_network", help="location to save the trained network")
+
+parser.add_argument(
+    "-d",
+    "--datatype",
+    metavar="dtype",
+    type=str,
+    required=True,
+    help="REQUIRED. Format of the input data file, must be 'sequence' or 'residues'",
+)
+
+parser.add_argument(
+    "-c",
+    "--classes",
+    type=int,
+    metavar="num_classes",
+    required=True,
+    help="REQUIRED. Number of output classes, for regression put 1",
+)
+
+parser.add_argument(
+    "-hs",
+    "--hidden-size",
+    default=10,
+    type=int,
+    metavar="hidden_size",
+    help="hidden vector size (def=10)",
+)
+
+parser.add_argument(
+    "-nl",
+    "--num-layers",
+    default=1,
+    type=int,
+    metavar="num_layers",
+    help="number of layers per direction (def=1)",
+)
+
+parser.add_argument(
+    "-lr",
+    "--learning-rate",
+    default=0.001,
+    type=float,
+    metavar="learning_rate",
+    help="(def=0.001)",
+)
+
+parser.add_argument(
+    "-b",
+    "--batch",
+    default=32,
+    type=int,
+    metavar="batch_size",
+    help="size of training batch (def=32)",
+)
+
+parser.add_argument(
+    "-e",
+    "--epochs",
+    default=100,
+    type=int,
+    metavar="num_epochs",
+    help="number of training epochs (def=100)",
+)
+
+parser.add_argument(
+    "--split",
+    default="",
+    metavar="split_file",
+    type=str,
+    help="file indicating how to split datafile into training, validation, and test sets",
+)
+
+parser.add_argument(
+    "--stop",
+    default="iter",
+    metavar="stop_condition",
+    type=str,
+    help="training stop condition: either 'auto' or 'iter' (default 'iter')",
+)
+
+parser.add_argument(
+    "--set-fractions",
+    nargs=3,
+    default=[0.7, 0.15, 0.15],
+    type=float,
+    dest="setFractions",
+    metavar=("train", "val", "test"),
+    help="proportion of dataset that should be divided into training, validation, and test sets",
+)
+
+parser.add_argument(
+    "--encode",
+    default="onehot",
+    type=str,
+    metavar="encoding_scheme",
+    help="'onehot' (default), 'biophysics', or specify a path to a user-created scheme",
+)
+
+parser.add_argument(
+    "--exclude-seq-id",
+    dest="excludeSeqID",
+    action="store_true",
+    help="use if data_file lacks sequence IDs in the first column of each line",
+)
+
+parser.add_argument(
+    "--probabilistic-classification",
+    dest="probabilistic_classification",
+    action="store_true",
+    help="Optional implementation for sequence classificaion",
+)
+
+parser.add_argument(
+    "--include-figs",
+    dest="include_figs",
+    action="store_true",
+    help="Generate figures from training results and save to same location as network",
+)
+
+parser.add_argument(
+    "--no-stats",
+    dest="ignore_metrics",
+    action="store_true",
+    help="If passed, do not output a performance stats file.",
+)
+
+parser.add_argument(
+    "--force-cpu",
+    dest="forceCPU",
+    action="store_true",
+    help="force network to train on CPU, even if GPU is available",
+)
+
+parser.add_argument(
+    "--gpu-id",
+    dest="gpu_id",
+    type=int,
+    help="User defined control over which CUDA device will be used by parrot",
+)
+
+parser.add_argument(
+    "--ignore-warnings",
+    "-w",
+    dest="ignore_warnings",
+    action="store_true",
+    help="Do not display warnings for dataset structure",
+)
+
+parser.add_argument(
+    "--save-splits",
+    dest="save_splits",
+    action="store_true",
+    help="Save a split-file using the random splits from this run",
+)
+
+parser.add_argument(
+    "--verbose",
+    "-v",
+    action="store_true",
+    help="Flag which, if provided, causes output to terminal to be more descriptive",
+)
+
+parser.add_argument(
+    "--silent",
+    action="store_true",
+    help="Flag which, if provided, ensures no output is generated to the terminal",
+)
 
 args = parser.parse_args()
 print(args)
@@ -134,32 +241,48 @@ save_splits = args.save_splits
 
 # Device configuration
 if forceCPU:
-    device = 'cpu'
+    device = "cpu"
+    device_string = "cpu"
 elif gpu_id:
-    device = torch.device(f"cuda:{gpu_id}" if torch.cuda.is_available() else 'cpu')
-    print(f"You've specified to run this network on cuda:{gpu_id}. Running on {device=}")
+    device = torch.device(f"cuda:{gpu_id}" if torch.cuda.is_available() else "cpu")
+    device_string = "cuda"
+    print(
+        f"You've specified to run this network on cuda:{gpu_id}. Running on {device=}"
+    )
 else:
-    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    if torch.cuda.is_available():
+        device_string = "cuda"
+        device = torch.device(device_string)
+    elif torch.backends.mps.is_available() and torch.backends.mps.is_built():
+        # Use MPS if available on ARM-based MacBooks
+        device_string = "mps"
+        device = torch.device(device_string)
+    else:
+        device_string = "cpu"
+        device = torch.device(device_string)
+
+if verbose:
+    print(f"Torch device={device_string}")
 
 ###############################################################################
 #############    Validate arguments and initialize network:      ##############
 
 # Ensure that provided data file exists
-data_file = validate_args.check_file_exists(args.data_file, 'Datafile')
+data_file = validate_args.check_file_exists(args.data_file, "Datafile")
 
 # Extract output directory and output prediction file name
 network_file = os.path.abspath(args.output_network)
 filename_prefix, output_dir = validate_args.split_file_and_directory(network_file)
 
 # If provided, check that split_file exists
-if split_file != '':
-    split_file = validate_args.check_file_exists(split_file, 'Split-file')
+if split_file != "":
+    split_file = validate_args.check_file_exists(split_file, "Split-file")
 else:
     split_file = None
 
 # If specified, get location where randomly generated train/val/test splits will be saved
 if save_splits:
-    save_splits_output = filename_prefix + '_split_file.txt'
+    save_splits_output = filename_prefix + "_split_file.txt"
 else:
     save_splits_output = None
 
@@ -170,69 +293,81 @@ encoding_scheme, encoder, input_size = validate_args.set_encoding_scheme(encode)
 problem_type, collate_function = validate_args.set_ml_task(num_classes, dtype)
 
 # Ensure that network hyperparams are valid
-validate_args.check_between_zero_and_one(learning_rate, 'Learning rate')
-validate_args.check_positive(hidden_size, 'Hidden vector size')
-validate_args.check_positive(num_layers, 'Number of layers')
-validate_args.check_positive(num_epochs, 'Number of epochs')
-validate_args.check_positive(batch_size, 'Batch size')
+validate_args.check_between_zero_and_one(learning_rate, "Learning rate")
+validate_args.check_positive(hidden_size, "Hidden vector size")
+validate_args.check_positive(num_layers, "Number of layers")
+validate_args.check_positive(num_epochs, "Number of epochs")
+validate_args.check_positive(batch_size, "Batch size")
 
 # Ensure that stop condition is 'iter' or 'auto'
 validate_args.check_stop_condition(stop_cond, num_epochs)
 
 # Ensure that the sum of setFractions adds up to 1
 for frac in setFractions:
-    validate_args.check_between_zero_and_one(frac, 'Set fractions')
+    validate_args.check_between_zero_and_one(frac, "Set fractions")
 if sum(setFractions) != 1.0:
-    raise ValueError('Set fractions must sum to 1.')
+    raise ValueError("Set fractions must sum to 1.")
 
 # Ensure that task is binary sequence classification if
 # probabilistic_classfication is set
 if probabilistic_classification:
-    if dtype != 'sequence' or num_classes < 2:
-        raise ValueError('Probabilistic classification only implemented for sequence classification')
+    if dtype != "sequence" or num_classes < 2:
+        raise ValueError(
+            "Probabilistic classification only implemented for sequence classification"
+        )
 
 # Set ignore_warnings to True if --silent is provided
 if silent:
     ignore_warnings = True
 
 # Initialize network architecture depending on data format
-if dtype == 'sequence':
+if dtype == "sequence":
     # Use a many-to-one architecture
-    brnn_network = brnn_architecture.BRNN_MtO(input_size, hidden_size,
-                                              num_layers, num_classes, device).to(device)
-elif dtype == 'residues':
+    brnn_network = brnn_architecture.BRNN_MtO(
+        input_size, hidden_size, num_layers, num_classes, device
+    ).to(device)
+elif dtype == "residues":
     # Use a many-to-many architecture
-    brnn_network = brnn_architecture.BRNN_MtM(input_size, hidden_size,
-                                              num_layers, num_classes, device).to(device)
+    brnn_network = brnn_architecture.BRNN_MtM(
+        input_size, hidden_size, num_layers, num_classes, device
+    ).to(device)
 
 ###############################################################################
 ################################  Main code  ##################################
 
 # Split data
-train, val, test = pid.split_data(data_file, datatype=dtype, problem_type=problem_type,
-                                  num_classes=num_classes, excludeSeqID=excludeSeqID, 
-                                  split_file=split_file, encoding_scheme=encoding_scheme, 
-                                  encoder=encoder, percent_val=setFractions[1], 
-                                  percent_test=setFractions[2], ignoreWarnings=ignore_warnings,
-                                  save_splits_output=save_splits_output)
+train, val, test = pid.split_data(
+    data_file,
+    datatype=dtype,
+    problem_type=problem_type,
+    num_classes=num_classes,
+    excludeSeqID=excludeSeqID,
+    split_file=split_file,
+    encoding_scheme=encoding_scheme,
+    encoder=encoder,
+    percent_val=setFractions[1],
+    percent_test=setFractions[2],
+    ignoreWarnings=ignore_warnings,
+    save_splits_output=save_splits_output,
+)
 
 # Assess batch size compared to training set size
 if not ignore_warnings:
     dataset_warnings.eval_batch_size(batch_size, len(train))
 
 # Add data to dataloaders
-train_loader = torch.utils.data.DataLoader(dataset=train,
-                                           batch_size=batch_size,
-                                           collate_fn=collate_function,
-                                           shuffle=True)
-val_loader = torch.utils.data.DataLoader(dataset=val,
-                                         batch_size=batch_size,
-                                         collate_fn=collate_function,
-                                         shuffle=False)
-test_loader = torch.utils.data.DataLoader(dataset=test,
-                                          batch_size=1,		# Set test batch size to 1
-                                          collate_fn=collate_function,
-                                          shuffle=False)
+train_loader = torch.utils.data.DataLoader(
+    dataset=train, batch_size=batch_size, collate_fn=collate_function, shuffle=True
+)
+val_loader = torch.utils.data.DataLoader(
+    dataset=val, batch_size=batch_size, collate_fn=collate_function, shuffle=False
+)
+test_loader = torch.utils.data.DataLoader(
+    dataset=test,
+    batch_size=1,  # Set test batch size to 1
+    collate_fn=collate_function,
+    shuffle=False,
+)
 
 # Output to std out
 # TODO: move to helper function in /tools
@@ -241,7 +376,7 @@ if silent is False:
     print("PARROT with user-specified parameters")
     print("-------------------------------------")
     if verbose > 1:
-        print('Train on:\t%s' % device)
+        print("Train on:\t%s" % device)
         print("Datatype:\t%s" % dtype)
         print("ML Task:\t%s" % problem_type)
         print("Learning rate:\t%f" % learning_rate)
@@ -252,31 +387,57 @@ if silent is False:
     print("Validation set loss per epoch:")
 
 # Train network
-train_loss, val_loss = train_network.train(brnn_network, train_loader, val_loader, datatype=dtype,
-                                           problem_type=problem_type, weights_file=network_file, 
-                                           stop_condition=stop_cond, device=device, 
-                                           learn_rate=learning_rate, n_epochs=num_epochs, 
-                                           verbose=verbose, silent=silent)
+train_loss, val_loss = train_network.train(
+    brnn_network,
+    train_loader,
+    val_loader,
+    datatype=dtype,
+    problem_type=problem_type,
+    weights_file=network_file,
+    stop_condition=stop_cond,
+    device=device,
+    learn_rate=learning_rate,
+    n_epochs=num_epochs,
+    verbose=verbose,
+    silent=silent,
+)
 
 if include_figs:  # Plot training & validation loss per epoch
     brnn_plot.training_loss(train_loss, val_loss, output_file_prefix=filename_prefix)
 
 # Test network
-test_loss, test_set_predictions = train_network.test_labeled_data(brnn_network, test_loader,
-                                                datatype=dtype, problem_type=problem_type,
-                                                weights_file=network_file, num_classes=num_classes,
-                                                probabilistic_classification=probabilistic_classification,
-                                                include_figs=include_figs, device=device,
-                                                output_file_prefix=filename_prefix)
+test_loss, test_set_predictions = train_network.test_labeled_data(
+    brnn_network,
+    test_loader,
+    datatype=dtype,
+    problem_type=problem_type,
+    weights_file=network_file,
+    num_classes=num_classes,
+    probabilistic_classification=probabilistic_classification,
+    include_figs=include_figs,
+    device=device,
+    output_file_prefix=filename_prefix,
+)
 
 if silent is False:
-    print('\nTest Loss: %.4f' % test_loss)
+    print("\nTest Loss: %.4f" % test_loss)
 
 # Output performance metrics
 if not ignore_metrics:
-    brnn_plot.write_performance_metrics(test_set_predictions, dtype, problem_type,
-                                      probabilistic_classification, filename_prefix)
+    brnn_plot.write_performance_metrics(
+        test_set_predictions,
+        dtype,
+        problem_type,
+        probabilistic_classification,
+        filename_prefix,
+    )
 
 # Output the test set predictions to a text file
-brnn_plot.output_predictions_to_file(test_set_predictions, excludeSeqID, encoding_scheme,
-                                probabilistic_classification, encoder, output_file_prefix=filename_prefix)
+brnn_plot.output_predictions_to_file(
+    test_set_predictions,
+    excludeSeqID,
+    encoding_scheme,
+    probabilistic_classification,
+    encoder,
+    output_file_prefix=filename_prefix,
+)