From 1ca8c67cf8a1b2dd566481ffcb42362b82a6fd34 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adri=C3=A1n=20Diaz?= <diaz.adrian.g@gmail.com>
Date: Thu, 22 Aug 2024 15:02:16 +0200
Subject: [PATCH] Add MPS support

---
 parrot/bayesian_optimization.py | 164 +++++++++----
 parrot/brnn_architecture.py     |  64 ++---
 parrot/train_network.py         | 202 +++++++++------
 scripts/parrot-optimize         | 403 ++++++++++++++++++++----------
 scripts/parrot-train            | 419 ++++++++++++++++++++++----------
 5 files changed, 844 insertions(+), 408 deletions(-)

diff --git a/parrot/bayesian_optimization.py b/parrot/bayesian_optimization.py
index 1243784..951926e 100644
--- a/parrot/bayesian_optimization.py
+++ b/parrot/bayesian_optimization.py
@@ -8,11 +8,9 @@
 Question/comments/concerns? Raise an issue on github:
 https://github.com/idptools/parrot
 
-Licensed under the MIT license. 
+Licensed under the MIT license.
 """
 
-import math
-
 import numpy as np
 
 try:
@@ -20,18 +18,19 @@
     import GPyOpt
     from GPyOpt.methods import BayesianOptimization
 except ImportError:
-    print('Error importing GPy.')
-    print(' If trying to run parrot-optimize, make sure to use `pip install idptools-parrot[optimize]`')
+    print("Error importing GPy.")
+    print(
+        " If trying to run parrot-optimize, make sure to use `pip install idptools-parrot[optimize]`"
+    )
 
-from parrot import train_network
-from parrot import brnn_architecture
+from parrot import brnn_architecture, train_network
 
 
 class BayesianOptimizer(object):
     """A class for conducting Bayesian Optimization on a PyTorch RNN
 
     Sets up and runs GPy Bayesian Optimization in order to choose the best-
-    performing hyperparameters for a RNN for a given machine learning task. 
+    performing hyperparameters for a RNN for a given machine learning task.
     Iteratively change learning rate, hidden vector size, and the number of layers
     in the network, then train and validating using 5-fold cross validation.
 
@@ -55,7 +54,7 @@ class BayesianOptimizer(object):
     weights_file : str
             Path to which the network weights will be saved during training
     device : str
-            'cpu' or 'cuda' depending on system hardware
+            'cpu', 'mps' or 'cuda' depending on system hardware
     max_iterations : int
             Maximum number of iterations to perform the optimization procedure
     silent : bool
@@ -64,8 +63,18 @@ class BayesianOptimizer(object):
             GPy-compatible bounds for each of the hyperparameters to be optimized
     """
 
-    def __init__(self, cv_dataloaders, input_size, n_epochs, n_classes,
-                 dtype, weights_file, max_iterations, device, silent):
+    def __init__(
+        self,
+        cv_dataloaders,
+        input_size,
+        n_epochs,
+        n_classes,
+        dtype,
+        weights_file,
+        max_iterations,
+        device,
+        silent,
+    ):
         """
         Parameters
         ----------
@@ -83,7 +92,7 @@ def __init__(self, cv_dataloaders, input_size, n_epochs, n_classes,
         weights_file : str
                 Path to which the network weights will be saved during training
         device : str
-                'cpu' or 'cuda' depending on system hardware
+                'cpu', 'mps' or 'cuda' depending on system hardware
         max_iterations : int
                 Maximum number of iterations to perform the optimization procedure
         silent : bool
@@ -96,9 +105,9 @@ def __init__(self, cv_dataloaders, input_size, n_epochs, n_classes,
         self.n_folds = len(cv_dataloaders)
         self.n_classes = n_classes
         if n_classes > 1:
-            self.problem_type = 'classification'
+            self.problem_type = "classification"
         else:
-            self.problem_type = 'regression'
+            self.problem_type = "regression"
 
         self.dtype = dtype
         self.weights_file = weights_file
@@ -106,9 +115,19 @@ def __init__(self, cv_dataloaders, input_size, n_epochs, n_classes,
         self.device = device
         self.silent = silent
 
-        self.bds = [{'name': 'log_learning_rate', 'type': 'continuous', 'domain': (-5, -2)},  # 0.00001-0.01
-                    {'name': 'n_layers', 'type': 'discrete', 'domain': tuple(range(1, 6))},  # 1-5
-                    {'name': 'hidden_size', 'type': 'discrete', 'domain': tuple(range(5, 51))}]  # 5-50
+        self.bds = [
+            {
+                "name": "log_learning_rate",
+                "type": "continuous",
+                "domain": (-5, -2),
+            },  # 0.00001-0.01
+            {
+                "name": "n_layers",
+                "type": "discrete",
+                "domain": tuple(range(1, 6)),
+            },  # 1-5
+            {"name": "hidden_size", "type": "discrete", "domain": tuple(range(5, 51))},
+        ]  # 5-50
 
     def compute_cv_loss(self, hyperparameters):
         """Compute the average cross-val loss for a given set of hyperparameters
@@ -125,7 +144,7 @@ def compute_cv_loss(self, hyperparameters):
         Returns
         -------
         numpy float array
-                a Nx1 numpy array of the average cross-val loss 
+                a Nx1 numpy array of the average cross-val loss
                 per set of input hyperparameters
         """
 
@@ -134,7 +153,7 @@ def compute_cv_loss(self, hyperparameters):
         for i in range(len(hyperparameters)):
 
             log_lr, nl, hs = hyperparameters[i]
-            lr = 10**float(log_lr)
+            lr = 10 ** float(log_lr)
             nl = int(nl)
             hs = int(hs)
 
@@ -143,7 +162,10 @@ def compute_cv_loss(self, hyperparameters):
             avg = np.average(cv_outputs[i])
 
             if self.silent is False:
-                print('  %.6f	|     %2d       |         %2d           |    %.3f' % (lr, nl, hs, avg))
+                print(
+                    "  %.6f	|     %2d       |         %2d           |    %.3f"
+                    % (lr, nl, hs, avg)
+                )
 
         outputs = np.average(cv_outputs, axis=1)
         return outputs
@@ -166,23 +188,36 @@ def eval_cv_brnns(self, lr, nl, hs):
                 the best validation loss from each fold of cross validation
         """
 
-        cv_losses = np.zeros(self.n_folds) - 1  # -1 so that it's obvious if something goes wrong
+        cv_losses = (
+            np.zeros(self.n_folds) - 1
+        )  # -1 so that it's obvious if something goes wrong
 
         for k in range(self.n_folds):
-            if self.dtype == 'sequence':
+            if self.dtype == "sequence":
                 # Use a many-to-one architecture
-                brnn_network = brnn_architecture.BRNN_MtO(self.input_size, hs, nl,
-                                                          self.n_classes, self.device).to(self.device)
+                brnn_network = brnn_architecture.BRNN_MtO(
+                    self.input_size, hs, nl, self.n_classes, self.device
+                ).to(self.device)
             else:
                 # Use a many-to-many architecture
-                brnn_network = brnn_architecture.BRNN_MtM(self.input_size, hs, nl,
-                                                          self.n_classes, self.device).to(self.device)
+                brnn_network = brnn_architecture.BRNN_MtM(
+                    self.input_size, hs, nl, self.n_classes, self.device
+                ).to(self.device)
 
             # Train network with this set of hyperparameters
-            train_losses, val_losses = train_network.train(brnn_network, self.cv_loaders[k][0],
-                                                           self.cv_loaders[k][1], self.dtype, self.problem_type,
-                                                           self.weights_file, stop_condition='iter', device=self.device,
-                                                           learn_rate=lr, n_epochs=self.n_epochs, silent=True)
+            train_losses, val_losses = train_network.train(
+                brnn_network,
+                self.cv_loaders[k][0],
+                self.cv_loaders[k][1],
+                self.dtype,
+                self.problem_type,
+                self.weights_file,
+                stop_condition="iter",
+                device=self.device,
+                learn_rate=lr,
+                n_epochs=self.n_epochs,
+                silent=True,
+            )
             # Take best val loss
             best_val_loss = np.min(val_losses)
             cv_losses[k] = best_val_loss
@@ -211,7 +246,7 @@ def initial_search(self, x):
         for i in range(len(x)):
 
             log_lr, nl, hs = x[i]
-            lr = 10**float(log_lr)
+            lr = 10 ** float(log_lr)
             nl = int(nl)
             hs = int(hs)
 
@@ -237,32 +272,55 @@ def optimize(self):
         """
 
         # Initial hyperparameter search -- used to get noise estimate
-        x_init = np.array([[-3.0, 1, 20], [-3.0, 2, 20], [-3.0, 3, 20], [-3.0, 4, 20], [-3.0, 5, 20],
-                           [-2.0, 2, 20], [-3.3, 2, 20], [-4.0, 2, 20], [-5.0, 2, 20],
-                           [-3.0, 2, 5],  [-3.0, 2, 15], [-3.0, 2, 35], [-3.0, 2, 50]])
+        x_init = np.array(
+            [
+                [-3.0, 1, 20],
+                [-3.0, 2, 20],
+                [-3.0, 3, 20],
+                [-3.0, 4, 20],
+                [-3.0, 5, 20],
+                [-2.0, 2, 20],
+                [-3.3, 2, 20],
+                [-4.0, 2, 20],
+                [-5.0, 2, 20],
+                [-3.0, 2, 5],
+                [-3.0, 2, 15],
+                [-3.0, 2, 35],
+                [-3.0, 2, 50],
+            ]
+        )
         y_init, noise = self.initial_search(x_init)
 
         if self.silent is False:
             print("\nInitial search results:")
             print("lr\tnl\ths\toutput")
             for i in range(len(x_init)):
-                print("%.5f\t%2d\t%2d\t%.4f" % (10**x_init[i][0], x_init[i][1], x_init[i][2], y_init[i][0]))
+                print(
+                    "%.5f\t%2d\t%2d\t%.4f"
+                    % (10 ** x_init[i][0], x_init[i][1], x_init[i][2], y_init[i][0])
+                )
             print("Noise estimate:", noise)
-            print('\n')
-            print('Primary optimization:')
-            print('--------------------\n')
-            print('Learning rate   |   n_layers   |   hidden vector size |  avg CV loss  ')
-            print('======================================================================')
-
-        optimizer = BayesianOptimization(f=self.compute_cv_loss,
-                                         domain=self.bds,
-                                         model_type='GP',
-                                         acquisition_type='EI',
-                                         acquisition_jitter=0.05,
-                                         X=x_init,
-                                         Y=y_init,
-                                         noise_var=noise,
-                                         maximize=False)
+            print("\n")
+            print("Primary optimization:")
+            print("--------------------\n")
+            print(
+                "Learning rate   |   n_layers   |   hidden vector size |  avg CV loss  "
+            )
+            print(
+                "======================================================================"
+            )
+
+        optimizer = BayesianOptimization(
+            f=self.compute_cv_loss,
+            domain=self.bds,
+            model_type="GP",
+            acquisition_type="EI",
+            acquisition_jitter=0.05,
+            X=x_init,
+            Y=y_init,
+            noise_var=noise,
+            maximize=False,
+        )
 
         optimizer.run_optimization(max_iter=self.max_iterations)
 
@@ -270,8 +328,10 @@ def optimize(self):
         outs = optimizer.get_evaluations()[1].flatten()
 
         if self.silent is False:
-            print("\nThe optimal hyperparameters are:\nlr = %.5f\nnl = %d\nhs = %d" %
-                  (10**optimizer.x_opt[0], optimizer.x_opt[1], optimizer.x_opt[2]))
+            print(
+                "\nThe optimal hyperparameters are:\nlr = %.5f\nnl = %d\nhs = %d"
+                % (10 ** optimizer.x_opt[0], optimizer.x_opt[1], optimizer.x_opt[2])
+            )
             print()
 
         return optimizer.x_opt
diff --git a/parrot/brnn_architecture.py b/parrot/brnn_architecture.py
index efe09c9..7e52527 100644
--- a/parrot/brnn_architecture.py
+++ b/parrot/brnn_architecture.py
@@ -8,7 +8,7 @@
 Question/comments/concerns? Raise an issue on github:
 https://github.com/idptools/parrot
 
-Licensed under the MIT license. 
+Licensed under the MIT license.
 """
 
 import torch
@@ -24,15 +24,15 @@ class BRNN_MtM(nn.Module):
     aggregates the deepest hidden layers of both directions and produces the
     outputs.
 
-    "Many-to-many" refers to the fact that the network will produce outputs 
-    corresponding to every item of the input sequence. For example, an input 
+    "Many-to-many" refers to the fact that the network will produce outputs
+    corresponding to every item of the input sequence. For example, an input
     sequence of length 10 will produce 10 sequential outputs.
 
     Attributes
     ----------
     device : str
         String describing where the network is physically stored on the computer.
-        Should be either 'cpu' or 'cuda' (GPU).
+        Should be 'cpu', 'mps' or 'cuda' (GPU).
     hidden_size : int
         Size of hidden vectors in the network
     num_layers : int
@@ -43,8 +43,8 @@ class BRNN_MtM(nn.Module):
         it should be the number of classes.
     lstm : PyTorch LSTM object
         The bidirectional LSTM layer(s) of the recurrent neural network.
-    fc : PyTorch Linear object  
-        The fully connected linear layer of the recurrent neural network. Across 
+    fc : PyTorch Linear object
+        The fully connected linear layer of the recurrent neural network. Across
         the length of the input sequence, this layer aggregates the output of the
         LSTM nodes from the deepest forward layer and deepest reverse layer and
         returns the output for that residue in the sequence.
@@ -66,7 +66,7 @@ def __init__(self, input_size, hidden_size, num_layers, num_classes, device):
             it should be the number of classes.
         device : str
             String describing where the network is physically stored on the computer.
-            Should be either 'cpu' or 'cuda' (GPU).
+            Should be 'cpu', 'mps' or 'cuda' (GPU).
         """
 
         super(BRNN_MtM, self).__init__()
@@ -74,10 +74,12 @@ def __init__(self, input_size, hidden_size, num_layers, num_classes, device):
         self.hidden_size = hidden_size
         self.num_layers = num_layers
         self.num_classes = num_classes
-        self.lstm = nn.LSTM(input_size, hidden_size, num_layers,
-                            batch_first=True, bidirectional=True)
-        self.fc = nn.Linear(in_features=hidden_size*2,  # *2 for bidirection
-                            out_features=num_classes)
+        self.lstm = nn.LSTM(
+            input_size, hidden_size, num_layers, batch_first=True, bidirectional=True
+        )
+        self.fc = nn.Linear(
+            in_features=hidden_size * 2, out_features=num_classes  # *2 for bidirection
+        )
 
     def forward(self, x):
         """Propogate input sequences through the network to produce outputs
@@ -98,10 +100,12 @@ def forward(self, x):
 
         # Set initial states
         # h0 and c0 dimensions: [num_layers*2 X batch_size X hidden_size]
-        h0 = torch.zeros(self.num_layers*2,     # *2 for bidirection
-                         x.size(0), self.hidden_size).to(self.device)
-        c0 = torch.zeros(self.num_layers*2,
-                         x.size(0), self.hidden_size).to(self.device)
+        h0 = torch.zeros(
+            self.num_layers * 2, x.size(0), self.hidden_size  # *2 for bidirection
+        ).to(self.device)
+        c0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(
+            self.device
+        )
 
         # Forward propagate LSTM
         # out: tensor of shape: [batch_size, seq_length, hidden_size*2]
@@ -121,7 +125,7 @@ class BRNN_MtO(nn.Module):
     aggregates the deepest hidden layers of both directions and produces the
     output.
 
-    "Many-to-one" refers to the fact that the network will produce a single output 
+    "Many-to-one" refers to the fact that the network will produce a single output
     for an entire input sequence. For example, an input sequence of length 10 will
     produce only one output.
 
@@ -129,7 +133,7 @@ class BRNN_MtO(nn.Module):
     ----------
     device : str
         String describing where the network is physically stored on the computer.
-        Should be either 'cpu' or 'cuda' (GPU).
+        Should be 'cpu', 'mps' or 'cuda' (GPU).
     hidden_size : int
         Size of hidden vectors in the network
     num_layers : int
@@ -140,8 +144,8 @@ class BRNN_MtO(nn.Module):
         it should be the number of classes.
     lstm : PyTorch LSTM object
         The bidirectional LSTM layer(s) of the recurrent neural network.
-    fc : PyTorch Linear object  
-        The fully connected linear layer of the recurrent neural network. Across 
+    fc : PyTorch Linear object
+        The fully connected linear layer of the recurrent neural network. Across
         the length of the input sequence, this layer aggregates the output of the
         LSTM nodes from the deepest forward layer and deepest reverse layer and
         returns the output for that residue in the sequence.
@@ -163,17 +167,19 @@ def __init__(self, input_size, hidden_size, num_layers, num_classes, device):
             it should be the number of classes.
         device : str
             String describing where the network is physically stored on the computer.
-            Should be either 'cpu' or 'cuda' (GPU).
+            Should be 'cpu', 'mps' or 'cuda' (GPU).
         """
 
         super(BRNN_MtO, self).__init__()
         self.device = device
         self.hidden_size = hidden_size
         self.num_layers = num_layers
-        self.lstm = nn.LSTM(input_size, hidden_size, num_layers,
-                            batch_first=True, bidirectional=True)
-        self.fc = nn.Linear(in_features=hidden_size*2,  # *2 for bidirection
-                            out_features=num_classes)
+        self.lstm = nn.LSTM(
+            input_size, hidden_size, num_layers, batch_first=True, bidirectional=True
+        )
+        self.fc = nn.Linear(
+            in_features=hidden_size * 2, out_features=num_classes  # *2 for bidirection
+        )
 
     def forward(self, x):
         """Propogate input sequences through the network to produce outputs
@@ -194,10 +200,12 @@ def forward(self, x):
 
         # Set initial states
         # h0 and c0 dimensions: [num_layers*2 X batch_size X hidden_size]
-        h0 = torch.zeros(self.num_layers*2,     # *2 for bidirection
-                         x.size(0), self.hidden_size).to(self.device)
-        c0 = torch.zeros(self.num_layers*2,
-                         x.size(0), self.hidden_size).to(self.device)
+        h0 = torch.zeros(
+            self.num_layers * 2, x.size(0), self.hidden_size  # *2 for bidirection
+        ).to(self.device)
+        c0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(
+            self.device
+        )
 
         # Forward propagate LSTM
         # out: tensor of shape: [batch_size, seq_length, hidden_size*2]
diff --git a/parrot/train_network.py b/parrot/train_network.py
index d4d939a..eb0a7fe 100644
--- a/parrot/train_network.py
+++ b/parrot/train_network.py
@@ -8,20 +8,30 @@
 Question/comments/concerns? Raise an issue on github:
 https://github.com/idptools/parrot
 
-Licensed under the MIT license. 
+Licensed under the MIT license.
 """
 
+import numpy as np
 import torch
 import torch.nn as nn
-from torch.utils.data import Dataset, DataLoader
-import numpy as np
-
-from parrot import brnn_plot
-from parrot import encode_sequence
 
-
-def train(network, train_loader, val_loader, datatype, problem_type, weights_file,
-          stop_condition, device, learn_rate, n_epochs, verbose=False, silent=False):
+from parrot import brnn_plot, encode_sequence
+
+
+def train(
+    network,
+    train_loader,
+    val_loader,
+    datatype,
+    problem_type,
+    weights_file,
+    stop_condition,
+    device,
+    learn_rate,
+    n_epochs,
+    verbose=False,
+    silent=False,
+):
     """Train a BRNN and save the best performing network weights
 
     Train the network on a training set, and every epoch evaluate its performance on
@@ -30,8 +40,8 @@ def train(network, train_loader, val_loader, datatype, problem_type, weights_fil
 
     User must specify the machine learning tast (`problem_type`) and the format of
     the data (`datatype`). Additionally, this function requires the learning rate
-    hyperparameter and the number of epochs of training. The other hyperparameters, 
-    number of hidden layers and hidden vector size, are implictly included on the 
+    hyperparameter and the number of epochs of training. The other hyperparameters,
+    number of hidden layers and hidden vector size, are implictly included on the
     the provided network.
 
     The user may specify if they want to train the network for a set number of
@@ -64,8 +74,9 @@ def train(network, train_loader, val_loader, datatype, problem_type, weights_fil
             performance has sufficiently stagnated. If the performance plateaus for
             `n_epochs` consecutive epochs, then training will stop.
     device : str
-            Location of where training will take place--should be either 'cpu' or
-            'cuda' (GPU). If available, training on GPU is typically much faster.
+            Location of where training will take place--should be 'cpu', 'mps' (Apple
+            GPU) or 'cuda' (GPU). If available, training on GPU is typically
+            much faster.
     learn_rate : float
             Initial learning rate of network training. The training process is
             controlled by the Adam optimization algorithm, so this learning rate
@@ -91,13 +102,13 @@ def train(network, train_loader, val_loader, datatype, problem_type, weights_fil
     optimizer = torch.optim.Adam(network.parameters(), lr=learn_rate)
 
     # Set loss criteria
-    if problem_type == 'regression':
-        if datatype == 'residues':
-            criterion = nn.MSELoss(reduction='sum')
-        elif datatype == 'sequence':
-            criterion = nn.L1Loss(reduction='sum')
-    elif problem_type == 'classification':
-        criterion = nn.CrossEntropyLoss(reduction='sum')
+    if problem_type == "regression":
+        if datatype == "residues":
+            criterion = nn.MSELoss(reduction="sum")
+        elif datatype == "sequence":
+            criterion = nn.L1Loss(reduction="sum")
+    elif problem_type == "classification":
+        criterion = nn.CrossEntropyLoss(reduction="sum")
 
     network = network.float()
     total_step = len(train_loader)
@@ -105,7 +116,7 @@ def train(network, train_loader, val_loader, datatype, problem_type, weights_fil
     avg_train_losses = []
     avg_val_losses = []
 
-    if stop_condition == 'auto':
+    if stop_condition == "auto":
         min_epochs = n_epochs
         # Set to some arbitrarily large number of iterations -- will stop automatically
         n_epochs = 20000000
@@ -127,10 +138,10 @@ def train(network, train_loader, val_loader, datatype, problem_type, weights_fil
             # Forward pass
             outputs = network(vectors.float())
 
-            if problem_type == 'regression':
+            if problem_type == "regression":
                 loss = criterion(outputs, targets.float())
             else:
-                if datatype == 'residues':
+                if datatype == "residues":
                     outputs = outputs.permute(0, 2, 1)
                 loss = criterion(outputs, targets.long())
 
@@ -147,10 +158,10 @@ def train(network, train_loader, val_loader, datatype, problem_type, weights_fil
 
             # Forward pass
             outputs = network(vectors.float())
-            if problem_type == 'regression':
+            if problem_type == "regression":
                 loss = criterion(outputs, targets.float())
             else:
-                if datatype == 'residues':
+                if datatype == "residues":
                     outputs = outputs.permute(0, 2, 1)
                 loss = criterion(outputs, targets.long())
 
@@ -162,12 +173,12 @@ def train(network, train_loader, val_loader, datatype, problem_type, weights_fil
         val_loss /= len(val_loader.dataset)
 
         signif_decrease = True
-        if stop_condition == 'auto' and epoch > min_epochs - 1:
+        if stop_condition == "auto" and epoch > min_epochs - 1:
             # Check to see if loss has stopped decreasing
             last_epochs_loss = avg_val_losses[-min_epochs:]
 
             for loss in last_epochs_loss:
-                if val_loss >= loss*0.995:
+                if val_loss >= loss * 0.995:
                     signif_decrease = False
 
             # If network performance has plateaued over the last range of epochs, end training
@@ -176,7 +187,7 @@ def train(network, train_loader, val_loader, datatype, problem_type, weights_fil
 
         # Only save updated weights to memory if they improve val set performance
         if val_loss < min_val_loss:
-            min_val_loss = val_loss 	# Reset min_val_loss
+            min_val_loss = val_loss  # Reset min_val_loss
             last_decrease = epoch
             torch.save(network.state_dict(), weights_file)  # Save model
 
@@ -185,9 +196,9 @@ def train(network, train_loader, val_loader, datatype, problem_type, weights_fil
         avg_val_losses.append(val_loss)
 
         if verbose:
-            print('Epoch %d\tLoss %.4f' % (epoch, val_loss))
+            print("Epoch %d\tLoss %.4f" % (epoch, val_loss))
         elif epoch % 5 == 0 and silent is False:
-            print('Epoch %d\tLoss %.4f' % (epoch, val_loss))
+            print("Epoch %d\tLoss %.4f" % (epoch, val_loss))
 
         # This is placed here to ensure that the best network, even if the performance
         # improvement is marginal, is saved.
@@ -198,15 +209,23 @@ def train(network, train_loader, val_loader, datatype, problem_type, weights_fil
     return avg_train_losses, avg_val_losses
 
 
-def test_labeled_data(network, test_loader, datatype,
-                      problem_type, weights_file, num_classes,
-                      probabilistic_classification, include_figs, 
-                      device, output_file_prefix=''):
+def test_labeled_data(
+    network,
+    test_loader,
+    datatype,
+    problem_type,
+    weights_file,
+    num_classes,
+    probabilistic_classification,
+    include_figs,
+    device,
+    output_file_prefix="",
+):
     """Test a trained BRNN on labeled sequences
 
     Using the saved weights of a trained network, run a set of sequences through
     the network and evaluate the performancd. Return the average loss per
-    sequence and plot the results. Testing a network on previously-unseen data 
+    sequence and plot the results. Testing a network on previously-unseen data
     provides a useful estimate of how generalizeable the network's performance is.
 
     Parameters
@@ -232,10 +251,11 @@ def test_labeled_data(network, test_loader, datatype,
     include_figs: bool
             Whether or not matplotlib figures should be generated.
     device : str
-            Location of where testing will take place--should be either 'cpu' or
-            'cuda' (GPU). If available, training on GPU is typically much faster.
+            Location of where training will take place--should be 'cpu', 'mps' (Apple
+            GPU) or 'cuda' (GPU). If available, training on GPU is typically
+            much faster.
     output_file_prefix : str
-            Path and filename prefix to which the test set predictions and plots will be saved. 
+            Path and filename prefix to which the test set predictions and plots will be saved.
 
     Returns
     -------
@@ -251,20 +271,20 @@ def test_labeled_data(network, test_loader, datatype,
     network.load_state_dict(torch.load(weights_file))
 
     # Get output directory for images
-    network_filename = weights_file.split('/')[-1]
-    output_dir = weights_file[:-len(network_filename)]
+    network_filename = weights_file.split("/")[-1]
+    output_dir = weights_file[: -len(network_filename)]
 
     # Set loss criteria
-    if problem_type == 'regression':
+    if problem_type == "regression":
         criterion = nn.MSELoss()
-    elif problem_type == 'classification':
+    elif problem_type == "classification":
         criterion = nn.CrossEntropyLoss()
 
     test_loss = 0
     all_targets = []
     all_outputs = []
     predictions = []
-    for names, vectors, targets in test_loader: 	# batch size of 1
+    for names, vectors, targets in test_loader:  # batch size of 1
         all_targets.append(targets)
 
         vectors = vectors.to(device)
@@ -272,10 +292,10 @@ def test_labeled_data(network, test_loader, datatype,
 
         # Forward pass
         outputs = network(vectors.float())
-        if problem_type == 'regression':
+        if problem_type == "regression":
             loss = criterion(outputs, targets.float())
         else:
-            if datatype == 'residues':
+            if datatype == "residues":
                 outputs = outputs.permute(0, 2, 1)
             loss = criterion(outputs, targets.long())
 
@@ -283,37 +303,49 @@ def test_labeled_data(network, test_loader, datatype,
         all_outputs.append(outputs.detach())
 
         # Add to list as: [seq_vector, true value, predicted value, name]
-        predictions.append([vectors[0].cpu().numpy(), targets.cpu().numpy()
-                           [0], outputs.cpu().detach().numpy(), names[0]])
+        predictions.append(
+            [
+                vectors[0].cpu().numpy(),
+                targets.cpu().numpy()[0],
+                outputs.cpu().detach().numpy(),
+                names[0],
+            ]
+        )
 
     # Plot 'accuracy' depending on the problem type and datatype
-    if problem_type == 'regression':
-        if datatype == 'residues':
+    if problem_type == "regression":
+        if datatype == "residues":
             if include_figs:
-                brnn_plot.residue_regression_scatterplot(all_targets, all_outputs, 
-                                            output_file_prefix=output_file_prefix)
+                brnn_plot.residue_regression_scatterplot(
+                    all_targets, all_outputs, output_file_prefix=output_file_prefix
+                )
 
             # Format predictions
             for i in range(len(predictions)):
                 predictions[i][2] = predictions[i][2].flatten()
                 predictions[i][1] = predictions[i][1].flatten()
 
-        elif datatype == 'sequence':
+        elif datatype == "sequence":
             if include_figs:
-                brnn_plot.sequence_regression_scatterplot(all_targets, all_outputs, 
-                                            output_file_prefix=output_file_prefix)
+                brnn_plot.sequence_regression_scatterplot(
+                    all_targets, all_outputs, output_file_prefix=output_file_prefix
+                )
 
             # Format predictions
             for i in range(len(predictions)):
                 predictions[i][2] = predictions[i][2][0][0]
                 predictions[i][1] = predictions[i][1][0]
 
-    elif problem_type == 'classification':
+    elif problem_type == "classification":
 
-        if datatype == 'residues':
+        if datatype == "residues":
             if include_figs:
-                brnn_plot.res_confusion_matrix(all_targets, all_outputs, num_classes, 
-                                            output_file_prefix=output_file_prefix)
+                brnn_plot.res_confusion_matrix(
+                    all_targets,
+                    all_outputs,
+                    num_classes,
+                    output_file_prefix=output_file_prefix,
+                )
 
             # Format predictions and assign class predictions
             for i in range(len(predictions)):
@@ -322,7 +354,7 @@ def test_labeled_data(network, test_loader, datatype,
                     pred_values = np.argmax(predictions[i][2], axis=1)[0]
                 predictions[i][2] = np.array(pred_values, dtype=np.int)
 
-        elif datatype == 'sequence':
+        elif datatype == "sequence":
             if probabilistic_classification:
                 # Probabilistic assignment of class predictions
                 # Optional implementation for classification tasks
@@ -337,10 +369,18 @@ def test_labeled_data(network, test_loader, datatype,
 
                 # Plot ROC and PR curves
                 if include_figs:
-                    brnn_plot.plot_roc_curve(all_targets, pred_probabilities, num_classes, 
-                                            output_file_prefix=output_file_prefix)
-                    brnn_plot.plot_precision_recall_curve(all_targets, pred_probabilities, 
-                                            num_classes, output_file_prefix=output_file_prefix)
+                    brnn_plot.plot_roc_curve(
+                        all_targets,
+                        pred_probabilities,
+                        num_classes,
+                        output_file_prefix=output_file_prefix,
+                    )
+                    brnn_plot.plot_precision_recall_curve(
+                        all_targets,
+                        pred_probabilities,
+                        num_classes,
+                        output_file_prefix=output_file_prefix,
+                    )
 
             else:
                 # Absolute assignment of class predictions
@@ -351,20 +391,31 @@ def test_labeled_data(network, test_loader, datatype,
 
                 # Plot confusion matrix (if not in probabilistic classification mode)
                 if include_figs:
-                    brnn_plot.confusion_matrix(all_targets, all_outputs, num_classes, 
-                                                output_file_prefix=output_file_prefix)
+                    brnn_plot.confusion_matrix(
+                        all_targets,
+                        all_outputs,
+                        num_classes,
+                        output_file_prefix=output_file_prefix,
+                    )
 
     return test_loss / len(test_loader.dataset), predictions
 
 
-def test_unlabeled_data(network, sequences, device, encoding_scheme='onehot', encoder=None, print_frequency=None):
+def test_unlabeled_data(
+    network,
+    sequences,
+    device,
+    encoding_scheme="onehot",
+    encoder=None,
+    print_frequency=None,
+):
     """Test a trained BRNN on unlabeled sequences
 
     Use a trained network to make predictions on previously-unseen data.
 
-    ** 
+    **
     Note: Unlike the previous functions, `network` here must have pre-loaded
-    weights. 
+    weights.
     **
 
     Parameters
@@ -374,8 +425,9 @@ def test_unlabeled_data(network, sequences, device, encoding_scheme='onehot', en
     sequences : list
             A list of amino acid sequences to test using the network
     device : str
-            Location of where testing will take place--should be either 'cpu' or
-            'cuda' (GPU). If available, training on GPU is typically much faster.
+            Location of where training will take place--should be 'cpu', 'mps' (Apple
+            GPU) or 'cuda' (GPU). If available, training on GPU is typically
+            much faster.
     encoding_scheme : str, optional
             How amino acid sequences are to be encoded as numeric vectors. Currently,
             'onehot','biophysics' and 'user' are the implemented options.
@@ -386,7 +438,7 @@ def test_unlabeled_data(network, sequences, device, encoding_scheme='onehot', en
     print_frequency : int
             If provided defines at what sequence interval an update is printed.
             Default = None.
-    
+
     Returns
     -------
     dict
@@ -403,13 +455,13 @@ def test_unlabeled_data(network, sequences, device, encoding_scheme='onehot', en
         local_count = local_count + 1
         if print_frequency is not None:
             if local_count % print_frequency == 0:
-                print(f'On {local_count} of {total_count}')
+                print(f"On {local_count} of {total_count}")
 
-        if encoding_scheme == 'onehot':
+        if encoding_scheme == "onehot":
             seq_vector = encode_sequence.one_hot(seq)
-        elif encoding_scheme == 'biophysics':
+        elif encoding_scheme == "biophysics":
             seq_vector = encode_sequence.biophysics(seq)
-        elif encoding_scheme == 'user':
+        elif encoding_scheme == "user":
             seq_vector = encoder.encode(seq)
 
         seq_vector = seq_vector.view(1, len(seq_vector), -1)
diff --git a/scripts/parrot-optimize b/scripts/parrot-optimize
index 22a8365..f091951 100755
--- a/scripts/parrot-optimize
+++ b/scripts/parrot-optimize
@@ -1,8 +1,8 @@
 #!/usr/bin/env python
 """
 Usage: $ parrot-optimize data_file output_network <flags>
-  
-Driver script for finding optimal hyperparameters for a bidirectional recurrent 
+
+Driver script for finding optimal hyperparameters for a bidirectional recurrent
 neural network on a given dataset, then training a network with those parameters
 For more information on usage, use the '-h' flag.
 
@@ -13,7 +13,7 @@ idptools-parrot was developed by the Holehouse lab
 Question/comments/concerns? Raise an issue on github:
 https://github.com/idptools/parrot
 
-Licensed under the MIT license. 
+Licensed under the MIT license.
 """
 
 import os
@@ -34,63 +34,148 @@ from parrot.tools import validate_args
 from parrot.tools import dataset_warnings
 
 # Parse the command line arguments
-parser = argparse.ArgumentParser(description='Train and test a bi-directional RNN using entire sequence.')
-
-parser.add_argument('data_file', help='path to tsv file with format: <idx> <sequence> <data>')
-
-parser.add_argument('output_network', help='location to save the trained network')
-
-parser.add_argument('-d', '--datatype', metavar='dtype', type=str, required=True,
-                    help="REQUIRED. Format of the input data file, must be 'sequence' or 'residues'")
-
-parser.add_argument('-c', '--classes', type=int, metavar='num_classes', required=True,
-                    help='REQUIRED. Number of output classes, for regression put 1')
-
-parser.add_argument('-b', '--batch', default=32, type=int, metavar='batch_size',
-                    help='size of training batch (def=32)')
-
-parser.add_argument('-e', '--epochs', default=100, type=int, metavar='num_epochs',
-                    help='number of training epochs (def=100)')
-
-parser.add_argument('--max-iter', default=50, type=int, metavar='max_iter',
-                    help='Maximum number of iterations for the optimization procedure (def=50)')
-
-parser.add_argument('--split', default='', metavar='split_file', type=str,
-                    help="file indicating how to split datafile into training, validation, and testing sets")
-
-parser.add_argument('--set-fractions', nargs=3, default=[0.7, 0.15, 0.15], type=float,
-                    dest='setFractions', metavar=('train', 'val', 'test'),
-                    help='Proportion of dataset that should be divided into training, validation, and test sets')
-
-parser.add_argument('--encode', default='onehot', type=str, metavar='encoding_scheme',
-                    help="'onehot' (default), 'biophysics', or specify a path to a user-created scheme")
-
-parser.add_argument('--exclude-seq-id', dest='excludeSeqID', action='store_true',
-                    help='use if data_file lacks sequence IDs in the first column of each line')
-
-parser.add_argument('--probabilistic-classification', dest='probabilistic_classification',
-                    action='store_true', help='Optional implementation for sequence classificaion')
-
-parser.add_argument('--include-figs', dest='include_figs', action='store_true',
-                    help='Generate figures from training results and save to same location as network')
-
-parser.add_argument('--no-stats', dest='ignore_metrics', action='store_true',
-                    help='If passed, do not output a perfomance stats file.')
-
-parser.add_argument('--force-cpu', dest='forceCPU', action='store_true',
-                    help='force network to train on CPU, even if GPU is available')
-
-parser.add_argument('--ignore-warnings', '-w', dest='ignore_warnings', action='store_true',
-                    help='Do not display warnings for dataset structure')
-
-parser.add_argument('--save-splits', dest='save_splits', action='store_true',
-                    help='Save a split-file using the random splits from this run')
-
-parser.add_argument('--verbose', '-v', action='store_true',
-                    help='Flag which, if provided, causes output to terminal to be more descriptive')
-
-parser.add_argument('--silent', action='store_true',
-                    help="Flag which, if provided, ensures no output is generated to the terminal")
+parser = argparse.ArgumentParser(
+    description="Train and test a bi-directional RNN using entire sequence."
+)
+
+parser.add_argument(
+    "data_file", help="path to tsv file with format: <idx> <sequence> <data>"
+)
+
+parser.add_argument("output_network", help="location to save the trained network")
+
+parser.add_argument(
+    "-d",
+    "--datatype",
+    metavar="dtype",
+    type=str,
+    required=True,
+    help="REQUIRED. Format of the input data file, must be 'sequence' or 'residues'",
+)
+
+parser.add_argument(
+    "-c",
+    "--classes",
+    type=int,
+    metavar="num_classes",
+    required=True,
+    help="REQUIRED. Number of output classes, for regression put 1",
+)
+
+parser.add_argument(
+    "-b",
+    "--batch",
+    default=32,
+    type=int,
+    metavar="batch_size",
+    help="size of training batch (def=32)",
+)
+
+parser.add_argument(
+    "-e",
+    "--epochs",
+    default=100,
+    type=int,
+    metavar="num_epochs",
+    help="number of training epochs (def=100)",
+)
+
+parser.add_argument(
+    "--max-iter",
+    default=50,
+    type=int,
+    metavar="max_iter",
+    help="Maximum number of iterations for the optimization procedure (def=50)",
+)
+
+parser.add_argument(
+    "--split",
+    default="",
+    metavar="split_file",
+    type=str,
+    help="file indicating how to split datafile into training, validation, and testing sets",
+)
+
+parser.add_argument(
+    "--set-fractions",
+    nargs=3,
+    default=[0.7, 0.15, 0.15],
+    type=float,
+    dest="setFractions",
+    metavar=("train", "val", "test"),
+    help="Proportion of dataset that should be divided into training, validation, and test sets",
+)
+
+parser.add_argument(
+    "--encode",
+    default="onehot",
+    type=str,
+    metavar="encoding_scheme",
+    help="'onehot' (default), 'biophysics', or specify a path to a user-created scheme",
+)
+
+parser.add_argument(
+    "--exclude-seq-id",
+    dest="excludeSeqID",
+    action="store_true",
+    help="use if data_file lacks sequence IDs in the first column of each line",
+)
+
+parser.add_argument(
+    "--probabilistic-classification",
+    dest="probabilistic_classification",
+    action="store_true",
+    help="Optional implementation for sequence classificaion",
+)
+
+parser.add_argument(
+    "--include-figs",
+    dest="include_figs",
+    action="store_true",
+    help="Generate figures from training results and save to same location as network",
+)
+
+parser.add_argument(
+    "--no-stats",
+    dest="ignore_metrics",
+    action="store_true",
+    help="If passed, do not output a perfomance stats file.",
+)
+
+parser.add_argument(
+    "--force-cpu",
+    dest="forceCPU",
+    action="store_true",
+    help="force network to train on CPU, even if GPU is available",
+)
+
+parser.add_argument(
+    "--ignore-warnings",
+    "-w",
+    dest="ignore_warnings",
+    action="store_true",
+    help="Do not display warnings for dataset structure",
+)
+
+parser.add_argument(
+    "--save-splits",
+    dest="save_splits",
+    action="store_true",
+    help="Save a split-file using the random splits from this run",
+)
+
+parser.add_argument(
+    "--verbose",
+    "-v",
+    action="store_true",
+    help="Flag which, if provided, causes output to terminal to be more descriptive",
+)
+
+parser.add_argument(
+    "--silent",
+    action="store_true",
+    help="Flag which, if provided, ensures no output is generated to the terminal",
+)
 
 args = parser.parse_args()
 
@@ -119,29 +204,42 @@ save_splits = args.save_splits
 
 # Device configuration
 if forceCPU:
-    device = 'cpu'
+    device = "cpu"
+    device_string = "cpu"
 else:
-    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    if torch.cuda.is_available():
+        device_string = "cuda"
+        device = torch.device(device_string)
+    elif torch.backends.mps.is_available() and torch.backends.mps.is_built():
+        # Use MPS if available on ARM-based MacBooks
+        device_string = "mps"
+        device = torch.device(device_string)
+    else:
+        device_string = "cpu"
+        device = torch.device(device_string)
+
+if verbose:
+    print(f"Torch device={device_string}")
 
 ###############################################################################
 ################    Validate arguments and initialize:      ###################
 
 # Ensure that provided data file exists
-data_file = validate_args.check_file_exists(args.data_file, 'Datafile')
+data_file = validate_args.check_file_exists(args.data_file, "Datafile")
 
 # Extract output directory and output prediction file name
 network_file = os.path.abspath(args.output_network)
 filename_prefix, output_dir = validate_args.split_file_and_directory(network_file)
 
 # If provided, check that split_file exists
-if split_file != '':
-    split_file = validate_args.check_file_exists(split_file, 'Split-file')
+if split_file != "":
+    split_file = validate_args.check_file_exists(split_file, "Split-file")
 else:
     split_file = None
 
 # If specified, get location where randomly generated train/val/test splits will be saved
 if save_splits:
-    save_splits_output = filename_prefix + '_split_file.txt'
+    save_splits_output = filename_prefix + "_split_file.txt"
 else:
     save_splits_output = None
 
@@ -152,20 +250,22 @@ encoding_scheme, encoder, input_size = validate_args.set_encoding_scheme(encode)
 problem_type, collate_function = validate_args.set_ml_task(num_classes, dtype)
 
 # Ensure that network hyperparams (not being optimized) are valid
-validate_args.check_positive(num_epochs, 'Number of epochs')
-validate_args.check_positive(batch_size, 'Batch size')
+validate_args.check_positive(num_epochs, "Number of epochs")
+validate_args.check_positive(batch_size, "Batch size")
 
 # Ensure that the sum of setFractions adds up to 1
 for frac in setFractions:
-    validate_args.check_between_zero_and_one(frac, 'Set fractions')
+    validate_args.check_between_zero_and_one(frac, "Set fractions")
 if sum(setFractions) != 1.0:
-    raise ValueError('Set fractions must sum to 1.')
+    raise ValueError("Set fractions must sum to 1.")
 
 # Ensure that task is binary sequence classification if
 # probabilistic_classfication is set
 if probabilistic_classification:
-    if dtype != 'sequence' or num_classes < 2:
-        raise ValueError('Probabilistic classification only implemented for sequence classification')
+    if dtype != "sequence" or num_classes < 2:
+        raise ValueError(
+            "Probabilistic classification only implemented for sequence classification"
+        )
 
 # Set ignore_warnings to True if --silent is provided
 if silent:
@@ -175,12 +275,20 @@ if silent:
 ################################  Main code  ##################################
 
 # Split data
-cvs, train, val, test = pid.split_data_cv(data_file, datatype=dtype, problem_type=problem_type,
-                                          num_classes=num_classes, excludeSeqID=excludeSeqID,
-                                          split_file=split_file, encoding_scheme=encoding_scheme,
-                                          encoder=encoder, ignoreWarnings=ignore_warnings,
-                                          percent_val=setFractions[1], percent_test=setFractions[2],
-                                          save_splits_output=save_splits_output)
+cvs, train, val, test = pid.split_data_cv(
+    data_file,
+    datatype=dtype,
+    problem_type=problem_type,
+    num_classes=num_classes,
+    excludeSeqID=excludeSeqID,
+    split_file=split_file,
+    encoding_scheme=encoding_scheme,
+    encoder=encoder,
+    ignoreWarnings=ignore_warnings,
+    percent_val=setFractions[1],
+    percent_test=setFractions[2],
+    save_splits_output=save_splits_output,
+)
 
 # Assess batch size compared to training set size
 if not ignore_warnings:
@@ -189,10 +297,18 @@ if not ignore_warnings:
 # Convert CV datasets to dataloaders
 cv_loaders = []
 for cv_train, cv_val in cvs:
-    cv_train_loader = torch.utils.data.DataLoader(dataset=cv_train, batch_size=batch_size,
-                                                  collate_fn=collate_function, shuffle=True)
-    cv_val_loader = torch.utils.data.DataLoader(dataset=cv_val, batch_size=batch_size,
-                                                collate_fn=collate_function, shuffle=False)
+    cv_train_loader = torch.utils.data.DataLoader(
+        dataset=cv_train,
+        batch_size=batch_size,
+        collate_fn=collate_function,
+        shuffle=True,
+    )
+    cv_val_loader = torch.utils.data.DataLoader(
+        dataset=cv_val,
+        batch_size=batch_size,
+        collate_fn=collate_function,
+        shuffle=False,
+    )
     cv_loaders.append((cv_train_loader, cv_val_loader))
 
 # Output to std out
@@ -202,7 +318,7 @@ if silent is False:
     print("PARROT with hyperparameter optimization")
     print("---------------------------------------")
     if verbose:
-        print('Train on:\t%s' % device)
+        print("Train on:\t%s" % device)
         print("Datatype:\t%s" % dtype)
         print("ML Task:\t%s" % problem_type)
         print("Batch size:\t%d" % batch_size)
@@ -210,70 +326,109 @@ if silent is False:
         print("Number of optimization iterations:\t%d\n" % max_iterations)
 
 # Optimization procedure
-optimizer = bayesian_optimization.BayesianOptimizer(cv_loaders, input_size, num_epochs,
-                                                    num_classes, dtype, network_file,
-                                                    max_iterations, device, silent)
+optimizer = bayesian_optimization.BayesianOptimizer(
+    cv_loaders,
+    input_size,
+    num_epochs,
+    num_classes,
+    dtype,
+    network_file,
+    max_iterations,
+    device,
+    silent,
+)
 
 best_hyperparams = optimizer.optimize()
-lr = 10**best_hyperparams[0]
+lr = 10 ** best_hyperparams[0]
 nl = int(best_hyperparams[1])
 hs = int(best_hyperparams[2])
 
 # Save these hyperparamters to a file so that the user has a record
 # TODO: move to helper function
-params_file = filename_prefix + '_optimal_hyperparams.txt'
-with open(params_file, 'w') as f:
-    f.write('Learning rate: %.5f\n' % lr)
-    f.write('Num Layers: %d\n' % nl)
-    f.write('Hidden vector size: %d\n' % hs)
+params_file = filename_prefix + "_optimal_hyperparams.txt"
+with open(params_file, "w") as f:
+    f.write("Learning rate: %.5f\n" % lr)
+    f.write("Num Layers: %d\n" % nl)
+    f.write("Hidden vector size: %d\n" % hs)
 
 # Use these best hyperparams to train the network from scratch using the entire train/val sets
 # Add data to dataloaders
-train_loader = torch.utils.data.DataLoader(dataset=train,
-                                           batch_size=batch_size,
-                                           collate_fn=collate_function,
-                                           shuffle=True)
-val_loader = torch.utils.data.DataLoader(dataset=val,
-                                         batch_size=batch_size,
-                                         collate_fn=collate_function,
-                                         shuffle=False)
-test_loader = torch.utils.data.DataLoader(dataset=test,
-                                          batch_size=1,		# Set test batch size to 1
-                                          collate_fn=collate_function,
-                                          shuffle=False)
+train_loader = torch.utils.data.DataLoader(
+    dataset=train, batch_size=batch_size, collate_fn=collate_function, shuffle=True
+)
+val_loader = torch.utils.data.DataLoader(
+    dataset=val, batch_size=batch_size, collate_fn=collate_function, shuffle=False
+)
+test_loader = torch.utils.data.DataLoader(
+    dataset=test,
+    batch_size=1,  # Set test batch size to 1
+    collate_fn=collate_function,
+    shuffle=False,
+)
 
 # Initialize network:
-if dtype == 'sequence':
-    brnn_network = brnn_architecture.BRNN_MtO(input_size, hs, nl, num_classes, device).to(device)
+if dtype == "sequence":
+    brnn_network = brnn_architecture.BRNN_MtO(
+        input_size, hs, nl, num_classes, device
+    ).to(device)
 else:  # dtype == 'residues'
-    brnn_network = brnn_architecture.BRNN_MtM(input_size, hs, nl, num_classes, device).to(device)
+    brnn_network = brnn_architecture.BRNN_MtM(
+        input_size, hs, nl, num_classes, device
+    ).to(device)
 
 # Train network
 if silent is False:
-    print('Training with optimal hyperparams:')
-train_loss, val_loss = train_network.train(brnn_network, train_loader, val_loader, datatype=dtype,
-                                           problem_type=problem_type, weights_file=network_file, 
-                                           stop_condition='iter', device=device, learn_rate=lr, 
-                                           n_epochs=num_epochs*2, verbose=verbose, silent=silent)
+    print("Training with optimal hyperparams:")
+train_loss, val_loss = train_network.train(
+    brnn_network,
+    train_loader,
+    val_loader,
+    datatype=dtype,
+    problem_type=problem_type,
+    weights_file=network_file,
+    stop_condition="iter",
+    device=device,
+    learn_rate=lr,
+    n_epochs=num_epochs * 2,
+    verbose=verbose,
+    silent=silent,
+)
 
 if include_figs:  # Plot training & validation loss per epoch
     brnn_plot.training_loss(train_loss, val_loss, output_file_prefix=filename_prefix)
 
 # Test network
-test_loss, test_set_predictions = train_network.test_labeled_data(brnn_network, test_loader,
-                                                datatype=dtype, problem_type=problem_type,
-                                                weights_file=network_file, num_classes=num_classes,
-                                                probabilistic_classification=probabilistic_classification,
-                                                include_figs=include_figs, device=device,
-                                                output_file_prefix=filename_prefix)
+test_loss, test_set_predictions = train_network.test_labeled_data(
+    brnn_network,
+    test_loader,
+    datatype=dtype,
+    problem_type=problem_type,
+    weights_file=network_file,
+    num_classes=num_classes,
+    probabilistic_classification=probabilistic_classification,
+    include_figs=include_figs,
+    device=device,
+    output_file_prefix=filename_prefix,
+)
 if silent is False:
-    print('\nTest Loss: %.4f' % test_loss)
+    print("\nTest Loss: %.4f" % test_loss)
 
 # Output performance metrics
 if not ignore_metrics:
-    brnn_plot.write_performance_metrics(test_set_predictions, dtype, problem_type,
-                                      probabilistic_classification, filename_prefix)
-    
+    brnn_plot.write_performance_metrics(
+        test_set_predictions,
+        dtype,
+        problem_type,
+        probabilistic_classification,
+        filename_prefix,
+    )
+
 # Output the test set predictions to a text file
-brnn_plot.output_predictions_to_file(test_set_predictions, excludeSeqID, encoding_scheme,
-                                    probabilistic_classification, encoder, output_file_prefix=filename_prefix)
+brnn_plot.output_predictions_to_file(
+    test_set_predictions,
+    excludeSeqID,
+    encoding_scheme,
+    probabilistic_classification,
+    encoder,
+    output_file_prefix=filename_prefix,
+)
diff --git a/scripts/parrot-train b/scripts/parrot-train
index 39ae2b9..39dce11 100755
--- a/scripts/parrot-train
+++ b/scripts/parrot-train
@@ -1,7 +1,7 @@
 #!/usr/bin/env python
 """
 Usage: $ parrot-train data_file output_network <flags>
-  
+
 Driver script for training a bidirectional recurrent neural network with user
 specified parameters. For more information on usage, use the '-h' flag.
 
@@ -12,7 +12,7 @@ idptools-parrot was developed by the Holehouse lab
 Question/comments/concerns? Raise an issue on github:
 https://github.com/idptools/parrot
 
-Licensed under the MIT license. 
+Licensed under the MIT license.
 """
 
 import os
@@ -32,75 +32,182 @@ from parrot.tools import validate_args
 from parrot.tools import dataset_warnings
 
 # Parse the command line arguments
-parser = argparse.ArgumentParser(description='Train and test a bi-directional RNN using entire sequence.')
-
-parser.add_argument('data_file', help='path to tsv file with format: <idx> <sequence> <data>')
-
-parser.add_argument('output_network', help='location to save the trained network')
-
-parser.add_argument('-d', '--datatype', metavar='dtype', type=str, required=True,
-                    help="REQUIRED. Format of the input data file, must be 'sequence' or 'residues'")
-
-parser.add_argument('-c', '--classes', type=int, metavar='num_classes', required=True,
-                    help='REQUIRED. Number of output classes, for regression put 1')
-
-parser.add_argument('-hs', '--hidden-size', default=10, type=int, metavar='hidden_size',
-                    help='hidden vector size (def=10)')
-
-parser.add_argument('-nl', '--num-layers', default=1, type=int, metavar='num_layers',
-                    help='number of layers per direction (def=1)')
-
-parser.add_argument('-lr', '--learning-rate', default=0.001, type=float,
-                    metavar='learning_rate', help='(def=0.001)')
-
-parser.add_argument('-b', '--batch', default=32, type=int, metavar='batch_size',
-                    help='size of training batch (def=32)')
-
-parser.add_argument('-e', '--epochs', default=100, type=int, metavar='num_epochs',
-                    help='number of training epochs (def=100)')
-
-parser.add_argument('--split', default='', metavar='split_file', type=str,
-                    help="file indicating how to split datafile into training, validation, and test sets")
-
-parser.add_argument('--stop', default='iter', metavar='stop_condition',
-                    type=str, help="training stop condition: either 'auto' or 'iter' (default 'iter')")
-
-parser.add_argument('--set-fractions', nargs=3, default=[0.7, 0.15, 0.15], type=float,
-                    dest='setFractions', metavar=('train', 'val', 'test'),
-                    help='proportion of dataset that should be divided into training, validation, and test sets')
-
-parser.add_argument('--encode', default='onehot', type=str, metavar='encoding_scheme',
-                    help="'onehot' (default), 'biophysics', or specify a path to a user-created scheme")
-
-parser.add_argument('--exclude-seq-id', dest='excludeSeqID', action='store_true',
-                    help='use if data_file lacks sequence IDs in the first column of each line')
-
-parser.add_argument('--probabilistic-classification', dest='probabilistic_classification',
-                    action='store_true', help='Optional implementation for sequence classificaion')
-
-parser.add_argument('--include-figs', dest='include_figs', action='store_true',
-                    help='Generate figures from training results and save to same location as network')
-
-parser.add_argument('--no-stats', dest='ignore_metrics', action='store_true',
-                    help='If passed, do not output a performance stats file.')
-
-parser.add_argument('--force-cpu', dest='forceCPU', action='store_true',
-                    help='force network to train on CPU, even if GPU is available')
-
-parser.add_argument('--gpu-id', dest='gpu_id', type=int, 
-                    help='User defined control over which CUDA device will be used by parrot')
-
-parser.add_argument('--ignore-warnings', '-w', dest='ignore_warnings', action='store_true',
-                    help='Do not display warnings for dataset structure')
-
-parser.add_argument('--save-splits', dest='save_splits', action='store_true',
-                    help='Save a split-file using the random splits from this run')
-
-parser.add_argument('--verbose', '-v', action='store_true',
-                    help='Flag which, if provided, causes output to terminal to be more descriptive')
-
-parser.add_argument('--silent', action='store_true',
-                    help="Flag which, if provided, ensures no output is generated to the terminal")
+parser = argparse.ArgumentParser(
+    description="Train and test a bi-directional RNN using entire sequence."
+)
+
+parser.add_argument(
+    "data_file", help="path to tsv file with format: <idx> <sequence> <data>"
+)
+
+parser.add_argument("output_network", help="location to save the trained network")
+
+parser.add_argument(
+    "-d",
+    "--datatype",
+    metavar="dtype",
+    type=str,
+    required=True,
+    help="REQUIRED. Format of the input data file, must be 'sequence' or 'residues'",
+)
+
+parser.add_argument(
+    "-c",
+    "--classes",
+    type=int,
+    metavar="num_classes",
+    required=True,
+    help="REQUIRED. Number of output classes, for regression put 1",
+)
+
+parser.add_argument(
+    "-hs",
+    "--hidden-size",
+    default=10,
+    type=int,
+    metavar="hidden_size",
+    help="hidden vector size (def=10)",
+)
+
+parser.add_argument(
+    "-nl",
+    "--num-layers",
+    default=1,
+    type=int,
+    metavar="num_layers",
+    help="number of layers per direction (def=1)",
+)
+
+parser.add_argument(
+    "-lr",
+    "--learning-rate",
+    default=0.001,
+    type=float,
+    metavar="learning_rate",
+    help="(def=0.001)",
+)
+
+parser.add_argument(
+    "-b",
+    "--batch",
+    default=32,
+    type=int,
+    metavar="batch_size",
+    help="size of training batch (def=32)",
+)
+
+parser.add_argument(
+    "-e",
+    "--epochs",
+    default=100,
+    type=int,
+    metavar="num_epochs",
+    help="number of training epochs (def=100)",
+)
+
+parser.add_argument(
+    "--split",
+    default="",
+    metavar="split_file",
+    type=str,
+    help="file indicating how to split datafile into training, validation, and test sets",
+)
+
+parser.add_argument(
+    "--stop",
+    default="iter",
+    metavar="stop_condition",
+    type=str,
+    help="training stop condition: either 'auto' or 'iter' (default 'iter')",
+)
+
+parser.add_argument(
+    "--set-fractions",
+    nargs=3,
+    default=[0.7, 0.15, 0.15],
+    type=float,
+    dest="setFractions",
+    metavar=("train", "val", "test"),
+    help="proportion of dataset that should be divided into training, validation, and test sets",
+)
+
+parser.add_argument(
+    "--encode",
+    default="onehot",
+    type=str,
+    metavar="encoding_scheme",
+    help="'onehot' (default), 'biophysics', or specify a path to a user-created scheme",
+)
+
+parser.add_argument(
+    "--exclude-seq-id",
+    dest="excludeSeqID",
+    action="store_true",
+    help="use if data_file lacks sequence IDs in the first column of each line",
+)
+
+parser.add_argument(
+    "--probabilistic-classification",
+    dest="probabilistic_classification",
+    action="store_true",
+    help="Optional implementation for sequence classificaion",
+)
+
+parser.add_argument(
+    "--include-figs",
+    dest="include_figs",
+    action="store_true",
+    help="Generate figures from training results and save to same location as network",
+)
+
+parser.add_argument(
+    "--no-stats",
+    dest="ignore_metrics",
+    action="store_true",
+    help="If passed, do not output a performance stats file.",
+)
+
+parser.add_argument(
+    "--force-cpu",
+    dest="forceCPU",
+    action="store_true",
+    help="force network to train on CPU, even if GPU is available",
+)
+
+parser.add_argument(
+    "--gpu-id",
+    dest="gpu_id",
+    type=int,
+    help="User defined control over which CUDA device will be used by parrot",
+)
+
+parser.add_argument(
+    "--ignore-warnings",
+    "-w",
+    dest="ignore_warnings",
+    action="store_true",
+    help="Do not display warnings for dataset structure",
+)
+
+parser.add_argument(
+    "--save-splits",
+    dest="save_splits",
+    action="store_true",
+    help="Save a split-file using the random splits from this run",
+)
+
+parser.add_argument(
+    "--verbose",
+    "-v",
+    action="store_true",
+    help="Flag which, if provided, causes output to terminal to be more descriptive",
+)
+
+parser.add_argument(
+    "--silent",
+    action="store_true",
+    help="Flag which, if provided, ensures no output is generated to the terminal",
+)
 
 args = parser.parse_args()
 print(args)
@@ -134,32 +241,48 @@ save_splits = args.save_splits
 
 # Device configuration
 if forceCPU:
-    device = 'cpu'
+    device = "cpu"
+    device_string = "cpu"
 elif gpu_id:
-    device = torch.device(f"cuda:{gpu_id}" if torch.cuda.is_available() else 'cpu')
-    print(f"You've specified to run this network on cuda:{gpu_id}. Running on {device=}")
+    device = torch.device(f"cuda:{gpu_id}" if torch.cuda.is_available() else "cpu")
+    device_string = "cuda"
+    print(
+        f"You've specified to run this network on cuda:{gpu_id}. Running on {device=}"
+    )
 else:
-    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    if torch.cuda.is_available():
+        device_string = "cuda"
+        device = torch.device(device_string)
+    elif torch.backends.mps.is_available() and torch.backends.mps.is_built():
+        # Use MPS if available on ARM-based MacBooks
+        device_string = "mps"
+        device = torch.device(device_string)
+    else:
+        device_string = "cpu"
+        device = torch.device(device_string)
+
+if verbose:
+    print(f"Torch device={device_string}")
 
 ###############################################################################
 #############    Validate arguments and initialize network:      ##############
 
 # Ensure that provided data file exists
-data_file = validate_args.check_file_exists(args.data_file, 'Datafile')
+data_file = validate_args.check_file_exists(args.data_file, "Datafile")
 
 # Extract output directory and output prediction file name
 network_file = os.path.abspath(args.output_network)
 filename_prefix, output_dir = validate_args.split_file_and_directory(network_file)
 
 # If provided, check that split_file exists
-if split_file != '':
-    split_file = validate_args.check_file_exists(split_file, 'Split-file')
+if split_file != "":
+    split_file = validate_args.check_file_exists(split_file, "Split-file")
 else:
     split_file = None
 
 # If specified, get location where randomly generated train/val/test splits will be saved
 if save_splits:
-    save_splits_output = filename_prefix + '_split_file.txt'
+    save_splits_output = filename_prefix + "_split_file.txt"
 else:
     save_splits_output = None
 
@@ -170,69 +293,81 @@ encoding_scheme, encoder, input_size = validate_args.set_encoding_scheme(encode)
 problem_type, collate_function = validate_args.set_ml_task(num_classes, dtype)
 
 # Ensure that network hyperparams are valid
-validate_args.check_between_zero_and_one(learning_rate, 'Learning rate')
-validate_args.check_positive(hidden_size, 'Hidden vector size')
-validate_args.check_positive(num_layers, 'Number of layers')
-validate_args.check_positive(num_epochs, 'Number of epochs')
-validate_args.check_positive(batch_size, 'Batch size')
+validate_args.check_between_zero_and_one(learning_rate, "Learning rate")
+validate_args.check_positive(hidden_size, "Hidden vector size")
+validate_args.check_positive(num_layers, "Number of layers")
+validate_args.check_positive(num_epochs, "Number of epochs")
+validate_args.check_positive(batch_size, "Batch size")
 
 # Ensure that stop condition is 'iter' or 'auto'
 validate_args.check_stop_condition(stop_cond, num_epochs)
 
 # Ensure that the sum of setFractions adds up to 1
 for frac in setFractions:
-    validate_args.check_between_zero_and_one(frac, 'Set fractions')
+    validate_args.check_between_zero_and_one(frac, "Set fractions")
 if sum(setFractions) != 1.0:
-    raise ValueError('Set fractions must sum to 1.')
+    raise ValueError("Set fractions must sum to 1.")
 
 # Ensure that task is binary sequence classification if
 # probabilistic_classfication is set
 if probabilistic_classification:
-    if dtype != 'sequence' or num_classes < 2:
-        raise ValueError('Probabilistic classification only implemented for sequence classification')
+    if dtype != "sequence" or num_classes < 2:
+        raise ValueError(
+            "Probabilistic classification only implemented for sequence classification"
+        )
 
 # Set ignore_warnings to True if --silent is provided
 if silent:
     ignore_warnings = True
 
 # Initialize network architecture depending on data format
-if dtype == 'sequence':
+if dtype == "sequence":
     # Use a many-to-one architecture
-    brnn_network = brnn_architecture.BRNN_MtO(input_size, hidden_size,
-                                              num_layers, num_classes, device).to(device)
-elif dtype == 'residues':
+    brnn_network = brnn_architecture.BRNN_MtO(
+        input_size, hidden_size, num_layers, num_classes, device
+    ).to(device)
+elif dtype == "residues":
     # Use a many-to-many architecture
-    brnn_network = brnn_architecture.BRNN_MtM(input_size, hidden_size,
-                                              num_layers, num_classes, device).to(device)
+    brnn_network = brnn_architecture.BRNN_MtM(
+        input_size, hidden_size, num_layers, num_classes, device
+    ).to(device)
 
 ###############################################################################
 ################################  Main code  ##################################
 
 # Split data
-train, val, test = pid.split_data(data_file, datatype=dtype, problem_type=problem_type,
-                                  num_classes=num_classes, excludeSeqID=excludeSeqID, 
-                                  split_file=split_file, encoding_scheme=encoding_scheme, 
-                                  encoder=encoder, percent_val=setFractions[1], 
-                                  percent_test=setFractions[2], ignoreWarnings=ignore_warnings,
-                                  save_splits_output=save_splits_output)
+train, val, test = pid.split_data(
+    data_file,
+    datatype=dtype,
+    problem_type=problem_type,
+    num_classes=num_classes,
+    excludeSeqID=excludeSeqID,
+    split_file=split_file,
+    encoding_scheme=encoding_scheme,
+    encoder=encoder,
+    percent_val=setFractions[1],
+    percent_test=setFractions[2],
+    ignoreWarnings=ignore_warnings,
+    save_splits_output=save_splits_output,
+)
 
 # Assess batch size compared to training set size
 if not ignore_warnings:
     dataset_warnings.eval_batch_size(batch_size, len(train))
 
 # Add data to dataloaders
-train_loader = torch.utils.data.DataLoader(dataset=train,
-                                           batch_size=batch_size,
-                                           collate_fn=collate_function,
-                                           shuffle=True)
-val_loader = torch.utils.data.DataLoader(dataset=val,
-                                         batch_size=batch_size,
-                                         collate_fn=collate_function,
-                                         shuffle=False)
-test_loader = torch.utils.data.DataLoader(dataset=test,
-                                          batch_size=1,		# Set test batch size to 1
-                                          collate_fn=collate_function,
-                                          shuffle=False)
+train_loader = torch.utils.data.DataLoader(
+    dataset=train, batch_size=batch_size, collate_fn=collate_function, shuffle=True
+)
+val_loader = torch.utils.data.DataLoader(
+    dataset=val, batch_size=batch_size, collate_fn=collate_function, shuffle=False
+)
+test_loader = torch.utils.data.DataLoader(
+    dataset=test,
+    batch_size=1,  # Set test batch size to 1
+    collate_fn=collate_function,
+    shuffle=False,
+)
 
 # Output to std out
 # TODO: move to helper function in /tools
@@ -241,7 +376,7 @@ if silent is False:
     print("PARROT with user-specified parameters")
     print("-------------------------------------")
     if verbose > 1:
-        print('Train on:\t%s' % device)
+        print("Train on:\t%s" % device)
         print("Datatype:\t%s" % dtype)
         print("ML Task:\t%s" % problem_type)
         print("Learning rate:\t%f" % learning_rate)
@@ -252,31 +387,57 @@ if silent is False:
     print("Validation set loss per epoch:")
 
 # Train network
-train_loss, val_loss = train_network.train(brnn_network, train_loader, val_loader, datatype=dtype,
-                                           problem_type=problem_type, weights_file=network_file, 
-                                           stop_condition=stop_cond, device=device, 
-                                           learn_rate=learning_rate, n_epochs=num_epochs, 
-                                           verbose=verbose, silent=silent)
+train_loss, val_loss = train_network.train(
+    brnn_network,
+    train_loader,
+    val_loader,
+    datatype=dtype,
+    problem_type=problem_type,
+    weights_file=network_file,
+    stop_condition=stop_cond,
+    device=device,
+    learn_rate=learning_rate,
+    n_epochs=num_epochs,
+    verbose=verbose,
+    silent=silent,
+)
 
 if include_figs:  # Plot training & validation loss per epoch
     brnn_plot.training_loss(train_loss, val_loss, output_file_prefix=filename_prefix)
 
 # Test network
-test_loss, test_set_predictions = train_network.test_labeled_data(brnn_network, test_loader,
-                                                datatype=dtype, problem_type=problem_type,
-                                                weights_file=network_file, num_classes=num_classes,
-                                                probabilistic_classification=probabilistic_classification,
-                                                include_figs=include_figs, device=device,
-                                                output_file_prefix=filename_prefix)
+test_loss, test_set_predictions = train_network.test_labeled_data(
+    brnn_network,
+    test_loader,
+    datatype=dtype,
+    problem_type=problem_type,
+    weights_file=network_file,
+    num_classes=num_classes,
+    probabilistic_classification=probabilistic_classification,
+    include_figs=include_figs,
+    device=device,
+    output_file_prefix=filename_prefix,
+)
 
 if silent is False:
-    print('\nTest Loss: %.4f' % test_loss)
+    print("\nTest Loss: %.4f" % test_loss)
 
 # Output performance metrics
 if not ignore_metrics:
-    brnn_plot.write_performance_metrics(test_set_predictions, dtype, problem_type,
-                                      probabilistic_classification, filename_prefix)
+    brnn_plot.write_performance_metrics(
+        test_set_predictions,
+        dtype,
+        problem_type,
+        probabilistic_classification,
+        filename_prefix,
+    )
 
 # Output the test set predictions to a text file
-brnn_plot.output_predictions_to_file(test_set_predictions, excludeSeqID, encoding_scheme,
-                                probabilistic_classification, encoder, output_file_prefix=filename_prefix)
+brnn_plot.output_predictions_to_file(
+    test_set_predictions,
+    excludeSeqID,
+    encoding_scheme,
+    probabilistic_classification,
+    encoder,
+    output_file_prefix=filename_prefix,
+)