XanaduAI · josephbowles · Jul 24, 2024 · Jul 24, 2024 · Jul 24, 2024 · Jul 24, 2024
diff --git a/README.md b/README.md
@@ -1,7 +1,7 @@
 # Benchmarking for quantum machine learning models
 
 This repository contains tools to compare the performance of near-term quantum machine learning (QML)
-as well as standard classical machine learning models on supervised learning tasks. 
+as well as standard classical machine learning models on supervised and generative learning tasks. 
 
 It is based on pipelines using [Pennylane](https://pennylane.ai/) for the simulation of quantum circuits, 
 [JAX](https://jax.readthedocs.io/en/latest/index.html) for training, 
@@ -39,12 +39,12 @@ Dependencies of this package can be installed in your environment by running
 pip install -r requirements.txt
 ```
 
-## Adding a custom model
+## Adding a custom classifier
 
 We use the [Scikit-learn API](https://scikit-learn.org/stable/developers/develop.html) to create 
 models and perform hyperparameter search.
 
-A minimal template for a new quantum model is as follows, and can be stored 
+A minimal template for a new quantum classifier is as follows, and can be stored 
 in `qml_benchmarks/models/my_model.py`:
 
 ```python
@@ -61,18 +61,23 @@ class MyModel(BaseEstimator, ClassifierMixin):
 
         # reproducibility is ensured by creating a numpy PRNG and using it for all
         # subsequent random functions. 
-        self._random_state = random_state
-        self._rng = np.random.default_rng(random_state)
+        self.random_state = random_state
+        self.rng = np.random.default_rng(random_state)
 
         # define data-dependent attributes
         self.params_ = None
         self.n_qubits_ = None
+
+    def initialize(self, args):
+        """
+        initialize the model if necessary
+        """
+        # ... your code here ...   
 
     def fit(self, X, y):
         """Fit the model to data X and labels y.
 
         Add your custom training loop here and store the trained model parameters in `self.params_`.
-        Set the data-dependent attributes, such as `self.n_qubits_`.
 
         Args:
             X (array_like): Data of shape (n_samples, n_features)
@@ -146,9 +151,86 @@ model.fit(X_train, y_train)
 print(model.score(X_test, y_test))
 ```
 
+
+## Adding a custom generative model
+
+The minimal template for a new generative model closely follows that of the classifier models.
+Labels are set to `None` throughout to maintain sci-kit learn functionality. 
+
+```python
+import numpy as np
+
+from sklearn.base import BaseEstimator
+
+
+class MyModel(BaseEstimator):
+    def __init__(self, hyperparam1="some_value",  random_state=42):
+
+        # store hyperparameters as attributes
+        self.hyperparam1 = hyperparam1
+
+        # reproducibility is ensured by creating a numpy PRNG and using it for all
+        # subsequent random functions. 
+        self.random_state = random_state
+        self.rng = np.random.default_rng(random_state)
+
+        # define data-dependent attributes
+        self.params_ = None
+        self.n_qubits_ = None
+
+    def initialize(self, args):
+        """
+        initialize the model if necessary
+        """
+        # ... your code here ...   
+
+    def fit(self, X, y=None):
+        """Fit the model to data X.
+
+        Add your custom training loop here and store the trained model parameters in `self.params_`.
+
+        Args:
+            X (array_like): Data of shape (n_samples, n_features)
+            y (array_like): not used (no labels)
+        """
+        # ... your code here ...        
+
+    def sample(self, num_samples):
+        """sample from the generative model
+
+        Args:
+            num_samples (int): number of points to sample
+
+        Returns:
+            array_like: sampled points
+        """
+        # ... your code here ...
+
+        return samples
+
+    def score(self, X, y=None):
+        """A optional custom score function to be used with hyperparameter optimization
+        Args:
+            X (array_like): Data of shape (n_samples, n_features)
+            y: unused (no labels for generative models)
+
+        Returns:
+            (float): score for the dataset X
+        """
+        # ... your code here ...
+        return score
+```
+
+If the model samples binary data, it is recommended to construct models that sample binary strings (rather than $\pm1$ valued strings) 
+to align with the datasets designed for generative models.
+Energy based models can easily be constructed by replacing the multilayer perceptron neural network in `DeepEBM` by
+any other differentiable network written in `flax`. 
+
 ## Datasets
 
-The `qml_benchmarks.data` module provides generating functions to create datasets for binary classification. 
+The `qml_benchmarks.data` module provides generating functions to create datasets for binary classification and
+generative learning.
+
 A generating function can be used like this:
 
 ```python
@@ -158,7 +240,7 @@ X, y = generate_two_curves(n_samples=200, n_features=4, degree=3, noise=0.1, off
 ```
 
 Note that some datasets might have different return data structures, for example if the train/test split 
-is performed by the generating function.
+is performed by the generating function. If the dataset does not include labels, `y = None` is returned. 
 
 The original datasets used in the paper can be generated by running the scripts in the `paper/benchmarks` folder, 
 such as:
@@ -172,15 +254,18 @@ This will create a new folder in `paper/benchmarks` containing the datasets.
 ## Running hyperparameter optimization
 
 In the folder `scripts` we provide an example that can be used to
-generate results for a hyperparameter search for any model and dataset. The script
+generate results for a hyperparameter search for any model and dataset. The script functions
+for both classifier and generative models. The script
 can be run as
 
 ```
-python run_hyperparameter_search.py --classifier-name "DataReuploadingClassifier" --dataset-path "my_dataset.csv"
+python run_hyperparameter_search.py --model "DataReuploadingClassifier" --dataset-path "my_dataset.csv"
 ```
 
-where `my_dataset.csv` is a CSV file containing the training data such that each column is a feature
-and the last column is the target.
+where`my_dataset.csv` is a CSV file containing the training data. For classification problems, each column should 
+correspond to a feature and the last column to the target. For generative learning, each row
+should correspond to a binary string that specifies a unique data sample, and the model should implement a `score`
+method. 
 
 Unless otherwise specified, the hyperparameter grid is loaded from `qml_benchmarks/hyperparameter_settings.py`.
 One can override the default grid of hyperparameters by specifying the hyperparameter list,
@@ -189,7 +274,7 @@ For example, for the `DataReuploadingClassifier` we can run:
 
 ```
 python run_hyperparameter_search.py \
-    --classifier-name DataReuploadingClassifier \
+    --model DataReuploadingClassifier \
     --dataset-path "my_dataset.csv" \
     --n_layers 1 2 \
     --observable_type "single" "full"\

diff --git a/requirements.txt b/requirements.txt
@@ -10,3 +10,4 @@ pyyaml~=6.0
 pennyLane~=0.34
 scipy~=1.11
 pandas~=2.2
+numpyro~=0.14.0
diff --git a/scripts/run_hyperparameter_search.py b/scripts/run_hyperparameter_search.py
@@ -20,28 +20,33 @@
 import time
 import argparse
 import logging
+
 logging.getLogger().setLevel(logging.INFO)
 from importlib import import_module
 import pandas as pd
 from pathlib import Path
 import matplotlib.pyplot as plt
 from sklearn.model_selection import GridSearchCV
+from sklearn.metrics import make_scorer
+from qml_benchmarks.models.base import BaseGenerator
 from qml_benchmarks.hyperparam_search_utils import read_data, construct_hyperparameter_grid
 from qml_benchmarks.hyperparameter_settings import hyper_parameter_settings
 
 np.random.seed(42)
 
-logging.info('cpu count:' + str(os.cpu_count()))
+def custom_scorer(estimator, X, y=None):
+    return estimator.score(X, y)
 
+logging.info('cpu count:' + str(os.cpu_count()))
 
 if __name__ == "__main__":
     # Create an argument parser
     parser = argparse.ArgumentParser(description="Run experiments with hyperparameter search.",
-                            formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 
     parser.add_argument(
-        "--classifier-name",
-        help="Classifier to run",
+        "--model",
+        help="Model to run",
     )
 
     parser.add_argument(
@@ -91,27 +96,28 @@
     # Parse the arguments along with any extra arguments that might be model specific
     args, unknown_args = parser.parse_known_args()
 
-    if any(arg is None for arg in [args.classifier_name,
+    if any(arg is None for arg in [args.model,
                                    args.dataset_path]):
         msg = "\n================================================================================"
-        msg += "\nA classifier from qml.benchmarks.model and dataset path are required. E.g., \n \n"
-        msg += "python run_hyperparameter_search \ \n--classifier DataReuploadingClassifier \ \n--dataset-path train.csv\n"
+        msg += "\nA model from qml.benchmarks.models and dataset path are required. E.g., \n \n"
+        msg += "python run_hyperparameter_search \ \n--model DataReuploadingClassifier \ \n--dataset-path train.csv\n"
         msg += "\nCheck all arguments for the script with \n"
         msg += "python run_hyperparameter_search --help\n"
         msg += "================================================================================"
         raise ValueError(msg)
-    
+
     # Add model specific arguments to override the default hyperparameter grid
     hyperparam_grid = construct_hyperparameter_grid(
-        hyper_parameter_settings, args.classifier_name
+        hyper_parameter_settings, args.model
     )
+
     for hyperparam in hyperparam_grid:
         hp_type = type(hyperparam_grid[hyperparam][0])
         parser.add_argument(f'--{hyperparam}',
                             type=hp_type,
                             nargs="+",
                             default=hyperparam_grid[hyperparam],
-                            help=f'{hyperparam} grid values for {args.classifier_name}')
+                            help=f'{hyperparam} grid values for {args.model}')
 
     args = parser.parse_args(unknown_args, namespace=args)
 
@@ -122,11 +128,12 @@
     logging.info(
         "Running hyperparameter search experiment with the following settings\n"
     )
-    logging.info(args.classifier_name)
+    logging.info(args.model)
     logging.info(args.dataset_path)
     logging.info(" ".join(args.hyperparameter_scoring))
     logging.info(args.hyperparameter_refit)
-    logging.info("Hyperparam grid:"+" ".join([(str(key)+str(":")+str(hyperparam_grid[key])) for key in hyperparam_grid.keys()]))
+    logging.info("Hyperparam grid:" + " ".join(
+        [(str(key) + str(":") + str(hyperparam_grid[key])) for key in hyperparam_grid.keys()]))
 
     experiment_path = args.results_path
     results_path = os.path.join(experiment_path, "results")
@@ -135,22 +142,25 @@
         os.makedirs(results_path)
 
     ###################################################################
-    # Get the classifier, dataset and search methods from the arguments
+    # Get the model, dataset and search methods from the arguments
     ###################################################################
-    Classifier = getattr(
+    Model = getattr(
         import_module("qml_benchmarks.models"),
-        args.classifier_name
+        args.model
     )
-    classifier_name = Classifier.__name__
+    model_name = Model.__name__
+
+    is_generative = isinstance(Model(), BaseGenerator)
+    use_labels = False if is_generative else True
 
     # Run the experiments save the results
     train_dataset_filename = os.path.join(args.dataset_path)
-    X, y = read_data(train_dataset_filename)
+    X, y = read_data(train_dataset_filename, labels=use_labels)
 
     dataset_path_obj = Path(args.dataset_path)
     results_filename_stem = " ".join(
-            [Classifier.__name__ + "_" + dataset_path_obj.stem
-             + "_GridSearchCV"])
+        [Model.__name__ + "_" + dataset_path_obj.stem
+         + "_GridSearchCV"])
 
     # If we have already run this experiment then continue
     if os.path.isfile(os.path.join(results_path, results_filename_stem + ".csv")):
@@ -162,44 +172,48 @@
             logging.warning(msg)
             sys.exit(msg)
         else:
-            logging.warning("Cleaning existing results for ", os.path.join(results_path, results_filename_stem + ".csv"))
-
+            logging.warning("Cleaning existing results for ",
+                            os.path.join(results_path, results_filename_stem + ".csv"))
 
     ###########################################################################
     # Single fit to check everything works
     ###########################################################################
-    classifier = Classifier()
+    model = Model()
     a = time.time()
-    classifier.fit(X, y)
+    model.fit(X, y)
     b = time.time()
-    acc_train = classifier.score(X, y)
+    default_score = model.score(X, y)
     logging.info(" ".join(
-        [classifier_name,
-        "Dataset path",
-        args.dataset_path,
-        "Train acc:",
-        str(acc_train),
-        "Time single run",
-        str(b - a)])
+        [model_name,
+         "Dataset path",
+         args.dataset_path,
+         "Train score:",
+         str(default_score),
+         "Time single run",
+         str(b - a)])
     )
-    if hasattr(classifier, "loss_history_"):
+    if hasattr(model, "loss_history_"):
         if args.plot_loss:
-            plt.plot(classifier.loss_history_)
+            plt.plot(model.loss_history_)
             plt.xlabel("Iterations")
             plt.ylabel("Loss")
             plt.show()
 
-    if hasattr(classifier, "n_qubits_"):
-        logging.info(" ".join(["Num qubits", f"{classifier.n_qubits_}"]))
+    if hasattr(model, "n_qubits_"):
+        logging.info(" ".join(["Num qubits", f"{model.n_qubits_}"]))
 
     ###########################################################################
     # Hyperparameter search
     ###########################################################################
-    gs = GridSearchCV(estimator=classifier, param_grid=hyperparam_grid,
-                        scoring=args.hyperparameter_scoring,
-                        refit=args.hyperparameter_refit,
-                        verbose=3,
-                        n_jobs=-1).fit(
+
+    scorer = args.hyperparameter_scoring if not is_generative else custom_scorer
+    refit = args.hyperparameter_refit if not is_generative else False
+
+    gs = GridSearchCV(estimator=model, param_grid=hyperparam_grid,
+                      scoring=scorer,
+                      refit=refit,
+                      verbose=3,
+                      n_jobs=args.n_jobs).fit(
         X, y
     )
     logging.info("Best hyperparams")

diff --git a/scripts/score_with_best_hyperparameters.py b/scripts/score_with_best_hyperparameters.py
@@ -12,7 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""Score a model using the best hyperparameters, using a command-line script."""
+"""
+Score a model using the best hyperparameters, using a command-line script.
+Note this is only compatible with supervised models for classification.
+"""
+
 
 import numpy as np
 import sys

diff --git a/src/qml_benchmarks/data/__init__.py b/src/qml_benchmarks/data/__init__.py
@@ -19,4 +19,5 @@
 from qml_benchmarks.data.hyperplanes import generate_hyperplanes_parity
 from qml_benchmarks.data.linearly_separable import generate_linearly_separable
 from qml_benchmarks.data.two_curves import generate_two_curves
-
+from qml_benchmarks.data.spin_blobs import generate_spin_blobs, generate_8blobs
+from qml_benchmarks.data.ising import generate_ising