From f08053c27d6c7c5750416165ed20a40c542f23cd Mon Sep 17 00:00:00 2001
From: danellecline <dcline@mbari.org>
Date: Wed, 15 May 2024 16:32:16 -0700
Subject: [PATCH 01/12] feat: initial commit of new option to cluster roi only

---
 sdcat/__main__.py         | 16 ++++++-
 sdcat/cluster/commands.py | 97 ++++++++++++++++++++++++++++++++++++++-
 2 files changed, 109 insertions(+), 4 deletions(-)

diff --git a/sdcat/__main__.py b/sdcat/__main__.py
index 36fda80..8cf4fc0 100644
--- a/sdcat/__main__.py
+++ b/sdcat/__main__.py
@@ -7,7 +7,7 @@
 import click
 from sdcat.logger import err, info, create_logger_file
 from sdcat import __version__
-from sdcat.cluster.commands import run_cluster
+from sdcat.cluster.commands import run_cluster_det, run_cluster_roi
 from sdcat.detect.commands import run_detect
 
 
@@ -28,7 +28,19 @@ def cli():
     pass
 
 cli.add_command(run_detect)
-cli.add_command(run_cluster)
+
+
+@cli.group(name="cluster")
+def cli_cluster():
+    """
+    Commands related to converting data
+    """
+    pass
+
+
+cli.add_command(cli_cluster)
+cli_cluster.add_command(run_cluster_det)
+cli_cluster.add_command(run_cluster_roi)
 
 
 if __name__ == '__main__':
diff --git a/sdcat/cluster/commands.py b/sdcat/cluster/commands.py
index 72aa06a..5f909fe 100644
--- a/sdcat/cluster/commands.py
+++ b/sdcat/cluster/commands.py
@@ -21,7 +21,7 @@
 from sdcat.cluster.cluster import cluster_vits
 
 
-@click.command('cluster', help='Cluster detections. See cluster --config-ini to override cluster defaults.')
+@click.command('cluster-det', help='Cluster detections. See cluster --config-ini to override cluster defaults.')
 @common_args.config_ini
 @common_args.start_image
 @common_args.end_image
@@ -31,7 +31,7 @@
 @click.option('--alpha', help='Alpha is a parameter that controls the linkage. See https://hdbscan.readthedocs.io/en/latest/parameter_selection.html. Default is 0.92. Increase for less conservative clustering, e.g. 1.0', type=float)
 @click.option('--cluster-selection-epsilon', help='Epsilon is a parameter that controls the linkage. Default is 0. Increase for less conservative clustering', type=float)
 @click.option('--min-cluster-size', help='The minimum number of samples in a group for that group to be considered a cluster. Default is 2. Increase for less conservative clustering, e.g. 5, 15', type=int)
-def run_cluster(det_dir, save_dir, device, config_ini, alpha, cluster_selection_epsilon, min_cluster_size, start_image, end_image):
+def run_cluster_det(det_dir, save_dir, device, config_ini, alpha, cluster_selection_epsilon, min_cluster_size, start_image, end_image):
     config = cfg.Config(config_ini)
     max_area = int(config('cluster', 'max_area'))
     min_area = int(config('cluster', 'min_area'))
@@ -259,3 +259,96 @@ def is_day(utc_dt):
         shutil.copy(Path(config_ini), save_dir / f'{prefix}_config.ini')
     else:
         warn(f'No detections found to cluster')
+
+
+@click.command('cluster-roi', help='Cluster roi. See cluster --config-ini to override cluster defaults.')
+@common_args.config_ini
+@click.option('--roi-dir', help='Input folder(s) with raw ROI images', multiple=True)
+@click.option('--save-dir', help='Output directory to save clustered detection results')
+@click.option('--device', help='Device to use, e.g. cpu or cuda:0', type=str)
+@click.option('--alpha', help='Alpha is a parameter that controls the linkage. See https://hdbscan.readthedocs.io/en/latest/parameter_selection.html. Default is 0.92. Increase for less conservative clustering, e.g. 1.0', type=float)
+@click.option('--cluster-selection-epsilon', help='Epsilon is a parameter that controls the linkage. Default is 0. Increase for less conservative clustering', type=float)
+@click.option('--min-cluster-size', help='The minimum number of samples in a group for that group to be considered a cluster. Default is 2. Increase for less conservative clustering, e.g. 5, 15', type=int)
+def run_cluster_roi(roi_dir, save_dir, device, config_ini, alpha, cluster_selection_epsilon, min_cluster_size, start_image, end_image):
+    config = cfg.Config(config_ini)
+    max_area = int(config('cluster', 'max_area'))
+    min_area = int(config('cluster', 'min_area'))
+    min_samples = int(config('cluster', 'min_samples'))
+    alpha = alpha if alpha else float(config('cluster', 'alpha'))
+    min_cluster_size = min_cluster_size if min_cluster_size else int(config('cluster', 'min_cluster_size'))
+    cluster_selection_epsilon = cluster_selection_epsilon if cluster_selection_epsilon else float(config('cluster','cluster_selection_epsilon'))
+    min_similarity = float(config('cluster', 'min_similarity'))
+    model = config('cluster', 'model')
+
+    if device:
+        num_devices = torch.cuda.device_count()
+        info(f'{num_devices} cuda devices available')
+        info(f'Using device {device}')
+        if 'cuda' in device:
+            device_num = device.split(':')[-1]
+            info(f'Setting CUDA_VISIBLE_DEVICES to {device_num}')
+            torch.cuda.set_device(device)
+            os.environ['CUDA_VISIBLE_DEVICES'] = device_num
+
+    save_dir = Path(save_dir)
+    save_dir.mkdir(parents=True, exist_ok=True)
+
+
+    # Grab all images from the input directories
+    supported_extensions = ['.png', '.jpg', '.jpeg', '.JPG', '.JPEG', '.PNG']
+    images = []
+
+    detections = []
+    roi_path = Path(roi_dir)
+    for ext in supported_extensions:
+        images.extend(list(roi_path.rglob(f'*{ext}')))
+
+    # Create a dataframe to store the combined data in an image_path column in sorted order
+    df = pd.DataFrame()
+    df['image_path'] = images
+
+    info(f'Found {len(df)} detections in {roi_path}')
+
+    if len(df) == 0:
+        info(f'No detections found in {roi_path}')
+        return
+
+    # Sort the dataframe by image_path to make sure the images are in order for start_image and end_image filtering
+    df = df.sort_values(by='image_path')
+
+    # Add in a column for the unique crop name for each detection with a unique id
+    # create a unique uuid based on the md5 hash of the box in the row
+    df['crop_path'] = df['image_path']
+
+    # Add in a column for the unique crop name for each detection with a unique id
+    df['cluster_id'] = -1  # -1 is the default value and means that the image is not in a cluster
+
+    # Remove small or large detections before clustering
+    size_before = len(df)
+    info(f'Searching through {size_before} detections')
+    df = df[(df['area'] > min_area) & (df['area'] < max_area)]
+    size_after = len(df)
+    info(f'Removed {size_before - size_after} detections that were too large or too small')
+
+    # Replace any NaNs with 0
+    df.fillna(0)
+
+    # Print the first 5 rows of the dataframe
+    info(df.head(5))
+
+    if len(df) > 0:
+        # A prefix for the output files to make sure the output is unique for each execution
+        prefix = f'{model}_{datetime.now().strftime("%Y%m%d_%H%M%S")}'
+
+        # Cluster the detections
+        df_cluster = cluster_vits(prefix, model, df, save_dir, alpha, cluster_selection_epsilon, min_similarity,
+                                  min_cluster_size, min_samples)
+
+        # Merge the results with the original DataFrame
+        df.update(df_cluster)
+
+        # Save the clustered detections to a csv file and a copy of the config.ini file
+        df.to_csv(save_dir / f'{prefix}_cluster_detections.csv', index=False, header=True)
+        shutil.copy(Path(config_ini), save_dir / f'{prefix}_config.ini')
+    else:
+        warn(f'No detections found to cluster')

From 6bb998af4245b21eb90195465e382788e612e988 Mon Sep 17 00:00:00 2001
From: danellecline <dcline@mbari.org>
Date: Wed, 15 May 2024 16:34:48 -0700
Subject: [PATCH 02/12] refactor: minor renaming for clarity

---
 sdcat/cluster/commands.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sdcat/cluster/commands.py b/sdcat/cluster/commands.py
index 5f909fe..85b0db7 100644
--- a/sdcat/cluster/commands.py
+++ b/sdcat/cluster/commands.py
@@ -21,7 +21,7 @@
 from sdcat.cluster.cluster import cluster_vits
 
 
-@click.command('cluster-det', help='Cluster detections. See cluster --config-ini to override cluster defaults.')
+@click.command('detections', help='Cluster detections. See cluster --config-ini to override cluster defaults.')
 @common_args.config_ini
 @common_args.start_image
 @common_args.end_image
@@ -261,7 +261,7 @@ def is_day(utc_dt):
         warn(f'No detections found to cluster')
 
 
-@click.command('cluster-roi', help='Cluster roi. See cluster --config-ini to override cluster defaults.')
+@click.command('roi', help='Cluster roi. See cluster --config-ini to override cluster defaults.')
 @common_args.config_ini
 @click.option('--roi-dir', help='Input folder(s) with raw ROI images', multiple=True)
 @click.option('--save-dir', help='Output directory to save clustered detection results')

From 473aa34c6e5efd02bb9983b55f358d3afea89ed8 Mon Sep 17 00:00:00 2001
From: danellecline <dcline@mbari.org>
Date: Wed, 15 May 2024 17:23:08 -0700
Subject: [PATCH 03/12] feat: added square black pad resize for roi

---
 sdcat/cluster/cluster.py  | 22 +++++++++++++++-------
 sdcat/cluster/commands.py | 11 +++++------
 sdcat/cluster/utils.py    | 38 +++++++++++++++++++++++++++++++++++++-
 3 files changed, 57 insertions(+), 14 deletions(-)

diff --git a/sdcat/cluster/cluster.py b/sdcat/cluster/cluster.py
index 3a77ea3..f15fc35 100644
--- a/sdcat/cluster/cluster.py
+++ b/sdcat/cluster/cluster.py
@@ -15,7 +15,7 @@
 from hdbscan import HDBSCAN
 from sklearn.metrics.pairwise import cosine_similarity
 from sdcat.logger import info, warn, debug, err
-from sdcat.cluster.utils import cluster_grid, crop_square_image
+from sdcat.cluster.utils import cluster_grid, crop_square_image, square_image
 from sdcat.cluster.embedding import fetch_embedding, has_cached_embedding, compute_norm_embedding
 
 if find_spec("cuml"):
@@ -222,12 +222,14 @@ def cluster_vits(
         cluster_selection_epsilon: float,
         min_similarity: float,
         min_cluster_size: int,
-        min_samples: int):
+        min_samples: int,
+        roi: bool = False) -> pd.DataFrame:
     """  Cluster the crops using the VITS embeddings.
     :param prefix:  A unique prefix to save artifacts from clustering
     :param model: The model to use for clustering
     :param df_dets: The dataframe with the detections
     :param output_path: The output path to save the clustering artifacts to
+    :param roi:  Whether the detections are already cropped to the ROI
     :param cluster_selection_epsilon: The epsilon parameter for HDBSCAN
     :param alpha: The alpha parameter for HDBSCAN
     :param min_similarity: The minimum similarity score to use for -1 cluster reassignment
@@ -245,12 +247,18 @@ def cluster_vits(
 
     # Skip cropping if all the crops are already done
     if num_crop != len(df_dets):
-        # Crop and squaring the images in parallel using multiprocessing to speed up the processing
-        info(f'Cropping {len(df_dets)} detections in parallel using {multiprocessing.cpu_count()} processes...')
         num_processes = min(multiprocessing.cpu_count(), len(df_dets))
-        with multiprocessing.Pool(num_processes) as pool:
-            args = [(row, 224) for index, row in df_dets.iterrows()]
-            pool.starmap(crop_square_image, args)
+        if roi == True:
+            info('ROI crops already exist. Creating square crops in parallel using {multiprocessing.cpu_count()} processes...')
+            with multiprocessing.Pool(num_processes) as pool:
+                args = [(row, 224) for index, row in df_dets.iterrows()]
+                pool.starmap(crop_square_image, args)
+        else:
+            # Crop and squaring the images in parallel using multiprocessing to speed up the processing
+            info(f'Cropping {len(df_dets)} detections in parallel using {multiprocessing.cpu_count()} processes...')
+            with multiprocessing.Pool(num_processes) as pool:
+                args = [(row, 224) for index, row in df_dets.iterrows()]
+                pool.starmap(crop_square_image, args)
 
     # Drop any rows with crop_path that have files that don't exist - sometimes the crops fail
     df_dets = df_dets[df_dets['crop_path'].apply(lambda x: os.path.exists(x))]
diff --git a/sdcat/cluster/commands.py b/sdcat/cluster/commands.py
index 85b0db7..691a619 100644
--- a/sdcat/cluster/commands.py
+++ b/sdcat/cluster/commands.py
@@ -293,12 +293,10 @@ def run_cluster_roi(roi_dir, save_dir, device, config_ini, alpha, cluster_select
     save_dir = Path(save_dir)
     save_dir.mkdir(parents=True, exist_ok=True)
 
-
     # Grab all images from the input directories
     supported_extensions = ['.png', '.jpg', '.jpeg', '.JPG', '.JPEG', '.PNG']
     images = []
 
-    detections = []
     roi_path = Path(roi_dir)
     for ext in supported_extensions:
         images.extend(list(roi_path.rglob(f'*{ext}')))
@@ -316,9 +314,10 @@ def run_cluster_roi(roi_dir, save_dir, device, config_ini, alpha, cluster_select
     # Sort the dataframe by image_path to make sure the images are in order for start_image and end_image filtering
     df = df.sort_values(by='image_path')
 
-    # Add in a column for the unique crop name for each detection with a unique id
-    # create a unique uuid based on the md5 hash of the box in the row
-    df['crop_path'] = df['image_path']
+    # Create a unique crop name for each detection with a unique id
+    crop_path = save_dir / 'crops'
+    crop_path.mkdir(parents=True, exist_ok=True)
+    df['crop_path'] = df.apply(lambda row: f"{crop_path}/{uuid.uuid5(uuid.NAMESPACE_DNS, row['image_path'])}.png", axis=1)
 
     # Add in a column for the unique crop name for each detection with a unique id
     df['cluster_id'] = -1  # -1 is the default value and means that the image is not in a cluster
@@ -342,7 +341,7 @@ def run_cluster_roi(roi_dir, save_dir, device, config_ini, alpha, cluster_select
 
         # Cluster the detections
         df_cluster = cluster_vits(prefix, model, df, save_dir, alpha, cluster_selection_epsilon, min_similarity,
-                                  min_cluster_size, min_samples)
+                                  min_cluster_size, min_samples, roi=True)
 
         # Merge the results with the original DataFrame
         df.update(df_cluster)
diff --git a/sdcat/cluster/utils.py b/sdcat/cluster/utils.py
index 1f35ee7..9b2858e 100644
--- a/sdcat/cluster/utils.py
+++ b/sdcat/cluster/utils.py
@@ -92,9 +92,45 @@ def gen_grid(with_attention: bool):
     # gen_grid(with_attention=True)
 
 
+def square_image(row, square_dim: int):
+    """
+    Squares an image to the model dimension, filling it with black bars if necessary
+    :param row:
+    :param square_dim: dimension of the square image
+    :return:
+    """
+    try:
+        if not Path(row.image_path).exists():
+            warn(f'Skipping {row.crop_path} because the image {row.image_path} does not exist')
+            return
+
+        if Path(row.crop_path).exists():  # If the crop already exists, skip it
+            return
+
+        # Determine the size of the new square
+        max_side = max(row.image_width, row.image_height)
+
+        # Create a new square image with a black background
+        new_image = Image.new('RGB', (max_side, max_side), (0, 0, 0))
+
+        img = Image.open(row.image_path)
+
+        # Paste the original image onto the center of the new image
+        new_image.paste(img, ((max_side - row.image_width) // 2, (max_side - row.image_height) // 2))
+
+        # Resize the image to square_dim x square_dim
+        img = img.resize((square_dim, square_dim), Image.LANCZOS)
+
+        # Save the image
+        img.save(row.crop_path)
+        img.close()
+    except Exception as e:
+        exception(f'Error cropping {row.image_path} {e}')
+        raise e
+
 def crop_square_image(row, square_dim: int):
     """
-    Crop the image to a square padding the shorted dimension, then resize it to square_dim x square_dim
+    Crop the image to a square padding the shortest dimension, then resize it to square_dim x square_dim
     This also adjusts the crop to make sure the crop is fully in the frame, otherwise the crop that
     exceeds the frame is filled with black bars - these produce clusters of "edge" objects instead
     of the detection

From f91ac03dcd1a3ff735b982cc2b0647eb8dc32938 Mon Sep 17 00:00:00 2001
From: danellecline <dcline@mbari.org>
Date: Wed, 15 May 2024 17:26:46 -0700
Subject: [PATCH 04/12] fix: removed unused args for start/end frame

---
 sdcat/cluster/commands.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sdcat/cluster/commands.py b/sdcat/cluster/commands.py
index 691a619..b19a40e 100644
--- a/sdcat/cluster/commands.py
+++ b/sdcat/cluster/commands.py
@@ -269,7 +269,7 @@ def is_day(utc_dt):
 @click.option('--alpha', help='Alpha is a parameter that controls the linkage. See https://hdbscan.readthedocs.io/en/latest/parameter_selection.html. Default is 0.92. Increase for less conservative clustering, e.g. 1.0', type=float)
 @click.option('--cluster-selection-epsilon', help='Epsilon is a parameter that controls the linkage. Default is 0. Increase for less conservative clustering', type=float)
 @click.option('--min-cluster-size', help='The minimum number of samples in a group for that group to be considered a cluster. Default is 2. Increase for less conservative clustering, e.g. 5, 15', type=int)
-def run_cluster_roi(roi_dir, save_dir, device, config_ini, alpha, cluster_selection_epsilon, min_cluster_size, start_image, end_image):
+def run_cluster_roi(roi_dir, save_dir, device, config_ini, alpha, cluster_selection_epsilon, min_cluster_size):
     config = cfg.Config(config_ini)
     max_area = int(config('cluster', 'max_area'))
     min_area = int(config('cluster', 'min_area'))

From 3808bf9531ed27d55e3dbf1e4ebd9cafc9c6454a Mon Sep 17 00:00:00 2001
From: danellecline <dcline@mbari.org>
Date: Wed, 15 May 2024 17:56:50 -0700
Subject: [PATCH 05/12] fix: roi_dir needs to support lists

---
 sdcat/cluster/commands.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/sdcat/cluster/commands.py b/sdcat/cluster/commands.py
index b19a40e..3cf8078 100644
--- a/sdcat/cluster/commands.py
+++ b/sdcat/cluster/commands.py
@@ -297,18 +297,19 @@ def run_cluster_roi(roi_dir, save_dir, device, config_ini, alpha, cluster_select
     supported_extensions = ['.png', '.jpg', '.jpeg', '.JPG', '.JPEG', '.PNG']
     images = []
 
-    roi_path = Path(roi_dir)
-    for ext in supported_extensions:
-        images.extend(list(roi_path.rglob(f'*{ext}')))
+    for r in roi_dir:
+        roi_path = Path(r)
+        for ext in supported_extensions:
+            images.extend(list(roi_path.rglob(f'*{ext}')))
 
     # Create a dataframe to store the combined data in an image_path column in sorted order
     df = pd.DataFrame()
     df['image_path'] = images
 
-    info(f'Found {len(df)} detections in {roi_path}')
+    info(f'Found {len(df)} detections in {roi_dir}')
 
     if len(df) == 0:
-        info(f'No detections found in {roi_path}')
+        info(f'No detections found in {roi_dir}')
         return
 
     # Sort the dataframe by image_path to make sure the images are in order for start_image and end_image filtering

From 977ef32902f82d4816716118a34b36a8978210a6 Mon Sep 17 00:00:00 2001
From: danellecline <dcline@mbari.org>
Date: Wed, 15 May 2024 18:29:10 -0700
Subject: [PATCH 06/12] fix: path to string

---
 sdcat/cluster/commands.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/sdcat/cluster/commands.py b/sdcat/cluster/commands.py
index 3cf8078..cc705a4 100644
--- a/sdcat/cluster/commands.py
+++ b/sdcat/cluster/commands.py
@@ -306,6 +306,9 @@ def run_cluster_roi(roi_dir, save_dir, device, config_ini, alpha, cluster_select
     df = pd.DataFrame()
     df['image_path'] = images
 
+    # Convert the image_path column to a string
+    df['image_path'] = df['image_path'].astype(str)
+
     info(f'Found {len(df)} detections in {roi_dir}')
 
     if len(df) == 0:
@@ -318,7 +321,9 @@ def run_cluster_roi(roi_dir, save_dir, device, config_ini, alpha, cluster_select
     # Create a unique crop name for each detection with a unique id
     crop_path = save_dir / 'crops'
     crop_path.mkdir(parents=True, exist_ok=True)
-    df['crop_path'] = df.apply(lambda row: f"{crop_path}/{uuid.uuid5(uuid.NAMESPACE_DNS, row['image_path'])}.png", axis=1)
+    df['crop_path'] = df.apply(lambda row:
+                               f'{crop_path}/{uuid.uuid5(uuid.NAMESPACE_DNS, row["image_path"])}.png',
+                               axis=1)
 
     # Add in a column for the unique crop name for each detection with a unique id
     df['cluster_id'] = -1  # -1 is the default value and means that the image is not in a cluster

From a54d88fa4463c8937288565d06a85f274be0c324 Mon Sep 17 00:00:00 2001
From: danellecline <dcline@mbari.org>
Date: Wed, 15 May 2024 18:31:08 -0700
Subject: [PATCH 07/12] chore: remove area for ROI since approximations are
 probably a bad idea

---
 sdcat/cluster/commands.py | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/sdcat/cluster/commands.py b/sdcat/cluster/commands.py
index cc705a4..4b6d9c9 100644
--- a/sdcat/cluster/commands.py
+++ b/sdcat/cluster/commands.py
@@ -328,13 +328,6 @@ def run_cluster_roi(roi_dir, save_dir, device, config_ini, alpha, cluster_select
     # Add in a column for the unique crop name for each detection with a unique id
     df['cluster_id'] = -1  # -1 is the default value and means that the image is not in a cluster
 
-    # Remove small or large detections before clustering
-    size_before = len(df)
-    info(f'Searching through {size_before} detections')
-    df = df[(df['area'] > min_area) & (df['area'] < max_area)]
-    size_after = len(df)
-    info(f'Removed {size_before - size_after} detections that were too large or too small')
-
     # Replace any NaNs with 0
     df.fillna(0)
 

From b7e2f21e136abc80d1e88133928cc68d2f30eebf Mon Sep 17 00:00:00 2001
From: danellecline <dcline@mbari.org>
Date: Wed, 15 May 2024 18:33:53 -0700
Subject: [PATCH 08/12] fix: added image width/height and fixed multiproc to
 square

---
 sdcat/cluster/cluster.py  | 2 +-
 sdcat/cluster/commands.py | 7 +++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/sdcat/cluster/cluster.py b/sdcat/cluster/cluster.py
index f15fc35..3f049a0 100644
--- a/sdcat/cluster/cluster.py
+++ b/sdcat/cluster/cluster.py
@@ -252,7 +252,7 @@ def cluster_vits(
             info('ROI crops already exist. Creating square crops in parallel using {multiprocessing.cpu_count()} processes...')
             with multiprocessing.Pool(num_processes) as pool:
                 args = [(row, 224) for index, row in df_dets.iterrows()]
-                pool.starmap(crop_square_image, args)
+                pool.starmap(square_image, args)
         else:
             # Crop and squaring the images in parallel using multiprocessing to speed up the processing
             info(f'Cropping {len(df_dets)} detections in parallel using {multiprocessing.cpu_count()} processes...')
diff --git a/sdcat/cluster/commands.py b/sdcat/cluster/commands.py
index 4b6d9c9..0a9c330 100644
--- a/sdcat/cluster/commands.py
+++ b/sdcat/cluster/commands.py
@@ -14,6 +14,7 @@
 import pandas as pd
 import pytz
 import torch
+from PIL.Image import Image
 
 from sdcat import common_args
 from sdcat.config import config as cfg
@@ -318,6 +319,12 @@ def run_cluster_roi(roi_dir, save_dir, device, config_ini, alpha, cluster_select
     # Sort the dataframe by image_path to make sure the images are in order for start_image and end_image filtering
     df = df.sort_values(by='image_path')
 
+    # Add the image_width and image_height columns to the dataframe
+    for index, row in df.iterrows():
+        im_size = Image.open(row['image_path']).size
+        df.at[index, 'image_width'] = im_size[0]
+        df.at[index, 'image_height'] = im_size[1]
+
     # Create a unique crop name for each detection with a unique id
     crop_path = save_dir / 'crops'
     crop_path.mkdir(parents=True, exist_ok=True)

From c0da6b67a79c636606a207571873ebb6bf8ebb0b Mon Sep 17 00:00:00 2001
From: danellecline <dcline@mbari.org>
Date: Wed, 15 May 2024 18:35:04 -0700
Subject: [PATCH 09/12] fix: correct PIL image path

---
 sdcat/cluster/commands.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sdcat/cluster/commands.py b/sdcat/cluster/commands.py
index 0a9c330..9371dff 100644
--- a/sdcat/cluster/commands.py
+++ b/sdcat/cluster/commands.py
@@ -14,7 +14,7 @@
 import pandas as pd
 import pytz
 import torch
-from PIL.Image import Image
+from PIL import Image
 
 from sdcat import common_args
 from sdcat.config import config as cfg

From 18fc005a5fc39a14327e47b6d0225ea6a79317ab Mon Sep 17 00:00:00 2001
From: danellecline <dcline@mbari.org>
Date: Wed, 15 May 2024 18:37:25 -0700
Subject: [PATCH 10/12] fix: image size in int not float needed for resize

---
 sdcat/cluster/commands.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sdcat/cluster/commands.py b/sdcat/cluster/commands.py
index 9371dff..16ce38c 100644
--- a/sdcat/cluster/commands.py
+++ b/sdcat/cluster/commands.py
@@ -322,8 +322,8 @@ def run_cluster_roi(roi_dir, save_dir, device, config_ini, alpha, cluster_select
     # Add the image_width and image_height columns to the dataframe
     for index, row in df.iterrows():
         im_size = Image.open(row['image_path']).size
-        df.at[index, 'image_width'] = im_size[0]
-        df.at[index, 'image_height'] = im_size[1]
+        df.at[index, 'image_width'] = int(im_size[0])
+        df.at[index, 'image_height'] = int(im_size[1])
 
     # Create a unique crop name for each detection with a unique id
     crop_path = save_dir / 'crops'

From cf3ebd37436ddd7e160a0b7b0ea14ac29ba64fe3 Mon Sep 17 00:00:00 2001
From: danellecline <dcline@mbari.org>
Date: Wed, 15 May 2024 18:41:33 -0700
Subject: [PATCH 11/12] fix: image size in int not float needed for resize

---
 sdcat/cluster/commands.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/sdcat/cluster/commands.py b/sdcat/cluster/commands.py
index 16ce38c..e263434 100644
--- a/sdcat/cluster/commands.py
+++ b/sdcat/cluster/commands.py
@@ -322,8 +322,10 @@ def run_cluster_roi(roi_dir, save_dir, device, config_ini, alpha, cluster_select
     # Add the image_width and image_height columns to the dataframe
     for index, row in df.iterrows():
         im_size = Image.open(row['image_path']).size
-        df.at[index, 'image_width'] = int(im_size[0])
-        df.at[index, 'image_height'] = int(im_size[1])
+        df.at[index, 'image_width'] = im_size[0]
+        df.at[index, 'image_height'] = im_size[1]
+    df['image_width'] = df['image_width'].astype(int)
+    df['image_height'] = df['image_height'].astype(int)
 
     # Create a unique crop name for each detection with a unique id
     crop_path = save_dir / 'crops'

From ff9a29dbe6c81cee0e0038401b9a99309c2baecd Mon Sep 17 00:00:00 2001
From: danellecline <dcline@mbari.org>
Date: Wed, 15 May 2024 18:48:02 -0700
Subject: [PATCH 12/12] fix: check for det columns

---
 sdcat/cluster/cluster.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/sdcat/cluster/cluster.py b/sdcat/cluster/cluster.py
index 3f049a0..be5069f 100644
--- a/sdcat/cluster/cluster.py
+++ b/sdcat/cluster/cluster.py
@@ -282,9 +282,15 @@ def cluster_vits(
         (output_path / prefix).mkdir(parents=True)
 
     # Remove everything except ancillary data to include in clustering
-    ancillary_df = df_dets.drop(
-        columns=['x', 'y', 'xx', 'xy', 'w', 'h', 'image_width', 'image_height', 'cluster_id', 'cluster', 'score',
-                 'class', 'image_path', 'crop_path'])
+    columns = ['x', 'y', 'xx', 'xy', 'w', 'h', 'image_width', 'image_height', 'cluster_id', 'cluster', 'score',
+               'class', 'image_path', 'crop_path']
+    # Check if the columns exist in the dataframe
+    if all(col in df_dets.columns for col in columns):
+        ancillary_df = df_dets.drop(
+            columns=['x', 'y', 'xx', 'xy', 'w', 'h', 'image_width', 'image_height', 'cluster_id', 'cluster', 'score',
+                     'class', 'image_path', 'crop_path'])
+    else:
+        ancillary_df = df_dets
 
     # Cluster the images
     cluster_sim, unique_clusters, cluster_means, coverage = _run_hdbscan_assign(prefix,