Merge pull request #6 from mbari-org/tsne

Tsne
mbari-org · May 30, 2024 · 48ca9f8 · 48ca9f8
2 parents 8dbc5ad + 9ec0ca9
commit 48ca9f8
Show file tree

Hide file tree

Showing 3 changed files with 109 additions and 7 deletions.
diff --git a/sdcat/cluster/cluster.py b/sdcat/cluster/cluster.py
@@ -10,7 +10,6 @@
 import json
 import seaborn as sns
 import numpy as np
-from matplotlib import pyplot as plt
 from umap import UMAP
 from hdbscan import HDBSCAN
 from sklearn.metrics.pairwise import cosine_similarity
@@ -75,10 +74,13 @@ def _run_hdbscan_assign(
     # Get the number of samples which is the number of rows in the dataframe - this is used mostly for calculating coverage
     num_samples = df.shape[0]
 
-    # Compute the cosine similarity matrix
-    cosine_sim_matrix = cosine_similarity(df.values)
-    distance_matrix = 1 - cosine_sim_matrix
-    x = distance_matrix.astype(np.float64)
+    # from sklearn.manifold import TSNE
+    from sklearn.preprocessing import MinMaxScaler
+    from MulticoreTSNE import MulticoreTSNE as TSNE
+
+    tsne = TSNE(n_components=2, perplexity=40, metric="cosine", n_jobs=8, random_state=42, verbose=True)
+    embedding = tsne.fit_transform(df.values)
+    x = MinMaxScaler().fit_transform(embedding) # scale the embedding to 0-1
 
     # Cluster the embeddings using HDBSCAN
     if have_gpu:
@@ -93,7 +95,7 @@ def _run_hdbscan_assign(
         labels = scan.fit_predict(x)
     else:
         scan = HDBSCAN(
-            metric='precomputed',
+            metric='l2',
             allow_single_cluster=True,
             min_cluster_size=min_cluster_size,
             min_samples=min_samples,
@@ -206,6 +208,7 @@ def _run_hdbscan_assign(
     df = pd.DataFrame({'x': xx[clustered, 0], 'y': xx[clustered, 1], 'labels': labels[clustered]})
     p = sns.jointplot(data=df, x='x', y='y', hue='labels')
     p.savefig(f"{out_path}/{prefix}_summary.png")
+    info(f"Saved {out_path}/{prefix}_summary.png")
 
     with open(f'{out_path}/{prefix}_summary.json', 'w') as f:
         json.dump(params, f)

diff --git a/sdcat/cluster/commands.py b/sdcat/cluster/commands.py
@@ -259,3 +259,99 @@ def is_day(utc_dt):
         shutil.copy(Path(config_ini), save_dir / f'{prefix}_config.ini')
     else:
         warn(f'No detections found to cluster')
+
+@click.command('roi', help='Cluster roi. See cluster --config-ini to override cluster defaults.')
+@common_args.config_ini
+@click.option('--roi-dir', help='Input folder(s) with raw ROI images', multiple=True)
+@click.option('--save-dir', help='Output directory to save clustered detection results')
+@click.option('--device', help='Device to use, e.g. cpu or cuda:0', type=str)
+@click.option('--alpha', help='Alpha is a parameter that controls the linkage. See https://hdbscan.readthedocs.io/en/latest/parameter_selection.html. Default is 0.92. Increase for less conservative clustering, e.g. 1.0', type=float)
+@click.option('--cluster-selection-epsilon', help='Epsilon is a parameter that controls the linkage. Default is 0. Increase for less conservative clustering', type=float)
+@click.option('--min-cluster-size', help='The minimum number of samples in a group for that group to be considered a cluster. Default is 2. Increase for less conservative clustering, e.g. 5, 15', type=int)
+def run_cluster_roi(roi_dir, save_dir, device, config_ini, alpha, cluster_selection_epsilon, min_cluster_size):
+    config = cfg.Config(config_ini)
+    min_samples = int(config('cluster', 'min_samples'))
+    alpha = alpha if alpha else float(config('cluster', 'alpha'))
+    min_cluster_size = min_cluster_size if min_cluster_size else int(config('cluster', 'min_cluster_size'))
+    cluster_selection_epsilon = cluster_selection_epsilon if cluster_selection_epsilon else float(config('cluster','cluster_selection_epsilon'))
+    min_similarity = float(config('cluster', 'min_similarity'))
+    model = config('cluster', 'model')
+
+    if device:
+        num_devices = torch.cuda.device_count()
+        info(f'{num_devices} cuda devices available')
+        info(f'Using device {device}')
+        if 'cuda' in device:
+            device_num = device.split(':')[-1]
+            info(f'Setting CUDA_VISIBLE_DEVICES to {device_num}')
+            torch.cuda.set_device(device)
+            os.environ['CUDA_VISIBLE_DEVICES'] = device_num
+
+    save_dir = Path(save_dir)
+    save_dir.mkdir(parents=True, exist_ok=True)
+
+    # Grab all images from the input directories
+    supported_extensions = ['.png', '.jpg', '.jpeg', '.JPG', '.JPEG', '.PNG']
+    images = []
+
+    for r in roi_dir:
+        roi_path = Path(r)
+        for ext in supported_extensions:
+            images.extend(list(roi_path.rglob(f'*{ext}')))
+
+    # Create a dataframe to store the combined data in an image_path column in sorted order
+    df = pd.DataFrame()
+    df['image_path'] = images
+
+    # Convert the image_path column to a string
+    df['image_path'] = df['image_path'].astype(str)
+
+    info(f'Found {len(df)} detections in {roi_dir}')
+
+    if len(df) == 0:
+        info(f'No detections found in {roi_dir}')
+        return
+
+    # Sort the dataframe by image_path to make sure the images are in order for start_image and end_image filtering
+    df = df.sort_values(by='image_path')
+
+    # Add the image_width and image_height columns to the dataframe
+    for index, row in df.iterrows():
+        im_size = Image.open(row['image_path']).size
+        df.at[index, 'image_width'] = im_size[0]
+        df.at[index, 'image_height'] = im_size[1]
+    df['image_width'] = df['image_width'].astype(int)
+    df['image_height'] = df['image_height'].astype(int)
+
+    # Create a unique crop name for each detection with a unique id
+    crop_path = save_dir / 'crops'
+    crop_path.mkdir(parents=True, exist_ok=True)
+    df['crop_path'] = df.apply(lambda row:
+                               f'{crop_path}/{uuid.uuid5(uuid.NAMESPACE_DNS, row["image_path"])}.png',
+                               axis=1)
+
+    # Add in a column for the unique crop name for each detection with a unique id
+    df['cluster_id'] = -1  # -1 is the default value and means that the image is not in a cluster
+
+    # Replace any NaNs with 0
+    df.fillna(0)
+
+    # Print the first 5 rows of the dataframe
+    info(df.head(5))
+
+    if len(df) > 0:
+        # A prefix for the output files to make sure the output is unique for each execution
+        prefix = f'{model}_{datetime.now().strftime("%Y%m%d_%H%M%S")}'
+
+        # Cluster the detections
+        df_cluster = cluster_vits(prefix, model, df, save_dir, alpha, cluster_selection_epsilon, min_similarity,
+                                  min_cluster_size, min_samples, roi=True)
+
+        # Merge the results with the original DataFrame
+        df.update(df_cluster)
+
+        # Save the clustered detections to a csv file and a copy of the config.ini file
+        df.to_csv(save_dir / f'{prefix}_cluster_detections.csv', index=False, header=True)
+        shutil.copy(Path(config_ini), save_dir / f'{prefix}_config.ini')
+    else:
+        warn(f'No detections found to cluster')
diff --git a/sdcat/cluster/embedding.py b/sdcat/cluster/embedding.py
@@ -11,6 +11,7 @@
 from numpy import save, load
 import numpy as np
 from sahi.utils.torch import torch
+from torchvision import transforms as pth_transforms
 import torch.nn as nn
 import cv2
 from sdcat.logger import info, err
@@ -119,8 +120,10 @@ def compute_embedding(images: list, model_name: str):
 
             image = np.array(square_img)
 
-            # Convert the image to a tensor
+            norm_transform = pth_transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
             img_tensor = torch.from_numpy(image).permute(2, 0, 1).float() / 255.0
+            # Noramlize the tensor with the mean and std of the ImageNet dataset
+            img_tensor = norm_transform(img_tensor)
             img_tensor = img_tensor.unsqueeze(0)  # Add batch dimension
             if 'cuda' in device:
                 img_tensor = img_tensor.to(device)