Skip to content

Commit

Permalink
Merge pull request #6 from mbari-org/tsne
Browse files Browse the repository at this point in the history
Tsne
  • Loading branch information
danellecline authored May 30, 2024
2 parents 8dbc5ad + 9ec0ca9 commit 48ca9f8
Show file tree
Hide file tree
Showing 3 changed files with 109 additions and 7 deletions.
15 changes: 9 additions & 6 deletions sdcat/cluster/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
import json
import seaborn as sns
import numpy as np
from matplotlib import pyplot as plt
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.metrics.pairwise import cosine_similarity
Expand Down Expand Up @@ -75,10 +74,13 @@ def _run_hdbscan_assign(
# Get the number of samples which is the number of rows in the dataframe - this is used mostly for calculating coverage
num_samples = df.shape[0]

# Compute the cosine similarity matrix
cosine_sim_matrix = cosine_similarity(df.values)
distance_matrix = 1 - cosine_sim_matrix
x = distance_matrix.astype(np.float64)
# from sklearn.manifold import TSNE
from sklearn.preprocessing import MinMaxScaler
from MulticoreTSNE import MulticoreTSNE as TSNE

tsne = TSNE(n_components=2, perplexity=40, metric="cosine", n_jobs=8, random_state=42, verbose=True)
embedding = tsne.fit_transform(df.values)
x = MinMaxScaler().fit_transform(embedding) # scale the embedding to 0-1

# Cluster the embeddings using HDBSCAN
if have_gpu:
Expand All @@ -93,7 +95,7 @@ def _run_hdbscan_assign(
labels = scan.fit_predict(x)
else:
scan = HDBSCAN(
metric='precomputed',
metric='l2',
allow_single_cluster=True,
min_cluster_size=min_cluster_size,
min_samples=min_samples,
Expand Down Expand Up @@ -206,6 +208,7 @@ def _run_hdbscan_assign(
df = pd.DataFrame({'x': xx[clustered, 0], 'y': xx[clustered, 1], 'labels': labels[clustered]})
p = sns.jointplot(data=df, x='x', y='y', hue='labels')
p.savefig(f"{out_path}/{prefix}_summary.png")
info(f"Saved {out_path}/{prefix}_summary.png")

with open(f'{out_path}/{prefix}_summary.json', 'w') as f:
json.dump(params, f)
Expand Down
96 changes: 96 additions & 0 deletions sdcat/cluster/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,3 +259,99 @@ def is_day(utc_dt):
shutil.copy(Path(config_ini), save_dir / f'{prefix}_config.ini')
else:
warn(f'No detections found to cluster')

@click.command('roi', help='Cluster roi. See cluster --config-ini to override cluster defaults.')
@common_args.config_ini
@click.option('--roi-dir', help='Input folder(s) with raw ROI images', multiple=True)
@click.option('--save-dir', help='Output directory to save clustered detection results')
@click.option('--device', help='Device to use, e.g. cpu or cuda:0', type=str)
@click.option('--alpha', help='Alpha is a parameter that controls the linkage. See https://hdbscan.readthedocs.io/en/latest/parameter_selection.html. Default is 0.92. Increase for less conservative clustering, e.g. 1.0', type=float)
@click.option('--cluster-selection-epsilon', help='Epsilon is a parameter that controls the linkage. Default is 0. Increase for less conservative clustering', type=float)
@click.option('--min-cluster-size', help='The minimum number of samples in a group for that group to be considered a cluster. Default is 2. Increase for less conservative clustering, e.g. 5, 15', type=int)
def run_cluster_roi(roi_dir, save_dir, device, config_ini, alpha, cluster_selection_epsilon, min_cluster_size):
config = cfg.Config(config_ini)
min_samples = int(config('cluster', 'min_samples'))
alpha = alpha if alpha else float(config('cluster', 'alpha'))
min_cluster_size = min_cluster_size if min_cluster_size else int(config('cluster', 'min_cluster_size'))
cluster_selection_epsilon = cluster_selection_epsilon if cluster_selection_epsilon else float(config('cluster','cluster_selection_epsilon'))
min_similarity = float(config('cluster', 'min_similarity'))
model = config('cluster', 'model')

if device:
num_devices = torch.cuda.device_count()
info(f'{num_devices} cuda devices available')
info(f'Using device {device}')
if 'cuda' in device:
device_num = device.split(':')[-1]
info(f'Setting CUDA_VISIBLE_DEVICES to {device_num}')
torch.cuda.set_device(device)
os.environ['CUDA_VISIBLE_DEVICES'] = device_num

save_dir = Path(save_dir)
save_dir.mkdir(parents=True, exist_ok=True)

# Grab all images from the input directories
supported_extensions = ['.png', '.jpg', '.jpeg', '.JPG', '.JPEG', '.PNG']
images = []

for r in roi_dir:
roi_path = Path(r)
for ext in supported_extensions:
images.extend(list(roi_path.rglob(f'*{ext}')))

# Create a dataframe to store the combined data in an image_path column in sorted order
df = pd.DataFrame()
df['image_path'] = images

# Convert the image_path column to a string
df['image_path'] = df['image_path'].astype(str)

info(f'Found {len(df)} detections in {roi_dir}')

if len(df) == 0:
info(f'No detections found in {roi_dir}')
return

# Sort the dataframe by image_path to make sure the images are in order for start_image and end_image filtering
df = df.sort_values(by='image_path')

# Add the image_width and image_height columns to the dataframe
for index, row in df.iterrows():
im_size = Image.open(row['image_path']).size
df.at[index, 'image_width'] = im_size[0]
df.at[index, 'image_height'] = im_size[1]
df['image_width'] = df['image_width'].astype(int)
df['image_height'] = df['image_height'].astype(int)

# Create a unique crop name for each detection with a unique id
crop_path = save_dir / 'crops'
crop_path.mkdir(parents=True, exist_ok=True)
df['crop_path'] = df.apply(lambda row:
f'{crop_path}/{uuid.uuid5(uuid.NAMESPACE_DNS, row["image_path"])}.png',
axis=1)

# Add in a column for the unique crop name for each detection with a unique id
df['cluster_id'] = -1 # -1 is the default value and means that the image is not in a cluster

# Replace any NaNs with 0
df.fillna(0)

# Print the first 5 rows of the dataframe
info(df.head(5))

if len(df) > 0:
# A prefix for the output files to make sure the output is unique for each execution
prefix = f'{model}_{datetime.now().strftime("%Y%m%d_%H%M%S")}'

# Cluster the detections
df_cluster = cluster_vits(prefix, model, df, save_dir, alpha, cluster_selection_epsilon, min_similarity,
min_cluster_size, min_samples, roi=True)

# Merge the results with the original DataFrame
df.update(df_cluster)

# Save the clustered detections to a csv file and a copy of the config.ini file
df.to_csv(save_dir / f'{prefix}_cluster_detections.csv', index=False, header=True)
shutil.copy(Path(config_ini), save_dir / f'{prefix}_config.ini')
else:
warn(f'No detections found to cluster')
5 changes: 4 additions & 1 deletion sdcat/cluster/embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from numpy import save, load
import numpy as np
from sahi.utils.torch import torch
from torchvision import transforms as pth_transforms
import torch.nn as nn
import cv2
from sdcat.logger import info, err
Expand Down Expand Up @@ -119,8 +120,10 @@ def compute_embedding(images: list, model_name: str):

image = np.array(square_img)

# Convert the image to a tensor
norm_transform = pth_transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
img_tensor = torch.from_numpy(image).permute(2, 0, 1).float() / 255.0
# Noramlize the tensor with the mean and std of the ImageNet dataset
img_tensor = norm_transform(img_tensor)
img_tensor = img_tensor.unsqueeze(0) # Add batch dimension
if 'cuda' in device:
img_tensor = img_tensor.to(device)
Expand Down

0 comments on commit 48ca9f8

Please sign in to comment.