From f08053c27d6c7c5750416165ed20a40c542f23cd Mon Sep 17 00:00:00 2001 From: danellecline Date: Wed, 15 May 2024 16:32:16 -0700 Subject: [PATCH 01/12] feat: initial commit of new option to cluster roi only --- sdcat/__main__.py | 16 ++++++- sdcat/cluster/commands.py | 97 ++++++++++++++++++++++++++++++++++++++- 2 files changed, 109 insertions(+), 4 deletions(-) diff --git a/sdcat/__main__.py b/sdcat/__main__.py index 36fda80..8cf4fc0 100644 --- a/sdcat/__main__.py +++ b/sdcat/__main__.py @@ -7,7 +7,7 @@ import click from sdcat.logger import err, info, create_logger_file from sdcat import __version__ -from sdcat.cluster.commands import run_cluster +from sdcat.cluster.commands import run_cluster_det, run_cluster_roi from sdcat.detect.commands import run_detect @@ -28,7 +28,19 @@ def cli(): pass cli.add_command(run_detect) -cli.add_command(run_cluster) + + +@cli.group(name="cluster") +def cli_cluster(): + """ + Commands related to converting data + """ + pass + + +cli.add_command(cli_cluster) +cli_cluster.add_command(run_cluster_det) +cli_cluster.add_command(run_cluster_roi) if __name__ == '__main__': diff --git a/sdcat/cluster/commands.py b/sdcat/cluster/commands.py index 72aa06a..5f909fe 100644 --- a/sdcat/cluster/commands.py +++ b/sdcat/cluster/commands.py @@ -21,7 +21,7 @@ from sdcat.cluster.cluster import cluster_vits -@click.command('cluster', help='Cluster detections. See cluster --config-ini to override cluster defaults.') +@click.command('cluster-det', help='Cluster detections. See cluster --config-ini to override cluster defaults.') @common_args.config_ini @common_args.start_image @common_args.end_image @@ -31,7 +31,7 @@ @click.option('--alpha', help='Alpha is a parameter that controls the linkage. See https://hdbscan.readthedocs.io/en/latest/parameter_selection.html. Default is 0.92. Increase for less conservative clustering, e.g. 1.0', type=float) @click.option('--cluster-selection-epsilon', help='Epsilon is a parameter that controls the linkage. Default is 0. Increase for less conservative clustering', type=float) @click.option('--min-cluster-size', help='The minimum number of samples in a group for that group to be considered a cluster. Default is 2. Increase for less conservative clustering, e.g. 5, 15', type=int) -def run_cluster(det_dir, save_dir, device, config_ini, alpha, cluster_selection_epsilon, min_cluster_size, start_image, end_image): +def run_cluster_det(det_dir, save_dir, device, config_ini, alpha, cluster_selection_epsilon, min_cluster_size, start_image, end_image): config = cfg.Config(config_ini) max_area = int(config('cluster', 'max_area')) min_area = int(config('cluster', 'min_area')) @@ -259,3 +259,96 @@ def is_day(utc_dt): shutil.copy(Path(config_ini), save_dir / f'{prefix}_config.ini') else: warn(f'No detections found to cluster') + + +@click.command('cluster-roi', help='Cluster roi. See cluster --config-ini to override cluster defaults.') +@common_args.config_ini +@click.option('--roi-dir', help='Input folder(s) with raw ROI images', multiple=True) +@click.option('--save-dir', help='Output directory to save clustered detection results') +@click.option('--device', help='Device to use, e.g. cpu or cuda:0', type=str) +@click.option('--alpha', help='Alpha is a parameter that controls the linkage. See https://hdbscan.readthedocs.io/en/latest/parameter_selection.html. Default is 0.92. Increase for less conservative clustering, e.g. 1.0', type=float) +@click.option('--cluster-selection-epsilon', help='Epsilon is a parameter that controls the linkage. Default is 0. Increase for less conservative clustering', type=float) +@click.option('--min-cluster-size', help='The minimum number of samples in a group for that group to be considered a cluster. Default is 2. Increase for less conservative clustering, e.g. 5, 15', type=int) +def run_cluster_roi(roi_dir, save_dir, device, config_ini, alpha, cluster_selection_epsilon, min_cluster_size, start_image, end_image): + config = cfg.Config(config_ini) + max_area = int(config('cluster', 'max_area')) + min_area = int(config('cluster', 'min_area')) + min_samples = int(config('cluster', 'min_samples')) + alpha = alpha if alpha else float(config('cluster', 'alpha')) + min_cluster_size = min_cluster_size if min_cluster_size else int(config('cluster', 'min_cluster_size')) + cluster_selection_epsilon = cluster_selection_epsilon if cluster_selection_epsilon else float(config('cluster','cluster_selection_epsilon')) + min_similarity = float(config('cluster', 'min_similarity')) + model = config('cluster', 'model') + + if device: + num_devices = torch.cuda.device_count() + info(f'{num_devices} cuda devices available') + info(f'Using device {device}') + if 'cuda' in device: + device_num = device.split(':')[-1] + info(f'Setting CUDA_VISIBLE_DEVICES to {device_num}') + torch.cuda.set_device(device) + os.environ['CUDA_VISIBLE_DEVICES'] = device_num + + save_dir = Path(save_dir) + save_dir.mkdir(parents=True, exist_ok=True) + + + # Grab all images from the input directories + supported_extensions = ['.png', '.jpg', '.jpeg', '.JPG', '.JPEG', '.PNG'] + images = [] + + detections = [] + roi_path = Path(roi_dir) + for ext in supported_extensions: + images.extend(list(roi_path.rglob(f'*{ext}'))) + + # Create a dataframe to store the combined data in an image_path column in sorted order + df = pd.DataFrame() + df['image_path'] = images + + info(f'Found {len(df)} detections in {roi_path}') + + if len(df) == 0: + info(f'No detections found in {roi_path}') + return + + # Sort the dataframe by image_path to make sure the images are in order for start_image and end_image filtering + df = df.sort_values(by='image_path') + + # Add in a column for the unique crop name for each detection with a unique id + # create a unique uuid based on the md5 hash of the box in the row + df['crop_path'] = df['image_path'] + + # Add in a column for the unique crop name for each detection with a unique id + df['cluster_id'] = -1 # -1 is the default value and means that the image is not in a cluster + + # Remove small or large detections before clustering + size_before = len(df) + info(f'Searching through {size_before} detections') + df = df[(df['area'] > min_area) & (df['area'] < max_area)] + size_after = len(df) + info(f'Removed {size_before - size_after} detections that were too large or too small') + + # Replace any NaNs with 0 + df.fillna(0) + + # Print the first 5 rows of the dataframe + info(df.head(5)) + + if len(df) > 0: + # A prefix for the output files to make sure the output is unique for each execution + prefix = f'{model}_{datetime.now().strftime("%Y%m%d_%H%M%S")}' + + # Cluster the detections + df_cluster = cluster_vits(prefix, model, df, save_dir, alpha, cluster_selection_epsilon, min_similarity, + min_cluster_size, min_samples) + + # Merge the results with the original DataFrame + df.update(df_cluster) + + # Save the clustered detections to a csv file and a copy of the config.ini file + df.to_csv(save_dir / f'{prefix}_cluster_detections.csv', index=False, header=True) + shutil.copy(Path(config_ini), save_dir / f'{prefix}_config.ini') + else: + warn(f'No detections found to cluster') From 6bb998af4245b21eb90195465e382788e612e988 Mon Sep 17 00:00:00 2001 From: danellecline Date: Wed, 15 May 2024 16:34:48 -0700 Subject: [PATCH 02/12] refactor: minor renaming for clarity --- sdcat/cluster/commands.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sdcat/cluster/commands.py b/sdcat/cluster/commands.py index 5f909fe..85b0db7 100644 --- a/sdcat/cluster/commands.py +++ b/sdcat/cluster/commands.py @@ -21,7 +21,7 @@ from sdcat.cluster.cluster import cluster_vits -@click.command('cluster-det', help='Cluster detections. See cluster --config-ini to override cluster defaults.') +@click.command('detections', help='Cluster detections. See cluster --config-ini to override cluster defaults.') @common_args.config_ini @common_args.start_image @common_args.end_image @@ -261,7 +261,7 @@ def is_day(utc_dt): warn(f'No detections found to cluster') -@click.command('cluster-roi', help='Cluster roi. See cluster --config-ini to override cluster defaults.') +@click.command('roi', help='Cluster roi. See cluster --config-ini to override cluster defaults.') @common_args.config_ini @click.option('--roi-dir', help='Input folder(s) with raw ROI images', multiple=True) @click.option('--save-dir', help='Output directory to save clustered detection results') From 473aa34c6e5efd02bb9983b55f358d3afea89ed8 Mon Sep 17 00:00:00 2001 From: danellecline Date: Wed, 15 May 2024 17:23:08 -0700 Subject: [PATCH 03/12] feat: added square black pad resize for roi --- sdcat/cluster/cluster.py | 22 +++++++++++++++------- sdcat/cluster/commands.py | 11 +++++------ sdcat/cluster/utils.py | 38 +++++++++++++++++++++++++++++++++++++- 3 files changed, 57 insertions(+), 14 deletions(-) diff --git a/sdcat/cluster/cluster.py b/sdcat/cluster/cluster.py index 3a77ea3..f15fc35 100644 --- a/sdcat/cluster/cluster.py +++ b/sdcat/cluster/cluster.py @@ -15,7 +15,7 @@ from hdbscan import HDBSCAN from sklearn.metrics.pairwise import cosine_similarity from sdcat.logger import info, warn, debug, err -from sdcat.cluster.utils import cluster_grid, crop_square_image +from sdcat.cluster.utils import cluster_grid, crop_square_image, square_image from sdcat.cluster.embedding import fetch_embedding, has_cached_embedding, compute_norm_embedding if find_spec("cuml"): @@ -222,12 +222,14 @@ def cluster_vits( cluster_selection_epsilon: float, min_similarity: float, min_cluster_size: int, - min_samples: int): + min_samples: int, + roi: bool = False) -> pd.DataFrame: """ Cluster the crops using the VITS embeddings. :param prefix: A unique prefix to save artifacts from clustering :param model: The model to use for clustering :param df_dets: The dataframe with the detections :param output_path: The output path to save the clustering artifacts to + :param roi: Whether the detections are already cropped to the ROI :param cluster_selection_epsilon: The epsilon parameter for HDBSCAN :param alpha: The alpha parameter for HDBSCAN :param min_similarity: The minimum similarity score to use for -1 cluster reassignment @@ -245,12 +247,18 @@ def cluster_vits( # Skip cropping if all the crops are already done if num_crop != len(df_dets): - # Crop and squaring the images in parallel using multiprocessing to speed up the processing - info(f'Cropping {len(df_dets)} detections in parallel using {multiprocessing.cpu_count()} processes...') num_processes = min(multiprocessing.cpu_count(), len(df_dets)) - with multiprocessing.Pool(num_processes) as pool: - args = [(row, 224) for index, row in df_dets.iterrows()] - pool.starmap(crop_square_image, args) + if roi == True: + info('ROI crops already exist. Creating square crops in parallel using {multiprocessing.cpu_count()} processes...') + with multiprocessing.Pool(num_processes) as pool: + args = [(row, 224) for index, row in df_dets.iterrows()] + pool.starmap(crop_square_image, args) + else: + # Crop and squaring the images in parallel using multiprocessing to speed up the processing + info(f'Cropping {len(df_dets)} detections in parallel using {multiprocessing.cpu_count()} processes...') + with multiprocessing.Pool(num_processes) as pool: + args = [(row, 224) for index, row in df_dets.iterrows()] + pool.starmap(crop_square_image, args) # Drop any rows with crop_path that have files that don't exist - sometimes the crops fail df_dets = df_dets[df_dets['crop_path'].apply(lambda x: os.path.exists(x))] diff --git a/sdcat/cluster/commands.py b/sdcat/cluster/commands.py index 85b0db7..691a619 100644 --- a/sdcat/cluster/commands.py +++ b/sdcat/cluster/commands.py @@ -293,12 +293,10 @@ def run_cluster_roi(roi_dir, save_dir, device, config_ini, alpha, cluster_select save_dir = Path(save_dir) save_dir.mkdir(parents=True, exist_ok=True) - # Grab all images from the input directories supported_extensions = ['.png', '.jpg', '.jpeg', '.JPG', '.JPEG', '.PNG'] images = [] - detections = [] roi_path = Path(roi_dir) for ext in supported_extensions: images.extend(list(roi_path.rglob(f'*{ext}'))) @@ -316,9 +314,10 @@ def run_cluster_roi(roi_dir, save_dir, device, config_ini, alpha, cluster_select # Sort the dataframe by image_path to make sure the images are in order for start_image and end_image filtering df = df.sort_values(by='image_path') - # Add in a column for the unique crop name for each detection with a unique id - # create a unique uuid based on the md5 hash of the box in the row - df['crop_path'] = df['image_path'] + # Create a unique crop name for each detection with a unique id + crop_path = save_dir / 'crops' + crop_path.mkdir(parents=True, exist_ok=True) + df['crop_path'] = df.apply(lambda row: f"{crop_path}/{uuid.uuid5(uuid.NAMESPACE_DNS, row['image_path'])}.png", axis=1) # Add in a column for the unique crop name for each detection with a unique id df['cluster_id'] = -1 # -1 is the default value and means that the image is not in a cluster @@ -342,7 +341,7 @@ def run_cluster_roi(roi_dir, save_dir, device, config_ini, alpha, cluster_select # Cluster the detections df_cluster = cluster_vits(prefix, model, df, save_dir, alpha, cluster_selection_epsilon, min_similarity, - min_cluster_size, min_samples) + min_cluster_size, min_samples, roi=True) # Merge the results with the original DataFrame df.update(df_cluster) diff --git a/sdcat/cluster/utils.py b/sdcat/cluster/utils.py index 1f35ee7..9b2858e 100644 --- a/sdcat/cluster/utils.py +++ b/sdcat/cluster/utils.py @@ -92,9 +92,45 @@ def gen_grid(with_attention: bool): # gen_grid(with_attention=True) +def square_image(row, square_dim: int): + """ + Squares an image to the model dimension, filling it with black bars if necessary + :param row: + :param square_dim: dimension of the square image + :return: + """ + try: + if not Path(row.image_path).exists(): + warn(f'Skipping {row.crop_path} because the image {row.image_path} does not exist') + return + + if Path(row.crop_path).exists(): # If the crop already exists, skip it + return + + # Determine the size of the new square + max_side = max(row.image_width, row.image_height) + + # Create a new square image with a black background + new_image = Image.new('RGB', (max_side, max_side), (0, 0, 0)) + + img = Image.open(row.image_path) + + # Paste the original image onto the center of the new image + new_image.paste(img, ((max_side - row.image_width) // 2, (max_side - row.image_height) // 2)) + + # Resize the image to square_dim x square_dim + img = img.resize((square_dim, square_dim), Image.LANCZOS) + + # Save the image + img.save(row.crop_path) + img.close() + except Exception as e: + exception(f'Error cropping {row.image_path} {e}') + raise e + def crop_square_image(row, square_dim: int): """ - Crop the image to a square padding the shorted dimension, then resize it to square_dim x square_dim + Crop the image to a square padding the shortest dimension, then resize it to square_dim x square_dim This also adjusts the crop to make sure the crop is fully in the frame, otherwise the crop that exceeds the frame is filled with black bars - these produce clusters of "edge" objects instead of the detection From f91ac03dcd1a3ff735b982cc2b0647eb8dc32938 Mon Sep 17 00:00:00 2001 From: danellecline Date: Wed, 15 May 2024 17:26:46 -0700 Subject: [PATCH 04/12] fix: removed unused args for start/end frame --- sdcat/cluster/commands.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdcat/cluster/commands.py b/sdcat/cluster/commands.py index 691a619..b19a40e 100644 --- a/sdcat/cluster/commands.py +++ b/sdcat/cluster/commands.py @@ -269,7 +269,7 @@ def is_day(utc_dt): @click.option('--alpha', help='Alpha is a parameter that controls the linkage. See https://hdbscan.readthedocs.io/en/latest/parameter_selection.html. Default is 0.92. Increase for less conservative clustering, e.g. 1.0', type=float) @click.option('--cluster-selection-epsilon', help='Epsilon is a parameter that controls the linkage. Default is 0. Increase for less conservative clustering', type=float) @click.option('--min-cluster-size', help='The minimum number of samples in a group for that group to be considered a cluster. Default is 2. Increase for less conservative clustering, e.g. 5, 15', type=int) -def run_cluster_roi(roi_dir, save_dir, device, config_ini, alpha, cluster_selection_epsilon, min_cluster_size, start_image, end_image): +def run_cluster_roi(roi_dir, save_dir, device, config_ini, alpha, cluster_selection_epsilon, min_cluster_size): config = cfg.Config(config_ini) max_area = int(config('cluster', 'max_area')) min_area = int(config('cluster', 'min_area')) From 3808bf9531ed27d55e3dbf1e4ebd9cafc9c6454a Mon Sep 17 00:00:00 2001 From: danellecline Date: Wed, 15 May 2024 17:56:50 -0700 Subject: [PATCH 05/12] fix: roi_dir needs to support lists --- sdcat/cluster/commands.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/sdcat/cluster/commands.py b/sdcat/cluster/commands.py index b19a40e..3cf8078 100644 --- a/sdcat/cluster/commands.py +++ b/sdcat/cluster/commands.py @@ -297,18 +297,19 @@ def run_cluster_roi(roi_dir, save_dir, device, config_ini, alpha, cluster_select supported_extensions = ['.png', '.jpg', '.jpeg', '.JPG', '.JPEG', '.PNG'] images = [] - roi_path = Path(roi_dir) - for ext in supported_extensions: - images.extend(list(roi_path.rglob(f'*{ext}'))) + for r in roi_dir: + roi_path = Path(r) + for ext in supported_extensions: + images.extend(list(roi_path.rglob(f'*{ext}'))) # Create a dataframe to store the combined data in an image_path column in sorted order df = pd.DataFrame() df['image_path'] = images - info(f'Found {len(df)} detections in {roi_path}') + info(f'Found {len(df)} detections in {roi_dir}') if len(df) == 0: - info(f'No detections found in {roi_path}') + info(f'No detections found in {roi_dir}') return # Sort the dataframe by image_path to make sure the images are in order for start_image and end_image filtering From 977ef32902f82d4816716118a34b36a8978210a6 Mon Sep 17 00:00:00 2001 From: danellecline Date: Wed, 15 May 2024 18:29:10 -0700 Subject: [PATCH 06/12] fix: path to string --- sdcat/cluster/commands.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/sdcat/cluster/commands.py b/sdcat/cluster/commands.py index 3cf8078..cc705a4 100644 --- a/sdcat/cluster/commands.py +++ b/sdcat/cluster/commands.py @@ -306,6 +306,9 @@ def run_cluster_roi(roi_dir, save_dir, device, config_ini, alpha, cluster_select df = pd.DataFrame() df['image_path'] = images + # Convert the image_path column to a string + df['image_path'] = df['image_path'].astype(str) + info(f'Found {len(df)} detections in {roi_dir}') if len(df) == 0: @@ -318,7 +321,9 @@ def run_cluster_roi(roi_dir, save_dir, device, config_ini, alpha, cluster_select # Create a unique crop name for each detection with a unique id crop_path = save_dir / 'crops' crop_path.mkdir(parents=True, exist_ok=True) - df['crop_path'] = df.apply(lambda row: f"{crop_path}/{uuid.uuid5(uuid.NAMESPACE_DNS, row['image_path'])}.png", axis=1) + df['crop_path'] = df.apply(lambda row: + f'{crop_path}/{uuid.uuid5(uuid.NAMESPACE_DNS, row["image_path"])}.png', + axis=1) # Add in a column for the unique crop name for each detection with a unique id df['cluster_id'] = -1 # -1 is the default value and means that the image is not in a cluster From a54d88fa4463c8937288565d06a85f274be0c324 Mon Sep 17 00:00:00 2001 From: danellecline Date: Wed, 15 May 2024 18:31:08 -0700 Subject: [PATCH 07/12] chore: remove area for ROI since approximations are probably a bad idea --- sdcat/cluster/commands.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/sdcat/cluster/commands.py b/sdcat/cluster/commands.py index cc705a4..4b6d9c9 100644 --- a/sdcat/cluster/commands.py +++ b/sdcat/cluster/commands.py @@ -328,13 +328,6 @@ def run_cluster_roi(roi_dir, save_dir, device, config_ini, alpha, cluster_select # Add in a column for the unique crop name for each detection with a unique id df['cluster_id'] = -1 # -1 is the default value and means that the image is not in a cluster - # Remove small or large detections before clustering - size_before = len(df) - info(f'Searching through {size_before} detections') - df = df[(df['area'] > min_area) & (df['area'] < max_area)] - size_after = len(df) - info(f'Removed {size_before - size_after} detections that were too large or too small') - # Replace any NaNs with 0 df.fillna(0) From b7e2f21e136abc80d1e88133928cc68d2f30eebf Mon Sep 17 00:00:00 2001 From: danellecline Date: Wed, 15 May 2024 18:33:53 -0700 Subject: [PATCH 08/12] fix: added image width/height and fixed multiproc to square --- sdcat/cluster/cluster.py | 2 +- sdcat/cluster/commands.py | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/sdcat/cluster/cluster.py b/sdcat/cluster/cluster.py index f15fc35..3f049a0 100644 --- a/sdcat/cluster/cluster.py +++ b/sdcat/cluster/cluster.py @@ -252,7 +252,7 @@ def cluster_vits( info('ROI crops already exist. Creating square crops in parallel using {multiprocessing.cpu_count()} processes...') with multiprocessing.Pool(num_processes) as pool: args = [(row, 224) for index, row in df_dets.iterrows()] - pool.starmap(crop_square_image, args) + pool.starmap(square_image, args) else: # Crop and squaring the images in parallel using multiprocessing to speed up the processing info(f'Cropping {len(df_dets)} detections in parallel using {multiprocessing.cpu_count()} processes...') diff --git a/sdcat/cluster/commands.py b/sdcat/cluster/commands.py index 4b6d9c9..0a9c330 100644 --- a/sdcat/cluster/commands.py +++ b/sdcat/cluster/commands.py @@ -14,6 +14,7 @@ import pandas as pd import pytz import torch +from PIL.Image import Image from sdcat import common_args from sdcat.config import config as cfg @@ -318,6 +319,12 @@ def run_cluster_roi(roi_dir, save_dir, device, config_ini, alpha, cluster_select # Sort the dataframe by image_path to make sure the images are in order for start_image and end_image filtering df = df.sort_values(by='image_path') + # Add the image_width and image_height columns to the dataframe + for index, row in df.iterrows(): + im_size = Image.open(row['image_path']).size + df.at[index, 'image_width'] = im_size[0] + df.at[index, 'image_height'] = im_size[1] + # Create a unique crop name for each detection with a unique id crop_path = save_dir / 'crops' crop_path.mkdir(parents=True, exist_ok=True) From c0da6b67a79c636606a207571873ebb6bf8ebb0b Mon Sep 17 00:00:00 2001 From: danellecline Date: Wed, 15 May 2024 18:35:04 -0700 Subject: [PATCH 09/12] fix: correct PIL image path --- sdcat/cluster/commands.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdcat/cluster/commands.py b/sdcat/cluster/commands.py index 0a9c330..9371dff 100644 --- a/sdcat/cluster/commands.py +++ b/sdcat/cluster/commands.py @@ -14,7 +14,7 @@ import pandas as pd import pytz import torch -from PIL.Image import Image +from PIL import Image from sdcat import common_args from sdcat.config import config as cfg From 18fc005a5fc39a14327e47b6d0225ea6a79317ab Mon Sep 17 00:00:00 2001 From: danellecline Date: Wed, 15 May 2024 18:37:25 -0700 Subject: [PATCH 10/12] fix: image size in int not float needed for resize --- sdcat/cluster/commands.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sdcat/cluster/commands.py b/sdcat/cluster/commands.py index 9371dff..16ce38c 100644 --- a/sdcat/cluster/commands.py +++ b/sdcat/cluster/commands.py @@ -322,8 +322,8 @@ def run_cluster_roi(roi_dir, save_dir, device, config_ini, alpha, cluster_select # Add the image_width and image_height columns to the dataframe for index, row in df.iterrows(): im_size = Image.open(row['image_path']).size - df.at[index, 'image_width'] = im_size[0] - df.at[index, 'image_height'] = im_size[1] + df.at[index, 'image_width'] = int(im_size[0]) + df.at[index, 'image_height'] = int(im_size[1]) # Create a unique crop name for each detection with a unique id crop_path = save_dir / 'crops' From cf3ebd37436ddd7e160a0b7b0ea14ac29ba64fe3 Mon Sep 17 00:00:00 2001 From: danellecline Date: Wed, 15 May 2024 18:41:33 -0700 Subject: [PATCH 11/12] fix: image size in int not float needed for resize --- sdcat/cluster/commands.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sdcat/cluster/commands.py b/sdcat/cluster/commands.py index 16ce38c..e263434 100644 --- a/sdcat/cluster/commands.py +++ b/sdcat/cluster/commands.py @@ -322,8 +322,10 @@ def run_cluster_roi(roi_dir, save_dir, device, config_ini, alpha, cluster_select # Add the image_width and image_height columns to the dataframe for index, row in df.iterrows(): im_size = Image.open(row['image_path']).size - df.at[index, 'image_width'] = int(im_size[0]) - df.at[index, 'image_height'] = int(im_size[1]) + df.at[index, 'image_width'] = im_size[0] + df.at[index, 'image_height'] = im_size[1] + df['image_width'] = df['image_width'].astype(int) + df['image_height'] = df['image_height'].astype(int) # Create a unique crop name for each detection with a unique id crop_path = save_dir / 'crops' From ff9a29dbe6c81cee0e0038401b9a99309c2baecd Mon Sep 17 00:00:00 2001 From: danellecline Date: Wed, 15 May 2024 18:48:02 -0700 Subject: [PATCH 12/12] fix: check for det columns --- sdcat/cluster/cluster.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/sdcat/cluster/cluster.py b/sdcat/cluster/cluster.py index 3f049a0..be5069f 100644 --- a/sdcat/cluster/cluster.py +++ b/sdcat/cluster/cluster.py @@ -282,9 +282,15 @@ def cluster_vits( (output_path / prefix).mkdir(parents=True) # Remove everything except ancillary data to include in clustering - ancillary_df = df_dets.drop( - columns=['x', 'y', 'xx', 'xy', 'w', 'h', 'image_width', 'image_height', 'cluster_id', 'cluster', 'score', - 'class', 'image_path', 'crop_path']) + columns = ['x', 'y', 'xx', 'xy', 'w', 'h', 'image_width', 'image_height', 'cluster_id', 'cluster', 'score', + 'class', 'image_path', 'crop_path'] + # Check if the columns exist in the dataframe + if all(col in df_dets.columns for col in columns): + ancillary_df = df_dets.drop( + columns=['x', 'y', 'xx', 'xy', 'w', 'h', 'image_width', 'image_height', 'cluster_id', 'cluster', 'score', + 'class', 'image_path', 'crop_path']) + else: + ancillary_df = df_dets # Cluster the images cluster_sim, unique_clusters, cluster_means, coverage = _run_hdbscan_assign(prefix,