Update CLI clustering

bittremieux-lab · Dec 13, 2021 · b71317c · b71317c
1 parent bca9f1f
commit b71317c
Show file tree

Hide file tree

Showing 4 changed files with 31 additions and 49 deletions.
diff --git a/README.md b/README.md
@@ -47,27 +47,29 @@ gleams cluster --help
 GLEAMS provides the `gleams embed` command to convert MS/MS spectra in peak files to 32-dimensional embeddings. Example:
 
 ```
-gleams embed *.mzML --embed_name GLEAMS.embed
+gleams embed *.mzML --embed_name GLEAMS_embed
 ```
 
-This will read the MS/MS spectra from all matched mzML files and export the results to a two-dimensional NumPy array of dimension _n_ x 32 in file `GLEAMS.embed.npy`, with _n_ the number of MS/MS spectra read from the mzML files.
-Additionally, a tabular file `GLEAMS.embed.parquet` will be created containing corresponding metadata for the embedded spectra.
+This will read the MS/MS spectra from all matched mzML files and export the results to a two-dimensional NumPy array of dimension _n_ x 32 in file `GLEAMS_embed.npy`, with _n_ the number of MS/MS spectra read from the mzML files.
+Additionally, a tabular file `GLEAMS_embed.parquet` will be created containing corresponding metadata for the embedded spectra.
 
 ### Embedding clustering
 
 After converting the MS/MS spectra to 32-dimensional embeddings, they can be clustered to group spectra with similar embeddings using the `gleams cluster` command. Example:
 
 ```
-gleams cluster --embed_name GLEAMS.embed --cluster_name GLEAMS.cluster --eps 0.05
+gleams cluster --embed_name GLEAMS_embed --cluster_name GLEAMS_cluster --distance_threshold 0.3
 ```
 
-This will perform DBSCAN clustering on the embeddings.
-The output will be written to the `GLEAMS.cluster.npy` NumPy file with cluster labels per embedding (`-1` indicates noise, minimum cluster size 2).
-Additionally, a tabular file `GLEAMS.cluster.parquet` will be created containing corresponding metadata for the clustered spectra.
-Note that although this `GLEAMS.cluster.parquet` metadata file contains information for the same spectra as the `GLEAMS.embed.parquet` metadata file, the order of the spectra (matching the clustering results) is different.
+This will perform hierarchical clustering on the embeddings with the given distance threshold.
+The output will be written to the `GLEAMS_cluster.npy` NumPy file with cluster labels per embedding (`-1` indicates noise, minimum cluster size 2).
+Additionally, a file `GLEAMS_cluster_medoids.npy` will be created containing indexes of the cluster representative spectra (medoids).
+
+### Advanced usage
+
+Full configuration of GLEAMS, including various configurations to train the neural network, can be modified in the `gleams/config.py` file.
 
 Contact
 -------
 
 For more information you can visit the [official code website](https://github.com/bittremieux/GLEAMS) or send an email to <[email protected]>.
-
diff --git a/gleams/cluster/cluster.py b/gleams/cluster/cluster.py
@@ -52,7 +52,7 @@ def cluster(embeddings_filename: str, metadata_filename: str,
                       'not recomputed')
         return
     clusters_dir = os.path.dirname(clusters_filename)
-    if not os.path.exists(clusters_dir):
+    if clusters_dir and not os.path.exists(clusters_dir):
         os.mkdir(clusters_dir)
     # Sort the metadata by increasing precursor m/z for easy subsetting.
     metadata = (pd.read_parquet(metadata_filename, columns=['charge', 'mz'])

diff --git a/gleams/config.py b/gleams/config.py
@@ -79,4 +79,5 @@
 num_probe = 1024
 
 # Clustering.
+linkage = 'average'
 distance_threshold = 0.35
diff --git a/gleams/gleams.py b/gleams/gleams.py
@@ -37,9 +37,9 @@ def gleams():
 @gleams.command('embed')
 @click.argument('peak_in', nargs=-1, required=True)
 @click.option(
-    '--embed_name', default='GLEAMS.embed',
+    '--embed_name', default='GLEAMS_embed',
     help='The output will be written to the current working directory with the'
-         ' specified name (default: "GLEAMS.embed"). The output consists of a '
+         ' specified name (default: "GLEAMS_embed"). The output consists of a '
          'NumPy file containing the GLEAMS embeddings (extension ".npy") and '
          'a Parquet file containing the corresponding MS/MS spectra metadata '
          '(extension ".parquet").')
@@ -59,7 +59,7 @@ def cli_embed(peak_in: List[str], embed_name: str) -> None:
 
     # Create temporary working directory.
     temp_dir = tempfile.mkdtemp()
-    metadata_filename = os.path.join(temp_dir, 'metadata.parquet')
+    metadata_filename = os.path.join(temp_dir, f'{embed_name}.parquet')
     embed_dir = os.path.join(temp_dir, 'embed')
     os.mkdir(embed_dir)
     # Create a metadata file with the file names.
@@ -104,26 +104,24 @@ def cli_embed(peak_in: List[str], embed_name: str) -> None:
 
 @gleams.command('cluster')
 @click.option(
-    '--embed_name', default='GLEAMS.embed',
-    help='Name of the GLEAMS embeddings (default: "GLEAMS.embed"). Both a '
+    '--embed_name', default='GLEAMS_embed',
+    help='Name of the GLEAMS embeddings (default: "GLEAMS_embed"). Both a '
          'NumPy file and a Parquet file should be present in the current '
          'working directory.')
 @click.option(
-    '--cluster_name', default='GLEAMS.cluster',
+    '--cluster_name', default='GLEAMS_cluster',
     help='The output will be written to the current working directory with the'
-         ' specified name (default: "GLEAMS.cluster"). The output consists of '
+         ' specified name (default: "GLEAMS_cluster"). The output consists of '
          'a NumPy file containing the cluster labels (extension ".npy") and '
-         'a Parquet file containing the corresponding MS/MS spectra metadata '
-         '(extension ".parquet"). Attention: the spectrum order in this '
-         'metadata file differs from the order in the embedding metadata '
-         'file.')
+         'a NumPy file containing indexes of the cluster medoid spectra '
+         '(extension "_medoids.npy").')
 @click.option(
-    '--eps', default=0.05,
-    help='The maximum Euclidean distance between embeddings to be considered '
-         'in each other\'s neighborhood during DBSCAN clustering '
-         '(default: 0.05).'
+    '--distance_threshold', default=0.3,
+    help='The Euclidean distance threshold between embeddings to be merged '
+         'during hierarchical clustering (average linkage) (default: 0.3).'
 )
-def cli_cluster(embed_name: str, cluster_name: str, eps: float) -> None:
+def cli_cluster(embed_name: str, cluster_name: str,
+                distance_threshold: float) -> None:
     """
     Cluster embeddings.
 
@@ -132,26 +130,7 @@ def cli_cluster(embed_name: str, cluster_name: str, eps: float) -> None:
     """
     logger.info('GLEAMS version %s', str(__version__))
 
-    # Create temporary working directory.
-    temp_dir = tempfile.mkdtemp()
-    dist_filename = os.path.join(temp_dir, f'{embed_name}.npz')
-    # Compute the pairwise distance to a temporary file.
-    cluster.compute_pairwise_distances(
-        f'{embed_name}.npy', f'{embed_name}.parquet', dist_filename,
-        config.precursor_tol_mass, config.precursor_tol_mode,
-        config.mz_interval, config.num_neighbors, config.num_neighbors_ann,
-        config.num_probe, config.batch_size_add, config.batch_size_dist,
-        config.charges)
-    # Move the metadata file to the working directory.
-    shutil.move(os.path.join(temp_dir, f'{embed_name}.parquet'),
-                f'{cluster_name}.parquet')
-    # Remove previous result (if applicable).
-    if os.path.isfile(f'{cluster_name}.npy'):
-        os.remove(f'{cluster_name}.npy')
-    # DBSCAN clustering.
-    cluster.cluster(
-        dist_filename, f'{cluster_name}.parquet', f'{cluster_name}.npy',
-        eps, config.min_samples, config.precursor_tol_mass,
-        config.precursor_tol_mode)
-    # Clean up intermediate files.
-    shutil.rmtree(temp_dir)
+    cluster.cluster(f'{embed_name}.npy', f'{embed_name}.parquet',
+                    f'{cluster_name}.npy', config.precursor_tol_mass,
+                    config.precursor_tol_mode, config.linkage,
+                    distance_threshold, config.charges)