Skip to content

Commit

Permalink
Update CLI clustering
Browse files Browse the repository at this point in the history
  • Loading branch information
bittremieux committed Dec 13, 2021
1 parent bca9f1f commit b71317c
Show file tree
Hide file tree
Showing 4 changed files with 31 additions and 49 deletions.
20 changes: 11 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,27 +47,29 @@ gleams cluster --help
GLEAMS provides the `gleams embed` command to convert MS/MS spectra in peak files to 32-dimensional embeddings. Example:

```
gleams embed *.mzML --embed_name GLEAMS.embed
gleams embed *.mzML --embed_name GLEAMS_embed
```

This will read the MS/MS spectra from all matched mzML files and export the results to a two-dimensional NumPy array of dimension _n_ x 32 in file `GLEAMS.embed.npy`, with _n_ the number of MS/MS spectra read from the mzML files.
Additionally, a tabular file `GLEAMS.embed.parquet` will be created containing corresponding metadata for the embedded spectra.
This will read the MS/MS spectra from all matched mzML files and export the results to a two-dimensional NumPy array of dimension _n_ x 32 in file `GLEAMS_embed.npy`, with _n_ the number of MS/MS spectra read from the mzML files.
Additionally, a tabular file `GLEAMS_embed.parquet` will be created containing corresponding metadata for the embedded spectra.

### Embedding clustering

After converting the MS/MS spectra to 32-dimensional embeddings, they can be clustered to group spectra with similar embeddings using the `gleams cluster` command. Example:

```
gleams cluster --embed_name GLEAMS.embed --cluster_name GLEAMS.cluster --eps 0.05
gleams cluster --embed_name GLEAMS_embed --cluster_name GLEAMS_cluster --distance_threshold 0.3
```

This will perform DBSCAN clustering on the embeddings.
The output will be written to the `GLEAMS.cluster.npy` NumPy file with cluster labels per embedding (`-1` indicates noise, minimum cluster size 2).
Additionally, a tabular file `GLEAMS.cluster.parquet` will be created containing corresponding metadata for the clustered spectra.
Note that although this `GLEAMS.cluster.parquet` metadata file contains information for the same spectra as the `GLEAMS.embed.parquet` metadata file, the order of the spectra (matching the clustering results) is different.
This will perform hierarchical clustering on the embeddings with the given distance threshold.
The output will be written to the `GLEAMS_cluster.npy` NumPy file with cluster labels per embedding (`-1` indicates noise, minimum cluster size 2).
Additionally, a file `GLEAMS_cluster_medoids.npy` will be created containing indexes of the cluster representative spectra (medoids).

### Advanced usage

Full configuration of GLEAMS, including various configurations to train the neural network, can be modified in the `gleams/config.py` file.

Contact
-------

For more information you can visit the [official code website](https://github.com/bittremieux/GLEAMS) or send an email to <[email protected]>.

2 changes: 1 addition & 1 deletion gleams/cluster/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def cluster(embeddings_filename: str, metadata_filename: str,
'not recomputed')
return
clusters_dir = os.path.dirname(clusters_filename)
if not os.path.exists(clusters_dir):
if clusters_dir and not os.path.exists(clusters_dir):
os.mkdir(clusters_dir)
# Sort the metadata by increasing precursor m/z for easy subsetting.
metadata = (pd.read_parquet(metadata_filename, columns=['charge', 'mz'])
Expand Down
1 change: 1 addition & 0 deletions gleams/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,4 +79,5 @@
num_probe = 1024

# Clustering.
linkage = 'average'
distance_threshold = 0.35
57 changes: 18 additions & 39 deletions gleams/gleams.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,9 @@ def gleams():
@gleams.command('embed')
@click.argument('peak_in', nargs=-1, required=True)
@click.option(
'--embed_name', default='GLEAMS.embed',
'--embed_name', default='GLEAMS_embed',
help='The output will be written to the current working directory with the'
' specified name (default: "GLEAMS.embed"). The output consists of a '
' specified name (default: "GLEAMS_embed"). The output consists of a '
'NumPy file containing the GLEAMS embeddings (extension ".npy") and '
'a Parquet file containing the corresponding MS/MS spectra metadata '
'(extension ".parquet").')
Expand All @@ -59,7 +59,7 @@ def cli_embed(peak_in: List[str], embed_name: str) -> None:

# Create temporary working directory.
temp_dir = tempfile.mkdtemp()
metadata_filename = os.path.join(temp_dir, 'metadata.parquet')
metadata_filename = os.path.join(temp_dir, f'{embed_name}.parquet')
embed_dir = os.path.join(temp_dir, 'embed')
os.mkdir(embed_dir)
# Create a metadata file with the file names.
Expand Down Expand Up @@ -104,26 +104,24 @@ def cli_embed(peak_in: List[str], embed_name: str) -> None:

@gleams.command('cluster')
@click.option(
'--embed_name', default='GLEAMS.embed',
help='Name of the GLEAMS embeddings (default: "GLEAMS.embed"). Both a '
'--embed_name', default='GLEAMS_embed',
help='Name of the GLEAMS embeddings (default: "GLEAMS_embed"). Both a '
'NumPy file and a Parquet file should be present in the current '
'working directory.')
@click.option(
'--cluster_name', default='GLEAMS.cluster',
'--cluster_name', default='GLEAMS_cluster',
help='The output will be written to the current working directory with the'
' specified name (default: "GLEAMS.cluster"). The output consists of '
' specified name (default: "GLEAMS_cluster"). The output consists of '
'a NumPy file containing the cluster labels (extension ".npy") and '
'a Parquet file containing the corresponding MS/MS spectra metadata '
'(extension ".parquet"). Attention: the spectrum order in this '
'metadata file differs from the order in the embedding metadata '
'file.')
'a NumPy file containing indexes of the cluster medoid spectra '
'(extension "_medoids.npy").')
@click.option(
'--eps', default=0.05,
help='The maximum Euclidean distance between embeddings to be considered '
'in each other\'s neighborhood during DBSCAN clustering '
'(default: 0.05).'
'--distance_threshold', default=0.3,
help='The Euclidean distance threshold between embeddings to be merged '
'during hierarchical clustering (average linkage) (default: 0.3).'
)
def cli_cluster(embed_name: str, cluster_name: str, eps: float) -> None:
def cli_cluster(embed_name: str, cluster_name: str,
distance_threshold: float) -> None:
"""
Cluster embeddings.
Expand All @@ -132,26 +130,7 @@ def cli_cluster(embed_name: str, cluster_name: str, eps: float) -> None:
"""
logger.info('GLEAMS version %s', str(__version__))

# Create temporary working directory.
temp_dir = tempfile.mkdtemp()
dist_filename = os.path.join(temp_dir, f'{embed_name}.npz')
# Compute the pairwise distance to a temporary file.
cluster.compute_pairwise_distances(
f'{embed_name}.npy', f'{embed_name}.parquet', dist_filename,
config.precursor_tol_mass, config.precursor_tol_mode,
config.mz_interval, config.num_neighbors, config.num_neighbors_ann,
config.num_probe, config.batch_size_add, config.batch_size_dist,
config.charges)
# Move the metadata file to the working directory.
shutil.move(os.path.join(temp_dir, f'{embed_name}.parquet'),
f'{cluster_name}.parquet')
# Remove previous result (if applicable).
if os.path.isfile(f'{cluster_name}.npy'):
os.remove(f'{cluster_name}.npy')
# DBSCAN clustering.
cluster.cluster(
dist_filename, f'{cluster_name}.parquet', f'{cluster_name}.npy',
eps, config.min_samples, config.precursor_tol_mass,
config.precursor_tol_mode)
# Clean up intermediate files.
shutil.rmtree(temp_dir)
cluster.cluster(f'{embed_name}.npy', f'{embed_name}.parquet',
f'{cluster_name}.npy', config.precursor_tol_mass,
config.precursor_tol_mode, config.linkage,
distance_threshold, config.charges)

0 comments on commit b71317c

Please sign in to comment.