diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 7c3c413..f607b29 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -11,7 +11,7 @@ jobs: strategy: max-parallel: 4 matrix: - python-version: [3.10.14] + python-version: [3.10.15] steps: - uses: actions/checkout@v2 @@ -57,7 +57,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [3.10.14] + python-version: [3.10.15] needs: build steps: diff --git a/.gitignore b/.gitignore index a987bd0..fb1251f 100644 --- a/.gitignore +++ b/.gitignore @@ -106,7 +106,7 @@ celerybeat-schedule .venv env/ venv/ -clustering_venv/ +cluster_venv/ ENV/ env.bak/ venv.bak/ @@ -154,26 +154,26 @@ benchmark_data/examples/P15291/P15291_distance_differences/* benchmark_data/examples/P15291/P15291_distance_difference_maps/* benchmark_data/examples/P15291/P15291_ca_distances/* benchmark_data/examples/P15291/P15291_cluster_results/* -benchmark_data/examples/P15291/P15291_alpha_fold_mmcifs/* +benchmark_data/examples/P15291/P15291_alphafold/* !benchmark_data/examples/P15291/P15291_distance_differences/.gitkeep !benchmark_data/examples/P15291/P15291_distance_difference_maps/.gitkeep !benchmark_data/examples/P15291/P15291_ca_distances/.gitkeep !benchmark_data/examples/P15291/P15291_cluster_results/.gitkeep -!benchmark_data/examples/P15291/P15291_alpha_fold_mmcifs/.gitkeep +!benchmark_data/examples/P15291/P15291_alphafold/.gitkeep # Example outputs -- O34926 benchmark_data/examples/O34926/O34926_distance_differences/* benchmark_data/examples/O34926/O34926_distance_difference_maps/* benchmark_data/examples/O34926/O34926_ca_distances/* benchmark_data/examples/O34926/O34926_cluster_results/* -benchmark_data/examples/O34926/O34926_alpha_fold_mmcifs/* +benchmark_data/examples/O34926/O34926_path_alphafold/* !benchmark_data/examples/O34926/O34926_distance_differences/.gitkeep !benchmark_data/examples/O34926/O34926_distance_difference_maps/.gitkeep !benchmark_data/examples/O34926/O34926_ca_distances/.gitkeep !benchmark_data/examples/O34926/O34926_cluster_results/.gitkeep -!benchmark_data/examples/O34926/O34926_alpha_fold_mmcifs/.gitkeep +!benchmark_data/examples/O34926/O34926_path_alphafold/.gitkeep # Outputs from unit tests tests/test_output/* diff --git a/README.md b/README.md index b56f005..2606c0c 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,8 @@ # Monomeric protein conformational state clustering -These scripts can be used to cluster a parsed set of monomeric protein chains via a global conformational change metric based on CA distances. Once the peptide chains destined for clustering have been specified, a pairwise CA distance matrix for each chain is produced. Distance difference matrices are then generated, again, pairwise but between CA distance matrices here. Therefore, for `N` unique peptide chains, `N` CA distance matrices and `N^2` distance difference matrices are generated. +These scripts can be used to cluster a parsed set of monomeric protein chains via a global conformational change metric based on CA distances. Once the polypeptide chains destined for clustering have been specified, a pairwise CA distance matrix for each chain is produced. Distance difference matrices are then generated, again, pairwise but between CA distance matrices. Therefore, for `N` unique peptide chains, `N` CA distance matrices and `N*(N-1)/1` distance difference matrices are generated. _NB: the score between A->B is the same as B->A_. -Additional scripts are provided to cluster the chains based on distance-based scores calculated from all pairwise distance difference matricies, as well as scripts to produce dendrograms of the clustering results, swarm plots of the scores, and heatmaps for each distance difference matrix. +Additional scripts are provided to cluster the chains based on distance-based scores calculated from all pairwise distance difference matricies, as well as scripts to produce dendrograms of the clustering results, and heatmaps for each distance difference matrix. Example input data is provided in the `benchmark_data/examples` folder, including scripts to download and save data from the [PDBe-KB's benchmark conformational state dataset](http://ftp.ebi.ac.uk/pub/databases/pdbe-kb/benchmarking/distinct-monomer-conformers/). Example scripts are included in `examples`, which run complete executions of the entire pipeline for a selection of structures from several difference UniProt accessions. @@ -13,10 +13,10 @@ For intructions on importing `protein-cluster-conformers` into your own Python c `protein-cluster-conformers` requires >=Python3.10 to run. Initialise virtual environment and install dependencies with: ```shell -$ cd protein-cluster-conformers] -$ python3.10 -m venv cluster_venv -$ source cluster_venv/bin/activate -$ python -m pip install -r requirements.txt +cd protein-cluster-conformers +python3.10 -m venv cluster_venv +source cluster_venv/bin/activate +pip install -r requirements.txt ``` _____ @@ -26,7 +26,7 @@ _____ To cluster a set of protein structures, run the `find_clusters.py` script: ```shell -$ python find_conformers.py [-h] [-v] -u UNIPROT -m MMCIF [MMCIF ...] +python3 find_conformers.py [-h] [-v] -u UNIPROT -m MMCIF [MMCIF ...] [-s PATH_CLUSTERS] -c PATH_CA [-d PATH_DD] [-g PATH_DENDROGRAM [PATH_DENDROGRAM ...]] [-w PATH_SWARM [PATH_SWARM ...]] [-o PATH_HISTOGRAM] @@ -70,12 +70,12 @@ optional arguments: ### **Run instructions** -#### Option 1) Cluster only +#### Option 1) Cluster and save matrices To only cluster a set of monomeric protein structures that share part or all of the same UniProt sequence, run: ``` shell -$ python find_clusters.py -u "A12345" \ +python3 find_clusters.py -u "A12345" \ -m /path/to/structure_1.cif [chains] \ -m /path/to/structure_2.cif [chains] \ ... \ @@ -85,16 +85,18 @@ $ python find_clusters.py -u "A12345" \ The paths to each structure are parsed using the `-m` flag. -Chain IDs (only `struct_asym_id` is currently recognised) should be given as space-delimited arguments after the path. Parse in multiple structures using consecutive `-m` flags. The UniProt accession must be parsed using the `-u` flag. +Chain IDs (only `struct_asym_id` is currently recognised at the moment) should be given as space-delimited arguments after the path. Parse in multiple structures using consecutive `-m` flags. The UniProt accession must be parsed using the `-u` flag. **Example**: O34926 ```shell -$ python find_conformers.py -u "O34926" \ +python3 find_conformers.py -u "O34926" \ -m benchmark_data/examples/O34926/O34926_updated_mmcif/3nc3_updated.cif A B \ -m benchmark_data/examples/O34926/O34926_updated_mmcif/3nc5_updated.cif A B \ -m benchmark_data/examples/O34926/O34926_updated_mmcif/3nc6_updated.cif A B \ -m benchmark_data/examples/O34926/O34926_updated_mmcif/3nc7_updated.cif A B \ + -c benchmark_data/examples/O34926/O34926_ca_distances \ + -d benchmark_data/examples/O34926/O34926_distance_differences/ \ -s benchmark_data/examples/O34926/O34926_cluster_results/ ``` @@ -102,12 +104,12 @@ By default, the pipeline only clusters the parsed mmCIFs (and specified chains), ---- -#### Option 2) Save matrices only +#### Option 2) Save CA matrices only To save the matrices produced in the pipeline, simply specify the path in which to save them using the `-c` flag for CA distance matrices and the `-d` flag for CA distance difference matrices: ```shell -$ python find_clusters.py -u "A12345" \ +$ python find_conformers.py -u "A12345" \ -m /path/to/structure_1.cif [chains] \ -m ... \ -s /path/to/save/cluster_results.csv \ @@ -115,18 +117,15 @@ $ python find_clusters.py -u "A12345" \ -d /path/to/save/distance/difference/matrices/ ``` -These flags are mutually exclusive, meaning either or both can be used at the same time. - **Example**: O34926 ```shell -$ python find_clusters.py -u "O34926" \ +python3 find_conformers.py -u "O34926" \ -m benchmark_data/examples/O34926/O34926_updated_mmcif/3nc3_updated.cif A B \ -m benchmark_data/examples/O34926/O34926_updated_mmcif/3nc5_updated.cif A B \ -m benchmark_data/examples/O34926/O34926_updated_mmcif/3nc6_updated.cif A B \ -m benchmark_data/examples/O34926/O34926_updated_mmcif/3nc7_updated.cif A B \ - -c benchmark_data/examples/O34926_CA_distances \ - -d benchmark_data/examples/O34926_distance_differences/ + -c benchmark_data/examples/O34926/O34926_ca_distances \ ``` --- @@ -136,14 +135,12 @@ $ python find_clusters.py -u "O34926" \ 2D histograms (heatmaps) can be rendered and saved for each CA distance difference matrix by specifying the save directory using the `-o` flag: ```shell -$ python find_clusters.py -u "A12345" \ +$ python find_conformers.py -u "A12345" \ -m /path/to/structure_1.cif [chains] \ -m ... \ -o /path/to/save/distance/difference/2D/histograms/ ``` -To save the histogram in the default directory (`test/ouputs/dd_histograms`), simply parse the `-o` flag without a path. - The resulting plots are saved in PNG format (to save render time). E.g: Distance difference map of 6hac chain A to 6hae chain K @@ -153,12 +150,14 @@ The resulting plots are saved in PNG format (to save render time). E.g: **Example**: O34926 ```shell -$ python find_clusters.py -u "O34926" \ +python3 find_conformers.py -u "O34926" \ -m benchmark_data/examples/O34926/O34926_updated_mmcif/3nc3_updated.cif A B \ -m benchmark_data/examples/O34926/O34926_updated_mmcif/3nc5_updated.cif A B \ -m benchmark_data/examples/O34926/O34926_updated_mmcif/3nc6_updated.cif A B \ -m benchmark_data/examples/O34926/O34926_updated_mmcif/3nc7_updated.cif A B \ - -o ./benchmark_data/examples/O34926/O34926_distance_difference_maps/ + -c benchmark_data/examples/O34926/O34926_ca_distances \ + -d benchmark_data/examples/O34926/O34926_distance_differences/ \ + -o benchmark_data/examples/O34926/O34926_distance_difference_maps/ ``` --- @@ -168,7 +167,7 @@ $ python find_clusters.py -u "O34926" \ From the clustering results, a dendrogram can be rendered to show the relationships between all clustered chains. To save a dendrogram of the hierarchical clustering results, run: ```shell -$ python find_clusters.py -u "A12345" \ +$ python find_conformers.py -u "A12345" \ -m /path/to/structure_1.cif [chains] \ -m ... \ -g /path/to/save/dendrogram/ [png svg] @@ -185,22 +184,24 @@ where either a `png` or `svg` file type is saved. E.g. **Example**: O34926 ```shell -$ python find_clusters.py -u "O34926" \ +python3 find_conformers.py -u "O34926" \ -m benchmark_data/examples/O34926/O34926_updated_mmcif/3nc3_updated.cif A B \ -m benchmark_data/examples/O34926/O34926_updated_mmcif/3nc5_updated.cif A B \ -m benchmark_data/examples/O34926/O34926_updated_mmcif/3nc6_updated.cif A B \ -m benchmark_data/examples/O34926/O34926_updated_mmcif/3nc7_updated.cif A B \ + -c benchmark_data/examples/O34926/O34926_ca_distances \ + -d benchmark_data/examples/O34926/O34926_distance_differences/ \ -g benchmark_data/examples/O34926/O34926_cluster_results/ png svg ``` --- -#### Option 5) Render swarm plot + -#### Option 6) Include AlphaFold Database structure(s) +#### Option 6) Include AlphaFold Database structure when generating CA and distance difference matrices By parsing in the `-a` flag, the script will attempt to download and cluster the pre-generated AlphaFold structure, stored on the [AlphaFold Database](https://alphafold.ebi.ac.uk/). You do not need to have downloaded the predicted AlphaFold structure already but must be connected to the internet. The structure will be saved ```shell -$ python find_clusters.py -u "A12345" \ +$ python find_conformers.py -u "A12345" \ -m /path/to/structure_1.cif [chains] \ -m ... \ -a @@ -239,12 +242,14 @@ $ python find_clusters.py -u "A12345" \ **Example**: O34926 ```shell -$ python find_clusters.py -u "O34926" \ +python3 find_conformers.py -u "O34926" \ -m benchmark_data/examples/O34926/O34926_updated_mmcif/3nc3_updated.cif A B \ -m benchmark_data/examples/O34926/O34926_updated_mmcif/3nc5_updated.cif A B \ -m benchmark_data/examples/O34926/O34926_updated_mmcif/3nc6_updated.cif A B \ -m benchmark_data/examples/O34926/O34926_updated_mmcif/3nc7_updated.cif A B \ - -a + -c benchmark_data/examples/O34926/O34926_ca_distances \ + -d benchmark_data/examples/O34926/O34926_distance_differences/ \ + -a benchmark_data/examples/O34926/O34926_path_alphafold/ ``` ------ @@ -254,7 +259,7 @@ $ python find_clusters.py -u "O34926" \ **Example #1:** O34926 ```shell -$ python find_conformers.py -u "O34926" \ +python3 find_conformers.py -u "O34926" \ -m benchmark_data/examples/O34926/O34926_updated_mmcif/3nc3_updated.cif A B \ -m benchmark_data/examples/O34926/O34926_updated_mmcif/3nc5_updated.cif A B \ -m benchmark_data/examples/O34926/O34926_updated_mmcif/3nc6_updated.cif A B \ @@ -263,14 +268,14 @@ $ python find_conformers.py -u "O34926" \ -d benchmark_data/examples/O34926/O34926_distance_differences/ \ -s benchmark_data/examples/O34926/O34926_cluster_results/ \ -o benchmark_data/examples/O34926/O34926_distance_difference_maps/ \ - -g benchmark_data/examples/O34926/O34926_cluster_results/ png svg - -a + -g benchmark_data/examples/O34926/O34926_cluster_results/ png svg \ + -a benchmark_data/examples/O34926/O34926_path_alphafold/ ``` -or use the `run_O34926.sh` script. +or use the `examples/run_O34926.sh` script. ``` shell -$ ./run_O34926.sh +$ ./examples/run_O34926.sh ``` **Example #2:** P15291 @@ -287,19 +292,55 @@ python3 find_conformers.py -u "P15291" \ -d benchmark_data/examples/P15291/P15291_distance_differences/ \ -s benchmark_data/examples/P15291/P15291_cluster_results/ \ -o benchmark_data/examples/P15291/P15291_distance_difference_maps/ \ - -g benchmark_data/examples/P15291/P15291_cluster_results/ png svg - -w benchmark_data/examples/P15291/P15291_cluster_results/ png svg - -a + -g benchmark_data/examples/P15291/P15291_cluster_results/ png svg \ + -a benchmark_data/examples/P15291/P15291_path_alphafold/ ``` -or execute the `run_P15291.sh` script. +or execute the `examples/run_P15291.sh` script. ```shell -$ ./run_P15291.sh +$ ./examples/run_P15291.sh ``` When imported into Orc, the arguments required to execute the clustering process correctly will be parsed into the class instance and related methods as lists generated from the preceding functions called by the existing `protein-superpose` pipeline. +#### Optional arguments + +The start and end residue positions can be parsed into the script using the `-0` and `-1` flags, respectively. This will not restrict the residue ranges during clustering but will be used to label the axes of the distance difference maps and dendrograms. + +*Example*: O34926: + +```shell +python3 find_conformers.py -u "O34926" \ + -m benchmark_data/examples/O34926/O34926_updated_mmcif/3nc3_updated.cif A B \ + -m benchmark_data/examples/O34926/O34926_updated_mmcif/3nc5_updated.cif A B \ + -m benchmark_data/examples/O34926/O34926_updated_mmcif/3nc6_updated.cif A B \ + -m benchmark_data/examples/O34926/O34926_updated_mmcif/3nc7_updated.cif A B \ + -c benchmark_data/examples/O34926/O34926_ca_distances/ \ + -d benchmark_data/examples/O34926/O34926_distance_differences/ \ + -s benchmark_data/examples/O34926/O34926_cluster_results/ \ + -g benchmark_data/examples/O34926/O34926_cluster_results/ png svg \ + -0 1 \ + -1 405 +``` + +The pipeline will avoid re-processing existing files where it files them. To update a single PDB entry, specify the PDB accession using the `-i` flag, e.g. `-i 3nc3`. To force all entries to be re-processed, use the `-f` flag, which will overwrite existing files indescriminately. + +*Example*: O34926: + +```shell +python3 find_conformers.py -u "O34926" \ + -m benchmark_data/examples/O34926/O34926_updated_mmcif/3nc3_updated.cif A B \ + -m benchmark_data/examples/O34926/O34926_updated_mmcif/3nc5_updated.cif A B \ + -m benchmark_data/examples/O34926/O34926_updated_mmcif/3nc6_updated.cif A B \ + -m benchmark_data/examples/O34926/O34926_updated_mmcif/3nc7_updated.cif A B \ + -c benchmark_data/examples/O34926/O34926_ca_distances/ \ + -d benchmark_data/examples/O34926/O34926_distance_differences/ \ + -s benchmark_data/examples/O34926/O34926_cluster_results/ \ + -g benchmark_data/examples/O34926/O34926_cluster_results/ png svg \ + -f +``` + --- ### Run on benchmark dataset @@ -307,7 +348,7 @@ When imported into Orc, the arguments required to execute the clustering process The scripts above are called by the `run_benchmark.py` wrapper. To generate conformational clustering results for the included benchmark dataset, run: ``` shell -$ python cluster_benchmark.py +python3 cluster_benchmark.py ``` This will call the `run_benchmark(...)` functions included in `ca_distance.py`, `distance_difference.py`, `cluster.py`, `plot_distance_difference.py`, `plot_dendrogram.py` and `plot_swarm_plot.py`. No arguments need parsing into the script. diff --git a/__version__.py b/__version__.py index c68196d..a955fda 100644 --- a/__version__.py +++ b/__version__.py @@ -1 +1 @@ -__version__ = "1.2.0" +__version__ = "1.2.1" diff --git a/benchmark_data/examples/P15291/P15291_alpha_fold_mmcifs/.gitkeep b/benchmark_data/examples/P15291/P15291_alpha_fold_mmcifs/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/benchmark_data/examples/P15291/P15291_ca_distances/.gitkeep b/benchmark_data/examples/P15291/P15291_ca_distances/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/benchmark_data/examples/P15291/P15291_cluster_results/.gitkeep b/benchmark_data/examples/P15291/P15291_cluster_results/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/benchmark_data/examples/P15291/P15291_distance_difference_maps/.gitkeep b/benchmark_data/examples/P15291/P15291_distance_difference_maps/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/benchmark_data/examples/P15291/P15291_distance_differences/.gitkeep b/benchmark_data/examples/P15291/P15291_distance_differences/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/cluster_benchmark.py b/cluster_benchmark.py index bf54208..dcc2341 100644 --- a/cluster_benchmark.py +++ b/cluster_benchmark.py @@ -101,12 +101,12 @@ def benchmark_cluster(benchmark_df, unp): png_bool = True svg_bool = True - unp_cluster.make_dendrogram( - PATH_SAVE_CLUSTER_RESULTS.joinpath("dendrograms"), png=png_bool, svg=svg_bool - ) - - unp_cluster.make_swarmplot( - PATH_SAVE_CLUSTER_RESULTS.joinpath("swarm_plots"), png=png_bool, svg=svg_bool + cluster_monomers.render_dendrogram( + unp=unp, + path_results=PATH_SAVE_CLUSTER_RESULTS, + path_save=PATH_SAVE_CLUSTER_RESULTS.joinpath("dendrograms"), + png=png_bool, + svg=svg_bool, ) """ diff --git a/cluster_conformers/cluster_chains.py b/cluster_conformers/cluster_chains.py index 341da64..9227d55 100644 --- a/cluster_conformers/cluster_chains.py +++ b/cluster_conformers/cluster_chains.py @@ -10,19 +10,16 @@ distance difference maps. """ -# Standard package imports -from typing import Iterable - -import seaborn as sns - # Third-party modules -from matplotlib import pyplot as plt +import seaborn as sns from matplotlib.axes import Axes from matplotlib.figure import Figure from numpy import column_stack, ndarray, zeros from scipy.cluster.hierarchy import dendrogram from sklearn.cluster import AgglomerativeClustering +from cluster_conformers.utils.linear_algebra_utils import upper_triangle + def make_linkage_matx(model: AgglomerativeClustering) -> "ndarray[any, float]": """ @@ -103,7 +100,7 @@ def cluster_agglomerative( def plot_dendrogram( unp: str, axis, linkage_matrix: ndarray = None, cutoff: float = None, **kwargs -) -> "tuple(Figure, Axes)": +) -> "tuple[Figure, Axes]": """ Create linkage matrix from SKLearn model and plot the dendrogram of nodes. Applies this to parsed Matplotlib axis, inplace. @@ -167,31 +164,20 @@ def plot_dendrogram( del dendrogram_plot -def plot_swarmplot(y_data: Iterable, unp: str) -> "tuple(Figure, Axes)": - """Creates a strip plot of non-overlapping data points for a given list of data. The +def plot_swarmplot(scores: ndarray, axis) -> "tuple[Figure, Axes]": + """Creates a swarm plot of non-overlapping data points for a given list of data. The values of the data will correspond to their y-values. Their position along the x-axis is irrelevant as they're all identical. - :param y_data: Array of data points to plot. - :type y_data: Iterable - :param unp: UniProt accession. - :type unp: str + :param scores: Matrix of GLOCON scores. + :type scores: Iterable + :param axis: Matplotlib axis to plot the swarm plot on. + :type axis: Axes :return: Figure and axis objects containing the plotted swarm plot :rtype: tuple(matplotlib.figure.Figure, matplotlib.axes.Axes) """ - # Init the figure - _, ax = plt.subplots( - 1, - 1, - figsize=(4, 5), - # ncols=1, # INTRA | bar | INTER | bar - # nrows=1, - # gridspec_kw=dict(width_ratios=[4, 0.2, 4, 0.2]), - tight_layout=True, - ) # Plot the data - sns.swarmplot(data=y_data, ax=ax, size=5) # vmax=max_dist , + sns.swarmplot(data=upper_triangle(scores), ax=axis, size=5) # vmax=max_dist , # Add some formatting - ax.set_title(unp, fontweight="bold") - ax.set_ylabel("Score (\u212B)") + axis.set_ylabel("GLOCON score (\u212B)") diff --git a/cluster_conformers/cluster_monomers.py b/cluster_conformers/cluster_monomers.py index a18e600..742817d 100644 --- a/cluster_conformers/cluster_monomers.py +++ b/cluster_conformers/cluster_monomers.py @@ -124,6 +124,8 @@ def __init__( # Handle pre-clustering AlphaFold file information if path_save_alphafold: + path_save_alphafold.mkdir(exist_ok=True, parents=True) + # Download and save afdb_path = download_utils.download_alphafold_mmcif( self.unp, path_save_alphafold @@ -149,13 +151,15 @@ def __init__( af_chain = "A" self.chains[af_prefix] = [af_chain] self.chains_all.append(af_chain) + self.pdbe_chain_ids.append(f"{af_prefix}_{af_chain}") # Parse AlphaFold structure for extracting chain info for superpose.py afdb_mmcif = parsing_utils.parse_mmcif(afdb_structure, af_chain) + # Storing the start-end UniProt residue indices for protein-superpose self.af_unp_range = ( - afdb_mmcif["unp_res_ids"][0], - afdb_mmcif["unp_res_ids"][-1], + min(afdb_mmcif["unp_res_ids"]), + max(afdb_mmcif["unp_res_ids"]), ) # Number of threads for multiprocessing. Only use 1 if few unique chains @@ -202,7 +206,7 @@ def remove_entry_matxs( logger.debug(f"Removing {file}") file.unlink() - def _generate_ca_matx(self, pdbe_chain_id: str) -> "tuple(dict)": + def _generate_ca_matx(self, pdbe_chain_id: str) -> "tuple[dict]": """ Method for calculating and saving the CA distance matrix for a given PDB-chain ID string. @@ -278,7 +282,7 @@ def ca_distance(self, path_save: PosixPath = None) -> None: # Dir to save the raw UniProt residue IDs as 1D np.array()s self.path_save_unps = path_save.joinpath("unp_residue_ids") - self.path_save_unps.mkdir(exist_ok=True) + self.path_save_unps.mkdir(exist_ok=True, parents=True) self.path_save_base_ca = path_save @@ -463,6 +467,9 @@ def cluster( :type path_save_cluster_results: PosixPath, optional """ + if path_save_dd_matx: + path_save_dd_matx.mkdir(exist_ok=True, parents=True) + logger.info("Generating distance difference matrices...") self.score_matx, self.label_matx = self.build_clustering_inputs( path_save_dd_matx @@ -517,6 +524,7 @@ def cluster( # Write out clustering results if path specified if path_save_cluster_results: + path_save_cluster_results.mkdir(exist_ok=True, parents=True) # Save clustering results path_save_all_conf = path_save_cluster_results.joinpath( @@ -706,39 +714,108 @@ def render_dendrogram( ) return - if not np.array_equal(linkage_matx, np.array([0])): - - fig, ax = plt.subplots(1, 1) - - logger.info("Rendering dendogram") - cluster_chains.plot_dendrogram( - unp, - ax, - linkage_matx, - CLUSTERING_CUTOFF_PC, - labels=pdbe_chain_ids, - leaf_rotation=90, - ) # p=3 - - # UniProt residue range specified, make modifications (optional) - if unp_range: - ax.set_title( - f"Agglomerative clustering results: {unp} ({unp_range[0]}-{unp_range[1]})", - fontweight="bold", - ) - fname = f"{unp}_{unp_range[0]}_{unp_range[1]}_agglomerative_dendrogram" - else: - fname = f"{unp}_agglomerative_dendrogram" - - # Save file - io_utils.save_figure( - path_save, - save_fname=fname, - png=png, - svg=svg, + if np.array_equal(linkage_matx, np.array([0])): + logger.info("Single cluster for segment. Not rendering dendrogram.") + return + + fig, ax = plt.subplots(1, 1) + + logger.info("Rendering dendogram") + cluster_chains.plot_dendrogram( + unp, + ax, + linkage_matx, + CLUSTERING_CUTOFF_PC, + labels=pdbe_chain_ids, + leaf_rotation=90, + ) # p=3 + + # UniProt residue range specified, make modifications (optional) + if unp_range: + ax.set_title( + f"Agglomerative clustering results: {unp} ({unp_range[0]}-{unp_range[1]})", + fontweight="bold", ) + fname = f"{unp}_{unp_range[0]}_{unp_range[1]}_agglomerative_dendrogram" + else: + fname = f"{unp}_agglomerative_dendrogram" + + # Save file + io_utils.save_figure( + path_save, + save_fname=fname, + png=png, + svg=svg, + ) + + # plt.close(fig=fig) - plt.close(fig=fig) +def render_swarmplot( + unp: str, + path_results: PosixPath, + path_save: PosixPath = None, + png: bool = False, + svg: bool = False, + unp_range: "tuple[int, int]" = None, +) -> None: + """ + Plot hierachical dendrogram from clustering results. Must have a linkage matrix and + ordered labels object already stored. The easiest way to get these is to run the + ClusterConformations() object first and point to its output folder. + + :param path_save: Path to save rendered dendrogram image. + :type path_save: PosixPath + :param png: Save dendrogram image in PNG format, defaults to False + :type png: bool, optional + :param svg: Save dendrogram image in SVG format, defaults to False + :type svg: bool, optional + :param unp_range: Range of UniProt residues used for clustering + """ + + # Set matplotlib global formatting + appearance_utils.init_plot_appearance() + + try: + score_matx = io_utils.load_matrix( + path_results.joinpath(f"{unp}_score_matrix.npz") + ) + + except OSError: + logger.error( + "Linkage matrix and/or label list not found. Please run clustering first." + ) + return + + if np.array_equal(score_matx, np.array([0])): + logger.info("Single cluster for segment. Not rendering swarm plot.") + return + + fig, ax = plt.subplots(1, 1) + + logger.info("Rendering swarm plot") + cluster_chains.plot_swarmplot( + score_matx, + ax, + ) + + # UniProt residue range specified, make modifications (optional) + if unp_range: + ax.set_title( + f"GLOCON score for clustering: {unp} ({unp_range[0]}-{unp_range[1]})", + fontweight="bold", + ) + fname = f"{unp}_{unp_range[0]}_{unp_range[1]}_swarm_plot" else: - logger.info("Single cluster for segment. Not rendering dendrogram.") + fname = f"{unp}_swarm_plot" + + # Save file + + io_utils.save_figure( + path_save, + save_fname=fname, + png=png, + svg=svg, + ) + + plt.close(fig=fig) diff --git a/cluster_conformers/utils/download_utils.py b/cluster_conformers/utils/download_utils.py index 2142148..7399e99 100644 --- a/cluster_conformers/utils/download_utils.py +++ b/cluster_conformers/utils/download_utils.py @@ -33,6 +33,12 @@ def fetch_benchmark_mmcifs(path_benchmark_df: PosixPath, path_save: PosixPath) - pdbe_ids = benchmark_df["PDBe_ID"].unique() # Download and save + logger.info( + f""" + Downloading {len(pdbe_ids)} updated mmCIFs from the PDBe. + This may take a few minutes... + """ + ) for pdbe in pdbe_ids: fetch_updated_mmcif(pdbe, path_save) @@ -53,9 +59,9 @@ def fetch_updated_mmcif(pdb_code: str, path_save: PosixPath) -> None: download_link = url + mmcif_file_name # Download - logger.info("Downloading", mmcif_file_name) - response = get(download_link, allow_redirects=True) save_to = path_save.joinpath(mmcif_file_name) + logger.info(f"Downloading {mmcif_file_name} to {save_to}") + response = get(download_link, allow_redirects=True) open(save_to, "wb").write(response.content) diff --git a/cluster_conformers/utils/io_utils.py b/cluster_conformers/utils/io_utils.py index 461c2c9..2a5faf4 100644 --- a/cluster_conformers/utils/io_utils.py +++ b/cluster_conformers/utils/io_utils.py @@ -182,6 +182,7 @@ def save_figure( :type svg: bool, optional """ default_dpi = 200 + path_save.mkdir(parents=True, exist_ok=True) if png: save_fig_dir = path_save.joinpath(f"{save_fname}.png") diff --git a/examples/run_O34926.sh b/examples/run_O34926.sh index 216dafa..1d5cd1d 100755 --- a/examples/run_O34926.sh +++ b/examples/run_O34926.sh @@ -1,12 +1,19 @@ #!/bin/sh +# Create directories if they don't exist +mkdir -p benchmark_data/examples/O34926/O34926_updated_mmcif +mkdir -p benchmark_data/examples/O34926/O34926_ca_distances/unp_residue_ids +mkdir -p benchmark_data/examples/O34926/O34926_distance_differences +mkdir -p benchmark_data/examples/O34926/O34926_cluster_results +mkdir -p benchmark_data/examples/O34926/O34926_alpha_fold_mmcifs + +# Remove files if they exist from previous runs rm benchmark_data/examples/O34926/O34926_ca_distances/* rm benchmark_data/examples/O34926/O34926_ca_distances/unp_residue_ids/* rm benchmark_data/examples/O34926/O34926_distance_differences/* rm benchmark_data/examples/O34926/O34926_cluster_results/* # mprof run --python - python3 find_conformers.py -u "O34926" \ -m benchmark_data/examples/O34926/O34926_updated_mmcif/3nc7_updated.cif A B \ -m benchmark_data/examples/O34926/O34926_updated_mmcif/3nc5_updated.cif A B \ @@ -16,12 +23,11 @@ python3 find_conformers.py -u "O34926" \ -d benchmark_data/examples/O34926/O34926_distance_differences/ \ -m benchmark_data/examples/O34926/O34926_updated_mmcif/3nc6_updated.cif A B \ -i 3nc6 \ + -o benchmark_data/examples/O34926/O34926_distance_difference_maps/ \ -f \ -g benchmark_data/examples/O34926/O34926_cluster_results/ png svg \ -0 1 \ - -1 405 + -1 405 \ + # -w benchmark_data/examples/O34926/O34926_cluster_results/ png svg \ + # -a benchmark_data/examples/O34926/O34926_alpha_fold_mmcifs \ # -v \ - # -a benchmark_data/examples/O34926/O34926_alpha_fold_mmcifs - -# -o benchmark_data/examples/O34926/O34926_distance_difference_maps/ \ -# -w benchmark_data/examples/O34926/O34926_cluster_results/ png svg \ diff --git a/examples/run_P15291.sh b/examples/run_P15291.sh index 58b1c73..a4df8bb 100755 --- a/examples/run_P15291.sh +++ b/examples/run_P15291.sh @@ -1,5 +1,19 @@ #!/bin/sh +# Create directories if they don't exist +mkdir -p benchmark_data/examples/P15291/P15291_updated_mmcif +mkdir -p benchmark_data/examples/P15291/P15291_ca_distances/unp_residue_ids +mkdir -p benchmark_data/examples/P15291/P15291_distance_differences +mkdir -p benchmark_data/examples/P15291/P15291_cluster_results +mkdir -p benchmark_data/examples/P15291/P15291_alpha_fold_mmcifs + +# Remove files if they exist from previous runs +rm benchmark_data/examples/P15291/P15291_ca_distances/* +rm benchmark_data/examples/P15291/P15291_ca_distances/unp_residue_ids/* +rm benchmark_data/examples/P15291/P15291_distance_differences/* +rm benchmark_data/examples/P15291/P15291_cluster_results/* + +# mprof run --python python3 find_conformers.py -u "P15291" \ -m benchmark_data/examples/P15291/P15291_updated_mmcif/2fyb_updated.cif A \ -m benchmark_data/examples/P15291/P15291_updated_mmcif/2fyd_updated.cif B D \ @@ -10,12 +24,11 @@ python3 find_conformers.py -u "P15291" \ -c benchmark_data/examples/P15291/P15291_ca_distances/ \ -d benchmark_data/examples/P15291/P15291_distance_differences/ \ -s benchmark_data/examples/P15291/P15291_cluster_results/ \ + -o benchmark_data/examples/P15291/P15291_distance_difference_maps/ \ -g benchmark_data/examples/P15291/P15291_cluster_results/ png svg \ + -f \ -0 122 \ - -1 398 + -1 398 \ + # -w benchmark_data/examples/P15291/P15291_cluster_results/ png svg \ # -a benchmark_data/examples/P15291/P15291_alpha_fold_mmcifs \ - - - -# -w benchmark_data/examples/P15291/P15291_cluster_results/ png svg \ - # -o benchmark_data/examples/P15291/P15291_distance_difference_maps/ \ + # -v \ diff --git a/find_conformers.py b/find_conformers.py index a76842e..378359f 100644 --- a/find_conformers.py +++ b/find_conformers.py @@ -102,13 +102,6 @@ def create_parser(input_args=None): required=True, ) - parser.add_argument( - "-s", - "--path_clusters", - help="Path to save clustering results", - type=PosixPath, - ) - parser.add_argument( "-c", "--path_ca", @@ -117,6 +110,13 @@ def create_parser(input_args=None): required=True, ) + parser.add_argument( + "-s", + "--path_clusters", + help="Path to save clustering results", + type=PosixPath, + ) + parser.add_argument( "-d", "--path_dd", @@ -139,6 +139,7 @@ def create_parser(input_args=None): help="Path to save swarm plot of scores", nargs="+", type=str, + default=None, ) parser.add_argument( @@ -263,17 +264,17 @@ def main(): force=True, ) + # Define residue range if parsed in + if args.first_residue_position and args.last_residue_position: + this_unp_range = (args.first_residue_position, args.last_residue_position) + else: + this_unp_range = None + # Parsing in options for saving dendrogram if args.path_dendrogram: path_save, png_bool, svg_bool = extract_image_format(args.path_dendrogram) - # Define residue range if parsed in - if args.first_residue_position and args.last_residue_position: - this_unp_range = (args.first_residue_position, args.last_residue_position) - else: - this_unp_range = None - cluster_monomers.render_dendrogram( unp=args.uniprot, path_results=path_save, @@ -283,6 +284,20 @@ def main(): unp_range=this_unp_range, ) + if args.path_swarm: + + path_save, png_bool, svg_bool = extract_image_format(args.path_swarm) + + # Render and save swarm plot of scores + cluster_monomers.render_swarmplot( + unp=args.uniprot, + path_save=path_save, + path_results=args.path_clusters, + png=png_bool, + svg=svg_bool, + unp_range=this_unp_range, + ) + if __name__ == "__main__":