Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PDBE-7222: Fixes to dir creation and documentation refresh #9

Merged
merged 10 commits into from
Oct 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ jobs:
strategy:
max-parallel: 4
matrix:
python-version: [3.10.14]
python-version: [3.10.15]

steps:
- uses: actions/checkout@v2
Expand Down Expand Up @@ -57,7 +57,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: [3.10.14]
python-version: [3.10.15]
needs: build

steps:
Expand Down
10 changes: 5 additions & 5 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ celerybeat-schedule
.venv
env/
venv/
clustering_venv/
cluster_venv/
ENV/
env.bak/
venv.bak/
Expand Down Expand Up @@ -154,26 +154,26 @@ benchmark_data/examples/P15291/P15291_distance_differences/*
benchmark_data/examples/P15291/P15291_distance_difference_maps/*
benchmark_data/examples/P15291/P15291_ca_distances/*
benchmark_data/examples/P15291/P15291_cluster_results/*
benchmark_data/examples/P15291/P15291_alpha_fold_mmcifs/*
benchmark_data/examples/P15291/P15291_alphafold/*

!benchmark_data/examples/P15291/P15291_distance_differences/.gitkeep
!benchmark_data/examples/P15291/P15291_distance_difference_maps/.gitkeep
!benchmark_data/examples/P15291/P15291_ca_distances/.gitkeep
!benchmark_data/examples/P15291/P15291_cluster_results/.gitkeep
!benchmark_data/examples/P15291/P15291_alpha_fold_mmcifs/.gitkeep
!benchmark_data/examples/P15291/P15291_alphafold/.gitkeep

# Example outputs -- O34926
benchmark_data/examples/O34926/O34926_distance_differences/*
benchmark_data/examples/O34926/O34926_distance_difference_maps/*
benchmark_data/examples/O34926/O34926_ca_distances/*
benchmark_data/examples/O34926/O34926_cluster_results/*
benchmark_data/examples/O34926/O34926_alpha_fold_mmcifs/*
benchmark_data/examples/O34926/O34926_path_alphafold/*

!benchmark_data/examples/O34926/O34926_distance_differences/.gitkeep
!benchmark_data/examples/O34926/O34926_distance_difference_maps/.gitkeep
!benchmark_data/examples/O34926/O34926_ca_distances/.gitkeep
!benchmark_data/examples/O34926/O34926_cluster_results/.gitkeep
!benchmark_data/examples/O34926/O34926_alpha_fold_mmcifs/.gitkeep
!benchmark_data/examples/O34926/O34926_path_alphafold/.gitkeep

# Outputs from unit tests
tests/test_output/*
Expand Down
129 changes: 85 additions & 44 deletions README.md

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion __version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "1.2.0"
__version__ = "1.2.1"
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
12 changes: 6 additions & 6 deletions cluster_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,12 +101,12 @@ def benchmark_cluster(benchmark_df, unp):
png_bool = True
svg_bool = True

unp_cluster.make_dendrogram(
PATH_SAVE_CLUSTER_RESULTS.joinpath("dendrograms"), png=png_bool, svg=svg_bool
)

unp_cluster.make_swarmplot(
PATH_SAVE_CLUSTER_RESULTS.joinpath("swarm_plots"), png=png_bool, svg=svg_bool
cluster_monomers.render_dendrogram(
unp=unp,
path_results=PATH_SAVE_CLUSTER_RESULTS,
path_save=PATH_SAVE_CLUSTER_RESULTS.joinpath("dendrograms"),
png=png_bool,
svg=svg_bool,
)

"""
Expand Down
38 changes: 12 additions & 26 deletions cluster_conformers/cluster_chains.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,19 +10,16 @@
distance difference maps.
"""

# Standard package imports
from typing import Iterable

import seaborn as sns

# Third-party modules
from matplotlib import pyplot as plt
import seaborn as sns
from matplotlib.axes import Axes
from matplotlib.figure import Figure
from numpy import column_stack, ndarray, zeros
from scipy.cluster.hierarchy import dendrogram
from sklearn.cluster import AgglomerativeClustering

from cluster_conformers.utils.linear_algebra_utils import upper_triangle


def make_linkage_matx(model: AgglomerativeClustering) -> "ndarray[any, float]":
"""
Expand Down Expand Up @@ -103,7 +100,7 @@ def cluster_agglomerative(

def plot_dendrogram(
unp: str, axis, linkage_matrix: ndarray = None, cutoff: float = None, **kwargs
) -> "tuple(Figure, Axes)":
) -> "tuple[Figure, Axes]":
"""
Create linkage matrix from SKLearn model and plot the dendrogram of nodes. Applies
this to parsed Matplotlib axis, inplace.
Expand Down Expand Up @@ -167,31 +164,20 @@ def plot_dendrogram(
del dendrogram_plot


def plot_swarmplot(y_data: Iterable, unp: str) -> "tuple(Figure, Axes)":
"""Creates a strip plot of non-overlapping data points for a given list of data. The
def plot_swarmplot(scores: ndarray, axis) -> "tuple[Figure, Axes]":
"""Creates a swarm plot of non-overlapping data points for a given list of data. The
values of the data will correspond to their y-values. Their position along the
x-axis is irrelevant as they're all identical.

:param y_data: Array of data points to plot.
:type y_data: Iterable
:param unp: UniProt accession.
:type unp: str
:param scores: Matrix of GLOCON scores.
:type scores: Iterable
:param axis: Matplotlib axis to plot the swarm plot on.
:type axis: Axes
:return: Figure and axis objects containing the plotted swarm plot
:rtype: tuple(matplotlib.figure.Figure, matplotlib.axes.Axes)
"""
# Init the figure
_, ax = plt.subplots(
1,
1,
figsize=(4, 5),
# ncols=1, # INTRA | bar | INTER | bar
# nrows=1,
# gridspec_kw=dict(width_ratios=[4, 0.2, 4, 0.2]),
tight_layout=True,
)
# Plot the data
sns.swarmplot(data=y_data, ax=ax, size=5) # vmax=max_dist ,
sns.swarmplot(data=upper_triangle(scores), ax=axis, size=5) # vmax=max_dist ,

# Add some formatting
ax.set_title(unp, fontweight="bold")
ax.set_ylabel("Score (\u212B)")
axis.set_ylabel("GLOCON score (\u212B)")
149 changes: 113 additions & 36 deletions cluster_conformers/cluster_monomers.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,8 @@ def __init__(
# Handle pre-clustering AlphaFold file information
if path_save_alphafold:

path_save_alphafold.mkdir(exist_ok=True, parents=True)

# Download and save
afdb_path = download_utils.download_alphafold_mmcif(
self.unp, path_save_alphafold
Expand All @@ -149,13 +151,15 @@ def __init__(
af_chain = "A"
self.chains[af_prefix] = [af_chain]
self.chains_all.append(af_chain)
self.pdbe_chain_ids.append(f"{af_prefix}_{af_chain}")

# Parse AlphaFold structure for extracting chain info for superpose.py
afdb_mmcif = parsing_utils.parse_mmcif(afdb_structure, af_chain)

# Storing the start-end UniProt residue indices for protein-superpose
self.af_unp_range = (
afdb_mmcif["unp_res_ids"][0],
afdb_mmcif["unp_res_ids"][-1],
min(afdb_mmcif["unp_res_ids"]),
max(afdb_mmcif["unp_res_ids"]),
)

# Number of threads for multiprocessing. Only use 1 if few unique chains
Expand Down Expand Up @@ -202,7 +206,7 @@ def remove_entry_matxs(
logger.debug(f"Removing {file}")
file.unlink()

def _generate_ca_matx(self, pdbe_chain_id: str) -> "tuple(dict)":
def _generate_ca_matx(self, pdbe_chain_id: str) -> "tuple[dict]":
"""
Method for calculating and saving the CA distance matrix for a given PDB-chain
ID string.
Expand Down Expand Up @@ -278,7 +282,7 @@ def ca_distance(self, path_save: PosixPath = None) -> None:

# Dir to save the raw UniProt residue IDs as 1D np.array()s
self.path_save_unps = path_save.joinpath("unp_residue_ids")
self.path_save_unps.mkdir(exist_ok=True)
self.path_save_unps.mkdir(exist_ok=True, parents=True)

self.path_save_base_ca = path_save

Expand Down Expand Up @@ -463,6 +467,9 @@ def cluster(
:type path_save_cluster_results: PosixPath, optional
"""

if path_save_dd_matx:
path_save_dd_matx.mkdir(exist_ok=True, parents=True)

logger.info("Generating distance difference matrices...")
self.score_matx, self.label_matx = self.build_clustering_inputs(
path_save_dd_matx
Expand Down Expand Up @@ -517,6 +524,7 @@ def cluster(

# Write out clustering results if path specified
if path_save_cluster_results:
path_save_cluster_results.mkdir(exist_ok=True, parents=True)

# Save clustering results
path_save_all_conf = path_save_cluster_results.joinpath(
Expand Down Expand Up @@ -706,39 +714,108 @@ def render_dendrogram(
)
return

if not np.array_equal(linkage_matx, np.array([0])):

fig, ax = plt.subplots(1, 1)

logger.info("Rendering dendogram")
cluster_chains.plot_dendrogram(
unp,
ax,
linkage_matx,
CLUSTERING_CUTOFF_PC,
labels=pdbe_chain_ids,
leaf_rotation=90,
) # p=3

# UniProt residue range specified, make modifications (optional)
if unp_range:
ax.set_title(
f"Agglomerative clustering results: {unp} ({unp_range[0]}-{unp_range[1]})",
fontweight="bold",
)
fname = f"{unp}_{unp_range[0]}_{unp_range[1]}_agglomerative_dendrogram"
else:
fname = f"{unp}_agglomerative_dendrogram"

# Save file
io_utils.save_figure(
path_save,
save_fname=fname,
png=png,
svg=svg,
if np.array_equal(linkage_matx, np.array([0])):
logger.info("Single cluster for segment. Not rendering dendrogram.")
return

fig, ax = plt.subplots(1, 1)

logger.info("Rendering dendogram")
cluster_chains.plot_dendrogram(
unp,
ax,
linkage_matx,
CLUSTERING_CUTOFF_PC,
labels=pdbe_chain_ids,
leaf_rotation=90,
) # p=3

# UniProt residue range specified, make modifications (optional)
if unp_range:
ax.set_title(
f"Agglomerative clustering results: {unp} ({unp_range[0]}-{unp_range[1]})",
fontweight="bold",
)
fname = f"{unp}_{unp_range[0]}_{unp_range[1]}_agglomerative_dendrogram"
else:
fname = f"{unp}_agglomerative_dendrogram"

# Save file
io_utils.save_figure(
path_save,
save_fname=fname,
png=png,
svg=svg,
)

# plt.close(fig=fig)

plt.close(fig=fig)

def render_swarmplot(
unp: str,
path_results: PosixPath,
path_save: PosixPath = None,
png: bool = False,
svg: bool = False,
unp_range: "tuple[int, int]" = None,
) -> None:
"""
Plot hierachical dendrogram from clustering results. Must have a linkage matrix and
ordered labels object already stored. The easiest way to get these is to run the
ClusterConformations() object first and point to its output folder.

:param path_save: Path to save rendered dendrogram image.
:type path_save: PosixPath
:param png: Save dendrogram image in PNG format, defaults to False
:type png: bool, optional
:param svg: Save dendrogram image in SVG format, defaults to False
:type svg: bool, optional
:param unp_range: Range of UniProt residues used for clustering
"""

# Set matplotlib global formatting
appearance_utils.init_plot_appearance()

try:
score_matx = io_utils.load_matrix(
path_results.joinpath(f"{unp}_score_matrix.npz")
)

except OSError:
logger.error(
"Linkage matrix and/or label list not found. Please run clustering first."
)
return

if np.array_equal(score_matx, np.array([0])):
logger.info("Single cluster for segment. Not rendering swarm plot.")
return

fig, ax = plt.subplots(1, 1)

logger.info("Rendering swarm plot")
cluster_chains.plot_swarmplot(
score_matx,
ax,
)

# UniProt residue range specified, make modifications (optional)
if unp_range:
ax.set_title(
f"GLOCON score for clustering: {unp} ({unp_range[0]}-{unp_range[1]})",
fontweight="bold",
)
fname = f"{unp}_{unp_range[0]}_{unp_range[1]}_swarm_plot"
else:
logger.info("Single cluster for segment. Not rendering dendrogram.")
fname = f"{unp}_swarm_plot"

# Save file

io_utils.save_figure(
path_save,
save_fname=fname,
png=png,
svg=svg,
)

plt.close(fig=fig)
10 changes: 8 additions & 2 deletions cluster_conformers/utils/download_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,12 @@ def fetch_benchmark_mmcifs(path_benchmark_df: PosixPath, path_save: PosixPath) -
pdbe_ids = benchmark_df["PDBe_ID"].unique()

# Download and save
logger.info(
f"""
Downloading {len(pdbe_ids)} updated mmCIFs from the PDBe.
This may take a few minutes...
"""
)
for pdbe in pdbe_ids:
fetch_updated_mmcif(pdbe, path_save)

Expand All @@ -53,9 +59,9 @@ def fetch_updated_mmcif(pdb_code: str, path_save: PosixPath) -> None:
download_link = url + mmcif_file_name

# Download
logger.info("Downloading", mmcif_file_name)
response = get(download_link, allow_redirects=True)
save_to = path_save.joinpath(mmcif_file_name)
logger.info(f"Downloading {mmcif_file_name} to {save_to}")
response = get(download_link, allow_redirects=True)
open(save_to, "wb").write(response.content)


Expand Down
1 change: 1 addition & 0 deletions cluster_conformers/utils/io_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,7 @@ def save_figure(
:type svg: bool, optional
"""
default_dpi = 200
path_save.mkdir(parents=True, exist_ok=True)

if png:
save_fig_dir = path_save.joinpath(f"{save_fname}.png")
Expand Down
Loading
Loading