Skip to content

Commit

Permalink
🎨 increase fonts, improve plotting
Browse files Browse the repository at this point in the history
- fixed heatmap from -1 to 1 for correlations
- shrink heatmap legend

:bug: greater equal, not strictly greater for cutoff
  • Loading branch information
Henry committed Oct 17, 2023
1 parent 288d78e commit 27ba67d
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 13 deletions.
44 changes: 32 additions & 12 deletions project/00_5_training_data_exploration.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
# %%
from __future__ import annotations
import json
import logging
from pathlib import Path

import numpy as np
Expand All @@ -42,8 +43,9 @@
from vaep.analyzers import analyzers

logger = vaep.logging.setup_nb_logger()
logging.getLogger('fontTools').setLevel(logging.WARNING)

matplotlib.rcParams.update({'font.size': 5,
matplotlib.rcParams.update({'font.size': 6,
'figure.figsize': [4.0, 2.0]})


Expand Down Expand Up @@ -189,7 +191,7 @@ def get_dynamic_range(min_max):
# %%
min_samples_per_feat = int(len(data) * COMPLETENESS_OVER_SAMPLES)
print(f"{min_samples_per_feat = }")
mask = data.notna().sum(axis=0) > min_samples_per_feat
mask = data.notna().sum(axis=0) >= min_samples_per_feat
print(f"drop = {(~mask).sum()} features")
selected = data.loc[:, mask]
selected.shape
Expand Down Expand Up @@ -305,7 +307,7 @@ def get_dynamic_range(min_max):
# %%time
corr_lower_triangle = analyzers.corr_lower_triangle(data)
fig, axes = analyzers.plot_corr_histogram(corr_lower_triangle, bins=40)
fname = FIGUREFOLDER / f'corr_histogram_feat.pdf'
fname = FIGUREFOLDER / 'corr_histogram_feat.pdf'
files_out[fname.name] = fname
vaep.savefig(fig, name=fname)

Expand All @@ -317,7 +319,7 @@ def get_dynamic_range(min_max):
cv = data.std() / data.mean()
# biological coefficient of variation: standard deviation (variation) w.r.t mean
ax = cv.hist(bins=30)
fname = FIGUREFOLDER / f'CV_histogram_features.pdf'
fname = FIGUREFOLDER / 'CV_histogram_features.pdf'
files_out[fname.name] = fname
vaep.savefig(ax.get_figure(), name=fname)

Expand All @@ -328,7 +330,10 @@ def get_dynamic_range(min_max):
# needs to deal with duplicates
# notna = data.notna().T.drop_duplicates().T
# get index and column names
cg = sns.clustermap(data.notna(), cbar_pos=None)
vaep.plotting.make_large_descriptors(8)
cg = sns.clustermap(data.notna(),
cbar_pos=None,
figsize=(8, 8))
ax = cg.ax_heatmap
if PG_SEPARATOR is not None:
_new_labels = [l.get_text().split(PG_SEPARATOR)[0]
Expand All @@ -341,7 +346,8 @@ def get_dynamic_range(min_max):
files_out[fname.name] = fname
vaep.savefig(cg.fig,
name=fname,
pdf=False)
pdf=False,
dpi=600)

# %% [markdown]
# based on cluster, plot heatmaps of features and samples
Expand All @@ -351,10 +357,12 @@ def get_dynamic_range(min_max):
cg.dendrogram_col.reordered_ind)) == data.shape

# %%
vaep.plotting.make_large_descriptors(5)
vaep.plotting.make_large_descriptors(8)
fig, ax = plt.subplots(figsize=(4, 4))
ax = sns.heatmap(
data.iloc[cg.dendrogram_row.reordered_ind,
cg.dendrogram_col.reordered_ind],
ax=ax,
)
only_every_x_ticks(ax, x=2)
use_first_n_chars_in_labels(ax, x=SAMPLE_FIRST_N_CHARS)
Expand All @@ -367,7 +375,7 @@ def get_dynamic_range(min_max):
ax.set_yticks([])
fname = FIGUREFOLDER / 'heatmap_intensities_ordered_by_missing_pattern.png'
files_out[fname.name] = fname
vaep.savefig(ax.get_figure(), name=fname, pdf=False)
vaep.savefig(fig, name=fname, pdf=False, dpi=600)
# ax.get_figure().savefig(fname, dpi=300)

# %% [markdown]
Expand All @@ -378,6 +386,9 @@ def get_dynamic_range(min_max):
ax = sns.heatmap(
analyzers.corr_lower_triangle(
data.iloc[:, cg.dendrogram_col.reordered_ind]),
vmin=-1,
vmax=1,
cbar_kws={'shrink': 0.75},
ax=ax,
square=True,
)
Expand All @@ -392,14 +403,19 @@ def get_dynamic_range(min_max):
ax.set_yticks([])
fname = FIGUREFOLDER / 'heatmap_feature_correlation.png'
files_out[fname.name] = fname
vaep.savefig(fig, name=fname, pdf=False)
vaep.savefig(fig, name=fname, pdf=False, dpi=600)

# %%
lower_corr = analyzers.corr_lower_triangle(
data.T.iloc[:, cg.dendrogram_row.reordered_ind])
# %%
fig, ax = plt.subplots(figsize=(4, 4))
ax = sns.heatmap(
analyzers.corr_lower_triangle(
data.T.iloc[:, cg.dendrogram_row.reordered_ind]),
data=lower_corr,
ax=ax,
vmin=-1,
vmax=1,
cbar_kws={'shrink': 0.75},
square=True,
)
_ = only_every_x_ticks(ax, x=2)
Expand All @@ -409,9 +425,10 @@ def get_dynamic_range(min_max):
ax.set_yticks([])
fname = FIGUREFOLDER / 'heatmap_sample_correlation.png'
files_out[fname.name] = fname
vaep.savefig(fig, name=fname, pdf=False)
vaep.savefig(fig, name=fname, pdf=False, dpi=600)

# %%
vaep.plotting.make_large_descriptors(12)
kwargs = dict()
if NO_TICK_LABELS_ON_HEATMAP:
kwargs['xticklabels'] = False
Expand Down Expand Up @@ -446,6 +463,7 @@ def get_dynamic_range(min_max):
sample_stats

# %%
vaep.plotting.make_large_descriptors(8)
fig_ident = sns.relplot(
x='SampleID_int', y=COL_NO_IDENTIFIED, data=sample_stats)
fig_ident.set_axis_labels('Sample ID', f'Frequency of identified {TYPE}')
Expand Down Expand Up @@ -491,3 +509,5 @@ def get_dynamic_range(min_max):

# %%
files_out

# %%
2 changes: 1 addition & 1 deletion vaep/analyzers/analyzers.py
Original file line number Diff line number Diff line change
Expand Up @@ -428,7 +428,7 @@ def corr_lower_triangle(df):

def plot_corr_histogram(corr_lower_triangle, bins=10):
fig, axes = plt.subplots(ncols=2, gridspec_kw={"width_ratios": [
5, 1], "wspace": 0.2}, figsize=(10, 4))
5, 1], "wspace": 0.2}, figsize=(8, 4))
values = pd.Series(corr_lower_triangle.to_numpy().flatten()).dropna()
ax = axes[0]
ax = values.hist(ax=ax, bins=bins)
Expand Down

0 comments on commit 27ba67d

Please sign in to comment.