Skip to content

Commit

Permalink
🎨 format and add soon keyword only parameters to "to_excel" (pandas)
Browse files Browse the repository at this point in the history
  • Loading branch information
enryH committed Nov 27, 2024
1 parent 752389b commit 2c92980
Show file tree
Hide file tree
Showing 4 changed files with 761 additions and 652 deletions.
209 changes: 116 additions & 93 deletions docs/tutorial/explorative_analysis.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,9 @@
},
"outputs": [],
"source": [
"%pip install 'njab[all]' openpyxl"
"%pip install 'njab[all]' openpyxl\n",
"\n",
"import logging"
]
},
{
Expand All @@ -31,34 +33,31 @@
"id": "99dc45b9",
"metadata": {
"tags": [
"hide-input"
"hide-cell"
]
},
"outputs": [],
"source": [
"from functools import partial\n",
"from pathlib import Path\n",
"import logging\n",
"\n",
"from IPython.display import display\n",
"\n",
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"import sklearn\n",
"import seaborn\n",
"import sklearn\n",
"from IPython.display import display\n",
"from lifelines.plotting import add_at_risk_counts\n",
"\n",
"import matplotlib.pyplot as plt\n",
"\n",
"from njab.plotting.km import compare_km_curves, log_rank_test\n",
"import njab\n",
"import njab.plotting\n",
"from njab.plotting.km import compare_km_curves, log_rank_test\n",
"\n",
"njab.pandas.set_pandas_options()\n",
"njab.plotting.set_font_sizes('x-small')\n",
"pd.options.display.min_rows = 10\n",
"njab.plotting.set_font_sizes(\"x-small\")\n",
"seaborn.set_style(\"whitegrid\")\n",
"plt.rcParams['figure.figsize'] = [4.0, 4.0]"
"plt.rcParams[\"figure.figsize\"] = [4.0, 4.0]"
]
},
{
Expand All @@ -80,18 +79,18 @@
},
"outputs": [],
"source": [
"TARGET = 'event'\n",
"TIME_KM = 'time'\n",
"FOLDER = 'prostate'\n",
"CLINIC = 'https://raw.githubusercontent.com/ErikinBC/SurvSet/main/SurvSet/_datagen/output/prostate.csv'\n",
"val_ids: str = '' # List of comma separated values or filepath\n",
"TARGET = \"event\"\n",
"TIME_KM = \"time\"\n",
"FOLDER = \"prostate\"\n",
"CLINIC = \"https://raw.githubusercontent.com/ErikinBC/SurvSet/main/SurvSet/_datagen/output/prostate.csv\"\n",
"val_ids: str = \"\" # List of comma separated values or filepath\n",
"#\n",
"# list or string of csv, eg. \"var1,var2\"\n",
"clinic_cont = ['age']\n",
"clinic_cont = [\"age\"]\n",
"# list or string of csv, eg. \"var1,var2\"\n",
"clinic_binary = ['male', 'AD']\n",
"clinic_binary = [\"male\", \"AD\"]\n",
"# List of comma separated values or filepath\n",
"da_covar = 'num_age,num_wt'"
"da_covar = \"num_age,num_wt\""
]
},
{
Expand Down Expand Up @@ -145,21 +144,26 @@
},
"outputs": [],
"source": [
"clinic = pd.read_csv(CLINIC, index_col=0).dropna(how='any')\n",
"clinic.columns.name = 'feat_name' # ! check needs to be implemented\n",
"clinic = pd.read_csv(CLINIC, index_col=0).dropna(how=\"any\")\n",
"clinic.columns.name = \"feat_name\" # ! check needs to be implemented\n",
"cols_clinic = njab.pandas.get_colums_accessor(clinic)\n",
"clinic = clinic.astype({var: 'int'\n",
" for var in ['event',\n",
" 'time',\n",
" 'num_age',\n",
" 'num_wt',\n",
" 'num_sbp',\n",
" 'num_dbp',\n",
" 'num_sz',\n",
" 'num_sg',\n",
" 'num_sdate',\n",
" 'fac_stage']}\n",
" )\n",
"clinic = clinic.astype(\n",
" {\n",
" var: \"int\"\n",
" for var in [\n",
" \"event\",\n",
" \"time\",\n",
" \"num_age\",\n",
" \"num_wt\",\n",
" \"num_sbp\",\n",
" \"num_dbp\",\n",
" \"num_sz\",\n",
" \"num_sg\",\n",
" \"num_sdate\",\n",
" \"fac_stage\",\n",
" ]\n",
" }\n",
")\n",
"clinic"
]
},
Expand All @@ -178,7 +182,7 @@
"metadata": {},
"outputs": [],
"source": [
"clinic.describe(include='object')"
"clinic.describe(include=\"object\")"
]
},
{
Expand All @@ -198,8 +202,8 @@
"metadata": {},
"outputs": [],
"source": [
"vars_binary = ['fac_hx', 'fac_bm']\n",
"clinic[vars_binary] = clinic[vars_binary].astype('category')"
"vars_binary = [\"fac_hx\", \"fac_bm\"]\n",
"clinic[vars_binary] = clinic[vars_binary].astype(\"category\")"
]
},
{
Expand Down Expand Up @@ -242,8 +246,16 @@
"outputs": [],
"source": [
"vars_cont = [\n",
" 'num_age', 'num_wt', 'num_sbp', 'num_dbp', 'num_hg', 'num_sz', 'num_sg',\n",
" 'num_ap', 'num_sdate', 'fac_stage'\n",
" \"num_age\",\n",
" \"num_wt\",\n",
" \"num_sbp\",\n",
" \"num_dbp\",\n",
" \"num_hg\",\n",
" \"num_sz\",\n",
" \"num_sg\",\n",
" \"num_ap\",\n",
" \"num_sdate\",\n",
" \"fac_stage\",\n",
"]"
]
},
Expand All @@ -267,7 +279,7 @@
},
"outputs": [],
"source": [
"fname = FOLDER / '1_differential_analysis.xlsx'\n",
"fname = FOLDER / \"1_differential_analysis.xlsx\"\n",
"files_out = {fname.name: fname}\n",
"writer = pd.ExcelWriter(fname)\n",
"print(f\"Output will be written to: {fname}\")"
Expand Down Expand Up @@ -318,10 +330,10 @@
"ana_differential = njab.stats.groups_comparision.diff_analysis(\n",
" clinic[vars_cont],\n",
" happend,\n",
" event_names=(TARGET, 'no event'),\n",
" event_names=(TARGET, \"no event\"),\n",
")\n",
"ana_differential = ana_differential.sort_values(('ttest', 'p-val'))\n",
"ana_differential.to_excel(writer, \"clinic continous\", float_format='%.4f')\n",
"ana_differential = ana_differential.sort_values((\"ttest\", \"p-val\"))\n",
"ana_differential.to_excel(writer, sheet_name=\"clinic continous\", float_format=\"%.4f\")\n",
"ana_differential"
]
},
Expand All @@ -348,19 +360,18 @@
"for var in vars_binary:\n",
" if len(clinic[var].cat.categories) == 2:\n",
" diff_binomial.append(\n",
" njab.stats.groups_comparision.binomtest(clinic[var],\n",
" happend,\n",
" event_names=(TARGET,\n",
" 'no-event')))\n",
" njab.stats.groups_comparision.binomtest(\n",
" clinic[var], happend, event_names=(TARGET, \"no-event\")\n",
" )\n",
" )\n",
" else:\n",
" logging.warning(\n",
" f\"Non-binary variable: {var} with {len(clinic[var].cat.categories)} categories\"\n",
" )\n",
"\n",
"diff_binomial = pd.concat(diff_binomial).sort_values(\n",
" ('binomial test', 'pvalue'))\n",
"diff_binomial.to_excel(writer, 'clinic binary', float_format='%.4f')\n",
"with pd.option_context('display.max_rows', len(diff_binomial)):\n",
"diff_binomial = pd.concat(diff_binomial).sort_values((\"binomial test\", \"pvalue\"))\n",
"diff_binomial.to_excel(writer, sheet_name=\"clinic binary\", float_format=\"%.4f\")\n",
"with pd.option_context(\"display.max_rows\", len(diff_binomial)):\n",
" display(diff_binomial)"
]
},
Expand Down Expand Up @@ -388,7 +399,7 @@
"source": [
"clinic_ancova = [TARGET, *covar]\n",
"clinic_ancova = clinic[clinic_ancova].copy()\n",
"clinic_ancova.describe(include='all')"
"clinic_ancova.describe(include=\"all\")"
]
},
{
Expand All @@ -410,17 +421,15 @@
},
"outputs": [],
"source": [
"clinic_ancova = clinic_ancova.dropna(\n",
")\n",
"categorical_columns = clinic_ancova.columns[clinic_ancova.dtypes == 'category']\n",
"clinic_ancova = clinic_ancova.dropna()\n",
"categorical_columns = clinic_ancova.columns[clinic_ancova.dtypes == \"category\"]\n",
"print(\"Available covariates: \" \", \".join(categorical_columns.to_list()))\n",
"for categorical_column in categorical_columns:\n",
" # only works if no NA and only binary variables!\n",
" clinic_ancova[categorical_column] = clinic_ancova[\n",
" categorical_column].cat.codes\n",
" clinic_ancova[categorical_column] = clinic_ancova[categorical_column].cat.codes\n",
"\n",
"desc_ancova = clinic_ancova.describe()\n",
"desc_ancova.to_excel(writer, \"covars\", float_format='%.4f')\n",
"desc_ancova.to_excel(writer, sheet_name=\"covars\", float_format=\"%.4f\")\n",
"desc_ancova"
]
},
Expand All @@ -443,10 +452,10 @@
},
"outputs": [],
"source": [
"if (desc_ancova.loc['std'] < 0.001).sum():\n",
" non_varying = desc_ancova.loc['std'] < 0.001\n",
"if (desc_ancova.loc[\"std\"] < 0.001).sum():\n",
" non_varying = desc_ancova.loc[\"std\"] < 0.001\n",
" non_varying = non_varying[non_varying].index\n",
" print(\"Non varying columns: \", ', '.join(non_varying))\n",
" print(\"Non varying columns: \", \", \".join(non_varying))\n",
" clinic_ancova = clinic_ancova.drop(non_varying, axis=1)\n",
" for col in non_varying:\n",
" covar.remove(col)"
Expand Down Expand Up @@ -476,12 +485,14 @@
" df_clinic=clinic_ancova,\n",
" target=TARGET,\n",
" covar=covar,\n",
" value_name='')\n",
"ancova = ancova.ancova().sort_values('p-unc')\n",
" value_name=\"\",\n",
")\n",
"ancova = ancova.ancova().sort_values(\"p-unc\")\n",
"ancova = ancova.loc[:, \"p-unc\":]\n",
"ancova.columns = pd.MultiIndex.from_product([['ancova'], ancova.columns],\n",
" names=('test', 'var'))\n",
"ancova.to_excel(writer, \"olink controlled\", float_format='%.4f')\n",
"ancova.columns = pd.MultiIndex.from_product(\n",
" [[\"ancova\"], ancova.columns], names=(\"test\", \"var\")\n",
")\n",
"ancova.to_excel(writer, sheet_name=\"olink controlled\", float_format=\"%.4f\")\n",
"ancova.head(20)"
]
},
Expand Down Expand Up @@ -522,7 +533,11 @@
"cell_type": "code",
"execution_count": null,
"id": "c4c8aff8",
"metadata": {},
"metadata": {
"tags": [
"hide-input"
]
},
"outputs": [],
"source": [
"rejected = ancova.query(\"`('ancova', 'rejected')` == True\")\n",
Expand All @@ -546,15 +561,17 @@
"metadata": {},
"outputs": [],
"source": [
"class_weight = 'balanced'\n",
"class_weight = \"balanced\"\n",
"y_km = clinic[TARGET]\n",
"time_km = clinic[TIME_KM]\n",
"compare_km_curves = partial(compare_km_curves,\n",
" time=time_km,\n",
" y=y_km,\n",
" xlim=(0, 80),\n",
" xlabel='time passed',\n",
" ylabel=f'rate {y_km.name}')\n",
"compare_km_curves = partial(\n",
" compare_km_curves,\n",
" time=time_km,\n",
" y=y_km,\n",
" xlim=(0, 80),\n",
" xlabel=\"time passed\",\n",
" ylabel=f\"rate {y_km.name}\",\n",
")\n",
"log_rank_test = partial(\n",
" log_rank_test,\n",
" time=time_km,\n",
Expand All @@ -575,43 +592,49 @@
"cell_type": "code",
"execution_count": null,
"id": "26a0e4a1",
"metadata": {},
"metadata": {
"tags": [
"hide-input"
]
},
"outputs": [],
"source": [
"for marker, _ in rejected.index[:TOP_N]: # first case done above currently\n",
" fig, ax = plt.subplots()\n",
" class_weight = 'balanced'\n",
" class_weight = \"balanced\"\n",
" # class_weight=None\n",
" model = sklearn.linear_model.LogisticRegression(class_weight=class_weight)\n",
" model = model.fit(X=clinic[marker].to_frame(), y=happend)\n",
" print(\n",
" f\"Intercept {float(model.intercept_):5.3f}, coef.: {float(model.coef_):5.3f}\"\n",
" )\n",
" # offset = np.log(p/(1-p)) # ! could be adapted based on proportion of target (for imbalanced data)\n",
" print(f\"Intercept {float(model.intercept_):5.3f}, coef.: {float(model.coef_):5.3f}\")\n",
" ! could be adapted based on proportion of target (for imbalanced data):\n",
" # offset = np.log(p/(1-p))\n",
" offset = np.log(0.5 / (1 - 0.5)) # ! standard cutoff of probability of 0.5\n",
" cutoff = offset - float(model.intercept_) / float(model.coef_)\n",
" direction = '>' if model.coef_ > 0 else '<'\n",
" print(\n",
" f\"Custom cutoff defined by Logistic regressor for {marker:>10}: {cutoff:.3f}\"\n",
" )\n",
" direction = \">\" if model.coef_ > 0 else \"<\"\n",
" print(f\"Custom cutoff defined by Logistic regressor for {marker:>10}: {cutoff:.3f}\")\n",
" pred = njab.sklearn.scoring.get_pred(model, clinic[marker].to_frame())\n",
" ax, kmf_0, kmf_1 = compare_km_curves(pred=pred)\n",
" res = log_rank_test(mask=pred)\n",
" ax.set_title(\n",
" f'KM curve for {TARGET.lower()}'\n",
" f' and marker {marker} \\n'\n",
" f'(cutoff{direction}{cutoff:.2f}, log-rank-test p={res.p_value:.3f})')\n",
" ax.legend([\n",
" f\"KP pred=0 (N={(~pred).sum()})\", '95% CI (pred=0)',\n",
" f\"KP pred=1 (N={pred.sum()})\", '95% CI (pred=1)'\n",
" ])\n",
" fname = FOLDER / f'KM_plot_{marker}.pdf'\n",
" f\"KM curve for {TARGET.lower()}\"\n",
" f\" and marker {marker} \\n\"\n",
" f\"(cutoff{direction}{cutoff:.2f}, log-rank-test p={res.p_value:.3f})\"\n",
" )\n",
" ax.legend(\n",
" [\n",
" f\"KP pred=0 (N={(~pred).sum()})\",\n",
" \"95% CI (pred=0)\",\n",
" f\"KP pred=1 (N={pred.sum()})\",\n",
" \"95% CI (pred=1)\",\n",
" ]\n",
" )\n",
" fname = FOLDER / f\"KM_plot_{marker}.pdf\"\n",
" files_out[fname.name] = fname\n",
" njab.plotting.savefig(ax.get_figure(), fname)\n",
"\n",
" # add counts\n",
" add_at_risk_counts(kmf_0, kmf_1, ax=ax)\n",
" fname = FOLDER / f'KM_plot_{marker}_w_counts.pdf'\n",
" fname = FOLDER / f\"KM_plot_{marker}_w_counts.pdf\"\n",
" files_out[fname.name] = fname\n",
" njab.plotting.savefig(ax.get_figure(), fname)"
]
Expand Down
Loading

0 comments on commit 2c92980

Please sign in to comment.