Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

🎨 format and add soon keyword only parameters to "to_excel" (pandas) #15

Merged
merged 2 commits into from
Dec 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
209 changes: 117 additions & 92 deletions docs/tutorial/explorative_analysis.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,9 @@
},
"outputs": [],
"source": [
"%pip install 'njab[all]' openpyxl"
"%pip install 'njab[all]' openpyxl\n",
"\n",
"import logging"
]
},
{
Expand All @@ -31,34 +33,31 @@
"id": "99dc45b9",
"metadata": {
"tags": [
"hide-input"
"hide-cell"
]
},
"outputs": [],
"source": [
"from functools import partial\n",
"from pathlib import Path\n",
"import logging\n",
"\n",
"from IPython.display import display\n",
"\n",
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"import sklearn\n",
"import seaborn\n",
"import sklearn\n",
"from IPython.display import display\n",
"from lifelines.plotting import add_at_risk_counts\n",
"\n",
"import matplotlib.pyplot as plt\n",
"\n",
"from njab.plotting.km import compare_km_curves, log_rank_test\n",
"import njab\n",
"import njab.plotting\n",
"from njab.plotting.km import compare_km_curves, log_rank_test\n",
"\n",
"njab.pandas.set_pandas_options()\n",
"njab.plotting.set_font_sizes('x-small')\n",
"pd.options.display.min_rows = 10\n",
"njab.plotting.set_font_sizes(\"x-small\")\n",
"seaborn.set_style(\"whitegrid\")\n",
"plt.rcParams['figure.figsize'] = [4.0, 4.0]"
"plt.rcParams[\"figure.figsize\"] = [4.0, 4.0]"
]
},
{
Expand All @@ -80,18 +79,18 @@
},
"outputs": [],
"source": [
"TARGET = 'event'\n",
"TIME_KM = 'time'\n",
"FOLDER = 'prostate'\n",
"CLINIC = 'https://raw.githubusercontent.com/ErikinBC/SurvSet/main/SurvSet/_datagen/output/prostate.csv'\n",
"val_ids: str = '' # List of comma separated values or filepath\n",
"TARGET = \"event\"\n",
"TIME_KM = \"time\"\n",
"FOLDER = \"prostate\"\n",
"CLINIC = \"https://raw.githubusercontent.com/ErikinBC/SurvSet/main/SurvSet/_datagen/output/prostate.csv\"\n",
"val_ids: str = \"\" # List of comma separated values or filepath\n",
"#\n",
"# list or string of csv, eg. \"var1,var2\"\n",
"clinic_cont = ['age']\n",
"clinic_cont = [\"age\"]\n",
"# list or string of csv, eg. \"var1,var2\"\n",
"clinic_binary = ['male', 'AD']\n",
"clinic_binary = [\"male\", \"AD\"]\n",
"# List of comma separated values or filepath\n",
"da_covar = 'num_age,num_wt'"
"da_covar = \"num_age,num_wt\""
]
},
{
Expand Down Expand Up @@ -145,21 +144,26 @@
},
"outputs": [],
"source": [
"clinic = pd.read_csv(CLINIC, index_col=0).dropna(how='any')\n",
"clinic.columns.name = 'feat_name' # ! check needs to be implemented\n",
"clinic = pd.read_csv(CLINIC, index_col=0).dropna(how=\"any\")\n",
"clinic.columns.name = \"feat_name\" # ! check needs to be implemented\n",
"cols_clinic = njab.pandas.get_colums_accessor(clinic)\n",
"clinic = clinic.astype({var: 'int'\n",
" for var in ['event',\n",
" 'time',\n",
" 'num_age',\n",
" 'num_wt',\n",
" 'num_sbp',\n",
" 'num_dbp',\n",
" 'num_sz',\n",
" 'num_sg',\n",
" 'num_sdate',\n",
" 'fac_stage']}\n",
" )\n",
"clinic = clinic.astype(\n",
" {\n",
" var: \"int\"\n",
" for var in [\n",
" \"event\",\n",
" \"time\",\n",
" \"num_age\",\n",
" \"num_wt\",\n",
" \"num_sbp\",\n",
" \"num_dbp\",\n",
" \"num_sz\",\n",
" \"num_sg\",\n",
" \"num_sdate\",\n",
" \"fac_stage\",\n",
" ]\n",
" }\n",
")\n",
"clinic"
]
},
Expand All @@ -178,7 +182,7 @@
"metadata": {},
"outputs": [],
"source": [
"clinic.describe(include='object')"
"clinic.describe(include=\"object\")"
]
},
{
Expand All @@ -198,8 +202,8 @@
"metadata": {},
"outputs": [],
"source": [
"vars_binary = ['fac_hx', 'fac_bm']\n",
"clinic[vars_binary] = clinic[vars_binary].astype('category')"
"vars_binary = [\"fac_hx\", \"fac_bm\"]\n",
"clinic[vars_binary] = clinic[vars_binary].astype(\"category\")"
]
},
{
Expand Down Expand Up @@ -242,8 +246,16 @@
"outputs": [],
"source": [
"vars_cont = [\n",
" 'num_age', 'num_wt', 'num_sbp', 'num_dbp', 'num_hg', 'num_sz', 'num_sg',\n",
" 'num_ap', 'num_sdate', 'fac_stage'\n",
" \"num_age\",\n",
" \"num_wt\",\n",
" \"num_sbp\",\n",
" \"num_dbp\",\n",
" \"num_hg\",\n",
" \"num_sz\",\n",
" \"num_sg\",\n",
" \"num_ap\",\n",
" \"num_sdate\",\n",
" \"fac_stage\",\n",
"]"
]
},
Expand All @@ -267,7 +279,7 @@
},
"outputs": [],
"source": [
"fname = FOLDER / '1_differential_analysis.xlsx'\n",
"fname = FOLDER / \"1_differential_analysis.xlsx\"\n",
"files_out = {fname.name: fname}\n",
"writer = pd.ExcelWriter(fname)\n",
"print(f\"Output will be written to: {fname}\")"
Expand Down Expand Up @@ -318,10 +330,10 @@
"ana_differential = njab.stats.groups_comparision.diff_analysis(\n",
" clinic[vars_cont],\n",
" happend,\n",
" event_names=(TARGET, 'no event'),\n",
" event_names=(TARGET, \"no event\"),\n",
")\n",
"ana_differential = ana_differential.sort_values(('ttest', 'p-val'))\n",
"ana_differential.to_excel(writer, \"clinic continous\", float_format='%.4f')\n",
"ana_differential = ana_differential.sort_values((\"ttest\", \"p-val\"))\n",
"ana_differential.to_excel(writer, sheet_name=\"clinic continous\", float_format=\"%.4f\")\n",
"ana_differential"
]
},
Expand All @@ -348,19 +360,18 @@
"for var in vars_binary:\n",
" if len(clinic[var].cat.categories) == 2:\n",
" diff_binomial.append(\n",
" njab.stats.groups_comparision.binomtest(clinic[var],\n",
" happend,\n",
" event_names=(TARGET,\n",
" 'no-event')))\n",
" njab.stats.groups_comparision.binomtest(\n",
" clinic[var], happend, event_names=(TARGET, \"no-event\")\n",
" )\n",
" )\n",
" else:\n",
" logging.warning(\n",
" f\"Non-binary variable: {var} with {len(clinic[var].cat.categories)} categories\"\n",
" )\n",
"\n",
"diff_binomial = pd.concat(diff_binomial).sort_values(\n",
" ('binomial test', 'pvalue'))\n",
"diff_binomial.to_excel(writer, 'clinic binary', float_format='%.4f')\n",
"with pd.option_context('display.max_rows', len(diff_binomial)):\n",
"diff_binomial = pd.concat(diff_binomial).sort_values((\"binomial test\", \"pvalue\"))\n",
"diff_binomial.to_excel(writer, sheet_name=\"clinic binary\", float_format=\"%.4f\")\n",
"with pd.option_context(\"display.max_rows\", len(diff_binomial)):\n",
" display(diff_binomial)"
]
},
Expand Down Expand Up @@ -388,7 +399,7 @@
"source": [
"clinic_ancova = [TARGET, *covar]\n",
"clinic_ancova = clinic[clinic_ancova].copy()\n",
"clinic_ancova.describe(include='all')"
"clinic_ancova.describe(include=\"all\")"
]
},
{
Expand All @@ -410,17 +421,15 @@
},
"outputs": [],
"source": [
"clinic_ancova = clinic_ancova.dropna(\n",
")\n",
"categorical_columns = clinic_ancova.columns[clinic_ancova.dtypes == 'category']\n",
"clinic_ancova = clinic_ancova.dropna()\n",
"categorical_columns = clinic_ancova.columns[clinic_ancova.dtypes == \"category\"]\n",
"print(\"Available covariates: \" \", \".join(categorical_columns.to_list()))\n",
"for categorical_column in categorical_columns:\n",
" # only works if no NA and only binary variables!\n",
" clinic_ancova[categorical_column] = clinic_ancova[\n",
" categorical_column].cat.codes\n",
" clinic_ancova[categorical_column] = clinic_ancova[categorical_column].cat.codes\n",
"\n",
"desc_ancova = clinic_ancova.describe()\n",
"desc_ancova.to_excel(writer, \"covars\", float_format='%.4f')\n",
"desc_ancova.to_excel(writer, sheet_name=\"covars\", float_format=\"%.4f\")\n",
"desc_ancova"
]
},
Expand All @@ -443,10 +452,10 @@
},
"outputs": [],
"source": [
"if (desc_ancova.loc['std'] < 0.001).sum():\n",
" non_varying = desc_ancova.loc['std'] < 0.001\n",
"if (desc_ancova.loc[\"std\"] < 0.001).sum():\n",
" non_varying = desc_ancova.loc[\"std\"] < 0.001\n",
" non_varying = non_varying[non_varying].index\n",
" print(\"Non varying columns: \", ', '.join(non_varying))\n",
" print(\"Non varying columns: \", \", \".join(non_varying))\n",
" clinic_ancova = clinic_ancova.drop(non_varying, axis=1)\n",
" for col in non_varying:\n",
" covar.remove(col)"
Expand Down Expand Up @@ -476,12 +485,14 @@
" df_clinic=clinic_ancova,\n",
" target=TARGET,\n",
" covar=covar,\n",
" value_name='')\n",
"ancova = ancova.ancova().sort_values('p-unc')\n",
" value_name=\"\",\n",
")\n",
"ancova = ancova.ancova().sort_values(\"p-unc\")\n",
"ancova = ancova.loc[:, \"p-unc\":]\n",
"ancova.columns = pd.MultiIndex.from_product([['ancova'], ancova.columns],\n",
" names=('test', 'var'))\n",
"ancova.to_excel(writer, \"olink controlled\", float_format='%.4f')\n",
"ancova.columns = pd.MultiIndex.from_product(\n",
" [[\"ancova\"], ancova.columns], names=(\"test\", \"var\")\n",
")\n",
"ancova.to_excel(writer, sheet_name=\"olink controlled\", float_format=\"%.4f\")\n",
"ancova.head(20)"
]
},
Expand Down Expand Up @@ -522,7 +533,11 @@
"cell_type": "code",
"execution_count": null,
"id": "c4c8aff8",
"metadata": {},
"metadata": {
"tags": [
"hide-input"
]
},
"outputs": [],
"source": [
"rejected = ancova.query(\"`('ancova', 'rejected')` == True\")\n",
Expand All @@ -546,15 +561,17 @@
"metadata": {},
"outputs": [],
"source": [
"class_weight = 'balanced'\n",
"class_weight = \"balanced\"\n",
"y_km = clinic[TARGET]\n",
"time_km = clinic[TIME_KM]\n",
"compare_km_curves = partial(compare_km_curves,\n",
" time=time_km,\n",
" y=y_km,\n",
" xlim=(0, 80),\n",
" xlabel='time passed',\n",
" ylabel=f'rate {y_km.name}')\n",
"compare_km_curves = partial(\n",
" compare_km_curves,\n",
" time=time_km,\n",
" y=y_km,\n",
" xlim=(0, 80),\n",
" xlabel=\"time passed\",\n",
" ylabel=f\"rate {y_km.name}\",\n",
")\n",
"log_rank_test = partial(\n",
" log_rank_test,\n",
" time=time_km,\n",
Expand All @@ -575,43 +592,51 @@
"cell_type": "code",
"execution_count": null,
"id": "26a0e4a1",
"metadata": {},
"metadata": {
"tags": [
"hide-input"
]
},
"outputs": [],
"source": [
"for marker, _ in rejected.index[:TOP_N]: # first case done above currently\n",
" fig, ax = plt.subplots()\n",
" class_weight = 'balanced'\n",
" class_weight = \"balanced\"\n",
" # class_weight=None\n",
" model = sklearn.linear_model.LogisticRegression(class_weight=class_weight)\n",
" model = model.fit(X=clinic[marker].to_frame(), y=happend)\n",
" print(\n",
" f\"Intercept {float(model.intercept_):5.3f}, coef.: {float(model.coef_):5.3f}\"\n",
" f\"Intercept {float(model.intercept_.squeeze()):5.3f}, coef.: {float(model.coef_.squeeze()):5.3f}\"\n",
" )\n",
" # offset = np.log(p/(1-p)) # ! could be adapted based on proportion of target (for imbalanced data)\n",
" ! could be adapted based on proportion of target (for imbalanced data):\n",
" # offset = np.log(p/(1-p))\n",
" offset = np.log(0.5 / (1 - 0.5)) # ! standard cutoff of probability of 0.5\n",
" cutoff = offset - float(model.intercept_) / float(model.coef_)\n",
" direction = '>' if model.coef_ > 0 else '<'\n",
" print(\n",
" f\"Custom cutoff defined by Logistic regressor for {marker:>10}: {cutoff:.3f}\"\n",
" )\n",
" cutoff = offset - float(model.intercept_.squeeze()) / float(model.coef_.squeeze())\n",
" direction = \">\" if model.coef_ > 0 else \"<\"\n",
" print(f\"Custom cutoff defined by Logistic regressor for {marker:>10}: {cutoff:.3f}\")\n",
" pred = njab.sklearn.scoring.get_pred(model, clinic[marker].to_frame())\n",
" ax, kmf_0, kmf_1 = compare_km_curves(pred=pred)\n",
" res = log_rank_test(mask=pred)\n",
" ax.set_title(\n",
" f'KM curve for {TARGET.lower()}'\n",
" f' and marker {marker} \\n'\n",
" f'(cutoff{direction}{cutoff:.2f}, log-rank-test p={res.p_value:.3f})')\n",
" ax.legend([\n",
" f\"KP pred=0 (N={(~pred).sum()})\", '95% CI (pred=0)',\n",
" f\"KP pred=1 (N={pred.sum()})\", '95% CI (pred=1)'\n",
" ])\n",
" fname = FOLDER / f'KM_plot_{marker}.pdf'\n",
" f\"KM curve for {TARGET.lower()}\"\n",
" f\" and marker {marker} \\n\"\n",
" f\"(cutoff{direction}{cutoff:.2f}, log-rank-test p={res.p_value:.3f})\"\n",
" )\n",
" ax.legend(\n",
" [\n",
" f\"KP pred=0 (N={(~pred).sum()})\",\n",
" \"95% CI (pred=0)\",\n",
" f\"KP pred=1 (N={pred.sum()})\",\n",
" \"95% CI (pred=1)\",\n",
" ]\n",
" )\n",
" fname = FOLDER / f\"KM_plot_{marker}.pdf\"\n",
" files_out[fname.name] = fname\n",
" njab.plotting.savefig(ax.get_figure(), fname)\n",
"\n",
" # add counts\n",
" add_at_risk_counts(kmf_0, kmf_1, ax=ax)\n",
" fname = FOLDER / f'KM_plot_{marker}_w_counts.pdf'\n",
" fname = FOLDER / f\"KM_plot_{marker}_w_counts.pdf\"\n",
" files_out[fname.name] = fname\n",
" njab.plotting.savefig(ax.get_figure(), fname)"
]
Expand Down
Loading