diff --git a/docs/tutorial/explorative_analysis.ipynb b/docs/tutorial/explorative_analysis.ipynb index d11ea64..f77f53b 100644 --- a/docs/tutorial/explorative_analysis.ipynb +++ b/docs/tutorial/explorative_analysis.ipynb @@ -22,7 +22,9 @@ }, "outputs": [], "source": [ - "%pip install 'njab[all]' openpyxl" + "%pip install 'njab[all]' openpyxl\n", + "\n", + "import logging" ] }, { @@ -31,34 +33,31 @@ "id": "99dc45b9", "metadata": { "tags": [ - "hide-input" + "hide-cell" ] }, "outputs": [], "source": [ "from functools import partial\n", "from pathlib import Path\n", - "import logging\n", - "\n", - "from IPython.display import display\n", "\n", + "import matplotlib.pyplot as plt\n", "import numpy as np\n", "import pandas as pd\n", - "\n", - "import sklearn\n", "import seaborn\n", + "import sklearn\n", + "from IPython.display import display\n", "from lifelines.plotting import add_at_risk_counts\n", "\n", - "import matplotlib.pyplot as plt\n", - "\n", - "from njab.plotting.km import compare_km_curves, log_rank_test\n", "import njab\n", "import njab.plotting\n", + "from njab.plotting.km import compare_km_curves, log_rank_test\n", "\n", "njab.pandas.set_pandas_options()\n", - "njab.plotting.set_font_sizes('x-small')\n", + "pd.options.display.min_rows = 10\n", + "njab.plotting.set_font_sizes(\"x-small\")\n", "seaborn.set_style(\"whitegrid\")\n", - "plt.rcParams['figure.figsize'] = [4.0, 4.0]" + "plt.rcParams[\"figure.figsize\"] = [4.0, 4.0]" ] }, { @@ -80,18 +79,18 @@ }, "outputs": [], "source": [ - "TARGET = 'event'\n", - "TIME_KM = 'time'\n", - "FOLDER = 'prostate'\n", - "CLINIC = 'https://raw.githubusercontent.com/ErikinBC/SurvSet/main/SurvSet/_datagen/output/prostate.csv'\n", - "val_ids: str = '' # List of comma separated values or filepath\n", + "TARGET = \"event\"\n", + "TIME_KM = \"time\"\n", + "FOLDER = \"prostate\"\n", + "CLINIC = \"https://raw.githubusercontent.com/ErikinBC/SurvSet/main/SurvSet/_datagen/output/prostate.csv\"\n", + "val_ids: str = \"\" # List of comma separated values or filepath\n", "#\n", "# list or string of csv, eg. \"var1,var2\"\n", - "clinic_cont = ['age']\n", + "clinic_cont = [\"age\"]\n", "# list or string of csv, eg. \"var1,var2\"\n", - "clinic_binary = ['male', 'AD']\n", + "clinic_binary = [\"male\", \"AD\"]\n", "# List of comma separated values or filepath\n", - "da_covar = 'num_age,num_wt'" + "da_covar = \"num_age,num_wt\"" ] }, { @@ -145,21 +144,26 @@ }, "outputs": [], "source": [ - "clinic = pd.read_csv(CLINIC, index_col=0).dropna(how='any')\n", - "clinic.columns.name = 'feat_name' # ! check needs to be implemented\n", + "clinic = pd.read_csv(CLINIC, index_col=0).dropna(how=\"any\")\n", + "clinic.columns.name = \"feat_name\" # ! check needs to be implemented\n", "cols_clinic = njab.pandas.get_colums_accessor(clinic)\n", - "clinic = clinic.astype({var: 'int'\n", - " for var in ['event',\n", - " 'time',\n", - " 'num_age',\n", - " 'num_wt',\n", - " 'num_sbp',\n", - " 'num_dbp',\n", - " 'num_sz',\n", - " 'num_sg',\n", - " 'num_sdate',\n", - " 'fac_stage']}\n", - " )\n", + "clinic = clinic.astype(\n", + " {\n", + " var: \"int\"\n", + " for var in [\n", + " \"event\",\n", + " \"time\",\n", + " \"num_age\",\n", + " \"num_wt\",\n", + " \"num_sbp\",\n", + " \"num_dbp\",\n", + " \"num_sz\",\n", + " \"num_sg\",\n", + " \"num_sdate\",\n", + " \"fac_stage\",\n", + " ]\n", + " }\n", + ")\n", "clinic" ] }, @@ -178,7 +182,7 @@ "metadata": {}, "outputs": [], "source": [ - "clinic.describe(include='object')" + "clinic.describe(include=\"object\")" ] }, { @@ -198,8 +202,8 @@ "metadata": {}, "outputs": [], "source": [ - "vars_binary = ['fac_hx', 'fac_bm']\n", - "clinic[vars_binary] = clinic[vars_binary].astype('category')" + "vars_binary = [\"fac_hx\", \"fac_bm\"]\n", + "clinic[vars_binary] = clinic[vars_binary].astype(\"category\")" ] }, { @@ -242,8 +246,16 @@ "outputs": [], "source": [ "vars_cont = [\n", - " 'num_age', 'num_wt', 'num_sbp', 'num_dbp', 'num_hg', 'num_sz', 'num_sg',\n", - " 'num_ap', 'num_sdate', 'fac_stage'\n", + " \"num_age\",\n", + " \"num_wt\",\n", + " \"num_sbp\",\n", + " \"num_dbp\",\n", + " \"num_hg\",\n", + " \"num_sz\",\n", + " \"num_sg\",\n", + " \"num_ap\",\n", + " \"num_sdate\",\n", + " \"fac_stage\",\n", "]" ] }, @@ -267,7 +279,7 @@ }, "outputs": [], "source": [ - "fname = FOLDER / '1_differential_analysis.xlsx'\n", + "fname = FOLDER / \"1_differential_analysis.xlsx\"\n", "files_out = {fname.name: fname}\n", "writer = pd.ExcelWriter(fname)\n", "print(f\"Output will be written to: {fname}\")" @@ -318,10 +330,10 @@ "ana_differential = njab.stats.groups_comparision.diff_analysis(\n", " clinic[vars_cont],\n", " happend,\n", - " event_names=(TARGET, 'no event'),\n", + " event_names=(TARGET, \"no event\"),\n", ")\n", - "ana_differential = ana_differential.sort_values(('ttest', 'p-val'))\n", - "ana_differential.to_excel(writer, \"clinic continous\", float_format='%.4f')\n", + "ana_differential = ana_differential.sort_values((\"ttest\", \"p-val\"))\n", + "ana_differential.to_excel(writer, sheet_name=\"clinic continous\", float_format=\"%.4f\")\n", "ana_differential" ] }, @@ -348,19 +360,18 @@ "for var in vars_binary:\n", " if len(clinic[var].cat.categories) == 2:\n", " diff_binomial.append(\n", - " njab.stats.groups_comparision.binomtest(clinic[var],\n", - " happend,\n", - " event_names=(TARGET,\n", - " 'no-event')))\n", + " njab.stats.groups_comparision.binomtest(\n", + " clinic[var], happend, event_names=(TARGET, \"no-event\")\n", + " )\n", + " )\n", " else:\n", " logging.warning(\n", " f\"Non-binary variable: {var} with {len(clinic[var].cat.categories)} categories\"\n", " )\n", "\n", - "diff_binomial = pd.concat(diff_binomial).sort_values(\n", - " ('binomial test', 'pvalue'))\n", - "diff_binomial.to_excel(writer, 'clinic binary', float_format='%.4f')\n", - "with pd.option_context('display.max_rows', len(diff_binomial)):\n", + "diff_binomial = pd.concat(diff_binomial).sort_values((\"binomial test\", \"pvalue\"))\n", + "diff_binomial.to_excel(writer, sheet_name=\"clinic binary\", float_format=\"%.4f\")\n", + "with pd.option_context(\"display.max_rows\", len(diff_binomial)):\n", " display(diff_binomial)" ] }, @@ -388,7 +399,7 @@ "source": [ "clinic_ancova = [TARGET, *covar]\n", "clinic_ancova = clinic[clinic_ancova].copy()\n", - "clinic_ancova.describe(include='all')" + "clinic_ancova.describe(include=\"all\")" ] }, { @@ -410,17 +421,15 @@ }, "outputs": [], "source": [ - "clinic_ancova = clinic_ancova.dropna(\n", - ")\n", - "categorical_columns = clinic_ancova.columns[clinic_ancova.dtypes == 'category']\n", + "clinic_ancova = clinic_ancova.dropna()\n", + "categorical_columns = clinic_ancova.columns[clinic_ancova.dtypes == \"category\"]\n", "print(\"Available covariates: \" \", \".join(categorical_columns.to_list()))\n", "for categorical_column in categorical_columns:\n", " # only works if no NA and only binary variables!\n", - " clinic_ancova[categorical_column] = clinic_ancova[\n", - " categorical_column].cat.codes\n", + " clinic_ancova[categorical_column] = clinic_ancova[categorical_column].cat.codes\n", "\n", "desc_ancova = clinic_ancova.describe()\n", - "desc_ancova.to_excel(writer, \"covars\", float_format='%.4f')\n", + "desc_ancova.to_excel(writer, sheet_name=\"covars\", float_format=\"%.4f\")\n", "desc_ancova" ] }, @@ -443,10 +452,10 @@ }, "outputs": [], "source": [ - "if (desc_ancova.loc['std'] < 0.001).sum():\n", - " non_varying = desc_ancova.loc['std'] < 0.001\n", + "if (desc_ancova.loc[\"std\"] < 0.001).sum():\n", + " non_varying = desc_ancova.loc[\"std\"] < 0.001\n", " non_varying = non_varying[non_varying].index\n", - " print(\"Non varying columns: \", ', '.join(non_varying))\n", + " print(\"Non varying columns: \", \", \".join(non_varying))\n", " clinic_ancova = clinic_ancova.drop(non_varying, axis=1)\n", " for col in non_varying:\n", " covar.remove(col)" @@ -476,12 +485,14 @@ " df_clinic=clinic_ancova,\n", " target=TARGET,\n", " covar=covar,\n", - " value_name='')\n", - "ancova = ancova.ancova().sort_values('p-unc')\n", + " value_name=\"\",\n", + ")\n", + "ancova = ancova.ancova().sort_values(\"p-unc\")\n", "ancova = ancova.loc[:, \"p-unc\":]\n", - "ancova.columns = pd.MultiIndex.from_product([['ancova'], ancova.columns],\n", - " names=('test', 'var'))\n", - "ancova.to_excel(writer, \"olink controlled\", float_format='%.4f')\n", + "ancova.columns = pd.MultiIndex.from_product(\n", + " [[\"ancova\"], ancova.columns], names=(\"test\", \"var\")\n", + ")\n", + "ancova.to_excel(writer, sheet_name=\"olink controlled\", float_format=\"%.4f\")\n", "ancova.head(20)" ] }, @@ -522,7 +533,11 @@ "cell_type": "code", "execution_count": null, "id": "c4c8aff8", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "rejected = ancova.query(\"`('ancova', 'rejected')` == True\")\n", @@ -546,15 +561,17 @@ "metadata": {}, "outputs": [], "source": [ - "class_weight = 'balanced'\n", + "class_weight = \"balanced\"\n", "y_km = clinic[TARGET]\n", "time_km = clinic[TIME_KM]\n", - "compare_km_curves = partial(compare_km_curves,\n", - " time=time_km,\n", - " y=y_km,\n", - " xlim=(0, 80),\n", - " xlabel='time passed',\n", - " ylabel=f'rate {y_km.name}')\n", + "compare_km_curves = partial(\n", + " compare_km_curves,\n", + " time=time_km,\n", + " y=y_km,\n", + " xlim=(0, 80),\n", + " xlabel=\"time passed\",\n", + " ylabel=f\"rate {y_km.name}\",\n", + ")\n", "log_rank_test = partial(\n", " log_rank_test,\n", " time=time_km,\n", @@ -575,43 +592,51 @@ "cell_type": "code", "execution_count": null, "id": "26a0e4a1", - "metadata": {}, + "metadata": { + "tags": [ + "hide-input" + ] + }, "outputs": [], "source": [ "for marker, _ in rejected.index[:TOP_N]: # first case done above currently\n", " fig, ax = plt.subplots()\n", - " class_weight = 'balanced'\n", + " class_weight = \"balanced\"\n", " # class_weight=None\n", " model = sklearn.linear_model.LogisticRegression(class_weight=class_weight)\n", " model = model.fit(X=clinic[marker].to_frame(), y=happend)\n", " print(\n", - " f\"Intercept {float(model.intercept_):5.3f}, coef.: {float(model.coef_):5.3f}\"\n", + " f\"Intercept {float(model.intercept_.squeeze()):5.3f}, coef.: {float(model.coef_.squeeze()):5.3f}\"\n", " )\n", - " # offset = np.log(p/(1-p)) # ! could be adapted based on proportion of target (for imbalanced data)\n", + " ! could be adapted based on proportion of target (for imbalanced data):\n", + " # offset = np.log(p/(1-p))\n", " offset = np.log(0.5 / (1 - 0.5)) # ! standard cutoff of probability of 0.5\n", - " cutoff = offset - float(model.intercept_) / float(model.coef_)\n", - " direction = '>' if model.coef_ > 0 else '<'\n", - " print(\n", - " f\"Custom cutoff defined by Logistic regressor for {marker:>10}: {cutoff:.3f}\"\n", - " )\n", + " cutoff = offset - float(model.intercept_.squeeze()) / float(model.coef_.squeeze())\n", + " direction = \">\" if model.coef_ > 0 else \"<\"\n", + " print(f\"Custom cutoff defined by Logistic regressor for {marker:>10}: {cutoff:.3f}\")\n", " pred = njab.sklearn.scoring.get_pred(model, clinic[marker].to_frame())\n", " ax, kmf_0, kmf_1 = compare_km_curves(pred=pred)\n", " res = log_rank_test(mask=pred)\n", " ax.set_title(\n", - " f'KM curve for {TARGET.lower()}'\n", - " f' and marker {marker} \\n'\n", - " f'(cutoff{direction}{cutoff:.2f}, log-rank-test p={res.p_value:.3f})')\n", - " ax.legend([\n", - " f\"KP pred=0 (N={(~pred).sum()})\", '95% CI (pred=0)',\n", - " f\"KP pred=1 (N={pred.sum()})\", '95% CI (pred=1)'\n", - " ])\n", - " fname = FOLDER / f'KM_plot_{marker}.pdf'\n", + " f\"KM curve for {TARGET.lower()}\"\n", + " f\" and marker {marker} \\n\"\n", + " f\"(cutoff{direction}{cutoff:.2f}, log-rank-test p={res.p_value:.3f})\"\n", + " )\n", + " ax.legend(\n", + " [\n", + " f\"KP pred=0 (N={(~pred).sum()})\",\n", + " \"95% CI (pred=0)\",\n", + " f\"KP pred=1 (N={pred.sum()})\",\n", + " \"95% CI (pred=1)\",\n", + " ]\n", + " )\n", + " fname = FOLDER / f\"KM_plot_{marker}.pdf\"\n", " files_out[fname.name] = fname\n", " njab.plotting.savefig(ax.get_figure(), fname)\n", "\n", " # add counts\n", " add_at_risk_counts(kmf_0, kmf_1, ax=ax)\n", - " fname = FOLDER / f'KM_plot_{marker}_w_counts.pdf'\n", + " fname = FOLDER / f\"KM_plot_{marker}_w_counts.pdf\"\n", " files_out[fname.name] = fname\n", " njab.plotting.savefig(ax.get_figure(), fname)" ] diff --git a/docs/tutorial/explorative_analysis.py b/docs/tutorial/explorative_analysis.py index 272fd39..b45ae5a 100644 --- a/docs/tutorial/explorative_analysis.py +++ b/docs/tutorial/explorative_analysis.py @@ -6,7 +6,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.15.2 +# jupytext_version: 1.16.4 # kernelspec: # display_name: Python 3 (ipykernel) # language: python @@ -22,48 +22,46 @@ # %% tags=["hide-output"] # %pip install 'njab[all]' openpyxl +import logging + # %% tags=["hide-cell"] from functools import partial from pathlib import Path -import logging - -from IPython.display import display +import matplotlib.pyplot as plt import numpy as np import pandas as pd - -import sklearn import seaborn +import sklearn +from IPython.display import display from lifelines.plotting import add_at_risk_counts -import matplotlib.pyplot as plt - -from njab.plotting.km import compare_km_curves, log_rank_test import njab import njab.plotting +from njab.plotting.km import compare_km_curves, log_rank_test njab.pandas.set_pandas_options() pd.options.display.min_rows = 10 -njab.plotting.set_font_sizes('x-small') +njab.plotting.set_font_sizes("x-small") seaborn.set_style("whitegrid") -plt.rcParams['figure.figsize'] = [4.0, 4.0] +plt.rcParams["figure.figsize"] = [4.0, 4.0] # %% [markdown] # ### Set parameters # %% tags=["parameters"] -TARGET = 'event' -TIME_KM = 'time' -FOLDER = 'prostate' -CLINIC = 'https://raw.githubusercontent.com/ErikinBC/SurvSet/main/SurvSet/_datagen/output/prostate.csv' -val_ids: str = '' # List of comma separated values or filepath +TARGET = "event" +TIME_KM = "time" +FOLDER = "prostate" +CLINIC = "https://raw.githubusercontent.com/ErikinBC/SurvSet/main/SurvSet/_datagen/output/prostate.csv" +val_ids: str = "" # List of comma separated values or filepath # # list or string of csv, eg. "var1,var2" -clinic_cont = ['age'] +clinic_cont = ["age"] # list or string of csv, eg. "var1,var2" -clinic_binary = ['male', 'AD'] +clinic_binary = ["male", "AD"] # List of comma separated values or filepath -da_covar = 'num_age,num_wt' +da_covar = "num_age,num_wt" # %% tags=["hide-input"] print(f"Time To Event: {TIME_KM}") @@ -77,34 +75,39 @@ # Inspect the data: # %% tags=["hide-input"] -clinic = pd.read_csv(CLINIC, index_col=0).dropna(how='any') -clinic.columns.name = 'feat_name' # ! check needs to be implemented +clinic = pd.read_csv(CLINIC, index_col=0).dropna(how="any") +clinic.columns.name = "feat_name" # ! check needs to be implemented cols_clinic = njab.pandas.get_colums_accessor(clinic) -clinic = clinic.astype({var: 'int' - for var in ['event', - 'time', - 'num_age', - 'num_wt', - 'num_sbp', - 'num_dbp', - 'num_sz', - 'num_sg', - 'num_sdate', - 'fac_stage']} - ) +clinic = clinic.astype( + { + var: "int" + for var in [ + "event", + "time", + "num_age", + "num_wt", + "num_sbp", + "num_dbp", + "num_sz", + "num_sg", + "num_sdate", + "fac_stage", + ] + } +) clinic # %% [markdown] # Descriptive statistics of non-numeric variables: # %% -clinic.describe(include='object') +clinic.describe(include="object") # %% [markdown] # Set the binary variables and convert them to categories: # %% -vars_binary = ['fac_hx', 'fac_bm'] -clinic[vars_binary] = clinic[vars_binary].astype('category') +vars_binary = ["fac_hx", "fac_bm"] +clinic[vars_binary] = clinic[vars_binary].astype("category") # %% [markdown] # Covariates to adjust for: @@ -119,8 +122,16 @@ # %% vars_cont = [ - 'num_age', 'num_wt', 'num_sbp', 'num_dbp', 'num_hg', 'num_sz', 'num_sg', - 'num_ap', 'num_sdate', 'fac_stage' + "num_age", + "num_wt", + "num_sbp", + "num_dbp", + "num_hg", + "num_sz", + "num_sg", + "num_ap", + "num_sdate", + "fac_stage", ] # %% [markdown] @@ -128,7 +139,7 @@ # in an excel file: # %% tags=["hide-input"] -fname = FOLDER / '1_differential_analysis.xlsx' +fname = FOLDER / "1_differential_analysis.xlsx" files_out = {fname.name: fname} writer = pd.ExcelWriter(fname) print(f"Output will be written to: {fname}") @@ -151,10 +162,10 @@ ana_differential = njab.stats.groups_comparision.diff_analysis( clinic[vars_cont], happend, - event_names=(TARGET, 'no event'), + event_names=(TARGET, "no event"), ) -ana_differential = ana_differential.sort_values(('ttest', 'p-val')) -ana_differential.to_excel(writer, "clinic continous", float_format='%.4f') +ana_differential = ana_differential.sort_values(("ttest", "p-val")) +ana_differential.to_excel(writer, sheet_name="clinic continous", float_format="%.4f") ana_differential # %% [markdown] @@ -165,19 +176,18 @@ for var in vars_binary: if len(clinic[var].cat.categories) == 2: diff_binomial.append( - njab.stats.groups_comparision.binomtest(clinic[var], - happend, - event_names=(TARGET, - 'no-event'))) + njab.stats.groups_comparision.binomtest( + clinic[var], happend, event_names=(TARGET, "no-event") + ) + ) else: logging.warning( f"Non-binary variable: {var} with {len(clinic[var].cat.categories)} categories" ) -diff_binomial = pd.concat(diff_binomial).sort_values( - ('binomial test', 'pvalue')) -diff_binomial.to_excel(writer, 'clinic binary', float_format='%.4f') -with pd.option_context('display.max_rows', len(diff_binomial)): +diff_binomial = pd.concat(diff_binomial).sort_values(("binomial test", "pvalue")) +diff_binomial.to_excel(writer, sheet_name="clinic binary", float_format="%.4f") +with pd.option_context("display.max_rows", len(diff_binomial)): display(diff_binomial) # %% [markdown] @@ -189,33 +199,31 @@ # %% tags=["hide-input"] clinic_ancova = [TARGET, *covar] clinic_ancova = clinic[clinic_ancova].copy() -clinic_ancova.describe(include='all') +clinic_ancova.describe(include="all") # %% [markdown] # Discard all rows with a missing features (if present): # %% tags=["hide-input"] -clinic_ancova = clinic_ancova.dropna( -) -categorical_columns = clinic_ancova.columns[clinic_ancova.dtypes == 'category'] +clinic_ancova = clinic_ancova.dropna() +categorical_columns = clinic_ancova.columns[clinic_ancova.dtypes == "category"] print("Available covariates: " ", ".join(categorical_columns.to_list())) for categorical_column in categorical_columns: # only works if no NA and only binary variables! - clinic_ancova[categorical_column] = clinic_ancova[ - categorical_column].cat.codes + clinic_ancova[categorical_column] = clinic_ancova[categorical_column].cat.codes desc_ancova = clinic_ancova.describe() -desc_ancova.to_excel(writer, "covars", float_format='%.4f') +desc_ancova.to_excel(writer, sheet_name="covars", float_format="%.4f") desc_ancova # %% [markdown] # Remove non-varying variables (if present): # %% tags=["hide-input"] -if (desc_ancova.loc['std'] < 0.001).sum(): - non_varying = desc_ancova.loc['std'] < 0.001 +if (desc_ancova.loc["std"] < 0.001).sum(): + non_varying = desc_ancova.loc["std"] < 0.001 non_varying = non_varying[non_varying].index - print("Non varying columns: ", ', '.join(non_varying)) + print("Non varying columns: ", ", ".join(non_varying)) clinic_ancova = clinic_ancova.drop(non_varying, axis=1) for col in non_varying: covar.remove(col) @@ -229,12 +237,14 @@ df_clinic=clinic_ancova, target=TARGET, covar=covar, - value_name='') -ancova = ancova.ancova().sort_values('p-unc') + value_name="", +) +ancova = ancova.ancova().sort_values("p-unc") ancova = ancova.loc[:, "p-unc":] -ancova.columns = pd.MultiIndex.from_product([['ancova'], ancova.columns], - names=('test', 'var')) -ancova.to_excel(writer, "olink controlled", float_format='%.4f') +ancova.columns = pd.MultiIndex.from_product( + [["ancova"], ancova.columns], names=("test", "var") +) +ancova.to_excel(writer, sheet_name="olink controlled", float_format="%.4f") ancova.head(20) # %% tags=["hide-input"] @@ -261,15 +271,17 @@ # %% [markdown] # Settings for plots # %% -class_weight = 'balanced' +class_weight = "balanced" y_km = clinic[TARGET] time_km = clinic[TIME_KM] -compare_km_curves = partial(compare_km_curves, - time=time_km, - y=y_km, - xlim=(0, 80), - xlabel='time passed', - ylabel=f'rate {y_km.name}') +compare_km_curves = partial( + compare_km_curves, + time=time_km, + y=y_km, + xlim=(0, 80), + xlabel="time passed", + ylabel=f"rate {y_km.name}", +) log_rank_test = partial( log_rank_test, time=time_km, @@ -283,38 +295,42 @@ # %% tags=["hide-input"] for marker, _ in rejected.index[:TOP_N]: # first case done above currently fig, ax = plt.subplots() - class_weight = 'balanced' + class_weight = "balanced" # class_weight=None model = sklearn.linear_model.LogisticRegression(class_weight=class_weight) model = model.fit(X=clinic[marker].to_frame(), y=happend) print( - f"Intercept {float(model.intercept_):5.3f}, coef.: {float(model.coef_):5.3f}" + f"Intercept {float(model.intercept_.squeeze()):5.3f}, coef.: {float(model.coef_.squeeze()):5.3f}" ) - # offset = np.log(p/(1-p)) # ! could be adapted based on proportion of target (for imbalanced data) + # ! could be adapted based on proportion of target (for imbalanced data): + # offset = np.log(p/(1-p)) offset = np.log(0.5 / (1 - 0.5)) # ! standard cutoff of probability of 0.5 - cutoff = offset - float(model.intercept_) / float(model.coef_) - direction = '>' if model.coef_ > 0 else '<' - print( - f"Custom cutoff defined by Logistic regressor for {marker:>10}: {cutoff:.3f}" - ) + cutoff = offset - float(model.intercept_.squeeze()) / float(model.coef_.squeeze()) + direction = ">" if model.coef_ > 0 else "<" + print(f"Custom cutoff defined by Logistic regressor for {marker:>10}: {cutoff:.3f}") pred = njab.sklearn.scoring.get_pred(model, clinic[marker].to_frame()) ax, kmf_0, kmf_1 = compare_km_curves(pred=pred) res = log_rank_test(mask=pred) ax.set_title( - f'KM curve for {TARGET.lower()}' - f' and marker {marker} \n' - f'(cutoff{direction}{cutoff:.2f}, log-rank-test p={res.p_value:.3f})') - ax.legend([ - f"KP pred=0 (N={(~pred).sum()})", '95% CI (pred=0)', - f"KP pred=1 (N={pred.sum()})", '95% CI (pred=1)' - ]) - fname = FOLDER / f'KM_plot_{marker}.pdf' + f"KM curve for {TARGET.lower()}" + f" and marker {marker} \n" + f"(cutoff{direction}{cutoff:.2f}, log-rank-test p={res.p_value:.3f})" + ) + ax.legend( + [ + f"KP pred=0 (N={(~pred).sum()})", + "95% CI (pred=0)", + f"KP pred=1 (N={pred.sum()})", + "95% CI (pred=1)", + ] + ) + fname = FOLDER / f"KM_plot_{marker}.pdf" files_out[fname.name] = fname njab.plotting.savefig(ax.get_figure(), fname) # add counts add_at_risk_counts(kmf_0, kmf_1, ax=ax) - fname = FOLDER / f'KM_plot_{marker}_w_counts.pdf' + fname = FOLDER / f"KM_plot_{marker}_w_counts.pdf" files_out[fname.name] = fname njab.plotting.savefig(ax.get_figure(), fname) diff --git a/docs/tutorial/log_reg.ipynb b/docs/tutorial/log_reg.ipynb index 66bc42f..d51ecac 100644 --- a/docs/tutorial/log_reg.ipynb +++ b/docs/tutorial/log_reg.ipynb @@ -63,19 +63,22 @@ "from njab.plotting.metrics import plot_auc, plot_prc\n", "from njab.sklearn import StandardScaler\n", "from njab.sklearn import pca as njab_pca\n", - "from njab.sklearn.scoring import (ConfusionMatrix,\n", - " get_lr_multiplicative_decomposition,\n", - " get_pred, get_score,\n", - " get_target_count_per_bin)\n", + "from njab.sklearn.scoring import (\n", + " ConfusionMatrix,\n", + " get_lr_multiplicative_decomposition,\n", + " get_pred,\n", + " get_score,\n", + " get_target_count_per_bin,\n", + ")\n", "from njab.sklearn.types import Splits\n", "\n", - "logger = logging.getLogger('njab')\n", + "logger = logging.getLogger(\"njab\")\n", "logger.setLevel(logging.INFO)\n", "\n", "njab.pandas.set_pandas_options()\n", "pd.options.display.min_rows = 10\n", "pd.options.display.max_columns = 20\n", - "njab.plotting.set_font_sizes('x-small')\n", + "njab.plotting.set_font_sizes(\"x-small\")\n", "seaborn.set_style(\"whitegrid\")\n", "\n", "njab.plotting.set_font_sizes(8)" @@ -100,17 +103,21 @@ }, "outputs": [], "source": [ - "CLINIC: str = 'https://raw.githubusercontent.com/RasmussenLab/njab/HEAD/docs/tutorial/data/alzheimer/clinic_ml.csv' # clincial data\n", - "fname_omics: str = 'https://raw.githubusercontent.com/RasmussenLab/njab/HEAD/docs/tutorial/data/alzheimer/proteome.csv' # omics data\n", - "TARGET: str = 'AD' # target column in CLINIC dataset (binary)\n", + "CLINIC: str = (\n", + " \"https://raw.githubusercontent.com/RasmussenLab/njab/HEAD/docs/tutorial/data/alzheimer/clinic_ml.csv\" # clincial data\n", + ")\n", + "fname_omics: str = (\n", + " \"https://raw.githubusercontent.com/RasmussenLab/njab/HEAD/docs/tutorial/data/alzheimer/proteome.csv\" # omics data\n", + ")\n", + "TARGET: str = \"AD\" # target column in CLINIC dataset (binary)\n", "TARGET_LABEL: Optional[str] = None # optional: rename target variable\n", "n_features_max: int = 5\n", "freq_cutoff: float = 0.5 # Omics cutoff for sample completeness\n", - "VAL_IDS: str = '' #\n", - "VAL_IDS_query: str = ''\n", + "VAL_IDS: str = \"\" #\n", + "VAL_IDS_query: str = \"\"\n", "weights: bool = True\n", - "FOLDER = 'alzheimer'\n", - "model_name = 'all'" + "FOLDER = \"alzheimer\"\n", + "model_name = \"all\"" ] }, { @@ -187,7 +194,8 @@ "M_after = omics.shape[1]\n", "msg = (\n", " f\"Removed {M_before-M_after} features with more than {freq_cutoff*100}% missing values.\"\n", - " f\"\\nRemaining features: {M_after} (of {M_before})\")\n", + " f\"\\nRemaining features: {M_after} (of {M_before})\"\n", + ")\n", "print(msg)\n", "# keep a map of all proteins in protein group, but only display first protein\n", "# proteins are unique to protein groups\n", @@ -252,8 +260,10 @@ "target_counts = clinic[TARGET].value_counts()\n", "\n", "if target_counts.sum() < len(clinic):\n", - " print(\"Target has missing values.\"\n", - " f\" Can only use {target_counts.sum()} of {len(clinic)} samples.\")\n", + " print(\n", + " \"Target has missing values.\"\n", + " f\" Can only use {target_counts.sum()} of {len(clinic)} samples.\"\n", + " )\n", " mask = clinic[TARGET].notna()\n", " clinic, omics = clinic.loc[mask], omics.loc[mask]" ] @@ -300,10 +310,8 @@ " else:\n", " logging.warning(\"Create train and test split.\")\n", " _, VAL_IDS = sklearn.model_selection.train_test_split(\n", - " clinic.index,\n", - " test_size=0.2,\n", - " random_state=123,\n", - " stratify=clinic[TARGET])\n", + " clinic.index, test_size=0.2, random_state=123, stratify=clinic[TARGET]\n", + " )\n", " VAL_IDS = list(VAL_IDS)\n", "elif isinstance(VAL_IDS, str):\n", " VAL_IDS = VAL_IDS.split(\",\")\n", @@ -380,14 +388,16 @@ }, "outputs": [], "source": [ - "TRAIN_LABEL = 'train'\n", - "TEST_LABEL = 'test'\n", + "TRAIN_LABEL = \"train\"\n", + "TEST_LABEL = \"test\"\n", "if VAL_IDS:\n", " diff = pd.Index(VAL_IDS)\n", " VAL_IDS = X.index.intersection(VAL_IDS)\n", " if len(diff) < len(VAL_IDS):\n", - " logging.warning(\"Some requested validation IDs are not in the data: \"\n", - " \",\".join(str(x) for x in diff.difference(VAL_IDS)))\n", + " logging.warning(\n", + " \"Some requested validation IDs are not in the data: \"\n", + " \",\".join(str(x) for x in diff.difference(VAL_IDS))\n", + " )\n", " X_val = X.loc[VAL_IDS]\n", " X = X.drop(VAL_IDS)\n", "\n", @@ -443,7 +453,7 @@ "source": [ "# out\n", "files_out = {}\n", - "fname = FOLDER / 'log_reg.xlsx'\n", + "fname = FOLDER / \"log_reg.xlsx\"\n", "files_out[fname.stem] = fname\n", "writer = pd.ExcelWriter(fname)\n", "print(f\"Excel-file for tables: {fname}\")" @@ -464,7 +474,7 @@ "metadata": {}, "outputs": [], "source": [ - "predictions = y_val.to_frame('true')" + "predictions = y_val.to_frame(\"true\")" ] }, { @@ -522,7 +532,7 @@ "metadata": {}, "outputs": [], "source": [ - "median_imputer = sklearn.impute.SimpleImputer(strategy='median')\n", + "median_imputer = sklearn.impute.SimpleImputer(strategy=\"median\")\n", "\n", "X = njab.sklearn.transform_DataFrame(X, median_imputer.fit_transform)\n", "X_val = njab.sklearn.transform_DataFrame(X_val, median_imputer.transform)\n", @@ -556,7 +566,7 @@ "PCs, pca = njab_pca.run_pca(X_scaled, n_components=None)\n", "files_out[\"var_explained_by_PCs.pdf\"] = FOLDER / \"var_explained_by_PCs.pdf\"\n", "ax = njab_pca.plot_explained_variance(pca)\n", - "ax.locator_params(axis='x', integer=True)\n", + "ax.locator_params(axis=\"x\", integer=True)\n", "njab.plotting.savefig(ax.get_figure(), files_out[\"var_explained_by_PCs.pdf\"])\n", "X_scaled.shape" ] @@ -580,18 +590,18 @@ }, "outputs": [], "source": [ - "files_out['scatter_first_5PCs.pdf'] = FOLDER / 'scatter_first_5PCs.pdf'\n", + "files_out[\"scatter_first_5PCs.pdf\"] = FOLDER / \"scatter_first_5PCs.pdf\"\n", "\n", - "fig, axes = plt.subplots(5, 2, figsize=(6, 8), layout='constrained')\n", + "fig, axes = plt.subplots(5, 2, figsize=(6, 8), layout=\"constrained\")\n", "PCs.columns = [s.replace(\"principal component\", \"PC\") for s in PCs.columns]\n", - "PCs = PCs.join(y.astype('category'))\n", + "PCs = PCs.join(y.astype(\"category\"))\n", "up_to = min(PCs.shape[-1], 5)\n", "# https://github.com/matplotlib/matplotlib/issues/25538\n", "# colab: old pandas version and too new matplotlib version (2023-11-6)\n", "for (i, j), ax in zip(itertools.combinations(range(up_to), 2), axes.flatten()):\n", - " PCs.plot.scatter(i, j, c=TARGET_LABEL, cmap='Paired', ax=ax)\n", + " PCs.plot.scatter(i, j, c=TARGET_LABEL, cmap=\"Paired\", ax=ax)\n", "_ = PCs.pop(TARGET_LABEL)\n", - "njab.plotting.savefig(fig, files_out['scatter_first_5PCs.pdf'])" + "njab.plotting.savefig(fig, files_out[\"scatter_first_5PCs.pdf\"])" ] }, { @@ -617,14 +627,13 @@ "reducer = umap.UMAP()\n", "embedding = reducer.fit_transform(X_scaled)\n", "\n", - "files_out['umap.pdf'] = FOLDER / 'umap.pdf'\n", + "files_out[\"umap.pdf\"] = FOLDER / \"umap.pdf\"\n", "\n", - "embedding = pd.DataFrame(embedding,\n", - " index=X_scaled.index,\n", - " columns=['UMAP 1',\n", - " 'UMAP 2']).join(y.astype('category'))\n", - "ax = embedding.plot.scatter('UMAP 1', 'UMAP 2', c=TARGET_LABEL, cmap='Paired')\n", - "njab.plotting.savefig(ax.get_figure(), files_out['umap.pdf'])" + "embedding = pd.DataFrame(\n", + " embedding, index=X_scaled.index, columns=[\"UMAP 1\", \"UMAP 2\"]\n", + ").join(y.astype(\"category\"))\n", + "ax = embedding.plot.scatter(\"UMAP 1\", \"UMAP 2\", c=TARGET_LABEL, cmap=\"Paired\")\n", + "njab.plotting.savefig(ax.get_figure(), files_out[\"umap.pdf\"])" ] }, { @@ -646,7 +655,7 @@ "outputs": [], "source": [ "if weights:\n", - " weights = 'balanced'\n", + " weights = \"balanced\"\n", " cutoff = 0.5\n", "else:\n", " cutoff = None\n", @@ -673,12 +682,10 @@ "metadata": {}, "outputs": [], "source": [ - "splits = Splits(X_train=X_scaled,\n", - " X_test=scaler.transform(X_val),\n", - " y_train=y,\n", - " y_test=y_val)\n", - "model = sklearn.linear_model.LogisticRegression(penalty='l2',\n", - " class_weight=weights)" + "splits = Splits(\n", + " X_train=X_scaled, X_test=scaler.transform(X_val), y_train=y, y_test=y_val\n", + ")\n", + "model = sklearn.linear_model.LogisticRegression(penalty=\"l2\", class_weight=weights)" ] }, { @@ -693,14 +700,16 @@ "outputs": [], "source": [ "scoring = [\n", - " 'precision', 'recall', 'f1', 'balanced_accuracy', 'roc_auc',\n", - " 'average_precision'\n", + " \"precision\",\n", + " \"recall\",\n", + " \"f1\",\n", + " \"balanced_accuracy\",\n", + " \"roc_auc\",\n", + " \"average_precision\",\n", "]\n", "scoring = {k: k for k in scoring}\n", "# do not average log loss for AIC and BIC calculations\n", - "scoring['log_loss'] = make_scorer(log_loss,\n", - " greater_is_better=True,\n", - " normalize=False)\n", + "scoring[\"log_loss\"] = make_scorer(log_loss, greater_is_better=True, normalize=False)\n", "cv_feat = njab.sklearn.find_n_best_features(\n", " X=splits.X_train,\n", " y=splits.y_train,\n", @@ -712,8 +721,7 @@ " return_train_score=True,\n", " # fit_params=dict(sample_weight=weights)\n", ")\n", - "cv_feat = cv_feat.drop('test_case',\n", - " axis=1).groupby('n_features').agg(['mean', 'std'])\n", + "cv_feat = cv_feat.drop(\"test_case\", axis=1).groupby(\"n_features\").agg([\"mean\", \"std\"])\n", "cv_feat" ] }, @@ -739,19 +747,19 @@ "# AIC vs BIC on train and test data with bigger is better\n", "IC_criteria = pd.DataFrame()\n", "N_split = {\n", - " 'train': round(len(splits.X_train) * 0.8),\n", - " 'test': round(len(splits.X_train) * 0.2)\n", + " \"train\": round(len(splits.X_train) * 0.8),\n", + " \"test\": round(len(splits.X_train) * 0.2),\n", "}\n", "\n", - "for _split in ('train', 'test'):\n", + "for _split in (\"train\", \"test\"):\n", "\n", - " IC_criteria[(f'{_split}_neg_AIC',\n", - " 'mean')] = -(2 * cv_feat.index.to_series() -\n", - " 2 * cv_feat[(f'{_split}_log_loss', 'mean')])\n", - " IC_criteria[(\n", - " f'{_split}_neg_BIC',\n", - " 'mean')] = -(cv_feat.index.to_series() * np.log(N_split[_split]) -\n", - " 2 * cv_feat[(f'{_split}_log_loss', 'mean')])\n", + " IC_criteria[(f\"{_split}_neg_AIC\", \"mean\")] = -(\n", + " 2 * cv_feat.index.to_series() - 2 * cv_feat[(f\"{_split}_log_loss\", \"mean\")]\n", + " )\n", + " IC_criteria[(f\"{_split}_neg_BIC\", \"mean\")] = -(\n", + " cv_feat.index.to_series() * np.log(N_split[_split])\n", + " - 2 * cv_feat[(f\"{_split}_log_loss\", \"mean\")]\n", + " )\n", "IC_criteria.columns = pd.MultiIndex.from_tuples(IC_criteria.columns)\n", "IC_criteria" ] @@ -773,7 +781,8 @@ "source": [ "cv_feat = cv_feat.join(IC_criteria)\n", "cv_feat = cv_feat.filter(regex=\"train|test\", axis=1).style.highlight_max(\n", - " axis=0, subset=pd.IndexSlice[:, pd.IndexSlice[:, 'mean']])\n", + " axis=0, subset=pd.IndexSlice[:, pd.IndexSlice[:, \"mean\"]]\n", + ")\n", "cv_feat" ] }, @@ -792,7 +801,7 @@ "metadata": {}, "outputs": [], "source": [ - "cv_feat.to_excel(writer, 'CV', float_format='%.3f')\n", + "cv_feat.to_excel(writer, sheet_name=\"CV\", float_format=\"%.3f\")\n", "cv_feat = cv_feat.data" ] }, @@ -815,11 +824,11 @@ }, "outputs": [], "source": [ - "mask = cv_feat.columns.levels[0].str[:4] == 'test'\n", + "mask = cv_feat.columns.levels[0].str[:4] == \"test\"\n", "scores_cols = cv_feat.columns.levels[0][mask]\n", - "n_feat_best = cv_feat.loc[:, pd.IndexSlice[scores_cols, 'mean']].idxmax()\n", - "n_feat_best.name = 'best'\n", - "n_feat_best.to_excel(writer, 'n_feat_best')\n", + "n_feat_best = cv_feat.loc[:, pd.IndexSlice[scores_cols, \"mean\"]].idxmax()\n", + "n_feat_best.name = \"best\"\n", + "n_feat_best.to_excel(writer, sheet_name=\"n_feat_best\")\n", "n_feat_best" ] }, @@ -841,7 +850,7 @@ "results_model = njab.sklearn.run_model(\n", " model=model,\n", " splits=splits,\n", - " n_feat_to_select=n_feat_best.loc['test_roc_auc', 'mean'],\n", + " n_feat_to_select=n_feat_best.loc[\"test_roc_auc\", \"mean\"],\n", ")\n", "results_model.name = model_name" ] @@ -865,12 +874,11 @@ }, "outputs": [], "source": [ - "ax = plot_auc(results_model,\n", - " label_train=TRAIN_LABEL,\n", - " label_test=TEST_LABEL,\n", - " figsize=(4, 2))\n", - "files_out['ROAUC'] = FOLDER / 'plot_roauc.pdf'\n", - "njab.plotting.savefig(ax.get_figure(), files_out['ROAUC'])" + "ax = plot_auc(\n", + " results_model, label_train=TRAIN_LABEL, label_test=TEST_LABEL, figsize=(4, 2)\n", + ")\n", + "files_out[\"ROAUC\"] = FOLDER / \"plot_roauc.pdf\"\n", + "njab.plotting.savefig(ax.get_figure(), files_out[\"ROAUC\"])" ] }, { @@ -892,12 +900,11 @@ }, "outputs": [], "source": [ - "ax = plot_prc(results_model,\n", - " label_train=TRAIN_LABEL,\n", - " label_test=TEST_LABEL,\n", - " figsize=(4, 2))\n", - "files_out['PRAUC'] = FOLDER / 'plot_prauc.pdf'\n", - "njab.plotting.savefig(ax.get_figure(), files_out['PRAUC'])" + "ax = plot_prc(\n", + " results_model, label_train=TRAIN_LABEL, label_test=TEST_LABEL, figsize=(4, 2)\n", + ")\n", + "files_out[\"PRAUC\"] = FOLDER / \"plot_prauc.pdf\"\n", + "njab.plotting.savefig(ax.get_figure(), files_out[\"PRAUC\"])" ] }, { @@ -919,10 +926,12 @@ }, "outputs": [], "source": [ - "pd.DataFrame({\n", - " 'coef': results_model.model.coef_.flatten(),\n", - " 'name': results_model.model.feature_names_in_\n", - "})" + "pd.DataFrame(\n", + " {\n", + " \"coef\": results_model.model.coef_.flatten(),\n", + " \"name\": results_model.model.feature_names_in_,\n", + " }\n", + ")" ] }, { @@ -955,7 +964,7 @@ "outputs": [], "source": [ "des_selected_feat = splits.X_train[results_model.selected_features].describe()\n", - "des_selected_feat.to_excel(writer, 'sel_feat', float_format='%.3f')\n", + "des_selected_feat.to_excel(writer, sheet_name=\"sel_feat\", float_format=\"%.3f\")\n", "des_selected_feat" ] }, @@ -979,9 +988,9 @@ "outputs": [], "source": [ "fig = plt.figure(figsize=(6, 6))\n", - "files_out['corr_plot_train.pdf'] = FOLDER / 'corr_plot_train.pdf'\n", + "files_out[\"corr_plot_train.pdf\"] = FOLDER / \"corr_plot_train.pdf\"\n", "_ = corrplot(X[results_model.selected_features].join(y).corr(), size_scale=300)\n", - "njab.plotting.savefig(fig, files_out['corr_plot_train.pdf'])" + "njab.plotting.savefig(fig, files_out[\"corr_plot_train.pdf\"])" ] }, { @@ -1004,18 +1013,16 @@ "outputs": [], "source": [ "N_BINS = 20\n", - "score = get_score(clf=results_model.model,\n", - " X=splits.X_train[results_model.selected_features],\n", - " pos=1)\n", + "score = get_score(\n", + " clf=results_model.model, X=splits.X_train[results_model.selected_features], pos=1\n", + ")\n", "ax = score.hist(bins=N_BINS)\n", - "files_out['hist_score_train.pdf'] = FOLDER / 'hist_score_train.pdf'\n", - "njab.plotting.savefig(ax.get_figure(), files_out['hist_score_train.pdf'])\n", + "files_out[\"hist_score_train.pdf\"] = FOLDER / \"hist_score_train.pdf\"\n", + "njab.plotting.savefig(ax.get_figure(), files_out[\"hist_score_train.pdf\"])\n", "pred_bins = get_target_count_per_bin(score, y, n_bins=N_BINS)\n", - "ax = pred_bins.plot(kind='bar', ylabel='count')\n", - "files_out[\n", - " 'hist_score_train_target.pdf'] = FOLDER / 'hist_score_train_target.pdf'\n", - "njab.plotting.savefig(ax.get_figure(),\n", - " files_out['hist_score_train_target.pdf'])\n", + "ax = pred_bins.plot(kind=\"bar\", ylabel=\"count\")\n", + "files_out[\"hist_score_train_target.pdf\"] = FOLDER / \"hist_score_train_target.pdf\"\n", + "njab.plotting.savefig(ax.get_figure(), files_out[\"hist_score_train_target.pdf\"])\n", "# pred_bins" ] }, @@ -1038,20 +1045,20 @@ }, "outputs": [], "source": [ - "score_val = get_score(clf=results_model.model,\n", - " X=splits.X_test[results_model.selected_features],\n", - " pos=1)\n", - "predictions['score'] = score_val\n", + "score_val = get_score(\n", + " clf=results_model.model, X=splits.X_test[results_model.selected_features], pos=1\n", + ")\n", + "predictions[\"score\"] = score_val\n", "ax = score_val.hist(bins=N_BINS) # list(x/N_BINS for x in range(0,N_BINS)))\n", - "ax.set_ylabel('count')\n", + "ax.set_ylabel(\"count\")\n", "ax.set_xlim(0, 1)\n", - "files_out['hist_score_test.pdf'] = FOLDER / 'hist_score_test.pdf'\n", - "njab.plotting.savefig(ax.get_figure(), files_out['hist_score_test.pdf'])\n", + "files_out[\"hist_score_test.pdf\"] = FOLDER / \"hist_score_test.pdf\"\n", + "njab.plotting.savefig(ax.get_figure(), files_out[\"hist_score_test.pdf\"])\n", "pred_bins_val = get_target_count_per_bin(score_val, y_val, n_bins=N_BINS)\n", - "ax = pred_bins_val.plot(kind='bar', ylabel='count')\n", - "ax.locator_params(axis='y', integer=True)\n", - "files_out['hist_score_test_target.pdf'] = FOLDER / 'hist_score_test_target.pdf'\n", - "njab.plotting.savefig(ax.get_figure(), files_out['hist_score_test_target.pdf'])\n", + "ax = pred_bins_val.plot(kind=\"bar\", ylabel=\"count\")\n", + "ax.locator_params(axis=\"y\", integer=True)\n", + "files_out[\"hist_score_test_target.pdf\"] = FOLDER / \"hist_score_test_target.pdf\"\n", + "njab.plotting.savefig(ax.get_figure(), files_out[\"hist_score_test_target.pdf\"])\n", "# pred_bins_val" ] }, @@ -1076,8 +1083,7 @@ }, "outputs": [], "source": [ - "prc = pd.DataFrame(results_model.train.prc,\n", - " index='precision recall cutoffs'.split())\n", + "prc = pd.DataFrame(results_model.train.prc, index=\"precision recall cutoffs\".split())\n", "prc" ] }, @@ -1092,9 +1098,12 @@ }, "outputs": [], "source": [ - "prc.loc['f1_score'] = 2 * (prc.loc['precision'] * prc.loc['recall']) / (\n", - " 1 / prc.loc['precision'] + 1 / prc.loc['recall'])\n", - "f1_max = prc[prc.loc['f1_score'].argmax()]\n", + "prc.loc[\"f1_score\"] = (\n", + " 2\n", + " * (prc.loc[\"precision\"] * prc.loc[\"recall\"])\n", + " / (1 / prc.loc[\"precision\"] + 1 / prc.loc[\"recall\"])\n", + ")\n", + "f1_max = prc[prc.loc[\"f1_score\"].argmax()]\n", "f1_max" ] }, @@ -1117,7 +1126,7 @@ }, "outputs": [], "source": [ - "cutoff = float(f1_max.loc['cutoffs'])\n", + "cutoff = float(f1_max.loc[\"cutoffs\"])\n", "cutoff" ] }, @@ -1135,13 +1144,15 @@ "y_pred_val = njab.sklearn.scoring.get_custom_pred(\n", " clf=results_model.model,\n", " X=splits.X_test[results_model.selected_features],\n", - " cutoff=cutoff)\n", + " cutoff=cutoff,\n", + ")\n", "predictions[model_name] = y_pred_val\n", - "predictions['dead'] = y_val\n", + "predictions[\"dead\"] = y_val\n", "_ = ConfusionMatrix(y_val, y_pred_val).as_dataframe()\n", - "_.columns = pd.MultiIndex.from_tuples([(t[0] + f\" - {cutoff:.3f}\", t[1])\n", - " for t in _.columns])\n", - "_.to_excel(writer, \"CM_test_cutoff_adapted\")\n", + "_.columns = pd.MultiIndex.from_tuples(\n", + " [(t[0] + f\" - {cutoff:.3f}\", t[1]) for t in _.columns]\n", + ")\n", + "_.to_excel(writer, sheet_name=\"CM_test_cutoff_adapted\")\n", "_" ] }, @@ -1156,14 +1167,14 @@ }, "outputs": [], "source": [ - "y_pred_val = get_pred(clf=results_model.model,\n", - " X=splits.X_test[results_model.selected_features])\n", + "y_pred_val = get_pred(\n", + " clf=results_model.model, X=splits.X_test[results_model.selected_features]\n", + ")\n", "predictions[model_name] = y_pred_val\n", - "predictions['dead'] = y_val\n", + "predictions[\"dead\"] = y_val\n", "_ = ConfusionMatrix(y_val, y_pred_val).as_dataframe()\n", - "_.columns = pd.MultiIndex.from_tuples([(t[0] + f\" - {0.5}\", t[1])\n", - " for t in _.columns])\n", - "_.to_excel(writer, \"CM_test_cutoff_0.5\")\n", + "_.columns = pd.MultiIndex.from_tuples([(t[0] + f\" - {0.5}\", t[1]) for t in _.columns])\n", + "_.to_excel(writer, sheet_name=\"CM_test_cutoff_0.5\")\n", "_" ] }, @@ -1187,14 +1198,13 @@ }, "outputs": [], "source": [ - "components = get_lr_multiplicative_decomposition(results=results_model,\n", - " X=splits.X_train,\n", - " prob=score,\n", - " y=y)\n", - "components.to_excel(writer, 'decomp_multiplicative_train')\n", - "components.to_excel(writer,\n", - " 'decomp_multiplicative_train_view',\n", - " float_format='%.5f')\n", + "components = get_lr_multiplicative_decomposition(\n", + " results=results_model, X=splits.X_train, prob=score, y=y\n", + ")\n", + "components.to_excel(writer, sheet_name=\"decomp_multiplicative_train\")\n", + "components.to_excel(\n", + " writer, sheet_name=\"decomp_multiplicative_train_view\", float_format=\"%.5f\"\n", + ")\n", "components.head(10)" ] }, @@ -1210,14 +1220,13 @@ }, "outputs": [], "source": [ - "components_test = get_lr_multiplicative_decomposition(results=results_model,\n", - " X=splits.X_test,\n", - " prob=score_val,\n", - " y=y_val)\n", - "components_test.to_excel(writer, 'decomp_multiplicative_test')\n", - "components_test.to_excel(writer,\n", - " 'decomp_multiplicative_test_view',\n", - " float_format='%.5f')\n", + "components_test = get_lr_multiplicative_decomposition(\n", + " results=results_model, X=splits.X_test, prob=score_val, y=y_val\n", + ")\n", + "components_test.to_excel(writer, sheet_name=\"decomp_multiplicative_test\")\n", + "components_test.to_excel(\n", + " writer, sheet_name=\"decomp_multiplicative_test_view\", float_format=\"%.5f\"\n", + ")\n", "components_test.head(10)" ] }, @@ -1249,13 +1258,13 @@ "# make sure to have two or more features?\n", "M_sel = len(results_model.selected_features)\n", "if M_sel > 1:\n", - " embedding = reducer.fit_transform(\n", - " X_scaled[results_model.selected_features])\n", + " embedding = reducer.fit_transform(X_scaled[results_model.selected_features])\n", "\n", - " embedding = pd.DataFrame(embedding,\n", - " index=X_scaled.index,\n", - " columns=['UMAP dimension 1', 'UMAP dimension 2'\n", - " ]).join(y.astype('category'))\n", + " embedding = pd.DataFrame(\n", + " embedding,\n", + " index=X_scaled.index,\n", + " columns=[\"UMAP dimension 1\", \"UMAP dimension 2\"],\n", + " ).join(y.astype(\"category\"))\n", " display(embedding.head(3))\n", "else:\n", " embedding = None" @@ -1280,12 +1289,14 @@ }, "outputs": [], "source": [ - "predictions['label'] = predictions.apply(\n", + "predictions[\"label\"] = predictions.apply(\n", " lambda x: njab.sklearn.scoring.get_label_binary_classification(\n", - " x['true'], x[model_name]),\n", - " axis=1)\n", - "mask = predictions[['true', model_name]].sum(axis=1).astype(bool)\n", - "predictions.loc[mask].sort_values('score', ascending=False)" + " x[\"true\"], x[model_name]\n", + " ),\n", + " axis=1,\n", + ")\n", + "mask = predictions[[\"true\", model_name]].sum(axis=1).astype(bool)\n", + "predictions.loc[mask].sort_values(\"score\", ascending=False)" ] }, { @@ -1304,7 +1315,8 @@ " embedding_val = pd.DataFrame(\n", " reducer.transform(X_val_scaled[results_model.selected_features]),\n", " index=X_val_scaled.index,\n", - " columns=['UMAP dimension 1', 'UMAP dimension 2'])\n", + " columns=[\"UMAP dimension 1\", \"UMAP dimension 2\"],\n", + " )\n", " embedding_val.sample(3)" ] }, @@ -1320,15 +1332,20 @@ "outputs": [], "source": [ "pred_train = (\n", - " y.to_frame('true')\n", + " y.to_frame(\"true\")\n", " # .join(get_score(clf=results_model.model, X=splits.X_train[results_model.selected_features], pos=1))\n", - " .join(score.rename('score')).join(\n", - " get_pred(results_model.model, splits.X_train[\n", - " results_model.selected_features]).rename(model_name)))\n", - "pred_train['label'] = pred_train.apply(\n", + " .join(score.rename(\"score\")).join(\n", + " get_pred(\n", + " results_model.model, splits.X_train[results_model.selected_features]\n", + " ).rename(model_name)\n", + " )\n", + ")\n", + "pred_train[\"label\"] = pred_train.apply(\n", " lambda x: njab.sklearn.scoring.get_label_binary_classification(\n", - " x['true'], x[model_name]),\n", - " axis=1)\n", + " x[\"true\"], x[model_name]\n", + " ),\n", + " axis=1,\n", + ")\n", "pred_train.sample(5)" ] }, @@ -1361,22 +1378,26 @@ "if embedding is not None:\n", " fig, axes = plt.subplots(1, 2, figsize=(8, 4), sharex=True, sharey=True)\n", " for _embedding, ax, _title, _model_pred_label in zip(\n", - " [embedding, embedding_val], axes, [TRAIN_LABEL, TEST_LABEL],\n", - " [pred_train['label'], predictions['label']]): # noqa: E129\n", + " [embedding, embedding_val],\n", + " axes,\n", + " [TRAIN_LABEL, TEST_LABEL],\n", + " [pred_train[\"label\"], predictions[\"label\"]],\n", + " ): # noqa: E129\n", " ax = seaborn.scatterplot(\n", " x=_embedding.iloc[:, 0],\n", " y=_embedding.iloc[:, 1],\n", " hue=_model_pred_label,\n", - " hue_order=['TN', 'TP', 'FN', 'FP'],\n", + " hue_order=[\"TN\", \"TP\", \"FN\", \"FP\"],\n", " palette=[colors[0], colors[2], colors[1], colors[3]],\n", - " ax=ax)\n", + " ax=ax,\n", + " )\n", " ax.set_title(_title)\n", "\n", " # files_out['pred_pca_labeled'] = FOLDER / 'pred_pca_labeled.pdf'\n", " # njab.plotting.savefig(fig, files_out['pred_pca_labeled'])\n", "\n", - " files_out['umap_sel_feat.pdf'] = FOLDER / 'umap_sel_feat.pdf'\n", - " njab.plotting.savefig(ax.get_figure(), files_out['umap_sel_feat.pdf'])" + " files_out[\"umap_sel_feat.pdf\"] = FOLDER / \"umap_sel_feat.pdf\"\n", + " njab.plotting.savefig(ax.get_figure(), files_out[\"umap_sel_feat.pdf\"])" ] }, { @@ -1402,11 +1423,13 @@ "if embedding is not None:\n", " embedding = embedding.join(X[results_model.selected_features])\n", " embedding_val = embedding_val.join(X_val[results_model.selected_features])\n", - " embedding['label'], embedding_val['label'] = pred_train[\n", - " 'label'], predictions['label']\n", - " embedding['group'], embedding_val['group'] = TRAIN_LABEL, TEST_LABEL\n", + " embedding[\"label\"], embedding_val[\"label\"] = (\n", + " pred_train[\"label\"],\n", + " predictions[\"label\"],\n", + " )\n", + " embedding[\"group\"], embedding_val[\"group\"] = TRAIN_LABEL, TEST_LABEL\n", " combined_embeddings = pd.concat([embedding, embedding_val])\n", - " combined_embeddings.index.name = 'ID'" + " combined_embeddings.index.name = \"ID\"" ] }, { @@ -1423,19 +1446,21 @@ "if embedding is not None:\n", " cols = combined_embeddings.columns\n", "\n", - " TEMPLATE = 'none'\n", + " TEMPLATE = \"none\"\n", " defaults = dict(width=800, height=400, template=TEMPLATE)\n", "\n", - " fig = px.scatter(combined_embeddings.round(3).reset_index(),\n", - " x=cols[0],\n", - " y=cols[1],\n", - " color='label',\n", - " facet_col='group',\n", - " hover_data=['ID'] + results_model.selected_features,\n", - " **defaults)\n", + " fig = px.scatter(\n", + " combined_embeddings.round(3).reset_index(),\n", + " x=cols[0],\n", + " y=cols[1],\n", + " color=\"label\",\n", + " facet_col=\"group\",\n", + " hover_data=[\"ID\"] + results_model.selected_features,\n", + " **defaults,\n", + " )\n", " fig.for_each_annotation(lambda a: a.update(text=a.text.split(\"=\")[1]))\n", "\n", - " fname = FOLDER / 'umap_sel_feat.html'\n", + " fname = FOLDER / \"umap_sel_feat.html\"\n", " files_out[fname.name] = fname\n", " fig.write_html(fname)\n", " print(fname)\n", @@ -1461,10 +1486,11 @@ }, "outputs": [], "source": [ - "PCs_train, pca = njab_pca.run_pca(X_scaled[results_model.selected_features],\n", - " n_components=None)\n", + "PCs_train, pca = njab_pca.run_pca(\n", + " X_scaled[results_model.selected_features], n_components=None\n", + ")\n", "ax = njab_pca.plot_explained_variance(pca)\n", - "ax.locator_params(axis='x', integer=True)\n", + "ax.locator_params(axis=\"x\", integer=True)\n", "\n", "fname = FOLDER / \"feat_sel_PCA_var_explained_by_PCs.pdf\"\n", "files_out[fname.name] = fname\n", @@ -1491,9 +1517,7 @@ "outputs": [], "source": [ "PCs_val = pca.transform(X_val_scaled[results_model.selected_features])\n", - "PCs_val = pd.DataFrame(PCs_val,\n", - " index=X_val_scaled.index,\n", - " columns=PCs_train.columns)\n", + "PCs_val = pd.DataFrame(PCs_val, index=X_val_scaled.index, columns=PCs_train.columns)\n", "PCs_val" ] }, @@ -1511,18 +1535,22 @@ "if M_sel > 1:\n", " fig, axes = plt.subplots(1, 2, figsize=(6, 3), sharex=True, sharey=True)\n", " for _embedding, ax, _title, _model_pred_label in zip(\n", - " [PCs_train, PCs_val], axes, [TRAIN_LABEL, TEST_LABEL],\n", - " [pred_train['label'], predictions['label']]): # noqa: E129\n", + " [PCs_train, PCs_val],\n", + " axes,\n", + " [TRAIN_LABEL, TEST_LABEL],\n", + " [pred_train[\"label\"], predictions[\"label\"]],\n", + " ): # noqa: E129\n", " ax = seaborn.scatterplot(\n", " x=_embedding.iloc[:, 0],\n", " y=_embedding.iloc[:, 1],\n", " hue=_model_pred_label,\n", - " hue_order=['TN', 'TP', 'FN', 'FP'],\n", + " hue_order=[\"TN\", \"TP\", \"FN\", \"FP\"],\n", " palette=[colors[0], colors[2], colors[1], colors[3]],\n", - " ax=ax)\n", + " ax=ax,\n", + " )\n", " ax.set_title(_title)\n", "\n", - " fname = FOLDER / 'pca_sel_feat.pdf'\n", + " fname = FOLDER / \"pca_sel_feat.pdf\"\n", " files_out[fname.name] = fname\n", " njab.plotting.savefig(ax.get_figure(), fname)" ] @@ -1540,29 +1568,31 @@ "source": [ "if M_sel > 1:\n", " max_rows = min(3, len(results_model.selected_features))\n", - " fig, axes = plt.subplots(max_rows,\n", - " 2,\n", - " figsize=(6, 8),\n", - " sharex=False,\n", - " sharey=False,\n", - " layout='constrained')\n", + " fig, axes = plt.subplots(\n", + " max_rows, 2, figsize=(6, 8), sharex=False, sharey=False, layout=\"constrained\"\n", + " )\n", "\n", " for axes_col, (_embedding, _title, _model_pred_label) in enumerate(\n", - " zip([PCs_train, PCs_val], [TRAIN_LABEL, TEST_LABEL],\n", - " [pred_train['label'], predictions['label']])):\n", + " zip(\n", + " [PCs_train, PCs_val],\n", + " [TRAIN_LABEL, TEST_LABEL],\n", + " [pred_train[\"label\"], predictions[\"label\"]],\n", + " )\n", + " ):\n", " _row = 0\n", " axes[_row, axes_col].set_title(_title)\n", - " for (i, j) in itertools.combinations(range(max_rows), 2):\n", + " for i, j in itertools.combinations(range(max_rows), 2):\n", " ax = seaborn.scatterplot(\n", " x=_embedding.iloc[:, i],\n", " y=_embedding.iloc[:, j],\n", " hue=_model_pred_label,\n", - " hue_order=['TN', 'TP', 'FN', 'FP'],\n", + " hue_order=[\"TN\", \"TP\", \"FN\", \"FP\"],\n", " palette=[colors[0], colors[2], colors[1], colors[3]],\n", - " ax=axes[_row, axes_col])\n", + " ax=axes[_row, axes_col],\n", + " )\n", " _row += 1\n", "\n", - " fname = FOLDER / f'pca_sel_feat_up_to_{max_rows}.pdf'\n", + " fname = FOLDER / f\"pca_sel_feat_up_to_{max_rows}.pdf\"\n", " files_out[fname.name] = fname\n", " njab.plotting.savefig(ax.get_figure(), fname)" ] @@ -1590,49 +1620,53 @@ "source": [ "if M_sel > 1:\n", " max_rows = min(3, len(results_model.selected_features))\n", - " fig, axes = plt.subplots(max_rows,\n", - " 2,\n", - " figsize=(6, 8),\n", - " sharex=False,\n", - " sharey=False,\n", - " layout='constrained')\n", + " fig, axes = plt.subplots(\n", + " max_rows, 2, figsize=(6, 8), sharex=False, sharey=False, layout=\"constrained\"\n", + " )\n", "\n", " for axes_col, (_embedding, _title, _model_pred_label) in enumerate(\n", - " zip([\n", + " zip(\n", + " [\n", " X_scaled[results_model.selected_features],\n", - " X_val_scaled[results_model.selected_features]\n", - " ], [TRAIN_LABEL, TEST_LABEL],\n", - " [pred_train['label'], predictions['label']])):\n", + " X_val_scaled[results_model.selected_features],\n", + " ],\n", + " [TRAIN_LABEL, TEST_LABEL],\n", + " [pred_train[\"label\"], predictions[\"label\"]],\n", + " )\n", + " ):\n", " _row = 0\n", " axes[_row, axes_col].set_title(_title)\n", - " for (i, j) in itertools.combinations(range(max_rows), 2):\n", + " for i, j in itertools.combinations(range(max_rows), 2):\n", " ax = seaborn.scatterplot(\n", " x=_embedding.iloc[:, i],\n", " y=_embedding.iloc[:, j],\n", " hue=_model_pred_label,\n", - " hue_order=['TN', 'TP', 'FN', 'FP'],\n", + " hue_order=[\"TN\", \"TP\", \"FN\", \"FP\"],\n", " palette=[colors[0], colors[2], colors[1], colors[3]],\n", - " ax=axes[_row, axes_col])\n", + " ax=axes[_row, axes_col],\n", + " )\n", " _row += 1\n", "\n", - " fname = FOLDER / f'sel_feat_up_to_{max_rows}.pdf'\n", + " fname = FOLDER / f\"sel_feat_up_to_{max_rows}.pdf\"\n", " files_out[fname.name] = fname\n", " njab.plotting.savefig(ax.get_figure(), fname)\n", "else:\n", - " fig, axes = plt.subplots(1, 1, figsize=(6, 2), layout='constrained')\n", + " fig, axes = plt.subplots(1, 1, figsize=(6, 2), layout=\"constrained\")\n", " single_feature = results_model.selected_features[0]\n", - " data = pd.concat([\n", - " X[single_feature].to_frame().join(\n", - " pred_train['label']).assign(group=TRAIN_LABEL),\n", - " X_val[single_feature].to_frame().join(\n", - " predictions['label']).assign(group=TEST_LABEL)\n", - " ])\n", - " ax = seaborn.swarmplot(data=data,\n", - " x='group',\n", - " y=single_feature,\n", - " hue='label',\n", - " ax=axes)\n", - " fname = FOLDER / f'sel_feat_{single_feature}.pdf'\n", + " data = pd.concat(\n", + " [\n", + " X[single_feature]\n", + " .to_frame()\n", + " .join(pred_train[\"label\"])\n", + " .assign(group=TRAIN_LABEL),\n", + " X_val[single_feature]\n", + " .to_frame()\n", + " .join(predictions[\"label\"])\n", + " .assign(group=TEST_LABEL),\n", + " ]\n", + " )\n", + " ax = seaborn.swarmplot(data=data, x=\"group\", y=single_feature, hue=\"label\", ax=axes)\n", + " fname = FOLDER / f\"sel_feat_{single_feature}.pdf\"\n", " files_out[fname.name] = fname\n", " njab.plotting.savefig(ax.get_figure(), fname)" ] @@ -1655,9 +1689,11 @@ "outputs": [], "source": [ "X[results_model.selected_features].join(pred_train).to_excel(\n", - " writer, sheet_name='pred_train_annotated', float_format=\"%.3f\")\n", + " writer, sheet_name=\"pred_train_annotated\", float_format=\"%.3f\"\n", + ")\n", "X_val[results_model.selected_features].join(predictions).to_excel(\n", - " writer, sheet_name='pred_test_annotated', float_format=\"%.3f\")" + " writer, sheet_name=\"pred_test_annotated\", float_format=\"%.3f\"\n", + ")" ] }, { diff --git a/docs/tutorial/log_reg.py b/docs/tutorial/log_reg.py index 087ffa3..00c8e81 100644 --- a/docs/tutorial/log_reg.py +++ b/docs/tutorial/log_reg.py @@ -6,7 +6,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.15.2 +# jupytext_version: 1.16.4 # kernelspec: # display_name: Python 3 (ipykernel) # language: python @@ -50,19 +50,22 @@ from njab.plotting.metrics import plot_auc, plot_prc from njab.sklearn import StandardScaler from njab.sklearn import pca as njab_pca -from njab.sklearn.scoring import (ConfusionMatrix, - get_lr_multiplicative_decomposition, - get_pred, get_score, - get_target_count_per_bin) +from njab.sklearn.scoring import ( + ConfusionMatrix, + get_lr_multiplicative_decomposition, + get_pred, + get_score, + get_target_count_per_bin, +) from njab.sklearn.types import Splits -logger = logging.getLogger('njab') +logger = logging.getLogger("njab") logger.setLevel(logging.INFO) njab.pandas.set_pandas_options() pd.options.display.min_rows = 10 pd.options.display.max_columns = 20 -njab.plotting.set_font_sizes('x-small') +njab.plotting.set_font_sizes("x-small") seaborn.set_style("whitegrid") njab.plotting.set_font_sizes(8) @@ -71,17 +74,21 @@ # ## Set parameters # %% tags=["parameters"] -CLINIC: str = 'https://raw.githubusercontent.com/RasmussenLab/njab/HEAD/docs/tutorial/data/alzheimer/clinic_ml.csv' # clincial data -fname_omics: str = 'https://raw.githubusercontent.com/RasmussenLab/njab/HEAD/docs/tutorial/data/alzheimer/proteome.csv' # omics data -TARGET: str = 'AD' # target column in CLINIC dataset (binary) +CLINIC: str = ( + "https://raw.githubusercontent.com/RasmussenLab/njab/HEAD/docs/tutorial/data/alzheimer/clinic_ml.csv" # clincial data +) +fname_omics: str = ( + "https://raw.githubusercontent.com/RasmussenLab/njab/HEAD/docs/tutorial/data/alzheimer/proteome.csv" # omics data +) +TARGET: str = "AD" # target column in CLINIC dataset (binary) TARGET_LABEL: Optional[str] = None # optional: rename target variable n_features_max: int = 5 freq_cutoff: float = 0.5 # Omics cutoff for sample completeness -VAL_IDS: str = '' # -VAL_IDS_query: str = '' +VAL_IDS: str = "" # +VAL_IDS_query: str = "" weights: bool = True -FOLDER = 'alzheimer' -model_name = 'all' +FOLDER = "alzheimer" +model_name = "all" # %% [markdown] # ## Setup @@ -111,7 +118,8 @@ M_after = omics.shape[1] msg = ( f"Removed {M_before-M_after} features with more than {freq_cutoff*100}% missing values." - f"\nRemaining features: {M_after} (of {M_before})") + f"\nRemaining features: {M_after} (of {M_before})" +) print(msg) # keep a map of all proteins in protein group, but only display first protein # proteins are unique to protein groups @@ -138,8 +146,10 @@ target_counts = clinic[TARGET].value_counts() if target_counts.sum() < len(clinic): - print("Target has missing values." - f" Can only use {target_counts.sum()} of {len(clinic)} samples.") + print( + "Target has missing values." + f" Can only use {target_counts.sum()} of {len(clinic)} samples." + ) mask = clinic[TARGET].notna() clinic, omics = clinic.loc[mask], omics.loc[mask] @@ -163,10 +173,8 @@ else: logging.warning("Create train and test split.") _, VAL_IDS = sklearn.model_selection.train_test_split( - clinic.index, - test_size=0.2, - random_state=123, - stratify=clinic[TARGET]) + clinic.index, test_size=0.2, random_state=123, stratify=clinic[TARGET] + ) VAL_IDS = list(VAL_IDS) elif isinstance(VAL_IDS, str): VAL_IDS = VAL_IDS.split(",") @@ -196,14 +204,16 @@ # Separate train and test split # %% tags=["hide-input"] -TRAIN_LABEL = 'train' -TEST_LABEL = 'test' +TRAIN_LABEL = "train" +TEST_LABEL = "test" if VAL_IDS: diff = pd.Index(VAL_IDS) VAL_IDS = X.index.intersection(VAL_IDS) if len(diff) < len(VAL_IDS): - logging.warning("Some requested validation IDs are not in the data: " - ",".join(str(x) for x in diff.difference(VAL_IDS))) + logging.warning( + "Some requested validation IDs are not in the data: " + ",".join(str(x) for x in diff.difference(VAL_IDS)) + ) X_val = X.loc[VAL_IDS] X = X.drop(VAL_IDS) @@ -227,7 +237,7 @@ # %% tags=["hide-input"] # out files_out = {} -fname = FOLDER / 'log_reg.xlsx' +fname = FOLDER / "log_reg.xlsx" files_out[fname.stem] = fname writer = pd.ExcelWriter(fname) print(f"Excel-file for tables: {fname}") @@ -236,7 +246,7 @@ # ## Collect test predictions # %% -predictions = y_val.to_frame('true') +predictions = y_val.to_frame("true") # %% [markdown] # ## Fill missing values with training median @@ -255,7 +265,7 @@ # Impute using median of training data # %% -median_imputer = sklearn.impute.SimpleImputer(strategy='median') +median_imputer = sklearn.impute.SimpleImputer(strategy="median") X = njab.sklearn.transform_DataFrame(X, median_imputer.fit_transform) X_val = njab.sklearn.transform_DataFrame(X_val, median_imputer.transform) @@ -273,7 +283,7 @@ PCs, pca = njab_pca.run_pca(X_scaled, n_components=None) files_out["var_explained_by_PCs.pdf"] = FOLDER / "var_explained_by_PCs.pdf" ax = njab_pca.plot_explained_variance(pca) -ax.locator_params(axis='x', integer=True) +ax.locator_params(axis="x", integer=True) njab.plotting.savefig(ax.get_figure(), files_out["var_explained_by_PCs.pdf"]) X_scaled.shape @@ -281,18 +291,18 @@ # Plot first 5 PCs with binary target label annotating each sample:: # %% tags=["hide-input"] -files_out['scatter_first_5PCs.pdf'] = FOLDER / 'scatter_first_5PCs.pdf' +files_out["scatter_first_5PCs.pdf"] = FOLDER / "scatter_first_5PCs.pdf" -fig, axes = plt.subplots(5, 2, figsize=(6, 8), layout='constrained') +fig, axes = plt.subplots(5, 2, figsize=(6, 8), layout="constrained") PCs.columns = [s.replace("principal component", "PC") for s in PCs.columns] -PCs = PCs.join(y.astype('category')) +PCs = PCs.join(y.astype("category")) up_to = min(PCs.shape[-1], 5) # https://github.com/matplotlib/matplotlib/issues/25538 # colab: old pandas version and too new matplotlib version (2023-11-6) for (i, j), ax in zip(itertools.combinations(range(up_to), 2), axes.flatten()): - PCs.plot.scatter(i, j, c=TARGET_LABEL, cmap='Paired', ax=ax) + PCs.plot.scatter(i, j, c=TARGET_LABEL, cmap="Paired", ax=ax) _ = PCs.pop(TARGET_LABEL) -njab.plotting.savefig(fig, files_out['scatter_first_5PCs.pdf']) +njab.plotting.savefig(fig, files_out["scatter_first_5PCs.pdf"]) # %% [markdown] # ## UMAP @@ -302,21 +312,20 @@ reducer = umap.UMAP() embedding = reducer.fit_transform(X_scaled) -files_out['umap.pdf'] = FOLDER / 'umap.pdf' +files_out["umap.pdf"] = FOLDER / "umap.pdf" -embedding = pd.DataFrame(embedding, - index=X_scaled.index, - columns=['UMAP 1', - 'UMAP 2']).join(y.astype('category')) -ax = embedding.plot.scatter('UMAP 1', 'UMAP 2', c=TARGET_LABEL, cmap='Paired') -njab.plotting.savefig(ax.get_figure(), files_out['umap.pdf']) +embedding = pd.DataFrame( + embedding, index=X_scaled.index, columns=["UMAP 1", "UMAP 2"] +).join(y.astype("category")) +ax = embedding.plot.scatter("UMAP 1", "UMAP 2", c=TARGET_LABEL, cmap="Paired") +njab.plotting.savefig(ax.get_figure(), files_out["umap.pdf"]) # %% [markdown] # ## Baseline Model - Logistic Regression # Based on parameters, use weighting: # %% if weights: - weights = 'balanced' + weights = "balanced" cutoff = 0.5 else: cutoff = None @@ -331,23 +340,23 @@ # Define splits and models: # %% -splits = Splits(X_train=X_scaled, - X_test=scaler.transform(X_val), - y_train=y, - y_test=y_val) -model = sklearn.linear_model.LogisticRegression(penalty='l2', - class_weight=weights) +splits = Splits( + X_train=X_scaled, X_test=scaler.transform(X_val), y_train=y, y_test=y_val +) +model = sklearn.linear_model.LogisticRegression(penalty="l2", class_weight=weights) # %% tags=["hide-input"] scoring = [ - 'precision', 'recall', 'f1', 'balanced_accuracy', 'roc_auc', - 'average_precision' + "precision", + "recall", + "f1", + "balanced_accuracy", + "roc_auc", + "average_precision", ] scoring = {k: k for k in scoring} # do not average log loss for AIC and BIC calculations -scoring['log_loss'] = make_scorer(log_loss, - greater_is_better=True, - normalize=False) +scoring["log_loss"] = make_scorer(log_loss, greater_is_better=True, normalize=False) cv_feat = njab.sklearn.find_n_best_features( X=splits.X_train, y=splits.y_train, @@ -359,8 +368,7 @@ return_train_score=True, # fit_params=dict(sample_weight=weights) ) -cv_feat = cv_feat.drop('test_case', - axis=1).groupby('n_features').agg(['mean', 'std']) +cv_feat = cv_feat.drop("test_case", axis=1).groupby("n_features").agg(["mean", "std"]) cv_feat # %% [markdown] @@ -370,19 +378,19 @@ # AIC vs BIC on train and test data with bigger is better IC_criteria = pd.DataFrame() N_split = { - 'train': round(len(splits.X_train) * 0.8), - 'test': round(len(splits.X_train) * 0.2) + "train": round(len(splits.X_train) * 0.8), + "test": round(len(splits.X_train) * 0.2), } -for _split in ('train', 'test'): +for _split in ("train", "test"): - IC_criteria[(f'{_split}_neg_AIC', - 'mean')] = -(2 * cv_feat.index.to_series() - - 2 * cv_feat[(f'{_split}_log_loss', 'mean')]) - IC_criteria[( - f'{_split}_neg_BIC', - 'mean')] = -(cv_feat.index.to_series() * np.log(N_split[_split]) - - 2 * cv_feat[(f'{_split}_log_loss', 'mean')]) + IC_criteria[(f"{_split}_neg_AIC", "mean")] = -( + 2 * cv_feat.index.to_series() - 2 * cv_feat[(f"{_split}_log_loss", "mean")] + ) + IC_criteria[(f"{_split}_neg_BIC", "mean")] = -( + cv_feat.index.to_series() * np.log(N_split[_split]) + - 2 * cv_feat[(f"{_split}_log_loss", "mean")] + ) IC_criteria.columns = pd.MultiIndex.from_tuples(IC_criteria.columns) IC_criteria @@ -392,25 +400,26 @@ # %% cv_feat = cv_feat.join(IC_criteria) cv_feat = cv_feat.filter(regex="train|test", axis=1).style.highlight_max( - axis=0, subset=pd.IndexSlice[:, pd.IndexSlice[:, 'mean']]) + axis=0, subset=pd.IndexSlice[:, pd.IndexSlice[:, "mean"]] +) cv_feat # %% [markdown] # Save: # %% -cv_feat.to_excel(writer, 'CV', float_format='%.3f') +cv_feat.to_excel(writer, sheet_name="CV", float_format="%.3f") cv_feat = cv_feat.data # %% [markdown] # Optimal number of features to use based on cross-validation by metric: # %% tags=["hide-input"] -mask = cv_feat.columns.levels[0].str[:4] == 'test' +mask = cv_feat.columns.levels[0].str[:4] == "test" scores_cols = cv_feat.columns.levels[0][mask] -n_feat_best = cv_feat.loc[:, pd.IndexSlice[scores_cols, 'mean']].idxmax() -n_feat_best.name = 'best' -n_feat_best.to_excel(writer, 'n_feat_best') +n_feat_best = cv_feat.loc[:, pd.IndexSlice[scores_cols, "mean"]].idxmax() +n_feat_best.name = "best" +n_feat_best.to_excel(writer, sheet_name="n_feat_best") n_feat_best # %% [markdown] @@ -420,7 +429,7 @@ results_model = njab.sklearn.run_model( model=model, splits=splits, - n_feat_to_select=n_feat_best.loc['test_roc_auc', 'mean'], + n_feat_to_select=n_feat_best.loc["test_roc_auc", "mean"], ) results_model.name = model_name @@ -428,32 +437,32 @@ # ## Receiver Operating Curve of final model # %% tags=["hide-input"] -ax = plot_auc(results_model, - label_train=TRAIN_LABEL, - label_test=TEST_LABEL, - figsize=(4, 2)) -files_out['ROAUC'] = FOLDER / 'plot_roauc.pdf' -njab.plotting.savefig(ax.get_figure(), files_out['ROAUC']) +ax = plot_auc( + results_model, label_train=TRAIN_LABEL, label_test=TEST_LABEL, figsize=(4, 2) +) +files_out["ROAUC"] = FOLDER / "plot_roauc.pdf" +njab.plotting.savefig(ax.get_figure(), files_out["ROAUC"]) # %% [markdown] # ## Precision-Recall Curve for final model # %% tags=["hide-input"] -ax = plot_prc(results_model, - label_train=TRAIN_LABEL, - label_test=TEST_LABEL, - figsize=(4, 2)) -files_out['PRAUC'] = FOLDER / 'plot_prauc.pdf' -njab.plotting.savefig(ax.get_figure(), files_out['PRAUC']) +ax = plot_prc( + results_model, label_train=TRAIN_LABEL, label_test=TEST_LABEL, figsize=(4, 2) +) +files_out["PRAUC"] = FOLDER / "plot_prauc.pdf" +njab.plotting.savefig(ax.get_figure(), files_out["PRAUC"]) # %% [markdown] # ## Coefficients with/out std. errors # %% tags=["hide-input"] -pd.DataFrame({ - 'coef': results_model.model.coef_.flatten(), - 'name': results_model.model.feature_names_in_ -}) +pd.DataFrame( + { + "coef": results_model.model.coef_.flatten(), + "name": results_model.model.feature_names_in_, + } +) # %% results_model.model.intercept_ @@ -463,7 +472,7 @@ # %% tags=["hide-input"] des_selected_feat = splits.X_train[results_model.selected_features].describe() -des_selected_feat.to_excel(writer, 'sel_feat', float_format='%.3f') +des_selected_feat.to_excel(writer, sheet_name="sel_feat", float_format="%.3f") des_selected_feat # %% [markdown] @@ -471,47 +480,45 @@ # %% tags=["hide-input"] fig = plt.figure(figsize=(6, 6)) -files_out['corr_plot_train.pdf'] = FOLDER / 'corr_plot_train.pdf' +files_out["corr_plot_train.pdf"] = FOLDER / "corr_plot_train.pdf" _ = corrplot(X[results_model.selected_features].join(y).corr(), size_scale=300) -njab.plotting.savefig(fig, files_out['corr_plot_train.pdf']) +njab.plotting.savefig(fig, files_out["corr_plot_train.pdf"]) # %% [markdown] # ## Plot training data scores # %% tags=["hide-input"] N_BINS = 20 -score = get_score(clf=results_model.model, - X=splits.X_train[results_model.selected_features], - pos=1) +score = get_score( + clf=results_model.model, X=splits.X_train[results_model.selected_features], pos=1 +) ax = score.hist(bins=N_BINS) -files_out['hist_score_train.pdf'] = FOLDER / 'hist_score_train.pdf' -njab.plotting.savefig(ax.get_figure(), files_out['hist_score_train.pdf']) +files_out["hist_score_train.pdf"] = FOLDER / "hist_score_train.pdf" +njab.plotting.savefig(ax.get_figure(), files_out["hist_score_train.pdf"]) pred_bins = get_target_count_per_bin(score, y, n_bins=N_BINS) -ax = pred_bins.plot(kind='bar', ylabel='count') -files_out[ - 'hist_score_train_target.pdf'] = FOLDER / 'hist_score_train_target.pdf' -njab.plotting.savefig(ax.get_figure(), - files_out['hist_score_train_target.pdf']) +ax = pred_bins.plot(kind="bar", ylabel="count") +files_out["hist_score_train_target.pdf"] = FOLDER / "hist_score_train_target.pdf" +njab.plotting.savefig(ax.get_figure(), files_out["hist_score_train_target.pdf"]) # pred_bins # %% [markdown] # ## Test data scores # %% tags=["hide-input"] -score_val = get_score(clf=results_model.model, - X=splits.X_test[results_model.selected_features], - pos=1) -predictions['score'] = score_val +score_val = get_score( + clf=results_model.model, X=splits.X_test[results_model.selected_features], pos=1 +) +predictions["score"] = score_val ax = score_val.hist(bins=N_BINS) # list(x/N_BINS for x in range(0,N_BINS))) -ax.set_ylabel('count') +ax.set_ylabel("count") ax.set_xlim(0, 1) -files_out['hist_score_test.pdf'] = FOLDER / 'hist_score_test.pdf' -njab.plotting.savefig(ax.get_figure(), files_out['hist_score_test.pdf']) +files_out["hist_score_test.pdf"] = FOLDER / "hist_score_test.pdf" +njab.plotting.savefig(ax.get_figure(), files_out["hist_score_test.pdf"]) pred_bins_val = get_target_count_per_bin(score_val, y_val, n_bins=N_BINS) -ax = pred_bins_val.plot(kind='bar', ylabel='count') -ax.locator_params(axis='y', integer=True) -files_out['hist_score_test_target.pdf'] = FOLDER / 'hist_score_test_target.pdf' -njab.plotting.savefig(ax.get_figure(), files_out['hist_score_test_target.pdf']) +ax = pred_bins_val.plot(kind="bar", ylabel="count") +ax.locator_params(axis="y", integer=True) +files_out["hist_score_test_target.pdf"] = FOLDER / "hist_score_test_target.pdf" +njab.plotting.savefig(ax.get_figure(), files_out["hist_score_test_target.pdf"]) # pred_bins_val # %% [markdown] @@ -520,45 +527,49 @@ # between precision and recall: # %% tags=["hide-input"] -prc = pd.DataFrame(results_model.train.prc, - index='precision recall cutoffs'.split()) +prc = pd.DataFrame(results_model.train.prc, index="precision recall cutoffs".split()) prc # %% tags=["hide-input"] -prc.loc['f1_score'] = 2 * (prc.loc['precision'] * prc.loc['recall']) / ( - 1 / prc.loc['precision'] + 1 / prc.loc['recall']) -f1_max = prc[prc.loc['f1_score'].argmax()] +prc.loc["f1_score"] = ( + 2 + * (prc.loc["precision"] * prc.loc["recall"]) + / (1 / prc.loc["precision"] + 1 / prc.loc["recall"]) +) +f1_max = prc[prc.loc["f1_score"].argmax()] f1_max # %% [markdown] # Cutoff set # %% tags=["hide-input"] -cutoff = float(f1_max.loc['cutoffs']) +cutoff = float(f1_max.loc["cutoffs"]) cutoff # %% tags=["hide-input"] y_pred_val = njab.sklearn.scoring.get_custom_pred( clf=results_model.model, X=splits.X_test[results_model.selected_features], - cutoff=cutoff) + cutoff=cutoff, +) predictions[model_name] = y_pred_val -predictions['dead'] = y_val +predictions["dead"] = y_val _ = ConfusionMatrix(y_val, y_pred_val).as_dataframe() -_.columns = pd.MultiIndex.from_tuples([(t[0] + f" - {cutoff:.3f}", t[1]) - for t in _.columns]) -_.to_excel(writer, "CM_test_cutoff_adapted") +_.columns = pd.MultiIndex.from_tuples( + [(t[0] + f" - {cutoff:.3f}", t[1]) for t in _.columns] +) +_.to_excel(writer, sheet_name="CM_test_cutoff_adapted") _ # %% tags=["hide-input"] -y_pred_val = get_pred(clf=results_model.model, - X=splits.X_test[results_model.selected_features]) +y_pred_val = get_pred( + clf=results_model.model, X=splits.X_test[results_model.selected_features] +) predictions[model_name] = y_pred_val -predictions['dead'] = y_val +predictions["dead"] = y_val _ = ConfusionMatrix(y_val, y_pred_val).as_dataframe() -_.columns = pd.MultiIndex.from_tuples([(t[0] + f" - {0.5}", t[1]) - for t in _.columns]) -_.to_excel(writer, "CM_test_cutoff_0.5") +_.columns = pd.MultiIndex.from_tuples([(t[0] + f" - {0.5}", t[1]) for t in _.columns]) +_.to_excel(writer, sheet_name="CM_test_cutoff_0.5") _ # %% [markdown] @@ -566,25 +577,23 @@ # Decompose the model into its components for both splits: # %% tags=["hide-input"] -components = get_lr_multiplicative_decomposition(results=results_model, - X=splits.X_train, - prob=score, - y=y) -components.to_excel(writer, 'decomp_multiplicative_train') -components.to_excel(writer, - 'decomp_multiplicative_train_view', - float_format='%.5f') +components = get_lr_multiplicative_decomposition( + results=results_model, X=splits.X_train, prob=score, y=y +) +components.to_excel(writer, sheet_name="decomp_multiplicative_train") +components.to_excel( + writer, sheet_name="decomp_multiplicative_train_view", float_format="%.5f" +) components.head(10) # %% tags=["hide-input"] -components_test = get_lr_multiplicative_decomposition(results=results_model, - X=splits.X_test, - prob=score_val, - y=y_val) -components_test.to_excel(writer, 'decomp_multiplicative_test') -components_test.to_excel(writer, - 'decomp_multiplicative_test_view', - float_format='%.5f') +components_test = get_lr_multiplicative_decomposition( + results=results_model, X=splits.X_test, prob=score_val, y=y_val +) +components_test.to_excel(writer, sheet_name="decomp_multiplicative_test") +components_test.to_excel( + writer, sheet_name="decomp_multiplicative_test_view", float_format="%.5f" +) components_test.head(10) @@ -598,13 +607,13 @@ # make sure to have two or more features? M_sel = len(results_model.selected_features) if M_sel > 1: - embedding = reducer.fit_transform( - X_scaled[results_model.selected_features]) + embedding = reducer.fit_transform(X_scaled[results_model.selected_features]) - embedding = pd.DataFrame(embedding, - index=X_scaled.index, - columns=['UMAP dimension 1', 'UMAP dimension 2' - ]).join(y.astype('category')) + embedding = pd.DataFrame( + embedding, + index=X_scaled.index, + columns=["UMAP dimension 1", "UMAP dimension 2"], + ).join(y.astype("category")) display(embedding.head(3)) else: embedding = None @@ -613,12 +622,14 @@ # Annotate using target variable and predictions: # %% tags=["hide-input"] -predictions['label'] = predictions.apply( +predictions["label"] = predictions.apply( lambda x: njab.sklearn.scoring.get_label_binary_classification( - x['true'], x[model_name]), - axis=1) -mask = predictions[['true', model_name]].sum(axis=1).astype(bool) -predictions.loc[mask].sort_values('score', ascending=False) + x["true"], x[model_name] + ), + axis=1, +) +mask = predictions[["true", model_name]].sum(axis=1).astype(bool) +predictions.loc[mask].sort_values("score", ascending=False) # %% tags=["hide-input"] X_val_scaled = scaler.transform(X_val) @@ -626,20 +637,26 @@ embedding_val = pd.DataFrame( reducer.transform(X_val_scaled[results_model.selected_features]), index=X_val_scaled.index, - columns=['UMAP dimension 1', 'UMAP dimension 2']) + columns=["UMAP dimension 1", "UMAP dimension 2"], + ) embedding_val.sample(3) # %% tags=["hide-input"] pred_train = ( - y.to_frame('true') + y.to_frame("true") # .join(get_score(clf=results_model.model, X=splits.X_train[results_model.selected_features], pos=1)) - .join(score.rename('score')).join( - get_pred(results_model.model, splits.X_train[ - results_model.selected_features]).rename(model_name))) -pred_train['label'] = pred_train.apply( + .join(score.rename("score")).join( + get_pred( + results_model.model, splits.X_train[results_model.selected_features] + ).rename(model_name) + ) +) +pred_train["label"] = pred_train.apply( lambda x: njab.sklearn.scoring.get_label_binary_classification( - x['true'], x[model_name]), - axis=1) + x["true"], x[model_name] + ), + axis=1, +) pred_train.sample(5) # %% tags=["hide-cell"] @@ -650,22 +667,26 @@ if embedding is not None: fig, axes = plt.subplots(1, 2, figsize=(8, 4), sharex=True, sharey=True) for _embedding, ax, _title, _model_pred_label in zip( - [embedding, embedding_val], axes, [TRAIN_LABEL, TEST_LABEL], - [pred_train['label'], predictions['label']]): # noqa: E129 + [embedding, embedding_val], + axes, + [TRAIN_LABEL, TEST_LABEL], + [pred_train["label"], predictions["label"]], + ): # noqa: E129 ax = seaborn.scatterplot( x=_embedding.iloc[:, 0], y=_embedding.iloc[:, 1], hue=_model_pred_label, - hue_order=['TN', 'TP', 'FN', 'FP'], + hue_order=["TN", "TP", "FN", "FP"], palette=[colors[0], colors[2], colors[1], colors[3]], - ax=ax) + ax=ax, + ) ax.set_title(_title) # files_out['pred_pca_labeled'] = FOLDER / 'pred_pca_labeled.pdf' # njab.plotting.savefig(fig, files_out['pred_pca_labeled']) - files_out['umap_sel_feat.pdf'] = FOLDER / 'umap_sel_feat.pdf' - njab.plotting.savefig(ax.get_figure(), files_out['umap_sel_feat.pdf']) + files_out["umap_sel_feat.pdf"] = FOLDER / "umap_sel_feat.pdf" + njab.plotting.savefig(ax.get_figure(), files_out["umap_sel_feat.pdf"]) # %% [markdown] # ### Interactive UMAP plot @@ -675,29 +696,33 @@ if embedding is not None: embedding = embedding.join(X[results_model.selected_features]) embedding_val = embedding_val.join(X_val[results_model.selected_features]) - embedding['label'], embedding_val['label'] = pred_train[ - 'label'], predictions['label'] - embedding['group'], embedding_val['group'] = TRAIN_LABEL, TEST_LABEL + embedding["label"], embedding_val["label"] = ( + pred_train["label"], + predictions["label"], + ) + embedding["group"], embedding_val["group"] = TRAIN_LABEL, TEST_LABEL combined_embeddings = pd.concat([embedding, embedding_val]) - combined_embeddings.index.name = 'ID' + combined_embeddings.index.name = "ID" # %% tags=["hide-input"] if embedding is not None: cols = combined_embeddings.columns - TEMPLATE = 'none' + TEMPLATE = "none" defaults = dict(width=800, height=400, template=TEMPLATE) - fig = px.scatter(combined_embeddings.round(3).reset_index(), - x=cols[0], - y=cols[1], - color='label', - facet_col='group', - hover_data=['ID'] + results_model.selected_features, - **defaults) + fig = px.scatter( + combined_embeddings.round(3).reset_index(), + x=cols[0], + y=cols[1], + color="label", + facet_col="group", + hover_data=["ID"] + results_model.selected_features, + **defaults, + ) fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[1])) - fname = FOLDER / 'umap_sel_feat.html' + fname = FOLDER / "umap_sel_feat.html" files_out[fname.name] = fname fig.write_html(fname) print(fname) @@ -707,10 +732,11 @@ # ### PCA # %% tags=["hide-input"] -PCs_train, pca = njab_pca.run_pca(X_scaled[results_model.selected_features], - n_components=None) +PCs_train, pca = njab_pca.run_pca( + X_scaled[results_model.selected_features], n_components=None +) ax = njab_pca.plot_explained_variance(pca) -ax.locator_params(axis='x', integer=True) +ax.locator_params(axis="x", integer=True) fname = FOLDER / "feat_sel_PCA_var_explained_by_PCs.pdf" files_out[fname.name] = fname @@ -721,56 +747,60 @@ # %% tags=["hide-input"] PCs_val = pca.transform(X_val_scaled[results_model.selected_features]) -PCs_val = pd.DataFrame(PCs_val, - index=X_val_scaled.index, - columns=PCs_train.columns) +PCs_val = pd.DataFrame(PCs_val, index=X_val_scaled.index, columns=PCs_train.columns) PCs_val # %% tags=["hide-input"] if M_sel > 1: fig, axes = plt.subplots(1, 2, figsize=(6, 3), sharex=True, sharey=True) for _embedding, ax, _title, _model_pred_label in zip( - [PCs_train, PCs_val], axes, [TRAIN_LABEL, TEST_LABEL], - [pred_train['label'], predictions['label']]): # noqa: E129 + [PCs_train, PCs_val], + axes, + [TRAIN_LABEL, TEST_LABEL], + [pred_train["label"], predictions["label"]], + ): # noqa: E129 ax = seaborn.scatterplot( x=_embedding.iloc[:, 0], y=_embedding.iloc[:, 1], hue=_model_pred_label, - hue_order=['TN', 'TP', 'FN', 'FP'], + hue_order=["TN", "TP", "FN", "FP"], palette=[colors[0], colors[2], colors[1], colors[3]], - ax=ax) + ax=ax, + ) ax.set_title(_title) - fname = FOLDER / 'pca_sel_feat.pdf' + fname = FOLDER / "pca_sel_feat.pdf" files_out[fname.name] = fname njab.plotting.savefig(ax.get_figure(), fname) # %% tags=["hide-input"] if M_sel > 1: max_rows = min(3, len(results_model.selected_features)) - fig, axes = plt.subplots(max_rows, - 2, - figsize=(6, 8), - sharex=False, - sharey=False, - layout='constrained') + fig, axes = plt.subplots( + max_rows, 2, figsize=(6, 8), sharex=False, sharey=False, layout="constrained" + ) for axes_col, (_embedding, _title, _model_pred_label) in enumerate( - zip([PCs_train, PCs_val], [TRAIN_LABEL, TEST_LABEL], - [pred_train['label'], predictions['label']])): + zip( + [PCs_train, PCs_val], + [TRAIN_LABEL, TEST_LABEL], + [pred_train["label"], predictions["label"]], + ) + ): _row = 0 axes[_row, axes_col].set_title(_title) - for (i, j) in itertools.combinations(range(max_rows), 2): + for i, j in itertools.combinations(range(max_rows), 2): ax = seaborn.scatterplot( x=_embedding.iloc[:, i], y=_embedding.iloc[:, j], hue=_model_pred_label, - hue_order=['TN', 'TP', 'FN', 'FP'], + hue_order=["TN", "TP", "FN", "FP"], palette=[colors[0], colors[2], colors[1], colors[3]], - ax=axes[_row, axes_col]) + ax=axes[_row, axes_col], + ) _row += 1 - fname = FOLDER / f'pca_sel_feat_up_to_{max_rows}.pdf' + fname = FOLDER / f"pca_sel_feat_up_to_{max_rows}.pdf" files_out[fname.name] = fname njab.plotting.savefig(ax.get_figure(), fname) @@ -782,49 +812,53 @@ # %% tags=["hide-input"] if M_sel > 1: max_rows = min(3, len(results_model.selected_features)) - fig, axes = plt.subplots(max_rows, - 2, - figsize=(6, 8), - sharex=False, - sharey=False, - layout='constrained') + fig, axes = plt.subplots( + max_rows, 2, figsize=(6, 8), sharex=False, sharey=False, layout="constrained" + ) for axes_col, (_embedding, _title, _model_pred_label) in enumerate( - zip([ + zip( + [ X_scaled[results_model.selected_features], - X_val_scaled[results_model.selected_features] - ], [TRAIN_LABEL, TEST_LABEL], - [pred_train['label'], predictions['label']])): + X_val_scaled[results_model.selected_features], + ], + [TRAIN_LABEL, TEST_LABEL], + [pred_train["label"], predictions["label"]], + ) + ): _row = 0 axes[_row, axes_col].set_title(_title) - for (i, j) in itertools.combinations(range(max_rows), 2): + for i, j in itertools.combinations(range(max_rows), 2): ax = seaborn.scatterplot( x=_embedding.iloc[:, i], y=_embedding.iloc[:, j], hue=_model_pred_label, - hue_order=['TN', 'TP', 'FN', 'FP'], + hue_order=["TN", "TP", "FN", "FP"], palette=[colors[0], colors[2], colors[1], colors[3]], - ax=axes[_row, axes_col]) + ax=axes[_row, axes_col], + ) _row += 1 - fname = FOLDER / f'sel_feat_up_to_{max_rows}.pdf' + fname = FOLDER / f"sel_feat_up_to_{max_rows}.pdf" files_out[fname.name] = fname njab.plotting.savefig(ax.get_figure(), fname) else: - fig, axes = plt.subplots(1, 1, figsize=(6, 2), layout='constrained') + fig, axes = plt.subplots(1, 1, figsize=(6, 2), layout="constrained") single_feature = results_model.selected_features[0] - data = pd.concat([ - X[single_feature].to_frame().join( - pred_train['label']).assign(group=TRAIN_LABEL), - X_val[single_feature].to_frame().join( - predictions['label']).assign(group=TEST_LABEL) - ]) - ax = seaborn.swarmplot(data=data, - x='group', - y=single_feature, - hue='label', - ax=axes) - fname = FOLDER / f'sel_feat_{single_feature}.pdf' + data = pd.concat( + [ + X[single_feature] + .to_frame() + .join(pred_train["label"]) + .assign(group=TRAIN_LABEL), + X_val[single_feature] + .to_frame() + .join(predictions["label"]) + .assign(group=TEST_LABEL), + ] + ) + ax = seaborn.swarmplot(data=data, x="group", y=single_feature, hue="label", ax=axes) + fname = FOLDER / f"sel_feat_{single_feature}.pdf" files_out[fname.name] = fname njab.plotting.savefig(ax.get_figure(), fname) @@ -835,9 +869,11 @@ # %% X[results_model.selected_features].join(pred_train).to_excel( - writer, sheet_name='pred_train_annotated', float_format="%.3f") + writer, sheet_name="pred_train_annotated", float_format="%.3f" +) X_val[results_model.selected_features].join(predictions).to_excel( - writer, sheet_name='pred_test_annotated', float_format="%.3f") + writer, sheet_name="pred_test_annotated", float_format="%.3f" +) # %% [markdown] # ## Outputs