diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 46de2be..55c1ebb 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -15,7 +15,8 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - python-version: ['3.7', '3.8', '3.9', '3.10'] + + python-version: ['3.9', '3.10', "3.11", "3.12"] os: [ubuntu-latest, windows-latest, macos-12] steps: @@ -29,10 +30,9 @@ jobs: python -m pip install --upgrade pip pip install -r test_requirements.txt pip install -r requirements.txt - pip install pympler tdigest - name: Install package run: | - pip install -e .[distributed,test] + pip install -e .[distributed,test,ecos] - name: Lint with flake8 run: | # stop the build if there are Python syntax errors or undefined names diff --git a/README.rst b/README.rst index 203a86c..e490cbf 100644 --- a/README.rst +++ b/README.rst @@ -58,6 +58,12 @@ To include batch and stream binning algorithms (this option is not required for pip install optbinning[distributed] +To include support for the `ecos `_ solver: + +.. code-block:: text + + pip install optbinning[ecos] + To install from source, download or clone the git repository .. code-block:: text diff --git a/doc/source/conf.py b/doc/source/conf.py index e1da0ba..2ed5a43 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -22,9 +22,9 @@ author = 'Guillermo Navas-Palencia' # The short X.Y version -version = '0.19.0' +version = '0.20.0' # The full version, including alpha/beta/rc tags -release = '0.19.0' +release = '0.20.0' # -- General configuration --------------------------------------------------- diff --git a/optbinning/_version.py b/optbinning/_version.py index 9b45234..97733e6 100644 --- a/optbinning/_version.py +++ b/optbinning/_version.py @@ -1,3 +1,3 @@ """Version information.""" -__version__ = "0.19.0" +__version__ = "0.20.0" diff --git a/optbinning/binning/binning_statistics.py b/optbinning/binning/binning_statistics.py index d12fd19..5867b6b 100644 --- a/optbinning/binning/binning_statistics.py +++ b/optbinning/binning/binning_statistics.py @@ -608,7 +608,8 @@ def build(self, show_digits=2, add_totals=True): return df def plot(self, metric="woe", add_special=True, add_missing=True, - style="bin", show_bin_labels=False, savefig=None, figsize=None): + style="bin", show_bin_labels=False, savefig=None, figsize=None, + save_kwargs=None): """Plot the binning table. Visualize the non-event and event count, and the Weight of Evidence or @@ -642,6 +643,9 @@ def plot(self, metric="woe", add_special=True, add_missing=True, figsize : tuple or None (default=None) Size of the plot. + + save_kwargs : dict or None (default=None) + Additional keyword arguments to be passed to `plt.savefig`. """ _check_is_built(self) @@ -863,7 +867,13 @@ def plot(self, metric="woe", add_special=True, add_missing=True, if not isinstance(savefig, str): raise TypeError("savefig must be a string path; got {}." .format(savefig)) - plt.savefig(savefig) + if save_kwargs is None: + save_kwargs = {} + else: + if not isinstance(save_kwargs, dict): + raise TypeError("save_kwargs must be a dictionary; got {}." + .format(save_kwargs)) + plt.savefig(savefig, **save_kwargs) plt.close() def analysis(self, pvalue_test="chi2", n_samples=100, print_output=True): diff --git a/optbinning/binning/distributed/binning_sketch.py b/optbinning/binning/distributed/binning_sketch.py index ea7b8c5..06243d5 100644 --- a/optbinning/binning/distributed/binning_sketch.py +++ b/optbinning/binning/distributed/binning_sketch.py @@ -956,7 +956,7 @@ def _update_streaming_stats(self): self._solve_stats[self._n_solve] = { "n_add": self._n_add, "n_records": self._bsketch.n, - "divergence".format(self.divergence): dv + "divergence": dv } @property diff --git a/optbinning/binning/multidimensional/binning_statistics_2d.py b/optbinning/binning/multidimensional/binning_statistics_2d.py index 37ae0b8..5225575 100644 --- a/optbinning/binning/multidimensional/binning_statistics_2d.py +++ b/optbinning/binning/multidimensional/binning_statistics_2d.py @@ -338,7 +338,7 @@ def build(self, show_digits=2, show_bin_xy=False, add_totals=True): return df - def plot(self, metric="woe", savefig=None): + def plot(self, metric="woe", savefig=None, save_kwargs=None): """Plot the binning table. Visualize the Weight of Evidence or the event rate for each bin as a @@ -352,6 +352,9 @@ def plot(self, metric="woe", savefig=None): savefig : str or None (default=None) Path to save the plot figure. + + save_kwargs : dict or None (default=None) + Additional keyword arguments to be passed to `plt.savefig`. """ _check_is_built(self) @@ -384,7 +387,7 @@ def plot(self, metric="woe", savefig=None): er = er + [er[-1]] axtop.step(np.arange(self.n + 1) - 0.5, er, - label=path, where="post") + label=str(path), where="post") for i in range(self.n): axtop.axvline(i + 0.5, color="grey", linestyle="--", alpha=0.5) @@ -414,7 +417,7 @@ def plot(self, metric="woe", savefig=None): self.P == p, axis=0).max()) for p in path], []) er = er + [er[-1]] - axright.step(er, np.arange(self.m + 1) - 0.5, label=path, + axright.step(er, np.arange(self.m + 1) - 0.5, label=str(path), where="pre") for j in range(self.m): @@ -437,7 +440,13 @@ def plot(self, metric="woe", savefig=None): if not isinstance(savefig, str): raise TypeError("savefig must be a string path; got {}." .format(savefig)) - plt.savefig(savefig) + if save_kwargs is None: + save_kwargs = {} + else: + if not isinstance(save_kwargs, dict): + raise TypeError("save_kwargs must be a dictionary; got {}." + .format(save_kwargs)) + plt.savefig(savefig, **save_kwargs) plt.close() def analysis(self, pvalue_test="chi2", n_samples=100, print_output=True): @@ -763,7 +772,7 @@ def plot(self, savefig=None): er = er + [er[-1]] axtop.step(np.arange(self.n + 1) - 0.5, er, - label=path, where="post") + label=str(path), where="post") for i in range(self.n): axtop.axvline(i + 0.5, color="grey", linestyle="--", alpha=0.5) @@ -793,7 +802,7 @@ def plot(self, savefig=None): self.P == p, axis=0).max()) for p in path], []) er = er + [er[-1]] - axright.step(er, np.arange(self.m + 1) - 0.5, label=path, + axright.step(er, np.arange(self.m + 1) - 0.5, label=str(path), where="pre") for j in range(self.m): diff --git a/optbinning/binning/piecewise/binning_statistics.py b/optbinning/binning/piecewise/binning_statistics.py index e05109d..3092256 100644 --- a/optbinning/binning/piecewise/binning_statistics.py +++ b/optbinning/binning/piecewise/binning_statistics.py @@ -177,7 +177,8 @@ def build(self, show_digits=2, add_totals=True): return df - def plot(self, metric="woe", n_samples=10000, savefig=None): + def plot(self, metric="woe", n_samples=10000, savefig=None, + save_kwargs=None): """Plot the binning table. Visualize the non-event and event count, and the predicted Weight of @@ -194,6 +195,9 @@ def plot(self, metric="woe", n_samples=10000, savefig=None): savefig : str or None (default=None) Path to save the plot figure. + + save_kwargs : dict or None (default=None) + Additional keyword arguments to be passed to `plt.savefig`. """ _check_is_built(self) @@ -258,7 +262,13 @@ def plot(self, metric="woe", n_samples=10000, savefig=None): if not isinstance(savefig, str): raise TypeError("savefig must be a string path; got {}." .format(savefig)) - plt.savefig(savefig) + if save_kwargs is None: + save_kwargs = {} + else: + if not isinstance(save_kwargs, dict): + raise TypeError("save_kwargs must be a dictionary; got {}." + .format(save_kwargs)) + plt.savefig(savefig, **save_kwargs) plt.close() def analysis(self, pvalue_test="chi2", n_samples=100, print_output=True): diff --git a/optbinning/binning/preprocessing.py b/optbinning/binning/preprocessing.py index 5421b97..fc299dc 100644 --- a/optbinning/binning/preprocessing.py +++ b/optbinning/binning/preprocessing.py @@ -31,7 +31,7 @@ def categorical_transform(x, y): def categorical_cutoff(x, y, cutoff=0.01): cutoff_count = np.ceil(cutoff * len(x)) - cat_count = pd.value_counts(x) + cat_count = pd.Series(x).value_counts() cat_others = cat_count[cat_count < cutoff_count].index.values mask_others = pd.Series(x).isin(cat_others).values diff --git a/optbinning/scorecard/counterfactual/problem_data.py b/optbinning/scorecard/counterfactual/problem_data.py index 426f939..2b0131b 100644 --- a/optbinning/scorecard/counterfactual/problem_data.py +++ b/optbinning/scorecard/counterfactual/problem_data.py @@ -21,7 +21,13 @@ def problem_data(scorecard, X): sc["Points"] = sc["Mean"] * sc["Coefficient"] # Linear model coefficients - intercept = float(scorecard.estimator_.intercept_) + + # Only index into the intercept if it is an array, it is a scalar otherwise + if isinstance(scorecard.estimator_.intercept_, np.ndarray): + intercept = float(scorecard.estimator_.intercept_[0]) + else: + intercept = float(scorecard.estimator_.intercept_) + coef = scorecard.estimator_.coef_.ravel() # Big-M parameters (min, max) points. diff --git a/optbinning/scorecard/monitoring.py b/optbinning/scorecard/monitoring.py index b675669..0226dad 100644 --- a/optbinning/scorecard/monitoring.py +++ b/optbinning/scorecard/monitoring.py @@ -469,6 +469,8 @@ def psi_plot(self, savefig=None): plt.legend(handles, labels, loc="upper center", bbox_to_anchor=(0.5, -0.2), ncol=2, fontsize=12) + plt.tight_layout() + if savefig is None: plt.show() else: diff --git a/setup.py b/setup.py index 610501d..400194a 100644 --- a/setup.py +++ b/setup.py @@ -1,11 +1,8 @@ #!/usr/bin/env python import os -import sys from setuptools import find_packages, setup, Command -from setuptools.command.test import test as TestCommand - long_description = ''' The optimal binning is the optimal discretization of a variable into bins @@ -34,24 +31,10 @@ def run(self): os.system('rm -vrf ./build ./dist ./*.pyc ./*.tgz ./*.egg-info') -# test suites -class PyTest(TestCommand): - def finalize_options(self): - TestCommand.finalize_options(self) - self.test_args = [] - self.test_suite = [] - - def run_tests(self): - # import here, because outside the eggs aren't loaded - import pytest - errcode = pytest.main(self.test_args) - sys.exit(errcode) - - # install requirements install_requires = [ 'matplotlib', - 'numpy>=1.16.1,<2', + 'numpy>=1.16.1', 'ortools>=9.4', 'pandas', 'ropwr>=1.0.0', @@ -59,15 +42,19 @@ def run_tests(self): 'scipy>=1.6.0', ] -# test requirements -tests_require = [ - 'pytest', - 'coverage' -] - # extra requirements extras_require = { 'distributed': ['pympler', 'tdigest'], + 'test': [ + 'coverage', + 'flake8', + 'pytest', + 'pyarrow', + 'pympler', + 'tdigest', + ], + # For ecos support: https://github.com/embotech/ecos + 'ecos': ['ecos'] } @@ -89,10 +76,9 @@ def run_tests(self): include_package_data=True, license="Apache Licence 2.0", url="https://github.com/guillermo-navas-palencia/optbinning", - cmdclass={'clean': CleanCommand, 'test': PyTest}, + cmdclass={'clean': CleanCommand}, python_requires='>=3.7', install_requires=install_requires, - tests_require=tests_require, extras_require=extras_require, classifiers=[ 'Topic :: Scientific/Engineering :: Mathematics', @@ -103,7 +89,9 @@ def run_tests(self): 'Intended Audience :: Science/Research', 'License :: OSI Approved :: Apache Software License', 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.7', - 'Programming Language :: Python :: 3.8', - 'Programming Language :: Python :: 3.9'] + 'Programming Language :: Python :: 3.9', + 'Programming Language :: Python :: 3.10', + 'Programming Language :: Python :: 3.11', + 'Programming Language :: Python :: 3.12', + ] ) diff --git a/test_requirements.txt b/test_requirements.txt index 65833da..88a9557 100644 --- a/test_requirements.txt +++ b/test_requirements.txt @@ -1,4 +1,6 @@ coverage flake8 pytest -pyarrow \ No newline at end of file +pyarrow +pympler +tdigest diff --git a/tests/test_binning_piecewise.py b/tests/test_binning_piecewise.py index d464e8c..29ea04d 100644 --- a/tests/test_binning_piecewise.py +++ b/tests/test_binning_piecewise.py @@ -174,7 +174,7 @@ def test_default(): optb.fit(x, y) optb.binning_table.build() - assert optb.binning_table.iv == approx(5.87152846, rel=1e-6) + assert optb.binning_table.iv == approx(5.87474602, rel=1e-6) with raises(ValueError): optb.binning_table.plot(metric="new_metric") @@ -188,7 +188,7 @@ def test_default_discontinuous(): optb.fit(x, y) optb.binning_table.build() - assert optb.binning_table.iv == approx(5.84252707, rel=1e-6) + assert optb.binning_table.iv == approx(5.84465825, rel=1e-6) def test_bounds_transform(): @@ -197,11 +197,11 @@ def test_bounds_transform(): x_transform_woe = optb.transform(x, metric="woe") assert x_transform_woe[:4] == approx( - [3.9899792, 4.2806587, 4.17226985, -3.25509338], rel=1e-6) + [3.99180564, 4.28245092, 4.17407503, -3.2565373], rel=1e-6) x_transform_event_rate = optb.transform(x, metric="event_rate") assert x_transform_event_rate[:4] == approx( - [0.03021225, 0.02276486, 0.02530506, 0.97760445], rel=1e-6) + [0.03015878, 0.02272502, 0.02526056, 0.97763604], rel=1e-6) def test_bounds_fit_transform(): @@ -211,13 +211,11 @@ def test_bounds_fit_transform(): x, y, lb=0.001, ub=0.999, metric="woe") assert x_transform_woe[:4] == approx( - [3.9899792, 4.2806587, 4.17226985, -3.25509338], rel=1e-6) - + [3.9918056, 4.2824509, 4.17407503, -3.25653732], rel=1e-6) x_transform_event_rate = optb.fit_transform( x, y, lb=0.001, ub=0.999, metric="event_rate") - assert x_transform_event_rate[:4] == approx( - [0.03021225, 0.02276486, 0.02530506, 0.97760445], rel=1e-6) + [0.03015878, 0.02272502, 0.02526056, 0.97763604], rel=1e-6) def test_solvers(): @@ -226,7 +224,7 @@ def test_solvers(): optb.fit(x, y) optb.binning_table.build() - assert optb.binning_table.iv == approx(5.87152846, rel=1e-6) + assert optb.binning_table.iv == approx(5.87474602, rel=1e-6) def test_user_splits(): diff --git a/tests/test_continuous_binning_piecewise.py b/tests/test_continuous_binning_piecewise.py index f5f5639..be6784c 100644 --- a/tests/test_continuous_binning_piecewise.py +++ b/tests/test_continuous_binning_piecewise.py @@ -80,7 +80,7 @@ def test_special_codes(): name=variable, monotonic_trend="convex", special_codes=special_codes) optb.fit(x, y) - x_transform = optb.transform([np.NaN], metric_missing='empirical') + x_transform = optb.transform([np.nan], metric_missing='empirical') assert x_transform == approx([17.94], rel=1e-6) diff --git a/tests/test_scorecard.py b/tests/test_scorecard.py index f937951..94bf7f4 100644 --- a/tests/test_scorecard.py +++ b/tests/test_scorecard.py @@ -184,10 +184,10 @@ def test_default(): sct = scorecard.table(style="summary") sc_min, sc_max = sct.groupby("Variable").agg( - {'Points': [np.min, np.max]}).sum() + {'Points': ['min', 'max']}).sum() - assert sc_min == approx(-43.65762593147646, rel=1e-6) - assert sc_max == approx(42.69694657427327, rel=1e-6) + assert sc_min == approx(-43.5354465187911, rel=1e-6) + assert sc_max == approx(42.55760963498596, rel=1e-6) def test_default_continuous(): @@ -204,7 +204,7 @@ def test_default_continuous(): sct = scorecard.table(style="detailed") sc_min, sc_max = sct.groupby("Variable").agg( - {'Points': [np.min, np.max]}).sum() + {'Points': ['min', 'max']}).sum() assert sc_min == approx(-43.261900687199045, rel=1e-6) assert sc_max == approx(100.28829019286185, rel=1e-6) @@ -229,10 +229,10 @@ def test_scaling_method_pdo_odd(): sct = scorecard.table(style="summary") sc_min, sc_max = sct.groupby("Variable").agg( - {'Points': [np.min, np.max]}).sum() + {'Points': ['min', 'max']}).sum() - assert sc_min == approx(-612.2266586867094, rel=1e-6) - assert sc_max == approx(1879.4396115559216, rel=1e-6) + assert sc_min == approx(-608.2909715472422, rel=1e-6) + assert sc_max == approx(1875.829531813342, rel=1e-6) def test_scaling_method_min_max(): @@ -253,7 +253,7 @@ def test_scaling_method_min_max(): sct = scorecard.table(style="summary") sc_min, sc_max = sct.groupby("Variable").agg( - {'Points': [np.min, np.max]}).sum() + {'Points': ['min', 'max']}).sum() assert sc_min == approx(300, rel=1e-6) assert sc_max == approx(850, rel=1e-6) @@ -277,7 +277,7 @@ def test_intercept_based(): sct = scorecard.table(style="summary") sc_min, sc_max = sct.groupby("Variable").agg( - {'Points': [np.min, np.max]}).sum() + {'Points': ['min', 'max']}).sum() assert sc_min == approx(300 - scorecard.intercept_, rel=1e-6) assert sc_max == approx(850 - scorecard.intercept_, rel=1e-6) @@ -301,7 +301,7 @@ def test_reverse_scorecard(): sct = scorecard.table(style="summary") sc_min, sc_max = sct.groupby("Variable").agg( - {'Points': [np.min, np.max]}).sum() + {'Points': ['min', 'max']}).sum() assert sc_min == approx(300, rel=1e-6) assert sc_max == approx(850, rel=1e-6) @@ -325,9 +325,9 @@ def test_rounding(): sct = scorecard.table(style="summary") sc_min, sc_max = sct.groupby("Variable").agg( - {'Points': [np.min, np.max]}).sum() + {'Points': ['min', 'max']}).sum() - assert sc_min == approx(201, rel=1e-6) + assert sc_min == approx(200, rel=1e-6) assert sc_max == approx(851, rel=1e-6) @@ -350,10 +350,10 @@ def test_rounding_pdo_odds(): sct = scorecard.table(style="summary") sc_min, sc_max = sct.groupby("Variable").agg( - {'Points': [np.min, np.max]}).sum() + {'Points': ['min', 'max']}).sum() - assert sc_min == approx(-612, rel=1e-6) - assert sc_max == approx(1880, rel=1e-6) + assert sc_min == approx(-609, rel=1e-6) + assert sc_max == approx(1876, rel=1e-6) def test_estimator_not_coef(): @@ -403,12 +403,23 @@ def test_predict_score(): assert pred[:5] == approx([0, 0, 0, 0, 0]) - assert pred_proba[:5, 1] == approx( - [1.15260206e-06, 9.79035720e-06, 7.52481206e-08, 1.12438599e-03, - 9.83145644e-06], rel=1e-6) - - assert score[:5] == approx([652.16590046, 638.52659074, 669.56413105, - 608.27744027, 638.49988325], rel=1e-6) + expected_pred_proba = [ + 1.18812864e-06, + 1.01521192e-05, + 7.65959946e-08, + 1.09683243e-03, + 9.99982719e-06 + ] + assert pred_proba[:5, 1] == approx(expected_pred_proba, rel=1e-6) + + expected_score = [ + 652.16890659, + 638.45026205, + 669.70058258, + 608.50009151, + 638.54691686 + ] + assert score[:5] == approx(expected_score, rel=1e-6) def test_information(): diff --git a/tests/test_scorecard_monitoring.py b/tests/test_scorecard_monitoring.py index 3abadf7..d9cba6e 100644 --- a/tests/test_scorecard_monitoring.py +++ b/tests/test_scorecard_monitoring.py @@ -119,7 +119,7 @@ def test_default_binary(): # Check psi_table psi_table = monitoring.psi_table() - assert psi_table.PSI.sum() == approx(0.003536079105130241) + assert psi_table.PSI.sum() == approx(0.002224950381423254) # Check psi_variable_table with raises(ValueError): @@ -134,7 +134,7 @@ def test_default_binary(): # Check tests table tests_table = monitoring.tests_table() assert tests_table["p-value"].values[:2] == approx( - [0.00077184, 0.51953576], rel=1e-4) + [0.00250006, 0.49480006], rel=1e-4) # Check system stability report with open("tests/results/test_scorecard_monitoring_default.txt", "w") as f: