From bccbd34b9ff761d7fd3450fd3a7f21157c69d048 Mon Sep 17 00:00:00 2001 From: janezd Date: Sat, 13 Jul 2024 21:31:05 +0200 Subject: [PATCH] Feature as Predictor: New widget --- Orange/classification/tests/test_column.py | 165 +++++++++ Orange/modelling/__init__.py | 1 + Orange/modelling/column.py | 155 +++++++++ Orange/modelling/tests/test_column.py | 306 +++++++++++++++++ Orange/tests/test_classification.py | 29 +- .../widgets/evaluate/owfeatureaspredictor.py | 179 ++++++++++ .../tests/test_owfeatureaspredictor.py | 315 ++++++++++++++++++ i18n/si/msgs.jaml | 61 +++- 8 files changed, 1196 insertions(+), 15 deletions(-) create mode 100644 Orange/classification/tests/test_column.py create mode 100644 Orange/modelling/column.py create mode 100644 Orange/modelling/tests/test_column.py create mode 100644 Orange/widgets/evaluate/owfeatureaspredictor.py create mode 100644 Orange/widgets/evaluate/tests/test_owfeatureaspredictor.py diff --git a/Orange/classification/tests/test_column.py b/Orange/classification/tests/test_column.py new file mode 100644 index 00000000000..d34102d4b66 --- /dev/null +++ b/Orange/classification/tests/test_column.py @@ -0,0 +1,165 @@ +import unittest +from unittest.mock import patch + +import numpy as np + +from Orange.classification import ColumnLearner, ColumnClassifier +from Orange.data import DiscreteVariable, ContinuousVariable, Domain, Table + + +class ColumnTest(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.domain = Domain([DiscreteVariable("d1", values=["a", "b"]), + DiscreteVariable("d2", values=["c", "d"]), + DiscreteVariable("d3", values=["d", "c"]), + ContinuousVariable("c1"), + ContinuousVariable("c2") + ], + DiscreteVariable("cls", values=["c", "d"]), + [DiscreteVariable("m1", values=["a", "b"]), + DiscreteVariable("m2", values=["d"]), + ContinuousVariable("c3")] + ) + cls.data = Table.from_numpy( + cls.domain, + np.array([[0, 0, 0, 1, 0.5], + [0, 1, 1, 0.25, -3], + [1, 0, np.nan, np.nan, np.nan]]), + np.array([0, 1, 1]), + np.array([[0, 0, 2], + [1, 0, 8], + [np.nan, np.nan, 5]]) + ) + + @patch("Orange.classification.column.ColumnModel") + def test_fit_storage(self, clsfr): + learner = ColumnLearner(self.domain.class_var, self.domain["d2"]) + self.assertEqual(learner.name, "column 'd2'") + learner.fit_storage(self.data) + clsfr.assert_called_with(self.domain.class_var, self.domain["d2"], None, None) + + learner = ColumnLearner(self.domain.class_var, self.domain["c3"]) + learner.fit_storage(self.data) + clsfr.assert_called_with(self.domain.class_var, self.domain["c3"], None, None) + + learner = ColumnLearner(self.domain.class_var, self.domain["c3"], 42, 3.5) + self.assertEqual(learner.name, "column 'c3'") + learner.fit_storage(self.data) + clsfr.assert_called_with(self.domain.class_var, self.domain["c3"], 42, 3.5) + + def test_classifier_init_checks(self): + cls = ColumnClassifier(self.domain.class_var, self.domain["d2"]) + cls.name = "column 'd2'" + + cls = ColumnClassifier(self.domain.class_var, self.domain["d3"]) + cls.name = "column 'd3'" + + cls = ColumnClassifier(self.domain.class_var, self.domain["c3"]) + cls.name = "column 'c3'" + + self.assertRaises( + ValueError, + ColumnClassifier, + self.domain.class_var, self.domain["d1"]) + + self.assertRaises( + ValueError, + ColumnClassifier, + DiscreteVariable("x", values=("a", "b", "c")), self.domain["c3"]) + + def test_check_prob_range(self): + self.assertTrue( + ColumnClassifier.valid_prob_range(np.array([0, 0.5, 1])) + ) + self.assertTrue( + ColumnClassifier.valid_prob_range(np.array([0, 0.5, np.nan])) + ) + self.assertFalse( + ColumnClassifier.valid_prob_range(np.array([0, 0.5, 1.5])) + ) + self.assertFalse( + ColumnClassifier.valid_prob_range(np.array([0, 0.5, -1])) + ) + + def test_check_value_sets(self): + d1, d2, d3, *_ = self.domain.attributes + c = self.domain.class_var + m2: DiscreteVariable = self.domain["m2"] + self.assertFalse(ColumnClassifier.valid_value_sets(c, d1)) + self.assertTrue(ColumnClassifier.valid_value_sets(c, d2)) + self.assertTrue(ColumnClassifier.valid_value_sets(c, d3)) + self.assertTrue(ColumnClassifier.valid_value_sets(c, m2)) + self.assertFalse(ColumnClassifier.valid_value_sets(m2, c)) + + def test_predict_discrete(self): + # Just copy + model = ColumnClassifier(self.domain.class_var, self.domain["d2"]) + self.assertEqual(model.name, "column 'd2'") + classes, probs = model(self.data, model.ValueProbs) + np.testing.assert_equal(classes, [0, 1, 0]) + np.testing.assert_equal(probs, [[1, 0], [0, 1], [1, 0]]) + + # Values are not in the same order -> map + model = ColumnClassifier(self.domain.class_var, self.domain["d3"]) + classes, probs = model(self.data, model.ValueProbs) + np.testing.assert_equal(classes, [1, 0, np.nan]) + np.testing.assert_equal(probs, [[0, 1], [1, 0], [0.5, 0.5]]) + + # Not in the same order, and one is missing -> map + model = ColumnClassifier(self.domain.class_var, self.domain["m2"]) + classes, probs = model(self.data, model.ValueProbs) + np.testing.assert_equal(classes, [1, 1, np.nan]) + np.testing.assert_equal(probs, [[0, 1], [0, 1], [0.5, 0.5]]) + + # Non-binary class + domain = Domain( + self.domain.attributes, + DiscreteVariable("cls", values=["a", "c", "b", "d", "e"])) + data = Table.from_numpy(domain, self.data.X, self.data.Y) + model = ColumnClassifier(domain.class_var, domain["d3"]) + classes, probs = model(data, model.ValueProbs) + np.testing.assert_equal(classes, [3, 1, np.nan]) + np.testing.assert_almost_equal( + probs, + np.array([[0, 0, 0, 1, 0], + [0, 1, 0, 0, 0], + [0.2, 0.2, 0.2, 0.2, 0.2]])) + + def test_predict_as_direct_probs(self): + model = ColumnClassifier(self.domain.class_var, self.domain["c1"]) + self.assertEqual(model.name, "column 'c1'") + classes, probs = model(self.data, model.ValueProbs) + np.testing.assert_equal(classes, [1, 0, np.nan]) + np.testing.assert_equal(probs, [[0, 1], [0.75, 0.25], [0.5, 0.5]]) + + model = ColumnClassifier(self.domain.class_var, self.domain["c2"]) + self.assertRaises(ValueError, model, self.data) + + model = ColumnClassifier(self.domain.class_var, self.domain["c3"]) + self.assertRaises(ValueError, model, self.data) + + def test_predict_with_logistic(self): + model = ColumnClassifier( + self.domain.class_var, self.domain["c1"], 0.5, 3) + classes, probs = model(self.data, model.ValueProbs) + np.testing.assert_equal(classes, [1, 0, np.nan]) + np.testing.assert_almost_equal( + probs[:, 1], [1 / (1 + np.exp(-3 * (1 - 0.5))), + 1 / (1 + np.exp(-3 * (0.25 - 0.5))), + 0.5]) + np.testing.assert_equal(probs[:, 0], 1 - probs[:, 1]) + + model = ColumnClassifier( + self.domain.class_var, self.domain["c2"], 0.5, 3) + classes, probs = model(self.data, model.ValueProbs) + np.testing.assert_equal(classes, [0, 0, np.nan]) + np.testing.assert_almost_equal( + probs[:, 1], [1 / (1 + np.exp(-3 * (0.5 - 0.5))), + 1 / (1 + np.exp(-3 * (-3 - 0.5))), + 0.5]) + np.testing.assert_equal(probs[:, 0], 1 - probs[:, 1]) + + +if __name__ == "__main__": + unittest.main() diff --git a/Orange/modelling/__init__.py b/Orange/modelling/__init__.py index 206151fcdf2..22d91d563c0 100644 --- a/Orange/modelling/__init__.py +++ b/Orange/modelling/__init__.py @@ -11,6 +11,7 @@ from .randomforest import * from .svm import * from .tree import * +from .column import * try: from .catgb import * except ImportError: diff --git a/Orange/modelling/column.py b/Orange/modelling/column.py new file mode 100644 index 00000000000..447097461fd --- /dev/null +++ b/Orange/modelling/column.py @@ -0,0 +1,155 @@ +from typing import Optional + +import numpy as np + +from Orange.data import Variable, DiscreteVariable, Domain, Table +from Orange.classification import LogisticRegressionLearner +from Orange.regression import LinearRegressionLearner +from Orange.modelling import Model, Learner + +__all__ = ["ColumnLearner", "ColumnModel"] + + +def _check_column_combinations( + class_var: Variable, + column: Variable, + fit_regression: bool): + if class_var.is_continuous: + if not column.is_continuous: + raise ValueError( + "Regression can only be used with numeric variables") + return + + assert isinstance(class_var, DiscreteVariable) # remove type warnings + if column.is_continuous: + if len(class_var.values) != 2: + raise ValueError( + "Numeric columns can only be used with binary class variables") + else: + assert isinstance(column, DiscreteVariable) + if not valid_value_sets(class_var, column): + raise ValueError( + "Column contains values that are not in class variable") + if fit_regression and not column.is_continuous: + raise ValueError( + "Intercept and coefficient are only allowed for continuous " + "variables") + + +def valid_prob_range(values: np.ndarray): + return np.nanmin(values) >= 0 and np.nanmax(values) <= 1 + + +def valid_value_sets(class_var: DiscreteVariable, + column_var: DiscreteVariable): + return set(column_var.values) <= set(class_var.values) + + +class ColumnLearner(Learner): + def __init__(self, + class_var: Variable, + column: Variable, + fit_regression: bool = False): + super().__init__() + _check_column_combinations(class_var, column, fit_regression) + self.class_var = class_var + self.column = column + self.fit_regression = fit_regression + self.name = f"column '{column.name}'" + + def __fit_coefficients(self, data: Table): + # Use learners from Orange rather than directly calling + # scikit-learn, so that we make sure we use the same parameters + # and get the same result as we would if we used the widgets. + data1 = data.transform(Domain([self.column], self.class_var)) + if self.class_var.is_discrete: + model = LogisticRegressionLearner()(data1) + return model.intercept[0], model.coefficients[0][0] + else: + model = LinearRegressionLearner()(data1) + return model.intercept, model.coefficients[0] + + def fit_storage(self, data: Table): + if data.domain.class_var != self.class_var: + raise ValueError("Class variable does not match the data") + if not self.fit_regression: + return ColumnModel(self.class_var, self.column) + + intercept, coefficient = self.__fit_coefficients(data) + return ColumnModel(self.class_var, self.column, intercept, coefficient) + + +class ColumnModel(Model): + def __init__(self, + class_var: Variable, + column: Variable, + intercept: Optional[float] = None, + coefficient: Optional[float] = None): + super().__init__(Domain([column], class_var)) + + _check_column_combinations(class_var, column, intercept is not None) + if (intercept is not None) is not (coefficient is not None): + raise ValueError( + "Intercept and coefficient must both be provided or absent") + + self.class_var = class_var + self.column = column + self.intercept = intercept + self.coefficient = coefficient + if (column.is_discrete and + class_var.values[:len(column.values)] != column.values): + self.value_mapping = np.array([class_var.to_val(x) + for x in column.values]) + else: + self.value_mapping = None + + pars = f" ({intercept}, {coefficient})" if intercept is not None else "" + self.name = f"column '{column.name}'{pars}" + + def predict_storage(self, data: Table): + vals = data.get_column(self.column) + if self.class_var.is_discrete: + return self._predict_discrete(vals) + else: + return self._predict_continuous(vals) + + def _predict_discrete(self, vals): + assert isinstance(self.class_var, DiscreteVariable) + nclasses = len(self.class_var.values) + proba = np.full((len(vals), nclasses), np.nan) + rows = np.isfinite(vals) + if self.column.is_discrete: + mapped = vals[rows].astype(int) + if self.value_mapping is not None: + mapped = self.value_mapping[mapped] + vals = vals.copy() + vals[rows] = mapped + proba[rows] = 0 + proba[rows, mapped] = 1 + else: + if self.coefficient is None: + if not valid_prob_range(vals): + raise ValueError("Column values must be in [0, 1] range " + "unless logistic function is applied") + proba[rows, 1] = vals[rows] + else: + proba[rows, 1] = ( + 1 / + (1 + np.exp(-self.intercept - self.coefficient * vals[rows]) + )) + + proba[rows, 0] = 1 - proba[rows, 1] + vals = (proba[:, 1] > 0.5).astype(float) + vals[~rows] = np.nan + return vals, proba + + def _predict_continuous(self, vals): + if self.coefficient is None: + return vals + else: + return vals * self.coefficient + self.intercept + + def __str__(self): + pars = f" ({self.intercept}, {self.coefficient})" \ + if self.intercept is not None else "" + return f'ColumnModel {self.column.name}{pars}' diff --git a/Orange/modelling/tests/test_column.py b/Orange/modelling/tests/test_column.py new file mode 100644 index 00000000000..c8ecbfe9b0d --- /dev/null +++ b/Orange/modelling/tests/test_column.py @@ -0,0 +1,306 @@ +import unittest +from unittest.mock import patch + +import numpy as np + +from Orange.data import DiscreteVariable, ContinuousVariable, Table, Domain +from Orange.modelling.column import _check_column_combinations, \ + valid_prob_range, valid_value_sets, ColumnLearner, ColumnModel + + +class TestBase(unittest.TestCase): + def setUp(self): + self.disc_a = DiscreteVariable("a", values=("a", "b", "c")) + self.disc_b = DiscreteVariable("b", values=("c", "a", "b")) + self.disc_c = DiscreteVariable("c", values=("c", "b")) + self.cont_e = ContinuousVariable("e") + self.cont_f = ContinuousVariable("f") + self.cont_g = ContinuousVariable("g") + + +class TestColumnLearner(TestBase): + @patch("Orange.modelling.column._check_column_combinations") + def test_column_regressor(self, check): + data = Table.from_numpy( + Domain([self.disc_a, self.cont_e, self.cont_f], + self.cont_g), + np.array([[0, 1, -6], + [1, 2, -4], + [2, 4, 0], + [np.nan, 6, 4], + [0, 3, -2]]), + np.array([0, 1, 3, 5, 2])) + + model = ColumnLearner(self.cont_g, self.cont_e, True)(data) + check.assert_called() + self.assertIs(model.class_var, self.cont_g) + self.assertIs(model.column, self.cont_e) + self.assertAlmostEqual(model.intercept, -1) + self.assertAlmostEqual(model.coefficient, 1) + + model = ColumnLearner(self.cont_g, self.cont_f, True)(data) + self.assertIs(model.class_var, self.cont_g) + self.assertIs(model.column, self.cont_f) + self.assertAlmostEqual(model.intercept, 3) + self.assertAlmostEqual(model.coefficient, 0.5) + + check.reset_mock() + model = ColumnLearner(self.cont_g, self.cont_f)(data) + check.assert_called() + self.assertIs(model.class_var, self.cont_g) + self.assertIs(model.column, self.cont_f) + self.assertIsNone(model.intercept) + self.assertIsNone(model.coefficient) + + @patch("Orange.modelling.column._check_column_combinations") + def test_column_classifier_from_numeric(self, check): + data = Table.from_numpy( + Domain([self.disc_a, self.disc_b, self.cont_e], + self.disc_c), + np.array([[0, 1, 0], + [1, 0, 1], + [2, 1, 1], + [0, 2, 0], + [0, 0, 1]]), + np.array([0, 1, 0, 1, 0])) + + model = ColumnLearner(self.disc_c, self.cont_e, True)(data) + check.assert_called() + self.assertIs(model.class_var, self.disc_c) + self.assertIs(model.column, self.cont_e) + # These values were not computed manually + self.assertAlmostEqual(model.intercept, -0.3127646959895215) + self.assertEqual(model.coefficient, -0.15535275317811897) + + @patch("Orange.modelling.column._check_column_combinations") + def test_column_classifier_from_discrete(self, check): + data = Table.from_numpy( + Domain([self.disc_b, self.disc_c, self.cont_e], + self.disc_a), + np.array([[0, 1, 0], + [1, 0, 1], + [2, 1, 1], + [0, 0, 0], + [0, 0, 1]]), + np.array([0, 1, 2, 1, 0])) + model = ColumnLearner(self.disc_a, self.disc_c)(data) + check.assert_called() + self.assertIs(model.class_var, self.disc_a) + self.assertIs(model.column, self.disc_c) + self.assertIsNone(model.intercept) + self.assertIsNone(model.coefficient) + + def test_class_mismatch(self): + data = Table.from_numpy( + Domain([self.disc_b, self.disc_c, self.cont_e], + self.disc_a), + np.array([[0, 1, 0], + [1, 0, 1], + [2, 1, 1], + [0, 0, 0], + [0, 0, 1]]), + np.array([0, 1, 2, 1, 0])) + + self.assertRaises( + ValueError, + ColumnLearner(self.disc_b, self.disc_c), + data) + + +class TestModel(TestBase): + def test_str(self): + model = ColumnModel(self.disc_a, self.disc_c) + self.assertEqual(str(model), "ColumnModel c") + + model = ColumnModel(self.disc_c, self.cont_e, 1, 2) + self.assertEqual(str(model), "ColumnModel e (1, 2)") + + def test_mapping(self): + model = ColumnModel(self.disc_a, + DiscreteVariable("a2", self.disc_a.values)) + self.assertIsNone(model.value_mapping) + model = ColumnModel(self.disc_c, self.cont_e) + self.assertIsNone(model.value_mapping) + model = ColumnModel(self.disc_c, self.cont_e, 1, 2) + self.assertIsNone(model.value_mapping) + model = ColumnModel(self.disc_b, self.disc_c) + np.testing.assert_equal(model.value_mapping, [0, 2]) + model = ColumnModel(self.disc_a, self.disc_c) + np.testing.assert_equal(model.value_mapping, [2, 1]) + model = ColumnModel(self.disc_b, self.disc_a) + np.testing.assert_equal(model.value_mapping, [1, 2, 0]) + + @patch("Orange.modelling.column._check_column_combinations") + def test_check_validity(self, check): + ColumnModel(self.disc_c, self.cont_e) + check.assert_called() + + with self.assertRaises(ValueError): + ColumnModel(self.disc_b, self.cont_e, 0, None) + with self.assertRaises(ValueError): + ColumnModel(self.disc_b, self.cont_e, None, 1) + + def test_name(self): + self.assertEqual( + ColumnModel(self.disc_c, self.cont_e).name, + "column 'e'") + + self.assertEqual( + ColumnModel(self.disc_c, self.cont_e, -1, 2).name, + "column 'e' (-1, 2)") + + @patch("Orange.modelling.column.ColumnModel._predict_discrete") + @patch("Orange.modelling.column.ColumnModel._predict_continuous") + def test_predict_storage(self, predict_cont, predict_disc): + model = ColumnModel(self.cont_e, self.cont_f) + data = Table.from_numpy( + Domain([self.cont_f], self.cont_e), + np.array([[1], [2], [3]]), + np.array([0, 1, 2])) + model.predict_storage(data) + predict_cont.assert_called() + predict_cont.reset_mock() + predict_disc.assert_not_called() + + model = ColumnModel(self.disc_a, self.disc_c) + data = Table.from_numpy( + Domain([self.disc_c], self.disc_a), + np.array([[0], [1], [0]]), + np.array([0, 1, 0])) + model.predict_storage(data) + predict_disc.assert_called() + predict_cont.assert_not_called() + + def test_predict_disc_from_disc_w_mapping(self): + data = Table.from_numpy( + Domain([self.disc_a, self.disc_c, self.cont_e], + self.disc_b), + np.array([[0, 1, 0], # a, b, 0 + [1, 0, 1], # b c 1 + [2, 1, 1], # c b 1 + [0, 0, 0], # a c 0 + [0, 0, 1]]), # a c 1 + np.array([0, 1, 0, 1, 0])) + + model = ColumnModel(self.disc_b, self.disc_c) + vals, probs = model.predict_storage(data) + np.testing.assert_equal(vals, [2, 0, 2, 0, 0]) + np.testing.assert_almost_equal(probs, [[0, 0, 1], + [1, 0, 0], + [0, 0, 1], + [1, 0, 0], + [1, 0, 0]]) + + model = ColumnModel(self.disc_b, self.disc_a) + vals, probs = model.predict_storage(data) + np.testing.assert_equal(vals, [1, 2, 0, 1, 1]) + np.testing.assert_almost_equal(probs, [[0, 1, 0], + [0, 0, 1], + [1, 0, 0], + [0, 1, 0], + [0, 1, 0]]) + + def test_predict_disc_from_disc_wout_mapping(self): + data = Table.from_numpy( + Domain([self.disc_a, self.disc_c, self.cont_e], + DiscreteVariable("a1", values=self.disc_a.values)), + np.array([[0, 1, 0], + [1, 0, 1], + [2, 1, 1], + [0, 0, 0], + [0, 0, 1]]), + np.array([0, 1, 0, 1, 0])) + + model = ColumnModel(data.domain.class_var, self.disc_a) + vals, probs = model.predict_storage(data) + np.testing.assert_equal(vals, [0, 1, 2, 0, 0]) + np.testing.assert_almost_equal(probs, [[1, 0, 0], + [0, 1, 0], + [0, 0, 1], + [1, 0, 0], + [1, 0, 0]]) + + def test_predict_disc_from_cont(self): + data = Table.from_numpy( + Domain([self.cont_e, self.cont_f], + self.disc_c), + np.array([[0, -1], + [1, 0], + [0.45, 1], + [np.nan, 2], + [0.52, np.nan]]), + np.array([0, 1, 0, 1, 0])) + + model = ColumnModel(data.domain.class_var, self.cont_e) + vals, probs = model.predict_storage(data) + p1 = np.array([0, 1, 0.45, np.nan, 0.52]) + np.testing.assert_equal(vals, [0, 1, 0, np.nan, 1]) + np.testing.assert_almost_equal(probs, np.vstack((1 - p1, p1)).T) + + model = ColumnModel(self.disc_c, self.cont_e, -0.1, 5) + vals, probs = model.predict_storage(data) + p1e = 1 / (1 + np.exp(+0.1 - 5 * p1)) + np.testing.assert_equal(vals, [0, 1, 1, np.nan, 1]) + np.testing.assert_almost_equal(probs, np.vstack((1 - p1e, p1e)).T) + + with self.assertRaises(ValueError): + # values outside [0, 1] range + ColumnModel(self.disc_c, self.cont_f)(data) + + model = ColumnModel(self.disc_c, self.cont_f, -1, 5) + vals, probs = model.predict_storage(data) + p1 = 1 / (1 + np.exp(1 - 5 * data.X[:, 1])) + np.testing.assert_equal(vals, [0, 0, 1, 1, np.nan]) + np.testing.assert_almost_equal(probs, np.vstack((1 - p1, p1)).T) + + def test_predict_cont(self): + data = Table.from_numpy( + Domain([self.cont_e, self.cont_f], + self.cont_g), + np.array([[0, -1], + [1, 0], + [0.45, 1], + [np.nan, 2], + [0.52, np.nan]]), + np.array([0, 1, 0.45, 2, 0.52])) + + model = ColumnModel(self.cont_g, self.cont_e) + np.testing.assert_equal(model.predict_storage(data), data.X[:, 0]) + + model = ColumnModel(self.cont_g, self.cont_f, -0.1, 5) + np.testing.assert_almost_equal( + model.predict_storage(data), + -0.1 + 5 * data.X[:, 1]) + + +class Test(TestBase): + def test_checks(self): + def check(class_var, column, fit=False): + _check_column_combinations(class_var, column, fit) + + def value_error(*args): + self.assertRaises(ValueError, check, *args) + + value_error(self.cont_e, self.disc_a) # regression from discrete column + value_error(self.disc_a, self.cont_e) # non-binary class from numeric + value_error(self.disc_c, self.disc_a) # column has vales not in class + value_error(self.disc_a, self.disc_b, True) # fitting from discrete column + + check(self.cont_e, self.cont_f) + check(self.cont_e, self.cont_f, True) + check(self.disc_a, self.disc_b) + check(self.disc_a, self.disc_c) + + def test_valid_prob_range(self): + self.assertTrue(valid_prob_range(np.array([1, 0, 0.5]))) + self.assertFalse(valid_prob_range(np.array([-0.1, 0.5, 1]))) + self.assertFalse(valid_prob_range(np.array([0, 1.1, 0.5]))) + + def test_valid_value_sets(self): + self.assertTrue(valid_value_sets(self.disc_a, self.disc_b)) + self.assertTrue(valid_value_sets(self.disc_a, self.disc_c)) + self.assertFalse(valid_value_sets(self.disc_c, self.disc_a)) + + +if __name__ == '__main__': + unittest.main() diff --git a/Orange/tests/test_classification.py b/Orange/tests/test_classification.py index 05ba316a21a..b89181502d4 100644 --- a/Orange/tests/test_classification.py +++ b/Orange/tests/test_classification.py @@ -22,6 +22,7 @@ SVMLearner, LinearSVMLearner, OneClassSVMLearner, TreeLearner, KNNLearner, SimpleRandomForestLearner, EllipticEnvelopeLearner, ThresholdLearner, CalibratedLearner) +from Orange.classification.column import ColumnLearner from Orange.classification.rules import _RuleLearner from Orange.data import (ContinuousVariable, DiscreteVariable, Domain, Table) @@ -30,6 +31,10 @@ from Orange.tests.dummy_learners import DummyLearner, DummyMulticlassLearner from Orange.tests import test_filename +# While this could be determined automatically from __init__ signatures, +# it is better to do it explicitly +LEARNERS_WITH_ARGUMENTS = (ThresholdLearner, CalibratedLearner, ColumnLearner) + def all_learners(): classification_modules = pkgutil.walk_packages( @@ -214,8 +219,7 @@ def test_result_shape(self): """ iris = Table('iris') for learner in all_learners(): - # calibration, threshold learners' __init__ requires arguments - if learner in (ThresholdLearner, CalibratedLearner): + if learner in LEARNERS_WITH_ARGUMENTS: continue # Skip learners that are incompatible with the dataset @@ -260,6 +264,8 @@ def test_result_shape_numpy(self): args = [] if learner in (ThresholdLearner, CalibratedLearner): args = [LogisticRegressionLearner()] + elif learner in LEARNERS_WITH_ARGUMENTS: + continue data = iris_bin if learner is ThresholdLearner else iris # Skip learners that are incompatible with the dataset if learner.incompatibility_reason(self, data.domain): @@ -284,6 +290,10 @@ def test_predict_proba(self): continue if learner in (ThresholdLearner, CalibratedLearner): model = learner(LogisticRegressionLearner())(data) + elif learner in LEARNERS_WITH_ARGUMENTS: + # note that above two also require arguments, but we + # provide them + continue else: model = learner()(data) probs = model.predict_proba(data) @@ -392,8 +402,7 @@ def test_unknown(self): def test_missing_class(self): table = Table(test_filename("datasets/adult_sample_missing")) for learner in all_learners(): - # calibration, threshold learners' __init__ require arguments - if learner in (ThresholdLearner, CalibratedLearner): + if learner in LEARNERS_WITH_ARGUMENTS: continue # Skip slow tests if isinstance(learner, _RuleLearner): @@ -421,8 +430,7 @@ def test_all_learners_accessible_in_Orange_classification_namespace(self): def test_all_models_work_after_unpickling(self): datasets = [Table('iris'), Table('titanic')] for learner in list(all_learners()): - # calibration, threshold learners' __init__ require arguments - if learner in (ThresholdLearner, CalibratedLearner): + if learner in LEARNERS_WITH_ARGUMENTS: continue # Skip slow tests if issubclass(learner, _RuleLearner): @@ -448,8 +456,7 @@ def test_all_models_work_after_unpickling(self): def test_all_models_work_after_unpickling_pca(self): datasets = [Table('iris'), Table('titanic')] for learner in list(all_learners()): - # calibration, threshold learners' __init__ require arguments - if learner in (ThresholdLearner, CalibratedLearner): + if learner in LEARNERS_WITH_ARGUMENTS: continue # Skip slow tests if issubclass(learner, _RuleLearner): @@ -478,8 +485,7 @@ def test_all_models_work_after_unpickling_pca(self): def test_adequacy_all_learners(self): for learner in all_learners(): - # calibration, threshold learners' __init__ requires arguments - if learner in (ThresholdLearner, CalibratedLearner): + if learner in LEARNERS_WITH_ARGUMENTS: continue with self.subTest(learner.__name__): learner = learner() @@ -488,8 +494,7 @@ def test_adequacy_all_learners(self): def test_adequacy_all_learners_multiclass(self): for learner in all_learners(): - # calibration, threshold learners' __init__ require arguments - if learner in (ThresholdLearner, CalibratedLearner): + if learner in LEARNERS_WITH_ARGUMENTS: continue with self.subTest(learner.__name__): learner = learner() diff --git a/Orange/widgets/evaluate/owfeatureaspredictor.py b/Orange/widgets/evaluate/owfeatureaspredictor.py new file mode 100644 index 00000000000..ce56a531cfc --- /dev/null +++ b/Orange/widgets/evaluate/owfeatureaspredictor.py @@ -0,0 +1,179 @@ +from itertools import chain + +from AnyQt.QtWidgets import QComboBox, QCheckBox + +from orangewidget import gui +from orangewidget.settings import Setting +from orangewidget.widget import Msg + +from Orange.data import Variable, Table +from Orange.modelling.column import ( + ColumnModel, ColumnLearner, valid_value_sets, valid_prob_range) +from Orange.widgets.widget import OWWidget, Input, Output +from Orange.widgets.utils.itemmodels import VariableListModel +from Orange.widgets.utils.widgetpreview import WidgetPreview + + +class OWFeatureAsPredictor(OWWidget): + name = "Feature as Predictor" + description = "Use a column as probabilities or predictions" + icon = "icons/FeatureAsPredictor.svg" + priority = 10 + keywords = "column predictor" + + want_main_area = False + resizing_enabled = False + + class Inputs: + data = Input("Data", Table) + + class Outputs: + learner = Output("Learner", ColumnLearner) + model = Output("Model", ColumnModel) + + class Error(OWWidget.Error): + no_class = Msg("Data has no target variable.") + no_variables = Msg("No useful variables") + + column_hint: Variable = Setting(None, schema_only=True) + # Stores the last user setting. + # apply_transformation tells what will actually happens; + # checkbox may be disabled and set to reflect apply_transformation. + apply_transformation_setting = Setting(False) + auto_apply = Setting(True) + + def __init__(self): + super().__init__() + self.data = None + self.column = None + self.apply_transformation = False + self.pars_to_report = (False, False) + + box = gui.vBox(self.controlArea, True) + + self.column_combo = combo = QComboBox() + combo.setModel(VariableListModel()) + box.layout().addWidget(combo) + @combo.activated.connect + def on_column_changed(index): + self.column = combo.model()[index] + self.column_hint = self.column.name + self._update_controls() + self.commit.deferred() + + self.cb_transformation = cb = QCheckBox("", self) + box.layout().addWidget(cb) + @cb.clicked.connect + def on_apply_transformation_changed(checked): + self.apply_transformation_setting \ + = self.apply_transformation = checked + self.commit.deferred() + + gui.auto_apply(self.controlArea, self, "auto_apply") + self._update_controls() + + def _update_controls(self): + cb = self.cb_transformation + data = self.data + + if data is None or self.column is None: + cb.setChecked(self.apply_transformation_setting) + cb.setDisabled(False) + return + + if self.column.is_discrete: + self.apply_transformation = False + cb.setChecked(False) + cb.setDisabled(True) + elif (data.domain.class_var.is_discrete + and not valid_prob_range(data.get_column(self.column))): + self.apply_transformation = True + cb.setChecked(True) + cb.setDisabled(True) + else: + self.apply_transformation = self.apply_transformation_setting + cb.setChecked(self.apply_transformation_setting) + cb.setDisabled(False) + + shape = "logistic" if data.domain.class_var.is_discrete else "linear" + cb.setText(f"Transform through {shape} function") + cb.setToolTip(f"Use {shape} regression to fit the model's coefficients") + + @Inputs.data + def set_data(self, data): + self._set_data(data) + self._update_controls() + self.commit.now() + + def _set_data(self, data): + column_model: VariableListModel = self.column_combo.model() + + self.Error.clear() + column_model.clear() + self.column = None + self.data = None + + if data is None: + return + + class_var = data.domain.class_var + if class_var is None: + self.Error.no_class() + return + + allow_continuous = (class_var.is_continuous + or len(class_var.values) == 2) + column_model[:] = ( + var + for var in chain(data.domain.attributes, data.domain.metas) + if (var.is_continuous and allow_continuous + or (var.is_discrete and class_var.is_discrete + and valid_value_sets(class_var, var)) + ) + ) + if not column_model: + self.Error.no_variables() + return + + self.data = data + if self.column_hint \ + and self.column_hint in self.data.domain \ + and (var := self.data.domain[self.column_hint]) in column_model: + self.column = var + self.column_combo.setCurrentIndex(column_model.indexOf(self.column)) + else: + self.column = column_model[0] + self.column_combo.setCurrentIndex(0) + self.column_hint = self.column.name + + @gui.deferred + def commit(self): + self.pars_to_report = (False, False) + if self.column is None: + self.Outputs.learner.send(None) + self.Outputs.model.send(None) + return + + learner = ColumnLearner( + self.data.domain.class_var, self.column, self.apply_transformation) + model = learner(self.data) + self.Outputs.learner.send(learner) + self.Outputs.model.send(model) + if self.apply_transformation: + self.pars_to_report = (model.intercept, model.coefficient) + + def send_report(self): + if self.column is None: + return + self.report_items(( + ("Predict values from", self.column.name), + ("Applied transformation", + self.apply_transformation and self.data is not None and + ("logistic" if self.data.domain.class_var.is_discrete else "linear")), + ("Intercept", self.pars_to_report[0]), + ("Coefficient", self.pars_to_report[1]) + )) + + +if __name__ == "__main__": # pragma: no cover + WidgetPreview(OWFeatureAsPredictor).run(Table("heart_disease")) diff --git a/Orange/widgets/evaluate/tests/test_owfeatureaspredictor.py b/Orange/widgets/evaluate/tests/test_owfeatureaspredictor.py new file mode 100644 index 00000000000..db2a7ef91b1 --- /dev/null +++ b/Orange/widgets/evaluate/tests/test_owfeatureaspredictor.py @@ -0,0 +1,315 @@ +import unittest +from unittest.mock import patch, Mock + +import numpy as np + +from Orange.data import Table, Domain, \ + StringVariable, DiscreteVariable, ContinuousVariable +from Orange.widgets.evaluate.owfeatureaspredictor import OWFeatureAsPredictor +from Orange.widgets.tests.base import WidgetTest + + +class OWFeatureAsPredictorTest(WidgetTest): + def setUp(self): + self.widget = self.create_widget(OWFeatureAsPredictor) + self.disc_a = DiscreteVariable("a", values=("a", "b", "c")) + self.disc_b = DiscreteVariable("b", values=("c", "a", "b")) + self.disc_c = DiscreteVariable("c", values=("c", "a", "b", "d")) + self.disc_d = DiscreteVariable("d", values=("c", "b")) + self.disc_de = DiscreteVariable("de", values=("b", "c")) + self.cont_e = ContinuousVariable("e") + self.cont_f = ContinuousVariable("f") + self.cont_g = ContinuousVariable("g") + + attrs = [self.disc_b, self.disc_c, self.disc_d, + self.cont_e] + meta_attrs = [self.cont_f, StringVariable("s"), self.disc_de] + x = np.array([[0, 1, 0, 0.1], + [1, 1, 1, 0.6], + [2, 0, np.nan, 0.2], + [0, np.nan, 1, np.nan], + [np.nan, 0, 6, 0.8]]) + y = np.array([0, 1, 0, 1, 0]) + metas = np.array([[-0.1, "a", 0], + [0.3, "b", 1], + [np.nan, "c", 1], + [0.9, "d", 1], + [0.24, "e", 0]]) + + self.class_data = Table.from_numpy( + Domain(attrs, self.disc_a, meta_attrs), + x, y, metas + ) + + self.bin_data = Table.from_numpy( + Domain(attrs, self.disc_de, meta_attrs[:-1]), + x, y, metas[:, :-1] + ) + + self.regr_data = Table.from_numpy( + Domain(attrs[1:], self.cont_g, meta_attrs), + x[:, 1:], y, metas + ) + + def set_column(self, var): + combo = self.widget.column_combo + index = combo.model().indexOf(var) + self.widget.column_combo.setCurrentIndex(index) + self.widget.column_combo.activated.emit(index) + + def test_model(self): + w = self.widget + model = w.column_combo.model() + self.send_signal(self.class_data) + self.assertEqual(list(model), [self.disc_b, self.disc_d, self.disc_de]) + self.assertIsNotNone(self.get_output(w.Outputs.model)) + self.assertIsNotNone(self.get_output(w.Outputs.learner)) + + self.send_signal(None) + self.assertIsNone(self.get_output(w.Outputs.model)) + self.assertIsNone(self.get_output(w.Outputs.learner)) + self.assertEqual(list(model), []) + + self.send_signal(self.bin_data) + self.assertEqual(list(model), [self.disc_d, self.cont_e, self.cont_f]) + self.assertIsNotNone(self.get_output(w.Outputs.model)) + self.assertIsNotNone(self.get_output(w.Outputs.learner)) + + self.send_signal(self.regr_data) + self.assertEqual(list(model), [self.cont_e, self.cont_f]) + self.assertIsNotNone(self.get_output(w.Outputs.model)) + self.assertIsNotNone(self.get_output(w.Outputs.learner)) + + self.send_signal( + self.bin_data.transform(Domain(self.bin_data.domain.attributes))) + self.assertIsNone(self.get_output(w.Outputs.model)) + self.assertIsNone(self.get_output(w.Outputs.learner)) + self.assertTrue(w.Error.no_class.is_shown()) + self.assertFalse(w.Error.no_variables.is_shown()) + + self.send_signal(self.regr_data) + self.assertIsNotNone(self.get_output(w.Outputs.model)) + self.assertIsNotNone(self.get_output(w.Outputs.learner)) + self.assertFalse(w.Error.no_class.is_shown()) + self.assertFalse(w.Error.no_variables.is_shown()) + + self.send_signal( + self.regr_data.transform( + Domain([self.disc_b, self.disc_c, self.disc_d], self.cont_g))) + self.assertIsNone(self.get_output(w.Outputs.model)) + self.assertIsNone(self.get_output(w.Outputs.learner)) + self.assertFalse(w.Error.no_class.is_shown()) + self.assertTrue(w.Error.no_variables.is_shown()) + + def test_combo_hint(self): + self.send_signal(self.bin_data) + self.assertEqual(self.widget.column, self.disc_d) + + self.send_signal(self.regr_data) + self.assertEqual(self.widget.column, self.cont_e) + + self.set_column(self.cont_f) + # Keep f, because it exists + self.send_signal(self.bin_data) + self.assertEqual(self.widget.column, self.cont_f) + + self.set_column(self.disc_d) + # Can't keep + self.send_signal(self.regr_data) + self.assertEqual(self.widget.column, self.cont_e) + + # Keep hint when there is no data + self.set_column(self.cont_f) + self.send_signal(None) + self.send_signal(self.regr_data) + self.assertEqual(self.widget.column, self.cont_f) + + def set_checked(self, checked): + cb = self.widget.cb_transformation + if cb.isChecked() != checked: + cb.click() + + def test_update_transform_checkbox(self): + check = self.widget.cb_transformation + # No data: button is enabled + self.assertTrue(check.isEnabled()) + + # Check the checkbox so that we see it behaves properly + # when disabled, unchecked and re-enabled + self.set_checked(True) + + with self.subTest("Multinomial target"): + self.send_signal(self.class_data) + # Discrete target: button is disabled and unchecked + # transformation not applied + self.assertFalse(check.isEnabled()) + self.assertFalse(check.isChecked()) + self.assertFalse(self.widget.apply_transformation) + + # No data: re-enabled and re-checked + self.send_signal(None) + self.assertTrue(check.isEnabled()) + self.assertTrue(check.isChecked()) + + self.send_signal(self.bin_data) + # Binary target, discrete column: button is disabled and unchecked + # transformation not applied + self.assertIs(self.widget.column, self.disc_d) + self.assertFalse(check.isEnabled()) + self.assertFalse(check.isChecked()) + self.assertFalse(self.widget.apply_transformation) + + with self.subTest("Binary target, numeric column within range"): + # Binary target, numeric column within range: + # enabled, checked (because of setting) + self.set_column(self.cont_e) + self.assertTrue(check.isEnabled()) + self.assertTrue(check.isChecked()) + self.assertTrue(self.widget.apply_transformation) + # Binary target, numeric column outside range: + # disabled, unchecked (because of setting) + self.set_column(self.cont_e) + self.assertTrue(check.isEnabled()) + self.assertTrue(check.isChecked()) + self.assertTrue(self.widget.apply_transformation) + # Go back to numeric withing range to verify that it is re-checked + self.set_column(self.cont_e) + self.assertTrue(check.isEnabled()) + self.assertTrue(check.isChecked()) + self.assertTrue(self.widget.apply_transformation) + + self.send_signal(None) + self.set_checked(False) + self.send_signal(self.bin_data) + self.set_column(self.cont_e) + self.assertTrue(check.isEnabled()) + self.assertFalse(check.isChecked()) + self.assertFalse(self.widget.apply_transformation) + + with self.subTest("Regression target"): + self.send_signal(self.regr_data) + # Regression target: button is enabled and checked + self.assertTrue(check.isEnabled()) + self.assertFalse(check.isChecked()) + self.assertFalse(self.widget.apply_transformation) + + self.set_checked(True) + self.assertTrue(self.widget.apply_transformation) + + self.send_signal(self.class_data) + assert not check.isEnabled() + assert not check.isChecked() + + self.send_signal(self.regr_data) + self.assertTrue(check.isEnabled()) + self.assertTrue(check.isChecked()) + self.assertTrue(self.widget.apply_transformation) + + # Column that would be out of range for discrete, + # but regression doesn't mind + self.set_checked(False) + self.set_column(self.cont_f) + self.assertTrue(check.isEnabled()) + self.assertFalse(check.isChecked()) + self.assertFalse(self.widget.apply_transformation) + + def test_checkbox_text(self): + check = self.widget.cb_transformation + self.send_signal(self.bin_data) + self.assertIn("logistic", check.text()) + self.assertIn("logistic", check.toolTip()) + self.send_signal(self.regr_data) + self.assertIn("linear", check.text()) + self.assertIn("linear", check.toolTip()) + self.send_signal(self.class_data) + self.assertIn("logistic", check.text()) + self.assertIn("logistic", check.toolTip()) + + @patch("Orange.widgets.evaluate.owfeatureaspredictor.ColumnLearner") + def test_commit(self, learner): + model = self.widget.column_combo.model() + extract = self.regr_data.domain.class_var + + self.assertIsNone(self.get_output(self.widget.Outputs.model)) + self.assertIsNone(self.get_output(self.widget.Outputs.learner)) + self.set_checked(False) + + learner.reset_mock() + self.send_signal(self.regr_data) + learner.assert_called_once() + assert not self.widget.apply_transformation + self.assertEqual(learner.call_args, ((extract, model[0], False),)) + self.assertIs(self.get_output( + self.widget.Outputs.learner), learner.return_value) + self.assertIs(self.get_output( + self.widget.Outputs.model), learner.return_value.return_value) + + learner.reset_mock() + self.set_column(model[1]) + assert not self.widget.apply_transformation + learner.assert_called_once() + self.assertEqual(learner.call_args, ((extract, model[1], False),)) + self.assertIs(self.get_output( + self.widget.Outputs.learner), learner.return_value) + self.assertIs(self.get_output( + self.widget.Outputs.model), learner.return_value.return_value) + + learner.reset_mock() + self.set_checked(True) + learner.assert_called_once() + self.assertEqual(learner.call_args, ((extract, model[1], True),)) + self.assertIs(self.get_output( + self.widget.Outputs.learner), learner.return_value) + self.assertIs(self.get_output( + self.widget.Outputs.model), learner.return_value.return_value) + + @patch("Orange.modelling.column.ColumnModel") + def test_report_data(self, model): + def assert_items(*expected): + self.assertEqual(tuple(i[1] for i in items.call_args[0][0][1:]), + expected) + + items = self.widget.report_items = Mock() + model.return_value.intercept = 1 + model.return_value.coefficient = 2 + + self.widget.send_report() + items.assert_not_called() + + self.send_signal(self.class_data) + self.set_column(self.disc_d) + self.widget.send_report() + self.assertEqual(items.call_args[0][0][0][1], "d") + assert_items(False, False, False) + + self.set_column(self.disc_b) + self.widget.send_report() + self.assertEqual(items.call_args[0][0][0][1], "b") + assert_items(False, False, False) + + self.send_signal(self.regr_data) + self.set_checked(False) + self.widget.send_report() + self.assertEqual(items.call_args[0][0][0][1], "e") + assert_items(False, False, False) + + self.set_checked(True) + self.widget.send_report() + self.assertEqual(items.call_args[0][0][0][1], "e") + assert_items("linear", 1, 2) + + self.send_signal(self.bin_data) + self.set_column(self.disc_d) + self.widget.send_report() + self.assertEqual(items.call_args[0][0][0][1], "d") + assert_items(False, False, False) + + self.set_column(self.cont_e) + assert self.widget.apply_transformation + self.widget.send_report() + self.assertEqual(items.call_args[0][0][0][1], "e") + assert_items("logistic", 1, 2) + + +if __name__ == "__main__": + unittest.main() diff --git a/i18n/si/msgs.jaml b/i18n/si/msgs.jaml index eec09e4d7d4..05f0e76673d 100644 --- a/i18n/si/msgs.jaml +++ b/i18n/si/msgs.jaml @@ -202,9 +202,9 @@ util.py: def `funcv`: unsafe: false version.py: - 3.38.0: false - 3.38.0.dev0+b3dd2eb: false - b3dd2eba6a3cfa73ba06460226d6e16a8e25a23a: false + 3.39.0: false + 3.39.0.dev0+c2c1648: false + c2c16487816d19a63177fae0b9febf1f0220f982: false .dev: false canvas/__main__.py: ORANGE_STATISTICS_API_URL: false @@ -2518,6 +2518,31 @@ modelling/catgb.py: Gradient Boosting (catboost): true classification: false regression: false +modelling/column.py: + ColumnLearner: false + ColumnModel: false + def `_check_column_combinations`: + Regression can only be used with numeric variables: false + Numeric columns can only be used with binary class variables: false + Column contains values that are not in class variable: false + 'Intercept and coefficient are only allowed for continuous ': false + variables: false + class `ColumnLearner`: + def `__init__`: + column '{column.name}': false + def `fit_storage`: + Class variable does not match the data: false + class `ColumnModel`: + def `__init__`: + Intercept and coefficient must both be provided or absent: false + ' ({intercept}, {coefficient})': false + column '{column.name}'{pars}: stolpec '{column.name}'{pars} + def `_predict_discrete`: + 'Column values must be in [0, 1] range ': false + unless logistic function is applied: false + def `__str__`: + ' ({self.intercept}, {self.coefficient})': false + ColumnModel {self.column.name}{pars}: false modelling/constant.py: ConstantLearner: false class `ConstantLearner`: @@ -8900,6 +8925,36 @@ widgets/evaluate/owconfusionmatrix.py: selected_learner: false __main__: false iris: false +widgets/evaluate/owfeatureaspredictor.py: + class `OWFeatureAsPredictor`: + Feature as Predictor: Stolpec kot napoved + Use a column as probabilities or predictions: Uporabi stolpec kot verjetnosti ali napovedi + icons/FeatureAsPredictor.svg: false + column predictor: true + class `Inputs`: + Data: Podatki + class `Outputs`: + Learner: Učni algoritem + Model: Model + class `Error`: + Data has no target variable.: Podatki nimajo ciljne spremenljivke. + No useful variables: Ni uporabnih spremenljivk. + def `__init__`: + auto_apply: false + def `_update_controls`: + logistic: logistično + linear: linearno + Transform through {shape} function: Pretvori z {shape} funkcijo + Use {shape} regression to fit the model's coefficients: Uporabi {shape} regresijo za nastavitev koeficientov modela + def `send_report`: + Predict values from: Napoved vrednosti iz + Applied transformation: Uporabljena transformacija + logistic: logistična + linear: linearna + Intercept: Presečišče + Coefficient: Koeficient + __main__: false + heart_disease: false widgets/evaluate/owliftcurve.py: CurveData: false contacted: false