Skip to content

Commit

Permalink
Column As Model: Better interpretation of column vakues
Browse files Browse the repository at this point in the history
  • Loading branch information
janezd committed Jul 14, 2024
1 parent 2ee388f commit 73f6125
Show file tree
Hide file tree
Showing 2 changed files with 121 additions and 44 deletions.
75 changes: 59 additions & 16 deletions Orange/classification/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,42 +3,85 @@
import numpy as np

from Orange.data import Variable, DiscreteVariable, Domain
from Orange.classification import Model
from Orange.classification import Model, Learner


__all__ = ["ColumnClassifier"]


class ColumnLearner(Learner):
def __init__(self,
class_var: DiscreteVariable,
column: Variable,
offset: Optional[float] = None,
k: Optional[float] = None):
super().__init__()
self.class_var = class_var
self.column = column
self.offset = offset
self.k = k
self.name = f"column '{column.name}'"

def fit_storage(self, _):
return ColumnClassifier(
self.class_var, self.column, self.offset, self.k)


class ColumnClassifier(Model):
def __init__(self, class_var: DiscreteVariable, column: Variable,
def __init__(self,
class_var: DiscreteVariable,
column: Variable,
offset: Optional[float] = None,
k: Optional[float] = None):
assert class_var.is_discrete
assert column.is_continuous and len(class_var.values) == 2 or \
column.is_discrete and len(class_var.values) == len(column.values)
super().__init__(Domain([column], class_var))
assert class_var.is_discrete
if column.is_continuous:
assert len(class_var.values) == 2
self.value_mapping = np.array([0, 1])
else:
assert column.is_discrete
assert offset is None and k is None
if not self.check_value_sets(class_var, column):
raise ValueError(
"Column contains values that are not in class variable")
self.value_mapping = np.array(
[class_var.to_val(x) for x in column.values])
self.class_var = class_var
self.column = column
self.offset = offset
self.k = k
self.name = column.name
self.name = f"column '{column.name}'"

@staticmethod
def check_prob_range(values: np.ndarray):
return np.nanmin(values) >= 0 and np.nanmax(values) <= 1

@staticmethod
def check_value_sets(class_var: DiscreteVariable,
column_var: DiscreteVariable):
return set(column_var.values) <= set(class_var.values)

def predict_storage(self, data):
vals = data.get_column(self.column)
rows = np.isfinite(vals)
if self.column.is_discrete:
proba = np.zeros((len(data), len(self.column.values)))
rows = np.isfinite(vals)
proba[rows, vals[rows].astype(int)] = 1
proba = np.zeros((len(data), len(self.class_var.values)))
vals = self.value_mapping[vals[rows].astype(int)]
proba[rows, vals] = 1
else:
proba = np.zeros((len(data), 2))
proba = np.full((len(data), len(self.class_var.values)), 0.5)
if self.k is None:
if np.nanmin(vals) < 0 or np.nanmax(vals) > 1:
if not self.check_prob_range(vals):
raise ValueError("Column values must be in [0, 1] range "
"unless logistic function is applied")
proba[:, 1] = vals
proba[:, 0] = 1 - vals
proba[rows, 1] = vals[rows]
proba[rows, 0] = 1 - vals[rows]
vals = vals > 0.5
else:
proba[:, 1] = 1 / (1 + np.exp(-self.k * vals))
proba[:, 0] = 1 - proba[:, 1]
vals = vals > 0
proba[rows, 1] = (
1 / (1 + np.exp(-self.k * (vals[rows] - self.offset))))
proba[rows, 0] = 1 - proba[:, 1]
vals = vals > self.offset
return vals, proba

def __str__(self):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,20 +4,18 @@
from AnyQt.QtGui import QDoubleValidator

from orangewidget import gui
from orangewidget.settings import ContextSetting
from orangewidget.widget import Msg

from Orange.classification.column import ColumnClassifier
from Orange.classification.column import ColumnClassifier, ColumnLearner
from Orange.data import Table
from Orange.widgets.widget import OWWidget, Input, Output
from Orange.widgets.utils.itemmodels import DomainModel, VariableListModel
from Orange.widgets.utils.owlearnerwidget import OWBaseLearner
from Orange.widgets.utils.itemmodels import VariableListModel
from Orange.widgets.utils.widgetpreview import WidgetPreview
from orangewidget.gui import deferred
from orangewidget.settings import ContextSetting
from orangewidget.widget import Msg


class OWColumn(OWWidget):
name = "By Column"
name = "Column as Model"
description = "Predict values from columns."
icon = "icons/ByColumn.svg"
priority = 10
Expand All @@ -30,36 +28,56 @@ class Inputs:
data = Input("Data", Table)

class Outputs:
learner = Output("Learner", ColumnLearner)
model = Output("Model", ColumnClassifier)

class Error(OWWidget.Error):
no_class = Msg("Data has no class variable.")
no_variables = Msg("No useful variables.")
no_variables = Msg("No useful variables")
invalid_probabilities = \
Msg("Values must be between 0 and 1 (unless using logistic function).")

column = ContextSetting(None)
offset = ContextSetting(0)
k = ContextSetting(1)
apply_logistic = ContextSetting(False)
apply_logistic = ContextSetting(1)
auto_apply = ContextSetting(True)

def __init__(self):
super().__init__()
self.data = None

self.column_model = VariableListModel()
box = gui.vBox(self.controlArea, True)
gui.comboBox(
self.controlArea, self, "column", box="Column",
box, self, "column",
label="Column:", orientation=Qt.Horizontal,
model=self.column_model,
callback=self.on_column_changed)
self.options = gui.vBox(self.controlArea, "Options")
gui.checkBox(
self.options, self, "apply_logistic", "Apply logistic function",
self.options = gui.vBox(box)
self.bg = gui.radioButtons(
self.options, self, "apply_logistic",
["Use values as probabilities"],
tooltips=["For this, values must be betwwen 0 and 1."],
callback=self.on_apply_logistic_changed)
le = gui.lineEdit(
gui.indentedBox(self.options),
self, "k", label="k", orientation=Qt.Horizontal,
ibox = gui.hBox(self.options)
ibox.layout().setSpacing(0)
gui.appendRadioButton(
self.bg, "Apply logistic function with", insertInto=ibox,
tooltip="1 / [1 + exp(-k * (x - offset))]")
gui.lineEdit(
ibox,
self, "offset", label="offset", orientation=Qt.Horizontal,
valueType=float, validator=QDoubleValidator(),
alignment=Qt.AlignRight, controlWidth=40,
callback=self.on_log_pars_changed)
gui.lineEdit(
ibox,
self, "k", label=", k", orientation=Qt.Horizontal,
valueType=float, validator=QDoubleValidator(bottom=0),
alignment=Qt.AlignRight, controlWidth=80,
callback=self.on_k_changed)
gui.rubber(le.box)
alignment=Qt.AlignRight, controlWidth=40,
callback=self.on_log_pars_changed)
gui.rubber(ibox)
gui.auto_apply(self.controlArea, self, "auto_apply")
self._update_controls()

Expand All @@ -71,12 +89,12 @@ def on_apply_logistic_changed(self):
self._update_controls()
self.commit.deferred()

def on_k_changed(self):
def on_log_pars_changed(self):
self.apply_logistic = 1
self.commit.deferred()

def _update_controls(self):
self.options.setDisabled(self.column is None or self.column.is_discrete)
self.controls.k.box.setDisabled(not self.apply_logistic)

@Inputs.data
def set_data(self, data):
Expand All @@ -89,12 +107,14 @@ def set_data(self, data):
self.data = None
self.column_model.clear()
else:
nvalues = len(class_var.values)
classes = set(class_var.values)
binary_class = len(classes) == 2
self.column_model[:] = (
var
for var in chain(data.domain.attributes, data.domain.metas)
if var.is_continuous and nvalues == 2
or var.is_discrete and len(var.values) == nvalues)
if (var.is_discrete
and ColumnClassifier.check_value_sets(class_var, var))
or (var.is_continuous and binary_class))
if not self.column_model:
self.Error.no_variables()
self.data = None
Expand All @@ -107,14 +127,28 @@ def set_data(self, data):
self._update_controls()
self.commit.now()

@deferred
@gui.deferred
def commit(self):
self.Error.invalid_probabilities.clear()
if self.column is None:
self.Outputs.learner.send(None)
self.Outputs.model.send(None)
return

apply_logistic = self.column.is_continuous and self.apply_logistic
learner = ColumnLearner(
self.data.domain.class_var, self.column,
self.offset if apply_logistic else None,
self.k if apply_logistic else None)

values = self.data.get_column(self.column)
if not (apply_logistic or ColumnClassifier.check_prob_range(values)):
self.Error.invalid_probabilities()
model = None
else:
model = ColumnClassifier(
self.data.domain.class_var, self.column,
self.k if self.apply_logistic else None)
model = learner(self.data)

self.Outputs.learner.send(learner)
self.Outputs.model.send(model)


Expand Down

0 comments on commit 73f6125

Please sign in to comment.