adding environment variables for N_JOBS, and adding a default classif…

…ier (sklearn's logistic regression) for when the classifier is not specified in aggregative quantifiers
HLT-ISTI · May 30, 2024 · ad11b86 · ad11b86
1 parent 9ad36ef
commit ad11b86
Show file tree

Hide file tree

Showing 9 changed files with 108 additions and 77 deletions.
diff --git a/CHANGE_LOG.txt b/CHANGE_LOG.txt
@@ -1,10 +1,26 @@
 Change Log 0.1.9
 ----------------
 - [TODO] add LeQua2024
-- [TODO] add njobs to env
-- [TODO] add basic examples
-- [TODO] add default classifier to env
-- [TODO] add default classifier to env
+
+- Added a default classifier for aggregative quantifiers, which now can be instantiated without specifying
+    the classifier. The default classifier can be accessed in qp.environ['DEFAULT_CLS'] and is assigned to
+    sklearn.linear_model.LogisticRegression(max_iter=3000). If the classifier is not specified, then a clone
+    of said classifier is returned. E.g.:
+    > pacc = PACC()
+    is equivalent to:
+    > pacc = PACC(classifier=LogisticRegression(max_iter=3000))
+
+- Improved error loging in model selection. In v0.1.8 only Status.INVALID was reported; in v0.1.9 it is
+    now accompanied by a textual description of the error
+
+- The number of parallel workers can now be set via an environment variable by running, e.g.:
+    > N_JOBS=10 python3 your_script.py
+    which has the same effect as writing the following code at the beginning of your_script.py:
+    > import quapy as qp
+    > qp.environ["N_JOBS"] = 10
+
+- Some examples have been added to the ./examples/ dir, which now contains numbered examples from basics (0)
+    to advanced topics (higher numbers)
 
 - Moved the wiki documents to the ./docs/ folder so that they become editable via PR for the community
 

diff --git a/examples/0.basics.py b/examples/0.basics.py
@@ -33,9 +33,10 @@
 print(f'training prevalence = {F.strprev(train.prevalence())}')
 
 # let us train one quantifier, for example, PACC using a sklearn's Logistic Regressor as the underlying classifier
-classifier = LogisticRegression()
+# classifier = LogisticRegression()
 
-pacc = qp.method.aggregative.PACC(classifier)
+# pacc = qp.method.aggregative.PACC(classifier)
+pacc = qp.method.aggregative.PACC()
 
 print(f'training {pacc}')
 pacc.fit(train)

diff --git a/examples/1.model_selection.py b/examples/1.model_selection.py
@@ -1,21 +1,23 @@
 import quapy as qp
-from method._kdey import KDEyML
-from quapy.method.non_aggregative import DMx
-from quapy.protocol import APP, UPP
+from quapy.protocol import UPP
 from quapy.method.aggregative import DMy
 from sklearn.linear_model import LogisticRegression
-from examples.comparing_gridsearch import OLD_GridSearchQ
 import numpy as np
 from time import time
 
 """
 In this example, we show how to perform model selection on a DistributionMatching quantifier.
 """
 
-model = DMy(LogisticRegression())
+model = DMy()
 
 qp.environ['SAMPLE_SIZE'] = 100
-qp.environ['N_JOBS'] = -1
+
+print(f'running model selection with N_JOBS={qp.environ["N_JOBS"]}; '
+      f'to increase the number of jobs use:\n> N_JOBS=-1 python3 1.model_selection.py\n'
+      f'alternatively, you can set this variable within the script as:\n'
+      f'import quapy as qp\n'
+      f'qp.environ["N_JOBS"]=-1')
 
 training, test = qp.datasets.fetch_UCIMulticlassDataset('letter').train_test
 
@@ -42,7 +44,7 @@
     # different configurations of the quantifier. In other words, quapy avoids to train
     # the classifier 7x7 times.
     param_grid = {
-        'classifier__C': np.logspace(-3,3,7),
+        'classifier__C': np.logspace(-3, 3, 7),
         'nbins': [2, 3, 4, 5, 10, 15, 20]
     }
 

diff --git a/examples/8.ucimulti_experiments.py b/examples/8.ucimulti_experiments.py
@@ -7,7 +7,7 @@
 from sklearn.linear_model import LogisticRegression
 
 import quapy as qp
-from quapy.method.aggregative import PACC, EMQ, KDEyML
+from quapy.method.aggregative import PACC, EMQ
 from quapy.model_selection import GridSearchQ
 from quapy.protocol import UPP
 from pathlib import Path
@@ -52,6 +52,7 @@ def load_timings(result_path):
     df = pd.read_csv(result_path+'.csv', sep='\t')
     return timings | df.pivot_table(index='Dataset', columns='Method', values='t_train').to_dict()
 
+
 if __name__ == '__main__':
 
     qp.environ['SAMPLE_SIZE'] = 500

diff --git a/quapy/__init__.py b/quapy/__init__.py
@@ -1,15 +1,18 @@
 """QuaPy module for quantification"""
+from sklearn.linear_model import LogisticRegression
+
 from quapy.data import datasets
 from . import error
 from . import data
 from . import functional
-# from . import method
+from . import method
 from . import evaluation
 from . import protocol
 from . import plot
 from . import util
 from . import model_selection
 from . import classification
+import os
 
 __version__ = '0.1.9'
 
@@ -20,7 +23,8 @@
     'PAD_TOKEN': '[PAD]',
     'PAD_INDEX': 1,
     'SVMPERF_HOME': './svm_perf_quantification',
-    'N_JOBS': 1
+    'N_JOBS': int(os.getenv('N_JOBS', 1)),
+    'DEFAULT_CLS': LogisticRegression(max_iter=3000)
 }
 
 
@@ -48,3 +52,19 @@ def _get_sample_size(sample_size):
     if sample_size is None:
         raise ValueError('neither sample_size nor qp.environ["SAMPLE_SIZE"] have been specified')
     return sample_size
+
+
+def _get_classifier(classifier):
+    """
+    If `classifier` is None, then it returns `environ['DEFAULT_CLS']`;
+    if otherwise, returns `classifier`.
+
+    :param classifier: sklearn's estimator or None
+    :return: sklearn's estimator
+    """
+    if classifier is None:
+        from sklearn.base import clone
+        classifier = clone(environ['DEFAULT_CLS'])
+    if classifier is None:
+        raise ValueError('neither classifier nor qp.environ["DEFAULT_CLS"] have been specified')
+    return classifier
diff --git a/quapy/method/_kdey.py b/quapy/method/_kdey.py
@@ -24,12 +24,14 @@ def _check_bandwidth(cls, bandwidth):
         Checks that the bandwidth parameter is correct
 
         :param bandwidth: either a string (see BANDWIDTH_METHOD) or a float
-        :return: nothing, but raises an exception for invalid values
+        :return: the bandwidth if the check is passed, or raises an exception for invalid values
         """
         assert bandwidth in KDEBase.BANDWIDTH_METHOD or isinstance(bandwidth, float), \
             f'invalid bandwidth, valid ones are {KDEBase.BANDWIDTH_METHOD} or float values'
         if isinstance(bandwidth, float):
-            assert 0 < bandwidth < 1,  "the bandwith for KDEy should be in (0,1), since this method models the unit simplex"
+            assert 0 < bandwidth < 1,  \
+                "the bandwith for KDEy should be in (0,1), since this method models the unit simplex"
+        return bandwidth
 
     def get_kde_function(self, X, bandwidth):
         """
@@ -106,16 +108,13 @@ class KDEyML(AggregativeSoftQuantifier, KDEBase):
         Alternatively, this set can be specified at fit time by indicating the exact set of data
         on which the predictions are to be generated.
     :param bandwidth: float, the bandwidth of the Kernel
-    :param n_jobs: number of parallel workers
     :param random_state: a seed to be set before fitting any base quantifier (default None)
     """
 
-    def __init__(self, classifier: BaseEstimator, val_split=10, bandwidth=0.1, n_jobs=None, random_state=None):
-        self._check_bandwidth(bandwidth)
-        self.classifier = classifier
+    def __init__(self, classifier: BaseEstimator=None, val_split=5, bandwidth=0.1, random_state=None):
+        self.classifier = qp._get_classifier(classifier)
         self.val_split = val_split
-        self.bandwidth = bandwidth
-        self.n_jobs = n_jobs
+        self.bandwidth = KDEBase._check_bandwidth(bandwidth)
         self.random_state=random_state
 
     def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
@@ -130,17 +129,17 @@ def aggregate(self, posteriors: np.ndarray):
         :param posteriors: instances in the sample converted into posterior probabilities
         :return: a vector of class prevalence estimates
         """
-        np.random.RandomState(self.random_state)
-        epsilon = 1e-10
-        n_classes = len(self.mix_densities)
-        test_densities = [self.pdf(kde_i, posteriors) for kde_i in self.mix_densities]
+        with qp.util.temp_seed(self.random_state):
+            epsilon = 1e-10
+            n_classes = len(self.mix_densities)
+            test_densities = [self.pdf(kde_i, posteriors) for kde_i in self.mix_densities]
 
-        def neg_loglikelihood(prev):
-            test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip (prev, test_densities))
-            test_loglikelihood = np.log(test_mixture_likelihood + epsilon)
-            return  -np.sum(test_loglikelihood)
+            def neg_loglikelihood(prev):
+                test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip (prev, test_densities))
+                test_loglikelihood = np.log(test_mixture_likelihood + epsilon)
+                return  -np.sum(test_loglikelihood)
 
-        return F.optim_minimize(neg_loglikelihood, n_classes)
+            return F.optim_minimize(neg_loglikelihood, n_classes)
 
 
 class KDEyHD(AggregativeSoftQuantifier, KDEBase):
@@ -183,20 +182,17 @@ class KDEyHD(AggregativeSoftQuantifier, KDEBase):
         Alternatively, this set can be specified at fit time by indicating the exact set of data
         on which the predictions are to be generated.
     :param bandwidth: float, the bandwidth of the Kernel
-    :param n_jobs: number of parallel workers
     :param random_state: a seed to be set before fitting any base quantifier (default None)
     :param montecarlo_trials: number of Monte Carlo trials (default 10000)
     """
 
-    def __init__(self, classifier: BaseEstimator, val_split=10, divergence: str='HD',
-                 bandwidth=0.1, n_jobs=None, random_state=None, montecarlo_trials=10000):
+    def __init__(self, classifier: BaseEstimator=None, val_split=5, divergence: str='HD',
+                 bandwidth=0.1, random_state=None, montecarlo_trials=10000):
 
-        self._check_bandwidth(bandwidth)
-        self.classifier = classifier
+        self.classifier = qp._get_classifier(classifier)
         self.val_split = val_split
         self.divergence = divergence
-        self.bandwidth = bandwidth
-        self.n_jobs = n_jobs
+        self.bandwidth = KDEBase._check_bandwidth(bandwidth)
         self.random_state=random_state
         self.montecarlo_trials = montecarlo_trials
 
@@ -278,15 +274,12 @@ class KDEyCS(AggregativeSoftQuantifier):
         Alternatively, this set can be specified at fit time by indicating the exact set of data
         on which the predictions are to be generated.
     :param bandwidth: float, the bandwidth of the Kernel
-    :param n_jobs: number of parallel workers
     """
 
-    def __init__(self, classifier: BaseEstimator, val_split=10, bandwidth=0.1, n_jobs=None):
-        KDEBase._check_bandwidth(bandwidth)
-        self.classifier = classifier
+    def __init__(self, classifier: BaseEstimator=None, val_split=5, bandwidth=0.1):
+        self.classifier = qp._get_classifier(classifier)
         self.val_split = val_split
-        self.bandwidth = bandwidth
-        self.n_jobs = n_jobs
+        self.bandwidth = KDEBase._check_bandwidth(bandwidth)
 
     def gram_matrix_mix_sum(self, X, Y=None):
         # this adapts the output of the rbf_kernel function (pairwise evaluations of Gaussian kernels k(x,y))
@@ -355,7 +348,7 @@ def divergence(alpha):
             # called \overline{r} in the paper
             alpha_ratio = alpha * self.counts_inv
 
-            # recal that tr_te_sums already accounts for the constant terms (1/Li)*(1/M)
+            # recall that tr_te_sums already accounts for the constant terms (1/Li)*(1/M)
             partA = -np.log((alpha_ratio @ tr_te_sums) * Minv)
             partB = 0.5 * np.log(alpha_ratio @ tr_tr_sums @ alpha_ratio)
             return partA + partB #+ partC

diff --git a/quapy/method/_threshold_optim.py b/quapy/method/_threshold_optim.py
@@ -27,8 +27,8 @@ class ThresholdOptimization(BinaryAggregativeQuantifier):
         :class:`quapy.data.base.LabelledCollection` (the split itself).
     """
 
-    def __init__(self, classifier: BaseEstimator, val_split=None, n_jobs=None):
-        self.classifier = classifier
+    def __init__(self, classifier: BaseEstimator=None, val_split=None, n_jobs=None):
+        self.classifier = qp._get_classifier(classifier)
         self.val_split = val_split
         self.n_jobs = qp._get_njobs(n_jobs)
 
@@ -143,7 +143,7 @@ class T50(ThresholdOptimization):
         :class:`quapy.data.base.LabelledCollection` (the split itself).
     """
 
-    def __init__(self, classifier: BaseEstimator, val_split=5):
+    def __init__(self, classifier: BaseEstimator=None, val_split=5):
         super().__init__(classifier, val_split)
 
     def condition(self, tpr, fpr) -> float:
@@ -167,7 +167,7 @@ class MAX(ThresholdOptimization):
         :class:`quapy.data.base.LabelledCollection` (the split itself).
     """
 
-    def __init__(self, classifier: BaseEstimator, val_split=5):
+    def __init__(self, classifier: BaseEstimator=None, val_split=5):
         super().__init__(classifier, val_split)
 
     def condition(self, tpr, fpr) -> float:
@@ -192,7 +192,7 @@ class X(ThresholdOptimization):
         :class:`quapy.data.base.LabelledCollection` (the split itself).
     """
 
-    def __init__(self, classifier: BaseEstimator, val_split=5):
+    def __init__(self, classifier: BaseEstimator=None, val_split=5):
         super().__init__(classifier, val_split)
 
     def condition(self, tpr, fpr) -> float:
@@ -215,7 +215,7 @@ class prevalence estimates for all decision thresholds and returns the median of
         `k`-fold cross validation (this integer stands for the number of folds `k`, defaults 5), or as a
         :class:`quapy.data.base.LabelledCollection` (the split itself).
     """
-    def __init__(self, classifier: BaseEstimator, val_split=5):
+    def __init__(self, classifier: BaseEstimator=None, val_split=5):
         super().__init__(classifier, val_split)
 
     def condition(self, tpr, fpr) -> float:
@@ -254,7 +254,7 @@ class prevalence estimates for all decision thresholds and returns the median of
         `k`-fold cross validation (this integer stands for the number of folds `k`, defaults 5), or as a
         :class:`quapy.data.base.LabelledCollection` (the split itself).
     """
-    def __init__(self, classifier: BaseEstimator, val_split=5):
+    def __init__(self, classifier: BaseEstimator=None, val_split=5):
         super().__init__(classifier, val_split)
 
     def discard(self, tpr, fpr) -> bool: