Skip to content

Commit

Permalink
adding environment variables for N_JOBS, and adding a default classif…
Browse files Browse the repository at this point in the history
…ier (sklearn's logistic regression) for when the classifier is not specified in aggregative quantifiers
  • Loading branch information
AlexMoreo committed May 30, 2024
1 parent 9ad36ef commit ad11b86
Show file tree
Hide file tree
Showing 9 changed files with 108 additions and 77 deletions.
24 changes: 20 additions & 4 deletions CHANGE_LOG.txt
Original file line number Diff line number Diff line change
@@ -1,10 +1,26 @@
Change Log 0.1.9
----------------
- [TODO] add LeQua2024
- [TODO] add njobs to env
- [TODO] add basic examples
- [TODO] add default classifier to env
- [TODO] add default classifier to env

- Added a default classifier for aggregative quantifiers, which now can be instantiated without specifying
the classifier. The default classifier can be accessed in qp.environ['DEFAULT_CLS'] and is assigned to
sklearn.linear_model.LogisticRegression(max_iter=3000). If the classifier is not specified, then a clone
of said classifier is returned. E.g.:
> pacc = PACC()
is equivalent to:
> pacc = PACC(classifier=LogisticRegression(max_iter=3000))

- Improved error loging in model selection. In v0.1.8 only Status.INVALID was reported; in v0.1.9 it is
now accompanied by a textual description of the error

- The number of parallel workers can now be set via an environment variable by running, e.g.:
> N_JOBS=10 python3 your_script.py
which has the same effect as writing the following code at the beginning of your_script.py:
> import quapy as qp
> qp.environ["N_JOBS"] = 10

- Some examples have been added to the ./examples/ dir, which now contains numbered examples from basics (0)
to advanced topics (higher numbers)

- Moved the wiki documents to the ./docs/ folder so that they become editable via PR for the community

Expand Down
5 changes: 3 additions & 2 deletions examples/0.basics.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,10 @@
print(f'training prevalence = {F.strprev(train.prevalence())}')

# let us train one quantifier, for example, PACC using a sklearn's Logistic Regressor as the underlying classifier
classifier = LogisticRegression()
# classifier = LogisticRegression()

pacc = qp.method.aggregative.PACC(classifier)
# pacc = qp.method.aggregative.PACC(classifier)
pacc = qp.method.aggregative.PACC()

print(f'training {pacc}')
pacc.fit(train)
Expand Down
16 changes: 9 additions & 7 deletions examples/1.model_selection.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,23 @@
import quapy as qp
from method._kdey import KDEyML
from quapy.method.non_aggregative import DMx
from quapy.protocol import APP, UPP
from quapy.protocol import UPP
from quapy.method.aggregative import DMy
from sklearn.linear_model import LogisticRegression
from examples.comparing_gridsearch import OLD_GridSearchQ
import numpy as np
from time import time

"""
In this example, we show how to perform model selection on a DistributionMatching quantifier.
"""

model = DMy(LogisticRegression())
model = DMy()

qp.environ['SAMPLE_SIZE'] = 100
qp.environ['N_JOBS'] = -1

print(f'running model selection with N_JOBS={qp.environ["N_JOBS"]}; '
f'to increase the number of jobs use:\n> N_JOBS=-1 python3 1.model_selection.py\n'
f'alternatively, you can set this variable within the script as:\n'
f'import quapy as qp\n'
f'qp.environ["N_JOBS"]=-1')

training, test = qp.datasets.fetch_UCIMulticlassDataset('letter').train_test

Expand All @@ -42,7 +44,7 @@
# different configurations of the quantifier. In other words, quapy avoids to train
# the classifier 7x7 times.
param_grid = {
'classifier__C': np.logspace(-3,3,7),
'classifier__C': np.logspace(-3, 3, 7),
'nbins': [2, 3, 4, 5, 10, 15, 20]
}

Expand Down
3 changes: 2 additions & 1 deletion examples/8.ucimulti_experiments.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from sklearn.linear_model import LogisticRegression

import quapy as qp
from quapy.method.aggregative import PACC, EMQ, KDEyML
from quapy.method.aggregative import PACC, EMQ
from quapy.model_selection import GridSearchQ
from quapy.protocol import UPP
from pathlib import Path
Expand Down Expand Up @@ -52,6 +52,7 @@ def load_timings(result_path):
df = pd.read_csv(result_path+'.csv', sep='\t')
return timings | df.pivot_table(index='Dataset', columns='Method', values='t_train').to_dict()


if __name__ == '__main__':

qp.environ['SAMPLE_SIZE'] = 500
Expand Down
24 changes: 22 additions & 2 deletions quapy/__init__.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,18 @@
"""QuaPy module for quantification"""
from sklearn.linear_model import LogisticRegression

from quapy.data import datasets
from . import error
from . import data
from . import functional
# from . import method
from . import method
from . import evaluation
from . import protocol
from . import plot
from . import util
from . import model_selection
from . import classification
import os

__version__ = '0.1.9'

Expand All @@ -20,7 +23,8 @@
'PAD_TOKEN': '[PAD]',
'PAD_INDEX': 1,
'SVMPERF_HOME': './svm_perf_quantification',
'N_JOBS': 1
'N_JOBS': int(os.getenv('N_JOBS', 1)),
'DEFAULT_CLS': LogisticRegression(max_iter=3000)
}


Expand Down Expand Up @@ -48,3 +52,19 @@ def _get_sample_size(sample_size):
if sample_size is None:
raise ValueError('neither sample_size nor qp.environ["SAMPLE_SIZE"] have been specified')
return sample_size


def _get_classifier(classifier):
"""
If `classifier` is None, then it returns `environ['DEFAULT_CLS']`;
if otherwise, returns `classifier`.
:param classifier: sklearn's estimator or None
:return: sklearn's estimator
"""
if classifier is None:
from sklearn.base import clone
classifier = clone(environ['DEFAULT_CLS'])
if classifier is None:
raise ValueError('neither classifier nor qp.environ["DEFAULT_CLS"] have been specified')
return classifier
55 changes: 24 additions & 31 deletions quapy/method/_kdey.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,14 @@ def _check_bandwidth(cls, bandwidth):
Checks that the bandwidth parameter is correct
:param bandwidth: either a string (see BANDWIDTH_METHOD) or a float
:return: nothing, but raises an exception for invalid values
:return: the bandwidth if the check is passed, or raises an exception for invalid values
"""
assert bandwidth in KDEBase.BANDWIDTH_METHOD or isinstance(bandwidth, float), \
f'invalid bandwidth, valid ones are {KDEBase.BANDWIDTH_METHOD} or float values'
if isinstance(bandwidth, float):
assert 0 < bandwidth < 1, "the bandwith for KDEy should be in (0,1), since this method models the unit simplex"
assert 0 < bandwidth < 1, \
"the bandwith for KDEy should be in (0,1), since this method models the unit simplex"
return bandwidth

def get_kde_function(self, X, bandwidth):
"""
Expand Down Expand Up @@ -106,16 +108,13 @@ class KDEyML(AggregativeSoftQuantifier, KDEBase):
Alternatively, this set can be specified at fit time by indicating the exact set of data
on which the predictions are to be generated.
:param bandwidth: float, the bandwidth of the Kernel
:param n_jobs: number of parallel workers
:param random_state: a seed to be set before fitting any base quantifier (default None)
"""

def __init__(self, classifier: BaseEstimator, val_split=10, bandwidth=0.1, n_jobs=None, random_state=None):
self._check_bandwidth(bandwidth)
self.classifier = classifier
def __init__(self, classifier: BaseEstimator=None, val_split=5, bandwidth=0.1, random_state=None):
self.classifier = qp._get_classifier(classifier)
self.val_split = val_split
self.bandwidth = bandwidth
self.n_jobs = n_jobs
self.bandwidth = KDEBase._check_bandwidth(bandwidth)
self.random_state=random_state

def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
Expand All @@ -130,17 +129,17 @@ def aggregate(self, posteriors: np.ndarray):
:param posteriors: instances in the sample converted into posterior probabilities
:return: a vector of class prevalence estimates
"""
np.random.RandomState(self.random_state)
epsilon = 1e-10
n_classes = len(self.mix_densities)
test_densities = [self.pdf(kde_i, posteriors) for kde_i in self.mix_densities]
with qp.util.temp_seed(self.random_state):
epsilon = 1e-10
n_classes = len(self.mix_densities)
test_densities = [self.pdf(kde_i, posteriors) for kde_i in self.mix_densities]

def neg_loglikelihood(prev):
test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip (prev, test_densities))
test_loglikelihood = np.log(test_mixture_likelihood + epsilon)
return -np.sum(test_loglikelihood)
def neg_loglikelihood(prev):
test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip (prev, test_densities))
test_loglikelihood = np.log(test_mixture_likelihood + epsilon)
return -np.sum(test_loglikelihood)

return F.optim_minimize(neg_loglikelihood, n_classes)
return F.optim_minimize(neg_loglikelihood, n_classes)


class KDEyHD(AggregativeSoftQuantifier, KDEBase):
Expand Down Expand Up @@ -183,20 +182,17 @@ class KDEyHD(AggregativeSoftQuantifier, KDEBase):
Alternatively, this set can be specified at fit time by indicating the exact set of data
on which the predictions are to be generated.
:param bandwidth: float, the bandwidth of the Kernel
:param n_jobs: number of parallel workers
:param random_state: a seed to be set before fitting any base quantifier (default None)
:param montecarlo_trials: number of Monte Carlo trials (default 10000)
"""

def __init__(self, classifier: BaseEstimator, val_split=10, divergence: str='HD',
bandwidth=0.1, n_jobs=None, random_state=None, montecarlo_trials=10000):
def __init__(self, classifier: BaseEstimator=None, val_split=5, divergence: str='HD',
bandwidth=0.1, random_state=None, montecarlo_trials=10000):

self._check_bandwidth(bandwidth)
self.classifier = classifier
self.classifier = qp._get_classifier(classifier)
self.val_split = val_split
self.divergence = divergence
self.bandwidth = bandwidth
self.n_jobs = n_jobs
self.bandwidth = KDEBase._check_bandwidth(bandwidth)
self.random_state=random_state
self.montecarlo_trials = montecarlo_trials

Expand Down Expand Up @@ -278,15 +274,12 @@ class KDEyCS(AggregativeSoftQuantifier):
Alternatively, this set can be specified at fit time by indicating the exact set of data
on which the predictions are to be generated.
:param bandwidth: float, the bandwidth of the Kernel
:param n_jobs: number of parallel workers
"""

def __init__(self, classifier: BaseEstimator, val_split=10, bandwidth=0.1, n_jobs=None):
KDEBase._check_bandwidth(bandwidth)
self.classifier = classifier
def __init__(self, classifier: BaseEstimator=None, val_split=5, bandwidth=0.1):
self.classifier = qp._get_classifier(classifier)
self.val_split = val_split
self.bandwidth = bandwidth
self.n_jobs = n_jobs
self.bandwidth = KDEBase._check_bandwidth(bandwidth)

def gram_matrix_mix_sum(self, X, Y=None):
# this adapts the output of the rbf_kernel function (pairwise evaluations of Gaussian kernels k(x,y))
Expand Down Expand Up @@ -355,7 +348,7 @@ def divergence(alpha):
# called \overline{r} in the paper
alpha_ratio = alpha * self.counts_inv

# recal that tr_te_sums already accounts for the constant terms (1/Li)*(1/M)
# recall that tr_te_sums already accounts for the constant terms (1/Li)*(1/M)
partA = -np.log((alpha_ratio @ tr_te_sums) * Minv)
partB = 0.5 * np.log(alpha_ratio @ tr_tr_sums @ alpha_ratio)
return partA + partB #+ partC
Expand Down
14 changes: 7 additions & 7 deletions quapy/method/_threshold_optim.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@ class ThresholdOptimization(BinaryAggregativeQuantifier):
:class:`quapy.data.base.LabelledCollection` (the split itself).
"""

def __init__(self, classifier: BaseEstimator, val_split=None, n_jobs=None):
self.classifier = classifier
def __init__(self, classifier: BaseEstimator=None, val_split=None, n_jobs=None):
self.classifier = qp._get_classifier(classifier)
self.val_split = val_split
self.n_jobs = qp._get_njobs(n_jobs)

Expand Down Expand Up @@ -143,7 +143,7 @@ class T50(ThresholdOptimization):
:class:`quapy.data.base.LabelledCollection` (the split itself).
"""

def __init__(self, classifier: BaseEstimator, val_split=5):
def __init__(self, classifier: BaseEstimator=None, val_split=5):
super().__init__(classifier, val_split)

def condition(self, tpr, fpr) -> float:
Expand All @@ -167,7 +167,7 @@ class MAX(ThresholdOptimization):
:class:`quapy.data.base.LabelledCollection` (the split itself).
"""

def __init__(self, classifier: BaseEstimator, val_split=5):
def __init__(self, classifier: BaseEstimator=None, val_split=5):
super().__init__(classifier, val_split)

def condition(self, tpr, fpr) -> float:
Expand All @@ -192,7 +192,7 @@ class X(ThresholdOptimization):
:class:`quapy.data.base.LabelledCollection` (the split itself).
"""

def __init__(self, classifier: BaseEstimator, val_split=5):
def __init__(self, classifier: BaseEstimator=None, val_split=5):
super().__init__(classifier, val_split)

def condition(self, tpr, fpr) -> float:
Expand All @@ -215,7 +215,7 @@ class prevalence estimates for all decision thresholds and returns the median of
`k`-fold cross validation (this integer stands for the number of folds `k`, defaults 5), or as a
:class:`quapy.data.base.LabelledCollection` (the split itself).
"""
def __init__(self, classifier: BaseEstimator, val_split=5):
def __init__(self, classifier: BaseEstimator=None, val_split=5):
super().__init__(classifier, val_split)

def condition(self, tpr, fpr) -> float:
Expand Down Expand Up @@ -254,7 +254,7 @@ class prevalence estimates for all decision thresholds and returns the median of
`k`-fold cross validation (this integer stands for the number of folds `k`, defaults 5), or as a
:class:`quapy.data.base.LabelledCollection` (the split itself).
"""
def __init__(self, classifier: BaseEstimator, val_split=5):
def __init__(self, classifier: BaseEstimator=None, val_split=5):
super().__init__(classifier, val_split)

def discard(self, tpr, fpr) -> bool:
Expand Down
Loading

0 comments on commit ad11b86

Please sign in to comment.