Skip to content

Commit

Permalink
adding standardization for the uci datasets, binary and multi, which …
Browse files Browse the repository at this point in the history
…is by default set to True
  • Loading branch information
AlexMoreo committed Jul 23, 2024
1 parent 9642808 commit 3f20aa0
Showing 1 changed file with 33 additions and 5 deletions.
38 changes: 33 additions & 5 deletions quapy/data/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ def warn(*args, **kwargs):
from quapy.data.preprocessing import text2tfidf, reduce_columns
from quapy.data.reader import *
from quapy.util import download_file_if_not_exists, download_file, get_quapy_home, pickled_resource
from sklearn.preprocessing import StandardScaler


REVIEWS_SENTIMENT_DATASETS = ['hp', 'kindle', 'imdb']
Expand All @@ -21,11 +22,13 @@ def warn(*args, **kwargs):
'semeval13', 'semeval14', 'semeval15', 'semeval16',
'sst', 'wa', 'wb',
]

TWITTER_SENTIMENT_DATASETS_TRAIN = [
'gasp', 'hcr', 'omd', 'sanders',
'semeval', 'semeval16',
'sst', 'wa', 'wb',
]

UCI_BINARY_DATASETS = [
#'acute.a', 'acute.b',
'balance.1',
Expand Down Expand Up @@ -230,7 +233,7 @@ def fetch_twitter(dataset_name, for_model_selection=False, min_df=None, data_hom
return data


def fetch_UCIBinaryDataset(dataset_name, data_home=None, test_split=0.3, verbose=False) -> Dataset:
def fetch_UCIBinaryDataset(dataset_name, data_home=None, test_split=0.3, standardize=True, verbose=False) -> Dataset:
"""
Loads a UCI dataset as an instance of :class:`quapy.data.base.Dataset`, as used in
`Pérez-Gállego, P., Quevedo, J. R., & del Coz, J. J. (2017).
Expand All @@ -248,14 +251,20 @@ def fetch_UCIBinaryDataset(dataset_name, data_home=None, test_split=0.3, verbose
:param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
~/quay_data/ directory)
:param test_split: proportion of documents to be included in the test set. The rest conforms the training set
:param standardize: indicates whether the covariates should be standardized or not (default is True). If requested,
standardization applies after the LabelledCollection is split, that is, the mean an std are computed only on the
training portion of the data.
:param verbose: set to True (default is False) to get information (from the UCI ML repository) about the datasets
:return: a :class:`quapy.data.base.Dataset` instance
"""
data = fetch_UCIBinaryLabelledCollection(dataset_name, data_home, verbose)
return Dataset(*data.split_stratified(1 - test_split, random_state=0), name=dataset_name)
dataset = Dataset(*data.split_stratified(1 - test_split, random_state=0), name=dataset_name)
if standardize:
dataset = qp.data.preprocessing.standardize(dataset)
return dataset


def fetch_UCIBinaryLabelledCollection(dataset_name, data_home=None, verbose=False) -> LabelledCollection:
def fetch_UCIBinaryLabelledCollection(dataset_name, data_home=None, standardize=True, verbose=False) -> LabelledCollection:
"""
Loads a UCI collection as an instance of :class:`quapy.data.base.LabelledCollection`, as used in
`Pérez-Gállego, P., Quevedo, J. R., & del Coz, J. J. (2017).
Expand All @@ -279,6 +288,7 @@ def fetch_UCIBinaryLabelledCollection(dataset_name, data_home=None, verbose=Fals
:param dataset_name: a dataset name
:param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
~/quay_data/ directory)
:param standardize: indicates whether the covariates should be standardized or not (default is True).
:param verbose: set to True (default is False) to get information (from the UCI ML repository) about the datasets
:return: a :class:`quapy.data.base.LabelledCollection` instance
"""
Expand Down Expand Up @@ -568,6 +578,10 @@ def binarize_data(name, data: dict) -> LabelledCollection:
data = pickled_resource(file, download, identifier, dataset_group)
data = binarize_data(dataset_name, data)

if standardize:
stds = StandardScaler()
data.instances = stds.fit_transform(data.instances)

if verbose:
data.stats()

Expand All @@ -580,6 +594,7 @@ def fetch_UCIMulticlassDataset(
min_test_split=0.3,
max_train_instances=25000,
min_class_support=100,
standardize=True,
verbose=False) -> Dataset:
"""
Loads a UCI multiclass dataset as an instance of :class:`quapy.data.base.Dataset`.
Expand Down Expand Up @@ -610,6 +625,9 @@ def fetch_UCIMulticlassDataset(
set to -1 or None to avoid this check
:param min_class_support: minimum number of istances per class. Classes with fewer instances
are discarded (deafult is 100)
:param standardize: indicates whether the covariates should be standardized or not (default is True). If requested,
standardization applies after the LabelledCollection is split, that is, the mean an std are computed only on the
training portion of the data.
:param verbose: set to True (default is False) to get information (stats) about the dataset
:return: a :class:`quapy.data.base.Dataset` instance
"""
Expand All @@ -622,10 +640,15 @@ def fetch_UCIMulticlassDataset(
if n_train > max_train_instances:
train_prop = (max_train_instances / n)

return Dataset(*data.split_stratified(train_prop, random_state=0))
data = Dataset(*data.split_stratified(train_prop, random_state=0))

if standardize:
data = qp.data.preprocessing.standardize(data)

return data


def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, min_class_support=100, verbose=False) -> LabelledCollection:
def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, min_class_support=100, standardize=True, verbose=False) -> LabelledCollection:
"""
Loads a UCI multiclass collection as an instance of :class:`quapy.data.base.LabelledCollection`.
Expand All @@ -649,6 +672,7 @@ def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, min_clas
~/quay_data/ directory)
:param min_class_support: minimum number of istances per class. Classes with fewer instances
are discarded (deafult is 100)
:param standardize: indicates whether the covariates should be standardized or not (default is True).
:param verbose: set to True (default is False) to get information (stats) about the dataset
:return: a :class:`quapy.data.base.LabelledCollection` instance
"""
Expand Down Expand Up @@ -755,6 +779,10 @@ def filter_classes(data: LabelledCollection, min_ipc):
f'is no longer multiclass. Try a reducing this value.'
)

if standardize:
stds = StandardScaler()
data.instances = stds.fit_transform(data.instances)

if verbose:
data.stats()

Expand Down

0 comments on commit 3f20aa0

Please sign in to comment.