Skip to content

Commit

Permalink
Merge branch 'lorenzovolpi-cv_len_fix' into devel
Browse files Browse the repository at this point in the history
  • Loading branch information
AlexMoreo committed Nov 8, 2023
2 parents 15777b0 + 3d4ffce commit cc5ab8a
Show file tree
Hide file tree
Showing 4 changed files with 115 additions and 26 deletions.
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -111,3 +111,7 @@ are provided:
* [SVMperf](https://github.com/HLT-ISTI/QuaPy/wiki/ExplicitLossMinimization)
* [Model Selection](https://github.com/HLT-ISTI/QuaPy/wiki/Model-Selection)
* [Plotting](https://github.com/HLT-ISTI/QuaPy/wiki/Plotting)

## Acknowledgments:

<img src="SoBigData.png" alt="SoBigData++" width="250"/>
Binary file added SoBigData.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
135 changes: 110 additions & 25 deletions quapy/data/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,7 @@ def warn(*args, **kwargs):
import zipfile
from os.path import join
import pandas as pd
import scipy
import quapy
from ucimlrepo import fetch_ucirepo
from quapy.data.base import Dataset, LabelledCollection
from quapy.data.preprocessing import text2tfidf, reduce_columns
from quapy.data.reader import *
Expand Down Expand Up @@ -45,6 +44,12 @@ def warn(*args, **kwargs):
'wine-q-red', 'wine-q-white',
'yeast']

UCI_MULTICLASS_DATASETS = ['dry-bean',
'wine-quality',
'academic-success',
'digits',
'letter']

LEQUA2022_TASKS = ['T1A', 'T1B', 'T2A', 'T2B']

_TXA_SAMPLE_SIZE = 250
Expand Down Expand Up @@ -549,6 +554,109 @@ def fetch_UCILabelledCollection(dataset_name, data_home=None, verbose=False) ->
return data


def fetch_UCIMulticlassDataset(dataset_name, data_home=None, test_split=0.3, verbose=False) -> Dataset:
"""
Loads a UCI multiclass dataset as an instance of :class:`quapy.data.base.Dataset`.
The list of available datasets is taken from https://archive.ics.uci.edu/, following these criteria:
- It has more than 1000 instances
- It is suited for classification
- It has more than two classes
- It is available for Python import (requires ucimlrepo package)
>>> import quapy as qp
>>> dataset = qp.datasets.fetch_UCIMulticlassDataset("dry-bean")
>>> train, test = dataset.train_test
>>> ...
The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_MULTICLASS_DATASETS`
The datasets are downloaded only once and pickled into disk, saving time for consecutive calls.
:param dataset_name: a dataset name
:param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
~/quay_data/ directory)
:param test_split: proportion of documents to be included in the test set. The rest conforms the training set
:param verbose: set to True (default is False) to get information (stats) about the dataset
:return: a :class:`quapy.data.base.Dataset` instance
"""
data = fetch_UCIMulticlassLabelledCollection(dataset_name, data_home, verbose)
return Dataset(*data.split_stratified(1 - test_split, random_state=0))


def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, verbose=False) -> LabelledCollection:
"""
Loads a UCI multiclass collection as an instance of :class:`quapy.data.base.LabelledCollection`.
The list of available datasets is taken from https://archive.ics.uci.edu/, following these criteria:
- It has more than 1000 instances
- It is suited for classification
- It has more than two classes
- It is available for Python import (requires ucimlrepo package)
>>> import quapy as qp
>>> collection = qp.datasets.fetch_UCIMulticlassLabelledCollection("dry-bean")
>>> X, y = collection.Xy
>>> ...
The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_MULTICLASS_DATASETS`
The datasets are downloaded only once and pickled into disk, saving time for consecutive calls.
:param dataset_name: a dataset name
:param data_home: specify the quapy home directory where the dataset will be dumped (leave empty to use the default
~/quay_data/ directory)
:param test_split: proportion of documents to be included in the test set. The rest conforms the training set
:param verbose: set to True (default is False) to get information (stats) about the dataset
:return: a :class:`quapy.data.base.LabelledCollection` instance
"""
assert dataset_name in UCI_MULTICLASS_DATASETS, \
f'Name {dataset_name} does not match any known dataset from the ' \
f'UCI Machine Learning datasets repository (multiclass). ' \
f'Valid ones are {UCI_MULTICLASS_DATASETS}'

if data_home is None:
data_home = get_quapy_home()

identifiers = {
"dry-bean": 602,
"wine-quality": 186,
"academic-success": 697,
"digits": 80,
"letter": 59
}

full_names = {
"dry-bean": "Dry Bean Dataset",
"wine-quality": "Wine Quality",
"academic-success": "Predict students' dropout and academic success",
"digits": "Optical Recognition of Handwritten Digits",
"letter": "Letter Recognition"
}

identifier = identifiers[dataset_name]
fullname = full_names[dataset_name]

if verbose:
print(f'Loading UCI Muticlass {dataset_name} ({fullname})')

file = join(data_home, 'uci_multiclass', dataset_name+'.pkl')

def download(id):
data = fetch_ucirepo(id=id)
X, y = data['data']['features'].to_numpy(), data['data']['targets'].to_numpy().squeeze()
classes = np.sort(np.unique(y))
y = np.searchsorted(classes, y)
return LabelledCollection(X, y)

data = pickled_resource(file, download, identifier)

if verbose:
data.stats()

return data


def _df_replace(df, col, repl={'yes': 1, 'no':0}, astype=float):
df[col] = df[col].apply(lambda x:repl[x]).astype(astype, copy=False)

Expand Down Expand Up @@ -624,26 +732,3 @@ def download_unzip_and_remove(unzipped_path, url):

return train, val_gen, test_gen

def fetch_IFCB(data_home=None):

if data_home is None:
data_home = get_quapy_home()

URL_TRAINDEV=f'https://zenodo.org/records/10036244/files/IFCB.train.zip'
URL_TEST=f'https://zenodo.org/records/10036244/files/IFCB.test.zip'
URL_TEST_PREV=f'https://zenodo.org/records/10036244/files/IFCB.test_prevalences.zip'

ifcb_dir = join(data_home, 'ifcb')
os.makedirs(ifcb_dir, exist_ok=True)

def download_unzip_and_remove(unzipped_path, url):
tmp_path = join(ifcb_dir, 'tmp.zip')
download_file_if_not_exists(url, tmp_path)
with zipfile.ZipFile(tmp_path) as file:
file.extractall(unzipped_path)
os.remove(tmp_path)

if not os.path.exists(join(ifcb_dir, task)):
download_unzip_and_remove(ifcb_dir, URL_TRAINDEV)
download_unzip_and_remove(ifcb_dir, URL_TEST)
download_unzip_and_remove(ifcb_dir, URL_TEST_PREV)
2 changes: 1 addition & 1 deletion quapy/model_selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,7 @@ def cross_val_predict(quantifier: BaseQuantifier, data: LabelledCollection, nfol
for train, test in data.kFCV(nfolds=nfolds, random_state=random_state):
quantifier.fit(train)
fold_prev = quantifier.quantify(test.X)
rel_size = len(test.X)/len(data)
rel_size = 1. * len(test) / len(data)
total_prev += fold_prev*rel_size

return total_prev
Expand Down

0 comments on commit cc5ab8a

Please sign in to comment.