-
Notifications
You must be signed in to change notification settings - Fork 2
/
Dataset.h
111 lines (87 loc) · 2.89 KB
/
Dataset.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
//
// Created by squall on 18-6-11.
//
#ifndef MTREE_DATASET_H
#define MTREE_DATASET_H
#include <vector>
#include <string>
#include <fstream>
// #include <bits/unique_ptr.h>
#include <algorithm>
#include <functional>
#include "utils.h"
#include "Random.h"
using namespace std;
using namespace common;
class Dataset {
public:
Dataset(int feature_size) : feature_size(feature_size) {
for (int i = 0; i < feature_size; i++) {
vector<float> feature;
this->data.push_back(feature);
}
this->candidate_cut_points.resize(feature_size);
}
/*! \belief: get sample from the file */
int load_data_from_file(const string &file_name, const char *delimiter);
/*! \belief: get sample by index. */
int get_sample_by_index(vector<int> &index, vector<vector<float>> &selected_sample,
vector<float> &selected_label, vector<int> &selected_task,
Matrix &selected_gradients) const;
int get_data_by_index(vector<int> &index, Dataset &dataset) const;
int get_sample_by_task(int task_id, vector<vector<float>> &selected_sample,
vector<float> &selected_label, vector<int> &selected_task,
Matrix &selected_gradients) const;
/*! \belief: split data by task_id. */
int get_data_by_tasks(vector<Dataset> &datasets) const;
/*! \belief: shuffle split data into train and test set. */
vector<pair<Dataset, Dataset>> shuffle_split(const int &n_splits,
const float &test_size,
const int &random_state) const;
vector<pair<Dataset, Dataset>> shuffle_split_by_size(const int n_splits,
const int train_size,
const int test_size,
const int random_state) const;
/*! \belief: split data into train and test set. */
pair<Dataset, Dataset> train_test_split(const float &test_size, const int &random_state) const;
int set_gradients(const Matrix &gradients) {
this->gradients = gradients;
return SUCCESS;
}
int set_task_num(const int task_num) {
this->task_num = task_num;
return SUCCESS;
}
const Matrix &get_gradients() const {
return this->gradients;
}
int get_feature_size() const {
return this->feature_size;
}
int get_task_num() const {
return this->task_num;
}
const vector<float> &get_label_data() const {
return this->label;
}
const vector<int> &get_task_data() {
return this->task;
}
const int get_data_size() const {
return this->dataset_size;
}
const Matrix &get_data() const {
return this->data;
}
set<float>& get_unique_points(int feature_index);
private:
Matrix data;
int feature_size;
int dataset_size;
int task_num;
vector<float> label;
vector<int> task;
Matrix gradients;
vector<set<float>> candidate_cut_points;
};
#endif //MTREE_DATASET_H