diff --git a/.ci/linux-steps.yaml b/.ci/linux-steps.yaml index 3580b6a9..d22869b8 100644 --- a/.ci/linux-steps.yaml +++ b/.ci/linux-steps.yaml @@ -45,7 +45,7 @@ steps: displayName: 'Build models' # Run CTests. -- script: cd build/tests/ && sudo CTEST_OUTPUT_ON_FAILURE=1 ctest -R UtilsTest +- script: cd build/tests/ && sudo CTEST_OUTPUT_ON_FAILURE=1 ctest -T Test . displayName: 'Run tests via ctest' # Publish test results to Azure Pipelines diff --git a/.ci/macos-steps.yaml b/.ci/macos-steps.yaml index f4d87f64..d867d975 100644 --- a/.ci/macos-steps.yaml +++ b/.ci/macos-steps.yaml @@ -37,7 +37,7 @@ steps: displayName: 'Build models' # Run CTests. -- script: cd build/tests/ && sudo CTEST_OUTPUT_ON_FAILURE=1 ctest -R UtilsTest +- script: cd build/tests/ && sudo CTEST_OUTPUT_ON_FAILURE=1 ctest -T Test . displayName: 'Run tests via ctest' # Publish test results to Azure Pipelines diff --git a/.ci/windows-steps.yaml b/.ci/windows-steps.yaml index 7842df4d..57e1be52 100644 --- a/.ci/windows-steps.yaml +++ b/.ci/windows-steps.yaml @@ -134,7 +134,7 @@ steps: # Run tests via ctest. - bash: | cd build/tests - CTEST_OUTPUT_ON_FAILURE=1 ctest -T Test -C Release -R UtilsTest + CTEST_OUTPUT_ON_FAILURE=1 ctest -T Test -C Release . displayName: 'Run tests via ctest' # Publish test results to Azure Pipelines diff --git a/.gitignore b/.gitignore index 55454996..609ef60d 100644 --- a/.gitignore +++ b/.gitignore @@ -4,5 +4,8 @@ xcode* .idea cmake-build-* *.csv +*.tar +*.zip +*.tar.gz .travis/configs.hpp Testing/* diff --git a/dataloader/dataloader.hpp b/dataloader/dataloader.hpp index 12093a4d..9b0e9c9d 100644 --- a/dataloader/dataloader.hpp +++ b/dataloader/dataloader.hpp @@ -63,14 +63,14 @@ class DataLoader * * @param datasetPath Path or name of dataset. * @param shuffle whether or not to shuffle the data. - * @param ratio Ratio for train-test split. + * @param validRatio Ratio of dataset to be used for validation set. * @param useScaler Use feature scaler for pre-processing the dataset. * @param augmentation Adds augmentation to training data only. * @param augmentationProbability Probability of applying augmentation on dataset. */ DataLoader(const std::string& dataset, const bool shuffle, - const double ratio = 0.75, + const double validRatio = 0.25, const bool useScaler = true, const std::vector augmentation = std::vector(), @@ -85,7 +85,7 @@ class DataLoader * Note: This option augmentation to NULL, set ratio to 1 and * scaler will be used to only transform the test data. * @param shuffle Boolean to determine whether or not to shuffle the data. - * @param ratio Ratio for train-test split. + * @param validRatio Ratio of dataset to be used for validation set. * @param useScaler Fits the scaler on training data and transforms dataset. * @param dropHeader Drops the first row from CSV. * @param startInputFeatures First Index which will be fed into the model as input. @@ -106,7 +106,7 @@ class DataLoader void LoadCSV(const std::string& datasetPath, const bool loadTrainData = true, const bool shuffle = true, - const double ratio = 0.75, + const double validRatio = 0.25, const bool useScaler = false, const bool dropHeader = false, const int startInputFeatures = -1, @@ -179,11 +179,30 @@ class DataLoader */ void DownloadDataset(const std::string& dataset) { + if (datasetMap[dataset].zipFile && (!Utils::PathExists( + datasetMap[dataset].trainPath) || + !Utils::PathExists(datasetMap[dataset].testPath))) + { + Utils::DownloadFile(datasetMap[dataset].datasetURL, + datasetMap[dataset].datasetPath, dataset + "_training_data.", + false, false, datasetMap[dataset].serverName, + datasetMap[dataset].zipFile); + + if (!Utils::CompareCRC32(datasetMap[dataset].datasetPath, + datasetMap[dataset].datasetHash)) + { + mlpack::Log::Fatal << "Corrupted Data for " << dataset << + " downloaded." << std::endl; + } + + return; + } + if (!Utils::PathExists(datasetMap[dataset].trainPath)) { - Utils::DownloadFile(datasetMap[dataset].trainDownloadUrl, + Utils::DownloadFile(datasetMap[dataset].trainDownloadURL, datasetMap[dataset].trainPath, dataset + "_training_data.", - false); + false, false, datasetMap[dataset].serverName); if (!Utils::CompareCRC32(datasetMap[dataset].trainPath, datasetMap[dataset].trainHash)) @@ -192,11 +211,12 @@ class DataLoader dataset << " downloaded." << std::endl; } } + if (!Utils::PathExists(datasetMap[dataset].testPath)) { - Utils::DownloadFile(datasetMap[dataset].trainDownloadUrl, + Utils::DownloadFile(datasetMap[dataset].trainDownloadURL, datasetMap[dataset].testPath, dataset + "_testing_data.", - false); + false, false, datasetMap[dataset].serverName); if (!Utils::CompareCRC32(datasetMap[dataset].testPath, datasetMap[dataset].testHash)) diff --git a/dataloader/dataloader_impl.hpp b/dataloader/dataloader_impl.hpp index 40a23454..d58f4939 100644 --- a/dataloader/dataloader_impl.hpp +++ b/dataloader/dataloader_impl.hpp @@ -36,7 +36,7 @@ template< DatasetX, DatasetY, ScalerType >::DataLoader(const std::string& dataset, const bool shuffle, - const double ratio, + const double validRatio, const bool useScaler, const std::vector augmentation, const double augmentationProbability) @@ -49,14 +49,14 @@ template< if (datasetMap[dataset].loadCSV) { - LoadCSV(datasetMap[dataset].trainPath, true, shuffle, ratio, useScaler, - datasetMap[dataset].dropHeader, + LoadCSV(datasetMap[dataset].trainPath, true, shuffle, validRatio, + useScaler, datasetMap[dataset].dropHeader, datasetMap[dataset].startTrainingInputFeatures, datasetMap[dataset].endTrainingInputFeatures, datasetMap[dataset].endTrainingPredictionFeatures, datasetMap[dataset].endTrainingPredictionFeatures); - LoadCSV(datasetMap[dataset].testPath, false, false, useScaler, + LoadCSV(datasetMap[dataset].testPath, false, false, validRatio, useScaler, datasetMap[dataset].dropHeader, datasetMap[dataset].startTestingInputFeatures, datasetMap[dataset].endTestingInputFeatures); @@ -85,7 +85,7 @@ template< >::LoadCSV(const std::string& datasetPath, const bool loadTrainData, const bool shuffle, - const double ratio, + const double validRatio, const bool useScaler, const bool dropHeader, const int startInputFeatures, @@ -104,14 +104,7 @@ template< if (loadTrainData) { arma::mat trainDataset, validDataset; - data::Split(dataset, trainDataset, validDataset, ratio, shuffle); - - if (useScaler) - { - scaler.Fit(trainDataset); - scaler.Transform(trainDataset, trainDataset); - scaler.Transform(validDataset, validDataset); - } + data::Split(dataset, trainDataset, validDataset, validRatio, shuffle); trainFeatures = trainDataset.rows(WrapIndex(startInputFeatures, trainDataset.n_rows), WrapIndex(endInputFeatures, @@ -125,10 +118,16 @@ template< validDataset.n_rows), WrapIndex(endInputFeatures, validDataset.n_rows)); - validLabels = trainDataset.rows(WrapIndex(startPredictionFeatures, + validLabels = validDataset.rows(WrapIndex(startPredictionFeatures, validDataset.n_rows), WrapIndex(endPredictionFeatures, validDataset.n_rows)); + if (useScaler) + { + scaler.Fit(trainFeatures); + scaler.Transform(trainFeatures, trainFeatures); + scaler.Transform(validFeatures, validFeatures); + } // TODO : Add support for augmentation here. mlpack::Log::Info << "Training Dataset Loaded." << std::endl; } @@ -139,8 +138,9 @@ template< scaler.Transform(dataset, dataset); } - testFeatures = dataset.submat(WrapIndex(startInputFeatures, dataset.n_rows), - 0, WrapIndex(endInputFeatures, dataset.n_rows), dataset.n_cols - 1); + testFeatures = dataset.rows(WrapIndex(startInputFeatures, dataset.n_rows), + WrapIndex(endInputFeatures, dataset.n_rows)); + mlpack::Log::Info << "Testing Dataset Loaded." << std::endl; } } diff --git a/dataloader/datasets.hpp b/dataloader/datasets.hpp index a0206b8f..9b83403e 100644 --- a/dataloader/datasets.hpp +++ b/dataloader/datasets.hpp @@ -26,15 +26,47 @@ template< > struct DatasetDetails { + //! Locally stored name of dataset used for identification + //! during dataloader call. std::string datasetName; - std::string trainDownloadUrl; - std::string testDownloadUrl; + + //! Locally stored URL for downloading training data. + std::string trainDownloadURL; + + //! Locally stored URL for downloading testing data. + std::string testDownloadURL; + + //! CRC-32 checksum for training data file. std::string trainHash; + + //! CRC-32 checksum for testing data file. std::string testHash; + + //! Locally stored boolean to determine if dataset is of CSV or similar + //! format. bool loadCSV; + + //! Locally stored path to file / directory for training data. std::string trainPath; + + //! Locally stored path to file / directory for testing data. std::string testPath; + //! Locally held boolean to determine whether dataset will be in zip format. + bool zipFile; + + //! Locally stored URL for downloading dataset. + std::string datasetURL; + + //! Locally stored CRC-32 checksum for the dataset. + std::string datasetHash; + + //! Locally stored path for saving the archived / zip dataset. + std::string datasetPath; + + //! Locally stored server name for download file. + std::string serverName; + // Pre-Process functor. std::function PreProcess; @@ -61,13 +93,18 @@ struct DatasetDetails // Default constructor. DatasetDetails() : datasetName(""), - trainDownloadUrl(""), - testDownloadUrl(""), + trainDownloadURL(""), + testDownloadURL(""), trainHash(""), testHash(""), loadCSV(false), trainPath(""), testPath(""), + zipFile(false), + datasetURL(""), + datasetPath(""), + datasetHash(""), + serverName("www.mlpack.org"), startTrainingInputFeatures(0), endTrainingInputFeatures(0), startTrainingPredictionFeatures(0), @@ -77,23 +114,85 @@ struct DatasetDetails dropHeader(false) {/* Nothing to do here. */} - // Constructor for initializing object. + /** + * Constructor for initializing object for seperate + * train and test download URL. + * + * @param datasetName Name of dataset used for identification during + * dataloader call. + * @param trainDownloadURL URL for downloading training data. + * @param testDownloadURL URL for downloading testing data. + * @param trainHash CRC-32 checksum for training data. + * @param testHash CRC-32 checksum for testing data. + * @param loadCSV Determines if the format of dataset is similar to CSV. + * @param trainPath Path for training dataset. + * @param testPath Path for testing dataset. + */ DatasetDetails(const std::string& datasetName, - const std::string& trainDownloadUrl, - const std::string& testDownloadUrl, + const std::string& trainDownloadURL, + const std::string& testDownloadURL, const std::string& trainHash, const std::string& testHash, const bool loadCSV, const std::string& trainPath, const std::string& testPath) : datasetName(datasetName), - trainDownloadUrl(trainDownloadUrl), - testDownloadUrl(testDownloadUrl), + trainDownloadURL(trainDownloadURL), + testDownloadURL(testDownloadURL), trainHash(trainHash), testHash(testHash), loadCSV(loadCSV), trainPath(trainPath), testPath(testPath), + zipFile(false), + datasetURL(""), + datasetHash(""), + serverName("www.mlpack.org"), + startTrainingInputFeatures(0), + endTrainingInputFeatures(0), + startTrainingPredictionFeatures(0), + endTrainingPredictionFeatures(0), + startTestingInputFeatures(0), + endTestingInputFeatures(0), + dropHeader(false) + { + // Nothing to do here. + } + + /** + * Constructor for initializing paths for zip files. + * + * @param datasetName Name of dataset used for identification during + * dataloader call. + * @param zipFile Boolean to determine if dataset is stored in zip format. + * @param datasetURL URL for downloading dataset. + * @param datasetPath Path where the dataset will be downloaded. + * @param datasetHash CRC-32 checksum for dataset. + * @param loadCSV Determines if the format of dataset is similar to CSV. + * @param trainPath Path for training dataset. + * @param testPath Path for testing dataset. + */ + DatasetDetails(const std::string& datasetName, + const bool zipFile, + const std::string& datasetURL, + const std::string& datasetPath, + const std::string& datasetHash, + const bool loadCSV, + const std::string& trainPath, + const std::string& testPath) : + datasetName(datasetName), + zipFile(zipFile), + datasetURL(datasetURL), + datasetHash(datasetHash), + datasetPath(datasetPath), + loadCSV(loadCSV), + trainPath(trainPath), + testPath(testPath), + trainDownloadURL(""), + testDownloadURL(""), + trainHash(""), + testHash(""), + serverName("www.mlpack.org"), startTrainingInputFeatures(0), endTrainingInputFeatures(0), startTrainingPredictionFeatures(0), @@ -119,17 +218,18 @@ template< class Datasets { public: + //! Get details of MNIST Dataset. const static DatasetDetails MNIST() { DatasetDetails mnistDetails( "mnist", - "/datasets/mnist_train.csv", - "/datasets/mnist_test.csv", - "772495e3", - "8bcdb7e1", true, - "./../data/mnist_train.csv", - "./../data/mnist_test.csv"); + "/datasets/mnist.tar.gz", + "./../data/mnist.tar.gz", + "33470ca3", + true, + "./../data/mnist-dataset/mnist_train.csv", + "./../data/mnist-dataset/mnist_test.csv"); // Set the Pre-Processor Function. mnistDetails.PreProcess = PreProcessor::MNIST; diff --git a/tests/dataloader_tests.cpp b/tests/dataloader_tests.cpp index e435a61e..912cb1ab 100644 --- a/tests/dataloader_tests.cpp +++ b/tests/dataloader_tests.cpp @@ -50,6 +50,8 @@ BOOST_AUTO_TEST_CASE(CSVDataLoaderTest) // Check for training dataset using tuples. BOOST_REQUIRE_EQUAL(std::get<0>(irisDataloader.TrainSet()).n_cols, 75); BOOST_REQUIRE_EQUAL(std::get<0>(irisDataloader.TrainSet()).n_rows, 4); + + Utils::RemoveFile("./../data/iris.csv"); } /** @@ -58,11 +60,30 @@ BOOST_AUTO_TEST_CASE(CSVDataLoaderTest) BOOST_AUTO_TEST_CASE(MNISTDataLoaderTest) { DataLoader<> dataloader("mnist", true, 0.80); + + // Check for correct dimensions. + BOOST_REQUIRE_EQUAL(dataloader.TrainFeatures().n_rows, 784); + BOOST_REQUIRE_EQUAL(dataloader.TestFeatures().n_rows, 784); + BOOST_REQUIRE_EQUAL(dataloader.ValidFeatures().n_rows, 784); + // Check for correct dimensions. - BOOST_REQUIRE_EQUAL(dataloader.TrainFeatures().n_cols, 784); - BOOST_REQUIRE_EQUAL(dataloader.TestFeatures().n_cols, 784); - BOOST_REQUIRE_EQUAL(dataloader.ValidFeatures().n_cols, 784); - BOOST_REQUIRE_EQUAL(dataloader.TrainFeatures().n_rows, 33600); + BOOST_REQUIRE_EQUAL(dataloader.TrainFeatures().n_cols, 8400); + BOOST_REQUIRE_EQUAL(dataloader.ValidFeatures().n_cols, 33600); + BOOST_REQUIRE_EQUAL(dataloader.TestFeatures().n_cols, 28000); + + // Check if we can access both features and labels using + // TrainSet tuple and ValidSet tuple. + BOOST_REQUIRE_EQUAL(std::get<0>(dataloader.TrainSet()).n_cols, 8400); + BOOST_REQUIRE_EQUAL(std::get<1>(dataloader.TrainSet()).n_rows, 1); + BOOST_REQUIRE_EQUAL(std::get<0>(dataloader.ValidSet()).n_cols, 33600); + BOOST_REQUIRE_EQUAL(std::get<1>(dataloader.ValidSet()).n_rows, 1); + + // Clean up. + Utils::RemoveFile("./../data/mnist-dataset/mnist_all.csv"); + Utils::RemoveFile("./../data/mnist-dataset/mnist_all_centroids.csv"); + Utils::RemoveFile("./../data/mnist-dataset/mnist_train.csv"); + Utils::RemoveFile("./../data/mnist-dataset/mnist_test.csv"); + Utils::RemoveFile("./../data/mnist.tar.gz"); } BOOST_AUTO_TEST_SUITE_END(); diff --git a/tests/utils_tests.cpp b/tests/utils_tests.cpp index 90b0131c..4a314496 100644 --- a/tests/utils_tests.cpp +++ b/tests/utils_tests.cpp @@ -74,4 +74,21 @@ BOOST_AUTO_TEST_CASE(RemoveFileTest) BOOST_REQUIRE_EQUAL(Utils::PathExists("./../data/file.txt"), 0); } +BOOST_AUTO_TEST_CASE(ExtractFilesTest) +{ + std::vector vec; + + Utils::DownloadFile("/datasets/USCensus1990.tar.gz", + "./../data/USCensus1990.tar.gz", "", false, true, + "www.mlpack.org", true, "./../data/"); + + BOOST_REQUIRE(Utils::PathExists("./../data/USCensus1990.csv")); + BOOST_REQUIRE(Utils::PathExists("./../data/USCensus1990_centroids.csv")); + + // Clean up. + Utils::RemoveFile("./../data/USCensus1990.csv"); + Utils::RemoveFile("./../data/USCensus1990_centroids.csv"); + Utils::RemoveFile("./../data/USCensus1990.tar.gz"); +} + BOOST_AUTO_TEST_SUITE_END(); diff --git a/utils/utils.hpp b/utils/utils.hpp index 903aa201..7ddb6d61 100644 --- a/utils/utils.hpp +++ b/utils/utils.hpp @@ -43,6 +43,46 @@ class Utils return (stat(filePath.c_str(), &buffer) == 0); } + /** + * Uzips any supported tar file. + * + * @param pathToArchive Path to where the tar file is stored. + * @param pathForExtraction Path where files will be extracted. + * @param absolutePath Boolean to determine if path is absolute or relative. + */ + static int ExtractFiles(const std::string pathToArchive, + const std::string pathForExtraction, + const bool absolutePath = false) + { + std::string command = "tar -xvzf "; + if (!absolutePath) + { + #ifdef _WIN32 + std::string pathToArchiveTemp(pathToArchive); + std::string pathForExtractionTemp(pathForExtraction); + std::replace(pathToArchiveTemp.begin(), pathToArchiveTemp.end(), '/', + '\\'); + std::replace(pathForExtractionTemp.begin(), pathForExtractionTemp.end(), + '/', '\\'); + + command = "tar --force-local -xvzf " + pathToArchiveTemp + " -C " + + pathForExtractionTemp; + #else + command = command + boost::filesystem::current_path().string() + "/" + + pathToArchive + " -C " + boost::filesystem::current_path().string() + + "/" + pathForExtraction; + #endif + } + else + { + command = command + pathToArchive + " -C " + pathForExtraction; + } + + // Run the command using system command. + std::system(command.c_str()); + return 0; + } + /** * Downloads files using boost asio. * @@ -55,6 +95,8 @@ class Utils * @param absolutePath Boolean to determine if path is absolute or relative. * @param silent Boolean to display details of file being downloaded. * @param serverName Server to connect to, for downloading. + * @param zipFile Determines if dataset needs to be extracted or not. + * @param pathForExtraction Path where files will be extracted if zipFile is true. * @returns 0 to determine success. */ static int DownloadFile(const std::string url, @@ -63,7 +105,9 @@ class Utils const bool absolutePath = false, const bool silent = true, const std::string serverName = - "www.mlpack.org") + "www.mlpack.org", + const bool zipFile = false, + const std::string pathForExtraction = "./../data/") { // IO functionality by boost core. boost::asio::io_service ioService; @@ -151,6 +195,13 @@ class Utils } outputFile.close(); + + // Extract Files. + if (zipFile) + { + Utils::ExtractFiles(downloadPath, pathForExtraction); + } + return 0; } @@ -213,5 +264,35 @@ class Utils return 0; } + + /** + * Fills a vector with paths to all files in directory. + * + * @param path Path to Directory. + * @param pathVector A vector of type filesystem::path, which will be filled + * paths for all files / folders in given directory path. + * @param absolutePath Boolean to determine if path is absolute or relative. + */ + static void ListDir(const std::string& path, + std::vector& pathVector, + const bool absolutePath = false) + { + if (Utils::PathExists(path, absolutePath)) + { + boost::filesystem::path directoryPath(path); + + // Fill the path vector with respective paths. + std::copy(boost::filesystem::directory_iterator(directoryPath), + boost::filesystem::directory_iterator(), + std::back_inserter(pathVector)); + + // Sort the path vector. + std::sort(pathVector.begin(), pathVector.end()); + } + else + { + mlpack::Log::Warn << "The " << path << " doesn't exist." << std::endl; + } + } }; #endif