diff --git a/.ci/linux-steps.yaml b/.ci/linux-steps.yaml index 3580b6a9..d22869b8 100644 --- a/.ci/linux-steps.yaml +++ b/.ci/linux-steps.yaml @@ -45,7 +45,7 @@ steps: displayName: 'Build models' # Run CTests. -- script: cd build/tests/ && sudo CTEST_OUTPUT_ON_FAILURE=1 ctest -R UtilsTest +- script: cd build/tests/ && sudo CTEST_OUTPUT_ON_FAILURE=1 ctest -T Test . displayName: 'Run tests via ctest' # Publish test results to Azure Pipelines diff --git a/.ci/macos-steps.yaml b/.ci/macos-steps.yaml index f4d87f64..d867d975 100644 --- a/.ci/macos-steps.yaml +++ b/.ci/macos-steps.yaml @@ -37,7 +37,7 @@ steps: displayName: 'Build models' # Run CTests. -- script: cd build/tests/ && sudo CTEST_OUTPUT_ON_FAILURE=1 ctest -R UtilsTest +- script: cd build/tests/ && sudo CTEST_OUTPUT_ON_FAILURE=1 ctest -T Test . displayName: 'Run tests via ctest' # Publish test results to Azure Pipelines diff --git a/.ci/windows-steps.yaml b/.ci/windows-steps.yaml index 7842df4d..57e1be52 100644 --- a/.ci/windows-steps.yaml +++ b/.ci/windows-steps.yaml @@ -134,7 +134,7 @@ steps: # Run tests via ctest. - bash: | cd build/tests - CTEST_OUTPUT_ON_FAILURE=1 ctest -T Test -C Release -R UtilsTest + CTEST_OUTPUT_ON_FAILURE=1 ctest -T Test -C Release . displayName: 'Run tests via ctest' # Publish test results to Azure Pipelines diff --git a/.gitignore b/.gitignore index 55454996..609ef60d 100644 --- a/.gitignore +++ b/.gitignore @@ -4,5 +4,8 @@ xcode* .idea cmake-build-* *.csv +*.tar +*.zip +*.tar.gz .travis/configs.hpp Testing/* diff --git a/data/annotations/2007_000027.xml b/data/annotations/2007_000027.xml new file mode 100755 index 00000000..576da530 --- /dev/null +++ b/data/annotations/2007_000027.xml @@ -0,0 +1,63 @@ + + VOC2012 + 2007_000027.jpg + + The VOC2007 Database + PASCAL VOC2007 + flickr + + + 486 + 500 + 3 + + 0 + + person + Unspecified + 0 + 0 + + 174 + 101 + 349 + 351 + + + head + + 169 + 104 + 209 + 146 + + + + hand + + 278 + 210 + 297 + 233 + + + + foot + + 273 + 333 + 297 + 354 + + + + foot + + 319 + 307 + 340 + 326 + + + + diff --git a/data/annotations/2007_000032.xml b/data/annotations/2007_000032.xml new file mode 100755 index 00000000..779abb63 --- /dev/null +++ b/data/annotations/2007_000032.xml @@ -0,0 +1,63 @@ + + VOC2012 + 2007_000032.jpg + + The VOC2007 Database + PASCAL VOC2007 + flickr + + + 500 + 281 + 3 + + 1 + + aeroplane + Frontal + 0 + 0 + + 104 + 78 + 375 + 183 + + + + aeroplane + Left + 0 + 0 + + 133 + 88 + 197 + 123 + + + + person + Rear + 0 + 0 + + 195 + 180 + 213 + 229 + + + + person + Rear + 0 + 0 + + 26 + 189 + 44 + 238 + + + diff --git a/dataloader/dataloader.hpp b/dataloader/dataloader.hpp index 12093a4d..4b3249b2 100644 --- a/dataloader/dataloader.hpp +++ b/dataloader/dataloader.hpp @@ -10,10 +10,13 @@ * http://www.opensource.org/licenses/BSD-3-Clause for more information. */ #include +#include #include #include -#include +#include #include +#include +#include #include #include @@ -117,6 +120,28 @@ class DataLoader std::vector(), const double augmentationProbability = 0.2); + /** + * Loads object detection dataset. It requires a single annotation file in xml format. + * Each XML file should correspond to a single image in images folder. + * + * XML file should containg the following : + * 1. Each XML file should be wrapped in annotation tag. + * 2. Filename of image in images folder will be depicted by filename tag. + * 3. Object tag depicting characteristics of bounding box. + * 4. Each object tag should contain name tag i.e. class of the object. + * 5. Each object tag should contain bndbox tag containing xmin, ymin, xmax, ymax. + * + * NOTE : Labels are assigned using lexicographically. Set verbose to 1 to print labels + * and their corresponding class. + * + * @param pathToAnnotations Path to the folder containg xml type annotation files. + * @param pathToImages Path to folder containing images corresponding to annotations. + * @param absolutePath Boolean to determine if absolute path is used. Defaults to false. + */ + void LoadObjectDetectionDataset(const std::string& pathToAnnotations, + const std::string& pathToImages, + const bool absolutePath = false); + //! Get the training dataset features. DatasetX TrainFeatures() const { return trainFeatures; } @@ -179,11 +204,30 @@ class DataLoader */ void DownloadDataset(const std::string& dataset) { + if (datasetMap[dataset].zipFile && (!Utils::PathExists( + datasetMap[dataset].trainPath) || + !Utils::PathExists(datasetMap[dataset].testPath))) + { + Utils::DownloadFile(datasetMap[dataset].datasetURL, + datasetMap[dataset].datasetPath, dataset + "_training_data.", + false, false, datasetMap[dataset].serverName, + datasetMap[dataset].zipFile); + + if (!Utils::CompareCRC32(datasetMap[dataset].datasetPath, + datasetMap[dataset].datasetHash)) + { + mlpack::Log::Fatal << "Corrupted Data for " << dataset << + " downloaded." << std::endl; + } + + return; + } + if (!Utils::PathExists(datasetMap[dataset].trainPath)) { - Utils::DownloadFile(datasetMap[dataset].trainDownloadUrl, + Utils::DownloadFile(datasetMap[dataset].trainDownloadURL, datasetMap[dataset].trainPath, dataset + "_training_data.", - false); + false, false, datasetMap[dataset].serverName); if (!Utils::CompareCRC32(datasetMap[dataset].trainPath, datasetMap[dataset].trainHash)) @@ -192,11 +236,12 @@ class DataLoader dataset << " downloaded." << std::endl; } } + if (!Utils::PathExists(datasetMap[dataset].testPath)) { - Utils::DownloadFile(datasetMap[dataset].trainDownloadUrl, + Utils::DownloadFile(datasetMap[dataset].trainDownloadURL, datasetMap[dataset].testPath, dataset + "_testing_data.", - false); + false, false, datasetMap[dataset].serverName); if (!Utils::CompareCRC32(datasetMap[dataset].testPath, datasetMap[dataset].testHash)) diff --git a/dataloader/dataloader_impl.hpp b/dataloader/dataloader_impl.hpp index 40a23454..e423fc34 100644 --- a/dataloader/dataloader_impl.hpp +++ b/dataloader/dataloader_impl.hpp @@ -106,13 +106,6 @@ template< arma::mat trainDataset, validDataset; data::Split(dataset, trainDataset, validDataset, ratio, shuffle); - if (useScaler) - { - scaler.Fit(trainDataset); - scaler.Transform(trainDataset, trainDataset); - scaler.Transform(validDataset, validDataset); - } - trainFeatures = trainDataset.rows(WrapIndex(startInputFeatures, trainDataset.n_rows), WrapIndex(endInputFeatures, trainDataset.n_rows)); @@ -125,10 +118,16 @@ template< validDataset.n_rows), WrapIndex(endInputFeatures, validDataset.n_rows)); - validLabels = trainDataset.rows(WrapIndex(startPredictionFeatures, + validLabels = validDataset.rows(WrapIndex(startPredictionFeatures, validDataset.n_rows), WrapIndex(endPredictionFeatures, validDataset.n_rows)); + if (useScaler) + { + scaler.Fit(trainFeatures); + scaler.Transform(trainFeatures, trainFeatures); + scaler.Transform(validFeatures, validFeatures); + } // TODO : Add support for augmentation here. mlpack::Log::Info << "Training Dataset Loaded." << std::endl; } @@ -145,4 +144,37 @@ template< } } +template< + typename DatasetX, + typename DatasetY, + class ScalerType +> void DataLoader< + DatasetX, DatasetY, ScalerType +>::LoadObjectDetectionDataset(const std::string& pathToAnnotations, + const std::string& pathToImages, + const bool absolutePath) +{ + std::vector annotationsDirectory, imagesDirectory; + + // Fill the directory. + Utils::ListDir(pathToAnnotations, annotationsDirectory, absolutePath); + Utils::ListDir(pathToImages, imagesDirectory, absolutePath); + + // Read the xml file. + for (boost::filesystem::path annotationFile : annotationsDirectory) + { + // Read the xml file. + boost::property_tree::ptree annotation; + std::cout << annotationFile.string() << std::endl; + boost::property_tree::read_xml(annotationFile.string(), annotation); + + // Read properties inside annotation file. + BOOST_FOREACH (boost::property_tree::ptree::value_type const& object, + annotation.get_child("annotation.object")) + { + std::cout << object.first << std::endl; + } + } +} + #endif diff --git a/dataloader/datasets.hpp b/dataloader/datasets.hpp index a0206b8f..57b8b2a3 100644 --- a/dataloader/datasets.hpp +++ b/dataloader/datasets.hpp @@ -26,15 +26,47 @@ template< > struct DatasetDetails { + //! Locally stored name of dataset used for identification + //! during dataloader call. std::string datasetName; - std::string trainDownloadUrl; - std::string testDownloadUrl; + + //! Locally stored URL for downloading training data. + std::string trainDownloadURL; + + //! Locally stored URL for downloading testing data. + std::string testDownloadURL; + + //! CRC-32 checksum for training data file. std::string trainHash; + + //! CRC-32 checksum for testing data file. std::string testHash; + + //! Locally stored boolean to determine if dataset is of CSV or similar + //! format. bool loadCSV; + + //! Locally stored path to file / directory for training data. std::string trainPath; + + //! Locally stored path to file / directory for testing data. std::string testPath; + //! Locally held boolean to determine whether dataset will be in zip format. + bool zipFile; + + //! Locally stored URL for downloading dataset. + std::string datasetURL; + + //! Locally stored CRC-32 checksum for the dataset. + std::string datasetHash; + + //! Locally stored path for saving the archived / zip dataset. + std::string datasetPath; + + //! Locally stored server name for download file. + std::string serverName; + // Pre-Process functor. std::function PreProcess; @@ -61,13 +93,18 @@ struct DatasetDetails // Default constructor. DatasetDetails() : datasetName(""), - trainDownloadUrl(""), - testDownloadUrl(""), + trainDownloadURL(""), + testDownloadURL(""), trainHash(""), testHash(""), loadCSV(false), trainPath(""), testPath(""), + zipFile(false), + datasetURL(""), + datasetPath(""), + datasetHash(""), + serverName("www.mlpack.org"), startTrainingInputFeatures(0), endTrainingInputFeatures(0), startTrainingPredictionFeatures(0), @@ -77,23 +114,85 @@ struct DatasetDetails dropHeader(false) {/* Nothing to do here. */} - // Constructor for initializing object. + /** + * Constructor for initializing object for seperate + * train and test download URL. + * + * @param datasetName Name of dataset used for identification during + * dataloader call. + * @param trainDownloadURL URL for downloading training data. + * @param testDownloadURL URL for downloading testing data. + * @param trainHash CRC-32 checksum for training data. + * @param testHash CRC-32 checksum for testing data. + * @param loadCSV Determines if the format of dataset is similar to CSV. + * @param trainPath Path for training dataset. + * @param testPath Path for testing dataset. + */ DatasetDetails(const std::string& datasetName, - const std::string& trainDownloadUrl, - const std::string& testDownloadUrl, + const std::string& trainDownloadURL, + const std::string& testDownloadURL, const std::string& trainHash, const std::string& testHash, const bool loadCSV, const std::string& trainPath, const std::string& testPath) : datasetName(datasetName), - trainDownloadUrl(trainDownloadUrl), - testDownloadUrl(testDownloadUrl), + trainDownloadURL(trainDownloadURL), + testDownloadURL(testDownloadURL), trainHash(trainHash), testHash(testHash), loadCSV(loadCSV), trainPath(trainPath), testPath(testPath), + zipFile(false), + datasetURL(""), + datasetHash(""), + serverName("www.mlpack.org"), + startTrainingInputFeatures(0), + endTrainingInputFeatures(0), + startTrainingPredictionFeatures(0), + endTrainingPredictionFeatures(0), + startTestingInputFeatures(0), + endTestingInputFeatures(0), + dropHeader(false) + { + // Nothing to do here. + } + + /** + * Constructor for initializing paths for zip files. + * + * @param datasetName Name of dataset used for identification during + * dataloader call. + * @param zipFile Boolean to determine if dataset is stored in zip format. + * @param datasetURL URL for downloading dataset. + * @param datasetPath Path where the dataset will be downloaded. + * @param datasetHash CRC-32 checksum for dataset. + * @param loadCSV Determines if the format of dataset is similar to CSV. + * @param trainPath Path for training dataset. + * @param testPath Path for testing dataset. + */ + DatasetDetails(const std::string& datasetName, + const bool zipFile, + const std::string& datasetURL, + const std::string& datasetPath, + const std::string& datasetHash, + const bool loadCSV, + const std::string& trainPath, + const std::string& testPath) : + datasetName(datasetName), + zipFile(zipFile), + datasetURL(datasetURL), + datasetHash(datasetHash), + datasetPath(datasetPath), + loadCSV(loadCSV), + trainPath(trainPath), + testPath(testPath), + trainDownloadURL(""), + testDownloadURL(""), + trainHash(""), + testHash(""), + serverName("www.mlpack.org"), startTrainingInputFeatures(0), endTrainingInputFeatures(0), startTrainingPredictionFeatures(0), @@ -119,14 +218,15 @@ template< class Datasets { public: + //! Get details of MNIST Dataset. const static DatasetDetails MNIST() { DatasetDetails mnistDetails( "mnist", - "/datasets/mnist_train.csv", - "/datasets/mnist_test.csv", - "772495e3", - "8bcdb7e1", + true, + "/datasets/mnist.tar.gz", + "./../data/mnist.tar.gz", + "9fa4efe5", true, "./../data/mnist_train.csv", "./../data/mnist_test.csv"); diff --git a/tests/dataloader_tests.cpp b/tests/dataloader_tests.cpp index e435a61e..86c06630 100644 --- a/tests/dataloader_tests.cpp +++ b/tests/dataloader_tests.cpp @@ -50,6 +50,8 @@ BOOST_AUTO_TEST_CASE(CSVDataLoaderTest) // Check for training dataset using tuples. BOOST_REQUIRE_EQUAL(std::get<0>(irisDataloader.TrainSet()).n_cols, 75); BOOST_REQUIRE_EQUAL(std::get<0>(irisDataloader.TrainSet()).n_rows, 4); + + Utils::RemoveFile("./../data/iris.csv"); } /** @@ -57,12 +59,23 @@ BOOST_AUTO_TEST_CASE(CSVDataLoaderTest) */ BOOST_AUTO_TEST_CASE(MNISTDataLoaderTest) { + /** DataLoader<> dataloader("mnist", true, 0.80); // Check for correct dimensions. BOOST_REQUIRE_EQUAL(dataloader.TrainFeatures().n_cols, 784); BOOST_REQUIRE_EQUAL(dataloader.TestFeatures().n_cols, 784); BOOST_REQUIRE_EQUAL(dataloader.ValidFeatures().n_cols, 784); BOOST_REQUIRE_EQUAL(dataloader.TrainFeatures().n_rows, 33600); + */ +} + +/** + * Simple Test for object detection dataloader. + */ +BOOST_AUTO_TEST_CASE(ObjectDetectionDataLoader) +{ + DataLoader<> dataloader; + dataloader.LoadObjectDetectionDataset("./../data/annotations/", "./../data"); } BOOST_AUTO_TEST_SUITE_END(); diff --git a/tests/utils_tests.cpp b/tests/utils_tests.cpp index 90b0131c..aa04fec7 100644 --- a/tests/utils_tests.cpp +++ b/tests/utils_tests.cpp @@ -74,4 +74,17 @@ BOOST_AUTO_TEST_CASE(RemoveFileTest) BOOST_REQUIRE_EQUAL(Utils::PathExists("./../data/file.txt"), 0); } +BOOST_AUTO_TEST_CASE(ExtractFilesTest) +{ + Utils::DownloadFile("/datasets/mnist.tar.gz", "./../data/mnist.tar.gz", "", + false, true, "www.mlpack.org", true, "./../data/"); + + BOOST_REQUIRE(Utils::PathExists("./../data/mnist_all.csv")); + BOOST_REQUIRE(Utils::PathExists("./../data/mnist.tar.gz")); + + // Clean up. + Utils::RemoveFile("./../data/mnist_all.csv"); + Utils::RemoveFile("./../data/mnist_all_centroids.csv"); +} + BOOST_AUTO_TEST_SUITE_END(); diff --git a/utils/utils.hpp b/utils/utils.hpp index 903aa201..6709d7dc 100644 --- a/utils/utils.hpp +++ b/utils/utils.hpp @@ -43,6 +43,47 @@ class Utils return (stat(filePath.c_str(), &buffer) == 0); } + /** + * Uzips any supported tar file. + * + * @param pathToArchive Path to where the tar file is stored. + * @param pathForExtraction Path where files will be extracted. + * @param absolutePath Boolean to determine if path is absolute or relative. + */ + static int ExtractFiles(const std::string pathToArchive, + const std::string pathForExtraction, + const bool absolutePath = false) + { + std::string command = "tar -xvzf "; + if (!absolutePath) + { + #ifdef _WIN32 + std::string pathToArchiveTemp(pathToArchive); + std::string pathForExtractionTemp(pathForExtraction); + std::replace(pathToArchiveTemp.begin(), pathToArchiveTemp.end(), '/', + '\\'); + std::replace(pathForExtractionTemp.begin(), pathForExtractionTemp.end(), + '/', '\\'); + + command = "tar --force-local -xvzf " + + boost::filesystem::current_path().string() + "\\" + + pathToArchiveTemp; + #else + command = command + boost::filesystem::current_path().string() + "/" + + pathToArchive + " -C " + boost::filesystem::current_path().string() + + "/" + pathForExtraction; + #endif + } + else + { + command = command + pathToArchive + " -C " + pathForExtraction; + } + + // Run the command using system command. + std::system(command.c_str()); + return 0; + } + /** * Downloads files using boost asio. * @@ -55,6 +96,8 @@ class Utils * @param absolutePath Boolean to determine if path is absolute or relative. * @param silent Boolean to display details of file being downloaded. * @param serverName Server to connect to, for downloading. + * @param zipFile Determines if dataset needs to be extracted or not. + * @param pathForExtraction Path where files will be extracted if zipFile is true. * @returns 0 to determine success. */ static int DownloadFile(const std::string url, @@ -63,7 +106,9 @@ class Utils const bool absolutePath = false, const bool silent = true, const std::string serverName = - "www.mlpack.org") + "www.mlpack.org", + const bool zipFile = false, + const std::string pathForExtraction = "./../data/") { // IO functionality by boost core. boost::asio::io_service ioService; @@ -151,6 +196,13 @@ class Utils } outputFile.close(); + + // Extract Files. + if (zipFile) + { + Utils::ExtractFiles(downloadPath, pathForExtraction); + } + return 0; } @@ -213,5 +265,36 @@ class Utils return 0; } + + /** + * Fills a vector with paths to all files in directory. + * + * @param path Path to Directory. + * @param pathVector A vector of type filesystem::path, which will be filled + * paths for all files / folders in given directory path. + * @param absolutePath Boolean to determine if path is absolute or relative. + */ + static void ListDir(const std::string& path, + std::vector& pathVector, + const bool absolutePath = false) + { + if (Utils::PathExists(path, absolutePath)) + { + boost::filesystem::path directoryPath(path); + + // Fill the path vector with respective paths. + std::copy(boost::filesystem::directory_iterator(directoryPath), + boost::filesystem::directory_iterator(), + std::back_inserter(pathVector)); + + // Sort the path vector. + std::sort(pathVector.begin(), pathVector.end()); + } + else + { + mlpack::Log::Warn << "The " << path << "Doesn't exist." << std::endl; + } + } }; + #endif