From 3353e2ed1912ba8eb5dbec17cd8d458aa9e39dd6 Mon Sep 17 00:00:00 2001 From: kartikdutt18 Date: Wed, 20 May 2020 22:41:40 +0530 Subject: [PATCH] Add basic definition of models, Needs to be trained and tested Trained a lenet1 model Add All Weights Add unzip utility function, fix linux build Completed everything Fix build, Style fixes next Use force local for windows while unzipping tar files Use force local for windows while unzipping tar files Added Utility Function Added Utility Function Style Fix Fix Typo causing build error in windows Fix Typo causing build error in windows Fix const issue for windows Extract in data folder Reposition force local This should work Print Path in windows for debugging Print Path in windows for debugging Print Path in windows for debugging Stip components of tar Stip components of tar strip componenets Initial Definition Add List Dir utility function Add List Dir utility function Squash this, annotation read Yay, we can read xml now --- .ci/linux-steps.yaml | 2 +- .ci/macos-steps.yaml | 2 +- .ci/windows-steps.yaml | 2 +- .gitignore | 3 + data/annotations/2007_000027.xml | 63 ++++++++++++++++ data/annotations/2007_000032.xml | 63 ++++++++++++++++ dataloader/dataloader.hpp | 55 ++++++++++++-- dataloader/dataloader_impl.hpp | 48 ++++++++++-- dataloader/datasets.hpp | 126 +++++++++++++++++++++++++++---- tests/dataloader_tests.cpp | 13 ++++ tests/utils_tests.cpp | 13 ++++ utils/utils.hpp | 85 ++++++++++++++++++++- 12 files changed, 445 insertions(+), 30 deletions(-) create mode 100755 data/annotations/2007_000027.xml create mode 100755 data/annotations/2007_000032.xml diff --git a/.ci/linux-steps.yaml b/.ci/linux-steps.yaml index 3580b6a9..d22869b8 100644 --- a/.ci/linux-steps.yaml +++ b/.ci/linux-steps.yaml @@ -45,7 +45,7 @@ steps: displayName: 'Build models' # Run CTests. -- script: cd build/tests/ && sudo CTEST_OUTPUT_ON_FAILURE=1 ctest -R UtilsTest +- script: cd build/tests/ && sudo CTEST_OUTPUT_ON_FAILURE=1 ctest -T Test . displayName: 'Run tests via ctest' # Publish test results to Azure Pipelines diff --git a/.ci/macos-steps.yaml b/.ci/macos-steps.yaml index f4d87f64..d867d975 100644 --- a/.ci/macos-steps.yaml +++ b/.ci/macos-steps.yaml @@ -37,7 +37,7 @@ steps: displayName: 'Build models' # Run CTests. -- script: cd build/tests/ && sudo CTEST_OUTPUT_ON_FAILURE=1 ctest -R UtilsTest +- script: cd build/tests/ && sudo CTEST_OUTPUT_ON_FAILURE=1 ctest -T Test . displayName: 'Run tests via ctest' # Publish test results to Azure Pipelines diff --git a/.ci/windows-steps.yaml b/.ci/windows-steps.yaml index 7842df4d..57e1be52 100644 --- a/.ci/windows-steps.yaml +++ b/.ci/windows-steps.yaml @@ -134,7 +134,7 @@ steps: # Run tests via ctest. - bash: | cd build/tests - CTEST_OUTPUT_ON_FAILURE=1 ctest -T Test -C Release -R UtilsTest + CTEST_OUTPUT_ON_FAILURE=1 ctest -T Test -C Release . displayName: 'Run tests via ctest' # Publish test results to Azure Pipelines diff --git a/.gitignore b/.gitignore index 55454996..609ef60d 100644 --- a/.gitignore +++ b/.gitignore @@ -4,5 +4,8 @@ xcode* .idea cmake-build-* *.csv +*.tar +*.zip +*.tar.gz .travis/configs.hpp Testing/* diff --git a/data/annotations/2007_000027.xml b/data/annotations/2007_000027.xml new file mode 100755 index 00000000..576da530 --- /dev/null +++ b/data/annotations/2007_000027.xml @@ -0,0 +1,63 @@ + + VOC2012 + 2007_000027.jpg + + The VOC2007 Database + PASCAL VOC2007 + flickr + + + 486 + 500 + 3 + + 0 + + person + Unspecified + 0 + 0 + + 174 + 101 + 349 + 351 + + + head + + 169 + 104 + 209 + 146 + + + + hand + + 278 + 210 + 297 + 233 + + + + foot + + 273 + 333 + 297 + 354 + + + + foot + + 319 + 307 + 340 + 326 + + + + diff --git a/data/annotations/2007_000032.xml b/data/annotations/2007_000032.xml new file mode 100755 index 00000000..779abb63 --- /dev/null +++ b/data/annotations/2007_000032.xml @@ -0,0 +1,63 @@ + + VOC2012 + 2007_000032.jpg + + The VOC2007 Database + PASCAL VOC2007 + flickr + + + 500 + 281 + 3 + + 1 + + aeroplane + Frontal + 0 + 0 + + 104 + 78 + 375 + 183 + + + + aeroplane + Left + 0 + 0 + + 133 + 88 + 197 + 123 + + + + person + Rear + 0 + 0 + + 195 + 180 + 213 + 229 + + + + person + Rear + 0 + 0 + + 26 + 189 + 44 + 238 + + + diff --git a/dataloader/dataloader.hpp b/dataloader/dataloader.hpp index 12093a4d..4b3249b2 100644 --- a/dataloader/dataloader.hpp +++ b/dataloader/dataloader.hpp @@ -10,10 +10,13 @@ * http://www.opensource.org/licenses/BSD-3-Clause for more information. */ #include +#include #include #include -#include +#include #include +#include +#include #include #include @@ -117,6 +120,28 @@ class DataLoader std::vector(), const double augmentationProbability = 0.2); + /** + * Loads object detection dataset. It requires a single annotation file in xml format. + * Each XML file should correspond to a single image in images folder. + * + * XML file should containg the following : + * 1. Each XML file should be wrapped in annotation tag. + * 2. Filename of image in images folder will be depicted by filename tag. + * 3. Object tag depicting characteristics of bounding box. + * 4. Each object tag should contain name tag i.e. class of the object. + * 5. Each object tag should contain bndbox tag containing xmin, ymin, xmax, ymax. + * + * NOTE : Labels are assigned using lexicographically. Set verbose to 1 to print labels + * and their corresponding class. + * + * @param pathToAnnotations Path to the folder containg xml type annotation files. + * @param pathToImages Path to folder containing images corresponding to annotations. + * @param absolutePath Boolean to determine if absolute path is used. Defaults to false. + */ + void LoadObjectDetectionDataset(const std::string& pathToAnnotations, + const std::string& pathToImages, + const bool absolutePath = false); + //! Get the training dataset features. DatasetX TrainFeatures() const { return trainFeatures; } @@ -179,11 +204,30 @@ class DataLoader */ void DownloadDataset(const std::string& dataset) { + if (datasetMap[dataset].zipFile && (!Utils::PathExists( + datasetMap[dataset].trainPath) || + !Utils::PathExists(datasetMap[dataset].testPath))) + { + Utils::DownloadFile(datasetMap[dataset].datasetURL, + datasetMap[dataset].datasetPath, dataset + "_training_data.", + false, false, datasetMap[dataset].serverName, + datasetMap[dataset].zipFile); + + if (!Utils::CompareCRC32(datasetMap[dataset].datasetPath, + datasetMap[dataset].datasetHash)) + { + mlpack::Log::Fatal << "Corrupted Data for " << dataset << + " downloaded." << std::endl; + } + + return; + } + if (!Utils::PathExists(datasetMap[dataset].trainPath)) { - Utils::DownloadFile(datasetMap[dataset].trainDownloadUrl, + Utils::DownloadFile(datasetMap[dataset].trainDownloadURL, datasetMap[dataset].trainPath, dataset + "_training_data.", - false); + false, false, datasetMap[dataset].serverName); if (!Utils::CompareCRC32(datasetMap[dataset].trainPath, datasetMap[dataset].trainHash)) @@ -192,11 +236,12 @@ class DataLoader dataset << " downloaded." << std::endl; } } + if (!Utils::PathExists(datasetMap[dataset].testPath)) { - Utils::DownloadFile(datasetMap[dataset].trainDownloadUrl, + Utils::DownloadFile(datasetMap[dataset].trainDownloadURL, datasetMap[dataset].testPath, dataset + "_testing_data.", - false); + false, false, datasetMap[dataset].serverName); if (!Utils::CompareCRC32(datasetMap[dataset].testPath, datasetMap[dataset].testHash)) diff --git a/dataloader/dataloader_impl.hpp b/dataloader/dataloader_impl.hpp index 40a23454..e423fc34 100644 --- a/dataloader/dataloader_impl.hpp +++ b/dataloader/dataloader_impl.hpp @@ -106,13 +106,6 @@ template< arma::mat trainDataset, validDataset; data::Split(dataset, trainDataset, validDataset, ratio, shuffle); - if (useScaler) - { - scaler.Fit(trainDataset); - scaler.Transform(trainDataset, trainDataset); - scaler.Transform(validDataset, validDataset); - } - trainFeatures = trainDataset.rows(WrapIndex(startInputFeatures, trainDataset.n_rows), WrapIndex(endInputFeatures, trainDataset.n_rows)); @@ -125,10 +118,16 @@ template< validDataset.n_rows), WrapIndex(endInputFeatures, validDataset.n_rows)); - validLabels = trainDataset.rows(WrapIndex(startPredictionFeatures, + validLabels = validDataset.rows(WrapIndex(startPredictionFeatures, validDataset.n_rows), WrapIndex(endPredictionFeatures, validDataset.n_rows)); + if (useScaler) + { + scaler.Fit(trainFeatures); + scaler.Transform(trainFeatures, trainFeatures); + scaler.Transform(validFeatures, validFeatures); + } // TODO : Add support for augmentation here. mlpack::Log::Info << "Training Dataset Loaded." << std::endl; } @@ -145,4 +144,37 @@ template< } } +template< + typename DatasetX, + typename DatasetY, + class ScalerType +> void DataLoader< + DatasetX, DatasetY, ScalerType +>::LoadObjectDetectionDataset(const std::string& pathToAnnotations, + const std::string& pathToImages, + const bool absolutePath) +{ + std::vector annotationsDirectory, imagesDirectory; + + // Fill the directory. + Utils::ListDir(pathToAnnotations, annotationsDirectory, absolutePath); + Utils::ListDir(pathToImages, imagesDirectory, absolutePath); + + // Read the xml file. + for (boost::filesystem::path annotationFile : annotationsDirectory) + { + // Read the xml file. + boost::property_tree::ptree annotation; + std::cout << annotationFile.string() << std::endl; + boost::property_tree::read_xml(annotationFile.string(), annotation); + + // Read properties inside annotation file. + BOOST_FOREACH (boost::property_tree::ptree::value_type const& object, + annotation.get_child("annotation.object")) + { + std::cout << object.first << std::endl; + } + } +} + #endif diff --git a/dataloader/datasets.hpp b/dataloader/datasets.hpp index a0206b8f..57b8b2a3 100644 --- a/dataloader/datasets.hpp +++ b/dataloader/datasets.hpp @@ -26,15 +26,47 @@ template< > struct DatasetDetails { + //! Locally stored name of dataset used for identification + //! during dataloader call. std::string datasetName; - std::string trainDownloadUrl; - std::string testDownloadUrl; + + //! Locally stored URL for downloading training data. + std::string trainDownloadURL; + + //! Locally stored URL for downloading testing data. + std::string testDownloadURL; + + //! CRC-32 checksum for training data file. std::string trainHash; + + //! CRC-32 checksum for testing data file. std::string testHash; + + //! Locally stored boolean to determine if dataset is of CSV or similar + //! format. bool loadCSV; + + //! Locally stored path to file / directory for training data. std::string trainPath; + + //! Locally stored path to file / directory for testing data. std::string testPath; + //! Locally held boolean to determine whether dataset will be in zip format. + bool zipFile; + + //! Locally stored URL for downloading dataset. + std::string datasetURL; + + //! Locally stored CRC-32 checksum for the dataset. + std::string datasetHash; + + //! Locally stored path for saving the archived / zip dataset. + std::string datasetPath; + + //! Locally stored server name for download file. + std::string serverName; + // Pre-Process functor. std::function PreProcess; @@ -61,13 +93,18 @@ struct DatasetDetails // Default constructor. DatasetDetails() : datasetName(""), - trainDownloadUrl(""), - testDownloadUrl(""), + trainDownloadURL(""), + testDownloadURL(""), trainHash(""), testHash(""), loadCSV(false), trainPath(""), testPath(""), + zipFile(false), + datasetURL(""), + datasetPath(""), + datasetHash(""), + serverName("www.mlpack.org"), startTrainingInputFeatures(0), endTrainingInputFeatures(0), startTrainingPredictionFeatures(0), @@ -77,23 +114,85 @@ struct DatasetDetails dropHeader(false) {/* Nothing to do here. */} - // Constructor for initializing object. + /** + * Constructor for initializing object for seperate + * train and test download URL. + * + * @param datasetName Name of dataset used for identification during + * dataloader call. + * @param trainDownloadURL URL for downloading training data. + * @param testDownloadURL URL for downloading testing data. + * @param trainHash CRC-32 checksum for training data. + * @param testHash CRC-32 checksum for testing data. + * @param loadCSV Determines if the format of dataset is similar to CSV. + * @param trainPath Path for training dataset. + * @param testPath Path for testing dataset. + */ DatasetDetails(const std::string& datasetName, - const std::string& trainDownloadUrl, - const std::string& testDownloadUrl, + const std::string& trainDownloadURL, + const std::string& testDownloadURL, const std::string& trainHash, const std::string& testHash, const bool loadCSV, const std::string& trainPath, const std::string& testPath) : datasetName(datasetName), - trainDownloadUrl(trainDownloadUrl), - testDownloadUrl(testDownloadUrl), + trainDownloadURL(trainDownloadURL), + testDownloadURL(testDownloadURL), trainHash(trainHash), testHash(testHash), loadCSV(loadCSV), trainPath(trainPath), testPath(testPath), + zipFile(false), + datasetURL(""), + datasetHash(""), + serverName("www.mlpack.org"), + startTrainingInputFeatures(0), + endTrainingInputFeatures(0), + startTrainingPredictionFeatures(0), + endTrainingPredictionFeatures(0), + startTestingInputFeatures(0), + endTestingInputFeatures(0), + dropHeader(false) + { + // Nothing to do here. + } + + /** + * Constructor for initializing paths for zip files. + * + * @param datasetName Name of dataset used for identification during + * dataloader call. + * @param zipFile Boolean to determine if dataset is stored in zip format. + * @param datasetURL URL for downloading dataset. + * @param datasetPath Path where the dataset will be downloaded. + * @param datasetHash CRC-32 checksum for dataset. + * @param loadCSV Determines if the format of dataset is similar to CSV. + * @param trainPath Path for training dataset. + * @param testPath Path for testing dataset. + */ + DatasetDetails(const std::string& datasetName, + const bool zipFile, + const std::string& datasetURL, + const std::string& datasetPath, + const std::string& datasetHash, + const bool loadCSV, + const std::string& trainPath, + const std::string& testPath) : + datasetName(datasetName), + zipFile(zipFile), + datasetURL(datasetURL), + datasetHash(datasetHash), + datasetPath(datasetPath), + loadCSV(loadCSV), + trainPath(trainPath), + testPath(testPath), + trainDownloadURL(""), + testDownloadURL(""), + trainHash(""), + testHash(""), + serverName("www.mlpack.org"), startTrainingInputFeatures(0), endTrainingInputFeatures(0), startTrainingPredictionFeatures(0), @@ -119,14 +218,15 @@ template< class Datasets { public: + //! Get details of MNIST Dataset. const static DatasetDetails MNIST() { DatasetDetails mnistDetails( "mnist", - "/datasets/mnist_train.csv", - "/datasets/mnist_test.csv", - "772495e3", - "8bcdb7e1", + true, + "/datasets/mnist.tar.gz", + "./../data/mnist.tar.gz", + "9fa4efe5", true, "./../data/mnist_train.csv", "./../data/mnist_test.csv"); diff --git a/tests/dataloader_tests.cpp b/tests/dataloader_tests.cpp index e435a61e..86c06630 100644 --- a/tests/dataloader_tests.cpp +++ b/tests/dataloader_tests.cpp @@ -50,6 +50,8 @@ BOOST_AUTO_TEST_CASE(CSVDataLoaderTest) // Check for training dataset using tuples. BOOST_REQUIRE_EQUAL(std::get<0>(irisDataloader.TrainSet()).n_cols, 75); BOOST_REQUIRE_EQUAL(std::get<0>(irisDataloader.TrainSet()).n_rows, 4); + + Utils::RemoveFile("./../data/iris.csv"); } /** @@ -57,12 +59,23 @@ BOOST_AUTO_TEST_CASE(CSVDataLoaderTest) */ BOOST_AUTO_TEST_CASE(MNISTDataLoaderTest) { + /** DataLoader<> dataloader("mnist", true, 0.80); // Check for correct dimensions. BOOST_REQUIRE_EQUAL(dataloader.TrainFeatures().n_cols, 784); BOOST_REQUIRE_EQUAL(dataloader.TestFeatures().n_cols, 784); BOOST_REQUIRE_EQUAL(dataloader.ValidFeatures().n_cols, 784); BOOST_REQUIRE_EQUAL(dataloader.TrainFeatures().n_rows, 33600); + */ +} + +/** + * Simple Test for object detection dataloader. + */ +BOOST_AUTO_TEST_CASE(ObjectDetectionDataLoader) +{ + DataLoader<> dataloader; + dataloader.LoadObjectDetectionDataset("./../data/annotations/", "./../data"); } BOOST_AUTO_TEST_SUITE_END(); diff --git a/tests/utils_tests.cpp b/tests/utils_tests.cpp index 90b0131c..aa04fec7 100644 --- a/tests/utils_tests.cpp +++ b/tests/utils_tests.cpp @@ -74,4 +74,17 @@ BOOST_AUTO_TEST_CASE(RemoveFileTest) BOOST_REQUIRE_EQUAL(Utils::PathExists("./../data/file.txt"), 0); } +BOOST_AUTO_TEST_CASE(ExtractFilesTest) +{ + Utils::DownloadFile("/datasets/mnist.tar.gz", "./../data/mnist.tar.gz", "", + false, true, "www.mlpack.org", true, "./../data/"); + + BOOST_REQUIRE(Utils::PathExists("./../data/mnist_all.csv")); + BOOST_REQUIRE(Utils::PathExists("./../data/mnist.tar.gz")); + + // Clean up. + Utils::RemoveFile("./../data/mnist_all.csv"); + Utils::RemoveFile("./../data/mnist_all_centroids.csv"); +} + BOOST_AUTO_TEST_SUITE_END(); diff --git a/utils/utils.hpp b/utils/utils.hpp index 903aa201..6709d7dc 100644 --- a/utils/utils.hpp +++ b/utils/utils.hpp @@ -43,6 +43,47 @@ class Utils return (stat(filePath.c_str(), &buffer) == 0); } + /** + * Uzips any supported tar file. + * + * @param pathToArchive Path to where the tar file is stored. + * @param pathForExtraction Path where files will be extracted. + * @param absolutePath Boolean to determine if path is absolute or relative. + */ + static int ExtractFiles(const std::string pathToArchive, + const std::string pathForExtraction, + const bool absolutePath = false) + { + std::string command = "tar -xvzf "; + if (!absolutePath) + { + #ifdef _WIN32 + std::string pathToArchiveTemp(pathToArchive); + std::string pathForExtractionTemp(pathForExtraction); + std::replace(pathToArchiveTemp.begin(), pathToArchiveTemp.end(), '/', + '\\'); + std::replace(pathForExtractionTemp.begin(), pathForExtractionTemp.end(), + '/', '\\'); + + command = "tar --force-local -xvzf " + + boost::filesystem::current_path().string() + "\\" + + pathToArchiveTemp; + #else + command = command + boost::filesystem::current_path().string() + "/" + + pathToArchive + " -C " + boost::filesystem::current_path().string() + + "/" + pathForExtraction; + #endif + } + else + { + command = command + pathToArchive + " -C " + pathForExtraction; + } + + // Run the command using system command. + std::system(command.c_str()); + return 0; + } + /** * Downloads files using boost asio. * @@ -55,6 +96,8 @@ class Utils * @param absolutePath Boolean to determine if path is absolute or relative. * @param silent Boolean to display details of file being downloaded. * @param serverName Server to connect to, for downloading. + * @param zipFile Determines if dataset needs to be extracted or not. + * @param pathForExtraction Path where files will be extracted if zipFile is true. * @returns 0 to determine success. */ static int DownloadFile(const std::string url, @@ -63,7 +106,9 @@ class Utils const bool absolutePath = false, const bool silent = true, const std::string serverName = - "www.mlpack.org") + "www.mlpack.org", + const bool zipFile = false, + const std::string pathForExtraction = "./../data/") { // IO functionality by boost core. boost::asio::io_service ioService; @@ -151,6 +196,13 @@ class Utils } outputFile.close(); + + // Extract Files. + if (zipFile) + { + Utils::ExtractFiles(downloadPath, pathForExtraction); + } + return 0; } @@ -213,5 +265,36 @@ class Utils return 0; } + + /** + * Fills a vector with paths to all files in directory. + * + * @param path Path to Directory. + * @param pathVector A vector of type filesystem::path, which will be filled + * paths for all files / folders in given directory path. + * @param absolutePath Boolean to determine if path is absolute or relative. + */ + static void ListDir(const std::string& path, + std::vector& pathVector, + const bool absolutePath = false) + { + if (Utils::PathExists(path, absolutePath)) + { + boost::filesystem::path directoryPath(path); + + // Fill the path vector with respective paths. + std::copy(boost::filesystem::directory_iterator(directoryPath), + boost::filesystem::directory_iterator(), + std::back_inserter(pathVector)); + + // Sort the path vector. + std::sort(pathVector.begin(), pathVector.end()); + } + else + { + mlpack::Log::Warn << "The " << path << "Doesn't exist." << std::endl; + } + } }; + #endif