diff --git a/.ci/linux-steps.yaml b/.ci/linux-steps.yaml
index 3580b6a9..d22869b8 100644
--- a/.ci/linux-steps.yaml
+++ b/.ci/linux-steps.yaml
@@ -45,7 +45,7 @@ steps:
displayName: 'Build models'
# Run CTests.
-- script: cd build/tests/ && sudo CTEST_OUTPUT_ON_FAILURE=1 ctest -R UtilsTest
+- script: cd build/tests/ && sudo CTEST_OUTPUT_ON_FAILURE=1 ctest -T Test .
displayName: 'Run tests via ctest'
# Publish test results to Azure Pipelines
diff --git a/.ci/macos-steps.yaml b/.ci/macos-steps.yaml
index f4d87f64..d867d975 100644
--- a/.ci/macos-steps.yaml
+++ b/.ci/macos-steps.yaml
@@ -37,7 +37,7 @@ steps:
displayName: 'Build models'
# Run CTests.
-- script: cd build/tests/ && sudo CTEST_OUTPUT_ON_FAILURE=1 ctest -R UtilsTest
+- script: cd build/tests/ && sudo CTEST_OUTPUT_ON_FAILURE=1 ctest -T Test .
displayName: 'Run tests via ctest'
# Publish test results to Azure Pipelines
diff --git a/.ci/windows-steps.yaml b/.ci/windows-steps.yaml
index 7842df4d..57e1be52 100644
--- a/.ci/windows-steps.yaml
+++ b/.ci/windows-steps.yaml
@@ -134,7 +134,7 @@ steps:
# Run tests via ctest.
- bash: |
cd build/tests
- CTEST_OUTPUT_ON_FAILURE=1 ctest -T Test -C Release -R UtilsTest
+ CTEST_OUTPUT_ON_FAILURE=1 ctest -T Test -C Release .
displayName: 'Run tests via ctest'
# Publish test results to Azure Pipelines
diff --git a/.gitignore b/.gitignore
index 55454996..609ef60d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,5 +4,8 @@ xcode*
.idea
cmake-build-*
*.csv
+*.tar
+*.zip
+*.tar.gz
.travis/configs.hpp
Testing/*
diff --git a/data/annotations/2007_000027.xml b/data/annotations/2007_000027.xml
new file mode 100755
index 00000000..576da530
--- /dev/null
+++ b/data/annotations/2007_000027.xml
@@ -0,0 +1,63 @@
+
+ VOC2012
+ 2007_000027.jpg
+
+
+ 486
+ 500
+ 3
+
+ 0
+
+
diff --git a/data/annotations/2007_000032.xml b/data/annotations/2007_000032.xml
new file mode 100755
index 00000000..779abb63
--- /dev/null
+++ b/data/annotations/2007_000032.xml
@@ -0,0 +1,63 @@
+
+ VOC2012
+ 2007_000032.jpg
+
+
+ 500
+ 281
+ 3
+
+ 1
+
+
+
+
+
diff --git a/dataloader/dataloader.hpp b/dataloader/dataloader.hpp
index 12093a4d..4b3249b2 100644
--- a/dataloader/dataloader.hpp
+++ b/dataloader/dataloader.hpp
@@ -10,10 +10,13 @@
* http://www.opensource.org/licenses/BSD-3-Clause for more information.
*/
#include
+#include
#include
#include
-#include
+#include
#include
+#include
+#include
#include
#include
@@ -117,6 +120,28 @@ class DataLoader
std::vector(),
const double augmentationProbability = 0.2);
+ /**
+ * Loads object detection dataset. It requires a single annotation file in xml format.
+ * Each XML file should correspond to a single image in images folder.
+ *
+ * XML file should containg the following :
+ * 1. Each XML file should be wrapped in annotation tag.
+ * 2. Filename of image in images folder will be depicted by filename tag.
+ * 3. Object tag depicting characteristics of bounding box.
+ * 4. Each object tag should contain name tag i.e. class of the object.
+ * 5. Each object tag should contain bndbox tag containing xmin, ymin, xmax, ymax.
+ *
+ * NOTE : Labels are assigned using lexicographically. Set verbose to 1 to print labels
+ * and their corresponding class.
+ *
+ * @param pathToAnnotations Path to the folder containg xml type annotation files.
+ * @param pathToImages Path to folder containing images corresponding to annotations.
+ * @param absolutePath Boolean to determine if absolute path is used. Defaults to false.
+ */
+ void LoadObjectDetectionDataset(const std::string& pathToAnnotations,
+ const std::string& pathToImages,
+ const bool absolutePath = false);
+
//! Get the training dataset features.
DatasetX TrainFeatures() const { return trainFeatures; }
@@ -179,11 +204,30 @@ class DataLoader
*/
void DownloadDataset(const std::string& dataset)
{
+ if (datasetMap[dataset].zipFile && (!Utils::PathExists(
+ datasetMap[dataset].trainPath) ||
+ !Utils::PathExists(datasetMap[dataset].testPath)))
+ {
+ Utils::DownloadFile(datasetMap[dataset].datasetURL,
+ datasetMap[dataset].datasetPath, dataset + "_training_data.",
+ false, false, datasetMap[dataset].serverName,
+ datasetMap[dataset].zipFile);
+
+ if (!Utils::CompareCRC32(datasetMap[dataset].datasetPath,
+ datasetMap[dataset].datasetHash))
+ {
+ mlpack::Log::Fatal << "Corrupted Data for " << dataset <<
+ " downloaded." << std::endl;
+ }
+
+ return;
+ }
+
if (!Utils::PathExists(datasetMap[dataset].trainPath))
{
- Utils::DownloadFile(datasetMap[dataset].trainDownloadUrl,
+ Utils::DownloadFile(datasetMap[dataset].trainDownloadURL,
datasetMap[dataset].trainPath, dataset + "_training_data.",
- false);
+ false, false, datasetMap[dataset].serverName);
if (!Utils::CompareCRC32(datasetMap[dataset].trainPath,
datasetMap[dataset].trainHash))
@@ -192,11 +236,12 @@ class DataLoader
dataset << " downloaded." << std::endl;
}
}
+
if (!Utils::PathExists(datasetMap[dataset].testPath))
{
- Utils::DownloadFile(datasetMap[dataset].trainDownloadUrl,
+ Utils::DownloadFile(datasetMap[dataset].trainDownloadURL,
datasetMap[dataset].testPath, dataset + "_testing_data.",
- false);
+ false, false, datasetMap[dataset].serverName);
if (!Utils::CompareCRC32(datasetMap[dataset].testPath,
datasetMap[dataset].testHash))
diff --git a/dataloader/dataloader_impl.hpp b/dataloader/dataloader_impl.hpp
index 40a23454..e423fc34 100644
--- a/dataloader/dataloader_impl.hpp
+++ b/dataloader/dataloader_impl.hpp
@@ -106,13 +106,6 @@ template<
arma::mat trainDataset, validDataset;
data::Split(dataset, trainDataset, validDataset, ratio, shuffle);
- if (useScaler)
- {
- scaler.Fit(trainDataset);
- scaler.Transform(trainDataset, trainDataset);
- scaler.Transform(validDataset, validDataset);
- }
-
trainFeatures = trainDataset.rows(WrapIndex(startInputFeatures,
trainDataset.n_rows), WrapIndex(endInputFeatures,
trainDataset.n_rows));
@@ -125,10 +118,16 @@ template<
validDataset.n_rows), WrapIndex(endInputFeatures,
validDataset.n_rows));
- validLabels = trainDataset.rows(WrapIndex(startPredictionFeatures,
+ validLabels = validDataset.rows(WrapIndex(startPredictionFeatures,
validDataset.n_rows), WrapIndex(endPredictionFeatures,
validDataset.n_rows));
+ if (useScaler)
+ {
+ scaler.Fit(trainFeatures);
+ scaler.Transform(trainFeatures, trainFeatures);
+ scaler.Transform(validFeatures, validFeatures);
+ }
// TODO : Add support for augmentation here.
mlpack::Log::Info << "Training Dataset Loaded." << std::endl;
}
@@ -145,4 +144,37 @@ template<
}
}
+template<
+ typename DatasetX,
+ typename DatasetY,
+ class ScalerType
+> void DataLoader<
+ DatasetX, DatasetY, ScalerType
+>::LoadObjectDetectionDataset(const std::string& pathToAnnotations,
+ const std::string& pathToImages,
+ const bool absolutePath)
+{
+ std::vector annotationsDirectory, imagesDirectory;
+
+ // Fill the directory.
+ Utils::ListDir(pathToAnnotations, annotationsDirectory, absolutePath);
+ Utils::ListDir(pathToImages, imagesDirectory, absolutePath);
+
+ // Read the xml file.
+ for (boost::filesystem::path annotationFile : annotationsDirectory)
+ {
+ // Read the xml file.
+ boost::property_tree::ptree annotation;
+ std::cout << annotationFile.string() << std::endl;
+ boost::property_tree::read_xml(annotationFile.string(), annotation);
+
+ // Read properties inside annotation file.
+ BOOST_FOREACH (boost::property_tree::ptree::value_type const& object,
+ annotation.get_child("annotation.object"))
+ {
+ std::cout << object.first << std::endl;
+ }
+ }
+}
+
#endif
diff --git a/dataloader/datasets.hpp b/dataloader/datasets.hpp
index a0206b8f..57b8b2a3 100644
--- a/dataloader/datasets.hpp
+++ b/dataloader/datasets.hpp
@@ -26,15 +26,47 @@ template<
>
struct DatasetDetails
{
+ //! Locally stored name of dataset used for identification
+ //! during dataloader call.
std::string datasetName;
- std::string trainDownloadUrl;
- std::string testDownloadUrl;
+
+ //! Locally stored URL for downloading training data.
+ std::string trainDownloadURL;
+
+ //! Locally stored URL for downloading testing data.
+ std::string testDownloadURL;
+
+ //! CRC-32 checksum for training data file.
std::string trainHash;
+
+ //! CRC-32 checksum for testing data file.
std::string testHash;
+
+ //! Locally stored boolean to determine if dataset is of CSV or similar
+ //! format.
bool loadCSV;
+
+ //! Locally stored path to file / directory for training data.
std::string trainPath;
+
+ //! Locally stored path to file / directory for testing data.
std::string testPath;
+ //! Locally held boolean to determine whether dataset will be in zip format.
+ bool zipFile;
+
+ //! Locally stored URL for downloading dataset.
+ std::string datasetURL;
+
+ //! Locally stored CRC-32 checksum for the dataset.
+ std::string datasetHash;
+
+ //! Locally stored path for saving the archived / zip dataset.
+ std::string datasetPath;
+
+ //! Locally stored server name for download file.
+ std::string serverName;
+
// Pre-Process functor.
std::function PreProcess;
@@ -61,13 +93,18 @@ struct DatasetDetails
// Default constructor.
DatasetDetails() :
datasetName(""),
- trainDownloadUrl(""),
- testDownloadUrl(""),
+ trainDownloadURL(""),
+ testDownloadURL(""),
trainHash(""),
testHash(""),
loadCSV(false),
trainPath(""),
testPath(""),
+ zipFile(false),
+ datasetURL(""),
+ datasetPath(""),
+ datasetHash(""),
+ serverName("www.mlpack.org"),
startTrainingInputFeatures(0),
endTrainingInputFeatures(0),
startTrainingPredictionFeatures(0),
@@ -77,23 +114,85 @@ struct DatasetDetails
dropHeader(false)
{/* Nothing to do here. */}
- // Constructor for initializing object.
+ /**
+ * Constructor for initializing object for seperate
+ * train and test download URL.
+ *
+ * @param datasetName Name of dataset used for identification during
+ * dataloader call.
+ * @param trainDownloadURL URL for downloading training data.
+ * @param testDownloadURL URL for downloading testing data.
+ * @param trainHash CRC-32 checksum for training data.
+ * @param testHash CRC-32 checksum for testing data.
+ * @param loadCSV Determines if the format of dataset is similar to CSV.
+ * @param trainPath Path for training dataset.
+ * @param testPath Path for testing dataset.
+ */
DatasetDetails(const std::string& datasetName,
- const std::string& trainDownloadUrl,
- const std::string& testDownloadUrl,
+ const std::string& trainDownloadURL,
+ const std::string& testDownloadURL,
const std::string& trainHash,
const std::string& testHash,
const bool loadCSV,
const std::string& trainPath,
const std::string& testPath) :
datasetName(datasetName),
- trainDownloadUrl(trainDownloadUrl),
- testDownloadUrl(testDownloadUrl),
+ trainDownloadURL(trainDownloadURL),
+ testDownloadURL(testDownloadURL),
trainHash(trainHash),
testHash(testHash),
loadCSV(loadCSV),
trainPath(trainPath),
testPath(testPath),
+ zipFile(false),
+ datasetURL(""),
+ datasetHash(""),
+ serverName("www.mlpack.org"),
+ startTrainingInputFeatures(0),
+ endTrainingInputFeatures(0),
+ startTrainingPredictionFeatures(0),
+ endTrainingPredictionFeatures(0),
+ startTestingInputFeatures(0),
+ endTestingInputFeatures(0),
+ dropHeader(false)
+ {
+ // Nothing to do here.
+ }
+
+ /**
+ * Constructor for initializing paths for zip files.
+ *
+ * @param datasetName Name of dataset used for identification during
+ * dataloader call.
+ * @param zipFile Boolean to determine if dataset is stored in zip format.
+ * @param datasetURL URL for downloading dataset.
+ * @param datasetPath Path where the dataset will be downloaded.
+ * @param datasetHash CRC-32 checksum for dataset.
+ * @param loadCSV Determines if the format of dataset is similar to CSV.
+ * @param trainPath Path for training dataset.
+ * @param testPath Path for testing dataset.
+ */
+ DatasetDetails(const std::string& datasetName,
+ const bool zipFile,
+ const std::string& datasetURL,
+ const std::string& datasetPath,
+ const std::string& datasetHash,
+ const bool loadCSV,
+ const std::string& trainPath,
+ const std::string& testPath) :
+ datasetName(datasetName),
+ zipFile(zipFile),
+ datasetURL(datasetURL),
+ datasetHash(datasetHash),
+ datasetPath(datasetPath),
+ loadCSV(loadCSV),
+ trainPath(trainPath),
+ testPath(testPath),
+ trainDownloadURL(""),
+ testDownloadURL(""),
+ trainHash(""),
+ testHash(""),
+ serverName("www.mlpack.org"),
startTrainingInputFeatures(0),
endTrainingInputFeatures(0),
startTrainingPredictionFeatures(0),
@@ -119,14 +218,15 @@ template<
class Datasets
{
public:
+ //! Get details of MNIST Dataset.
const static DatasetDetails MNIST()
{
DatasetDetails mnistDetails(
"mnist",
- "/datasets/mnist_train.csv",
- "/datasets/mnist_test.csv",
- "772495e3",
- "8bcdb7e1",
+ true,
+ "/datasets/mnist.tar.gz",
+ "./../data/mnist.tar.gz",
+ "9fa4efe5",
true,
"./../data/mnist_train.csv",
"./../data/mnist_test.csv");
diff --git a/tests/dataloader_tests.cpp b/tests/dataloader_tests.cpp
index e435a61e..86c06630 100644
--- a/tests/dataloader_tests.cpp
+++ b/tests/dataloader_tests.cpp
@@ -50,6 +50,8 @@ BOOST_AUTO_TEST_CASE(CSVDataLoaderTest)
// Check for training dataset using tuples.
BOOST_REQUIRE_EQUAL(std::get<0>(irisDataloader.TrainSet()).n_cols, 75);
BOOST_REQUIRE_EQUAL(std::get<0>(irisDataloader.TrainSet()).n_rows, 4);
+
+ Utils::RemoveFile("./../data/iris.csv");
}
/**
@@ -57,12 +59,23 @@ BOOST_AUTO_TEST_CASE(CSVDataLoaderTest)
*/
BOOST_AUTO_TEST_CASE(MNISTDataLoaderTest)
{
+ /**
DataLoader<> dataloader("mnist", true, 0.80);
// Check for correct dimensions.
BOOST_REQUIRE_EQUAL(dataloader.TrainFeatures().n_cols, 784);
BOOST_REQUIRE_EQUAL(dataloader.TestFeatures().n_cols, 784);
BOOST_REQUIRE_EQUAL(dataloader.ValidFeatures().n_cols, 784);
BOOST_REQUIRE_EQUAL(dataloader.TrainFeatures().n_rows, 33600);
+ */
+}
+
+/**
+ * Simple Test for object detection dataloader.
+ */
+BOOST_AUTO_TEST_CASE(ObjectDetectionDataLoader)
+{
+ DataLoader<> dataloader;
+ dataloader.LoadObjectDetectionDataset("./../data/annotations/", "./../data");
}
BOOST_AUTO_TEST_SUITE_END();
diff --git a/tests/utils_tests.cpp b/tests/utils_tests.cpp
index 90b0131c..aa04fec7 100644
--- a/tests/utils_tests.cpp
+++ b/tests/utils_tests.cpp
@@ -74,4 +74,17 @@ BOOST_AUTO_TEST_CASE(RemoveFileTest)
BOOST_REQUIRE_EQUAL(Utils::PathExists("./../data/file.txt"), 0);
}
+BOOST_AUTO_TEST_CASE(ExtractFilesTest)
+{
+ Utils::DownloadFile("/datasets/mnist.tar.gz", "./../data/mnist.tar.gz", "",
+ false, true, "www.mlpack.org", true, "./../data/");
+
+ BOOST_REQUIRE(Utils::PathExists("./../data/mnist_all.csv"));
+ BOOST_REQUIRE(Utils::PathExists("./../data/mnist.tar.gz"));
+
+ // Clean up.
+ Utils::RemoveFile("./../data/mnist_all.csv");
+ Utils::RemoveFile("./../data/mnist_all_centroids.csv");
+}
+
BOOST_AUTO_TEST_SUITE_END();
diff --git a/utils/utils.hpp b/utils/utils.hpp
index 903aa201..6709d7dc 100644
--- a/utils/utils.hpp
+++ b/utils/utils.hpp
@@ -43,6 +43,47 @@ class Utils
return (stat(filePath.c_str(), &buffer) == 0);
}
+ /**
+ * Uzips any supported tar file.
+ *
+ * @param pathToArchive Path to where the tar file is stored.
+ * @param pathForExtraction Path where files will be extracted.
+ * @param absolutePath Boolean to determine if path is absolute or relative.
+ */
+ static int ExtractFiles(const std::string pathToArchive,
+ const std::string pathForExtraction,
+ const bool absolutePath = false)
+ {
+ std::string command = "tar -xvzf ";
+ if (!absolutePath)
+ {
+ #ifdef _WIN32
+ std::string pathToArchiveTemp(pathToArchive);
+ std::string pathForExtractionTemp(pathForExtraction);
+ std::replace(pathToArchiveTemp.begin(), pathToArchiveTemp.end(), '/',
+ '\\');
+ std::replace(pathForExtractionTemp.begin(), pathForExtractionTemp.end(),
+ '/', '\\');
+
+ command = "tar --force-local -xvzf " +
+ boost::filesystem::current_path().string() + "\\" +
+ pathToArchiveTemp;
+ #else
+ command = command + boost::filesystem::current_path().string() + "/" +
+ pathToArchive + " -C " + boost::filesystem::current_path().string() +
+ "/" + pathForExtraction;
+ #endif
+ }
+ else
+ {
+ command = command + pathToArchive + " -C " + pathForExtraction;
+ }
+
+ // Run the command using system command.
+ std::system(command.c_str());
+ return 0;
+ }
+
/**
* Downloads files using boost asio.
*
@@ -55,6 +96,8 @@ class Utils
* @param absolutePath Boolean to determine if path is absolute or relative.
* @param silent Boolean to display details of file being downloaded.
* @param serverName Server to connect to, for downloading.
+ * @param zipFile Determines if dataset needs to be extracted or not.
+ * @param pathForExtraction Path where files will be extracted if zipFile is true.
* @returns 0 to determine success.
*/
static int DownloadFile(const std::string url,
@@ -63,7 +106,9 @@ class Utils
const bool absolutePath = false,
const bool silent = true,
const std::string serverName =
- "www.mlpack.org")
+ "www.mlpack.org",
+ const bool zipFile = false,
+ const std::string pathForExtraction = "./../data/")
{
// IO functionality by boost core.
boost::asio::io_service ioService;
@@ -151,6 +196,13 @@ class Utils
}
outputFile.close();
+
+ // Extract Files.
+ if (zipFile)
+ {
+ Utils::ExtractFiles(downloadPath, pathForExtraction);
+ }
+
return 0;
}
@@ -213,5 +265,36 @@ class Utils
return 0;
}
+
+ /**
+ * Fills a vector with paths to all files in directory.
+ *
+ * @param path Path to Directory.
+ * @param pathVector A vector of type filesystem::path, which will be filled
+ * paths for all files / folders in given directory path.
+ * @param absolutePath Boolean to determine if path is absolute or relative.
+ */
+ static void ListDir(const std::string& path,
+ std::vector& pathVector,
+ const bool absolutePath = false)
+ {
+ if (Utils::PathExists(path, absolutePath))
+ {
+ boost::filesystem::path directoryPath(path);
+
+ // Fill the path vector with respective paths.
+ std::copy(boost::filesystem::directory_iterator(directoryPath),
+ boost::filesystem::directory_iterator(),
+ std::back_inserter(pathVector));
+
+ // Sort the path vector.
+ std::sort(pathVector.begin(), pathVector.end());
+ }
+ else
+ {
+ mlpack::Log::Warn << "The " << path << "Doesn't exist." << std::endl;
+ }
+ }
};
+
#endif