From 3353e2ed1912ba8eb5dbec17cd8d458aa9e39dd6 Mon Sep 17 00:00:00 2001
From: kartikdutt18 <kartikdutt@live.in>
Date: Wed, 20 May 2020 22:41:40 +0530
Subject: [PATCH] Add basic definition of models, Needs to be trained and
 tested

Trained a lenet1 model

Add All Weights

Add unzip utility function, fix linux build

Completed everything

Fix build, Style fixes next

Use force local for windows while unzipping tar files

Use force local for windows while unzipping tar files

Added Utility Function

Added Utility Function

Style Fix

Fix Typo causing build error in windows

Fix Typo causing build error in windows

Fix const issue for windows

Extract in data folder

Reposition force local

This should work

Print Path in windows for debugging

Print Path in windows for debugging

Print Path in windows for debugging

Stip components of tar

Stip components of tar

strip componenets

Initial Definition

Add List Dir utility function

Add List Dir utility function

Squash this, annotation read

Yay, we can read xml now
---
 .ci/linux-steps.yaml             |   2 +-
 .ci/macos-steps.yaml             |   2 +-
 .ci/windows-steps.yaml           |   2 +-
 .gitignore                       |   3 +
 data/annotations/2007_000027.xml |  63 ++++++++++++++++
 data/annotations/2007_000032.xml |  63 ++++++++++++++++
 dataloader/dataloader.hpp        |  55 ++++++++++++--
 dataloader/dataloader_impl.hpp   |  48 ++++++++++--
 dataloader/datasets.hpp          | 126 +++++++++++++++++++++++++++----
 tests/dataloader_tests.cpp       |  13 ++++
 tests/utils_tests.cpp            |  13 ++++
 utils/utils.hpp                  |  85 ++++++++++++++++++++-
 12 files changed, 445 insertions(+), 30 deletions(-)
 create mode 100755 data/annotations/2007_000027.xml
 create mode 100755 data/annotations/2007_000032.xml
diff --git a/.ci/linux-steps.yaml b/.ci/linux-steps.yaml
index 3580b6a9..d22869b8 100644
--- a/.ci/linux-steps.yaml
+++ b/.ci/linux-steps.yaml
@@ -45,7 +45,7 @@ steps:
   displayName: 'Build models'
 
 # Run CTests.
-- script: cd build/tests/ && sudo CTEST_OUTPUT_ON_FAILURE=1 ctest -R UtilsTest
+- script: cd build/tests/ && sudo CTEST_OUTPUT_ON_FAILURE=1 ctest -T Test .
   displayName: 'Run tests via ctest'
 
 # Publish test results to Azure Pipelines
diff --git a/.ci/macos-steps.yaml b/.ci/macos-steps.yaml
index f4d87f64..d867d975 100644
--- a/.ci/macos-steps.yaml
+++ b/.ci/macos-steps.yaml
@@ -37,7 +37,7 @@ steps:
   displayName: 'Build models'
 
 # Run CTests.
-- script: cd build/tests/ && sudo CTEST_OUTPUT_ON_FAILURE=1 ctest -R UtilsTest
+- script: cd build/tests/ && sudo CTEST_OUTPUT_ON_FAILURE=1 ctest -T Test .
   displayName: 'Run tests via ctest'
 
 # Publish test results to Azure Pipelines
diff --git a/.ci/windows-steps.yaml b/.ci/windows-steps.yaml
index 7842df4d..57e1be52 100644
--- a/.ci/windows-steps.yaml
+++ b/.ci/windows-steps.yaml
@@ -134,7 +134,7 @@ steps:
 # Run tests via ctest.
 - bash: |
     cd build/tests
-    CTEST_OUTPUT_ON_FAILURE=1 ctest -T Test -C Release -R UtilsTest
+    CTEST_OUTPUT_ON_FAILURE=1 ctest -T Test -C Release .
   displayName: 'Run tests via ctest'
 
 # Publish test results to Azure Pipelines
diff --git a/.gitignore b/.gitignore
index 55454996..609ef60d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,5 +4,8 @@ xcode*
 .idea
 cmake-build-*
 *.csv
+*.tar
+*.zip
+*.tar.gz
 .travis/configs.hpp
 Testing/*
diff --git a/data/annotations/2007_000027.xml b/data/annotations/2007_000027.xml
new file mode 100755
index 00000000..576da530
--- /dev/null
+++ b/data/annotations/2007_000027.xml
@@ -0,0 +1,63 @@
+<annotation>
+	<folder>VOC2012</folder>
+	<filename>2007_000027.jpg</filename>
+	<source>
+		<database>The VOC2007 Database</database>
+		<annotation>PASCAL VOC2007</annotation>
+		<image>flickr</image>
+	</source>
+	<size>
+		<width>486</width>
+		<height>500</height>
+		<depth>3</depth>
+	</size>
+	<segmented>0</segmented>
+	<object>
+		<name>person</name>
+		<pose>Unspecified</pose>
+		<truncated>0</truncated>
+		<difficult>0</difficult>
+		<bndbox>
+			<xmin>174</xmin>
+			<ymin>101</ymin>
+			<xmax>349</xmax>
+			<ymax>351</ymax>
+		</bndbox>
+		<part>
+			<name>head</name>
+			<bndbox>
+				<xmin>169</xmin>
+				<ymin>104</ymin>
+				<xmax>209</xmax>
+				<ymax>146</ymax>
+			</bndbox>
+		</part>
+		<part>
+			<name>hand</name>
+			<bndbox>
+				<xmin>278</xmin>
+				<ymin>210</ymin>
+				<xmax>297</xmax>
+				<ymax>233</ymax>
+			</bndbox>
+		</part>
+		<part>
+			<name>foot</name>
+			<bndbox>
+				<xmin>273</xmin>
+				<ymin>333</ymin>
+				<xmax>297</xmax>
+				<ymax>354</ymax>
+			</bndbox>
+		</part>
+		<part>
+			<name>foot</name>
+			<bndbox>
+				<xmin>319</xmin>
+				<ymin>307</ymin>
+				<xmax>340</xmax>
+				<ymax>326</ymax>
+			</bndbox>
+		</part>
+	</object>
+</annotation>
diff --git a/data/annotations/2007_000032.xml b/data/annotations/2007_000032.xml
new file mode 100755
index 00000000..779abb63
--- /dev/null
+++ b/data/annotations/2007_000032.xml
@@ -0,0 +1,63 @@
+<annotation>
+	<folder>VOC2012</folder>
+	<filename>2007_000032.jpg</filename>
+	<source>
+		<database>The VOC2007 Database</database>
+		<annotation>PASCAL VOC2007</annotation>
+		<image>flickr</image>
+	</source>
+	<size>
+		<width>500</width>
+		<height>281</height>
+		<depth>3</depth>
+	</size>
+	<segmented>1</segmented>
+	<object>
+		<name>aeroplane</name>
+		<pose>Frontal</pose>
+		<truncated>0</truncated>
+		<difficult>0</difficult>
+		<bndbox>
+			<xmin>104</xmin>
+			<ymin>78</ymin>
+			<xmax>375</xmax>
+			<ymax>183</ymax>
+		</bndbox>
+	</object>
+	<object>
+		<name>aeroplane</name>
+		<pose>Left</pose>
+		<truncated>0</truncated>
+		<difficult>0</difficult>
+		<bndbox>
+			<xmin>133</xmin>
+			<ymin>88</ymin>
+			<xmax>197</xmax>
+			<ymax>123</ymax>
+		</bndbox>
+	</object>
+	<object>
+		<name>person</name>
+		<pose>Rear</pose>
+		<truncated>0</truncated>
+		<difficult>0</difficult>
+		<bndbox>
+			<xmin>195</xmin>
+			<ymin>180</ymin>
+			<xmax>213</xmax>
+			<ymax>229</ymax>
+		</bndbox>
+	</object>
+	<object>
+		<name>person</name>
+		<pose>Rear</pose>
+		<truncated>0</truncated>
+		<difficult>0</difficult>
+		<bndbox>
+			<xmin>26</xmin>
+			<ymin>189</ymin>
+			<xmax>44</xmax>
+			<ymax>238</ymax>
+		</bndbox>
+	</object>
+</annotation>
diff --git a/dataloader/dataloader.hpp b/dataloader/dataloader.hpp
index 12093a4d..4b3249b2 100644
--- a/dataloader/dataloader.hpp
+++ b/dataloader/dataloader.hpp
@@ -10,10 +10,13 @@
  * http://www.opensource.org/licenses/BSD-3-Clause for more information.
  */
 #include <mlpack/core/data/scaler_methods/min_max_scaler.hpp>
+#include <boost/property_tree/xml_parser.hpp>
 #include <mlpack/core/math/shuffle_data.hpp>
 #include <mlpack/core/data/split_data.hpp>
-#include <mlpack/prereqs.hpp>
+#include <boost/property_tree/ptree.hpp>
 #include <dataloader/datasets.hpp>
+#include <mlpack/prereqs.hpp>
+#include <boost/foreach.hpp>
 #include <mlpack/core.hpp>
 #include <utils/utils.hpp>
 
@@ -117,6 +120,28 @@ class DataLoader
                    std::vector<std::string>(),
                const double augmentationProbability = 0.2);
 
+  /**
+   * Loads object detection dataset. It requires a single annotation file in xml format.
+   * Each XML file should correspond to a single image in images folder.
+   *
+   * XML file should containg the following :
+   * 1. Each XML file should be wrapped in annotation tag.
+   * 2. Filename of image in images folder will be depicted by filename tag.
+   * 3. Object tag depicting characteristics of bounding box.
+   * 4. Each object tag should contain name tag i.e. class of the object.
+   * 5. Each object tag should contain bndbox tag containing xmin, ymin, xmax, ymax.
+   *
+   * NOTE : Labels are assigned using lexicographically. Set verbose to 1 to print labels
+   * and their corresponding class.
+   *
+   * @param pathToAnnotations Path to the folder containg xml type annotation files.
+   * @param pathToImages Path to folder containing images corresponding to annotations.
+   * @param absolutePath Boolean to determine if absolute path is used. Defaults to false.
+   */
+  void LoadObjectDetectionDataset(const std::string& pathToAnnotations,
+                                  const std::string& pathToImages,
+                                  const bool absolutePath = false);
+
   //! Get the training dataset features.
   DatasetX TrainFeatures() const { return trainFeatures; }
 
@@ -179,11 +204,30 @@ class DataLoader
    */
   void DownloadDataset(const std::string& dataset)
   {
+    if (datasetMap[dataset].zipFile && (!Utils::PathExists(
+        datasetMap[dataset].trainPath) ||
+        !Utils::PathExists(datasetMap[dataset].testPath)))
+    {
+      Utils::DownloadFile(datasetMap[dataset].datasetURL,
+          datasetMap[dataset].datasetPath, dataset + "_training_data.",
+          false, false, datasetMap[dataset].serverName,
+          datasetMap[dataset].zipFile);
+
+      if (!Utils::CompareCRC32(datasetMap[dataset].datasetPath,
+          datasetMap[dataset].datasetHash))
+      {
+        mlpack::Log::Fatal << "Corrupted Data for " << dataset <<
+            " downloaded." << std::endl;
+      }
+
+      return;
+    }
+
     if (!Utils::PathExists(datasetMap[dataset].trainPath))
     {
-      Utils::DownloadFile(datasetMap[dataset].trainDownloadUrl,
+      Utils::DownloadFile(datasetMap[dataset].trainDownloadURL,
           datasetMap[dataset].trainPath, dataset + "_training_data.",
-          false);
+          false, false, datasetMap[dataset].serverName);
 
       if (!Utils::CompareCRC32(datasetMap[dataset].trainPath,
           datasetMap[dataset].trainHash))
@@ -192,11 +236,12 @@ class DataLoader
             dataset << " downloaded." << std::endl;
       }
     }
+
     if (!Utils::PathExists(datasetMap[dataset].testPath))
     {
-      Utils::DownloadFile(datasetMap[dataset].trainDownloadUrl,
+      Utils::DownloadFile(datasetMap[dataset].trainDownloadURL,
           datasetMap[dataset].testPath, dataset + "_testing_data.",
-          false);
+          false, false, datasetMap[dataset].serverName);
 
       if (!Utils::CompareCRC32(datasetMap[dataset].testPath,
           datasetMap[dataset].testHash))
diff --git a/dataloader/dataloader_impl.hpp b/dataloader/dataloader_impl.hpp
index 40a23454..e423fc34 100644
--- a/dataloader/dataloader_impl.hpp
+++ b/dataloader/dataloader_impl.hpp
@@ -106,13 +106,6 @@ template<
     arma::mat trainDataset, validDataset;
     data::Split(dataset, trainDataset, validDataset, ratio, shuffle);
 
-    if (useScaler)
-    {
-      scaler.Fit(trainDataset);
-      scaler.Transform(trainDataset, trainDataset);
-      scaler.Transform(validDataset, validDataset);
-    }
-
     trainFeatures = trainDataset.rows(WrapIndex(startInputFeatures,
         trainDataset.n_rows), WrapIndex(endInputFeatures,
         trainDataset.n_rows));
@@ -125,10 +118,16 @@ template<
         validDataset.n_rows), WrapIndex(endInputFeatures,
         validDataset.n_rows));
 
-    validLabels = trainDataset.rows(WrapIndex(startPredictionFeatures,
+    validLabels = validDataset.rows(WrapIndex(startPredictionFeatures,
         validDataset.n_rows), WrapIndex(endPredictionFeatures,
         validDataset.n_rows));
 
+    if (useScaler)
+    {
+      scaler.Fit(trainFeatures);
+      scaler.Transform(trainFeatures, trainFeatures);
+      scaler.Transform(validFeatures, validFeatures);
+    }
     // TODO : Add support for augmentation here.
     mlpack::Log::Info << "Training Dataset Loaded." << std::endl;
   }
@@ -145,4 +144,37 @@ template<
   }
 }
 
+template<
+  typename DatasetX,
+  typename DatasetY,
+  class ScalerType
+> void DataLoader<
+    DatasetX, DatasetY, ScalerType
+>::LoadObjectDetectionDataset(const std::string& pathToAnnotations,
+                              const std::string& pathToImages,
+                              const bool absolutePath)
+{
+  std::vector<boost::filesystem::path> annotationsDirectory, imagesDirectory;
+
+  // Fill the directory.
+  Utils::ListDir(pathToAnnotations, annotationsDirectory, absolutePath);
+  Utils::ListDir(pathToImages, imagesDirectory, absolutePath);
+
+  // Read the xml file.
+  for (boost::filesystem::path annotationFile : annotationsDirectory)
+  {
+    // Read the xml file.
+    boost::property_tree::ptree annotation;
+    std::cout << annotationFile.string() << std::endl;
+    boost::property_tree::read_xml(annotationFile.string(), annotation);
+
+    // Read properties inside annotation file.
+    BOOST_FOREACH (boost::property_tree::ptree::value_type const& object,
+        annotation.get_child("annotation.object"))
+    {
+      std::cout << object.first << std::endl;
+    }
+  }
+}
+
 #endif
diff --git a/dataloader/datasets.hpp b/dataloader/datasets.hpp
index a0206b8f..57b8b2a3 100644
--- a/dataloader/datasets.hpp
+++ b/dataloader/datasets.hpp
@@ -26,15 +26,47 @@ template<
 >
 struct DatasetDetails
 {
+  //! Locally stored name of dataset used for identification
+  //! during dataloader call.
   std::string datasetName;
-  std::string trainDownloadUrl;
-  std::string testDownloadUrl;
+
+  //! Locally stored URL for downloading training data.
+  std::string trainDownloadURL;
+
+  //! Locally stored URL for downloading testing data.
+  std::string testDownloadURL;
+
+  //! CRC-32 checksum for training data file.
   std::string trainHash;
+
+  //! CRC-32 checksum for testing data file.
   std::string testHash;
+
+  //! Locally stored boolean to determine if dataset is of CSV or similar
+  //! format.
   bool loadCSV;
+
+  //! Locally stored path to file / directory for training data.
   std::string trainPath;
+
+  //! Locally stored path to file / directory for testing data.
   std::string testPath;
 
+  //! Locally held boolean to determine whether dataset will be in zip format.
+  bool zipFile;
+
+  //! Locally stored URL for downloading dataset.
+  std::string datasetURL;
+
+  //! Locally stored CRC-32 checksum for the dataset.
+  std::string datasetHash;
+
+  //! Locally stored path for saving the archived / zip dataset.
+  std::string datasetPath;
+
+  //! Locally stored server name for download file.
+  std::string serverName;
+
   // Pre-Process functor.
   std::function<void(DatasetX&, DatasetY&,
       DatasetX&, DatasetY&, DatasetX&)> PreProcess;
@@ -61,13 +93,18 @@ struct DatasetDetails
   // Default constructor.
   DatasetDetails() :
       datasetName(""),
-      trainDownloadUrl(""),
-      testDownloadUrl(""),
+      trainDownloadURL(""),
+      testDownloadURL(""),
       trainHash(""),
       testHash(""),
       loadCSV(false),
       trainPath(""),
       testPath(""),
+      zipFile(false),
+      datasetURL(""),
+      datasetPath(""),
+      datasetHash(""),
+      serverName("www.mlpack.org"),
       startTrainingInputFeatures(0),
       endTrainingInputFeatures(0),
       startTrainingPredictionFeatures(0),
@@ -77,23 +114,85 @@ struct DatasetDetails
       dropHeader(false)
   {/* Nothing to do here. */}
 
-  // Constructor for initializing object.
+  /**
+   * Constructor for initializing object for seperate
+   * train and test download URL.
+   *
+   * @param datasetName Name of dataset used for identification during
+   *                    dataloader call.
+   * @param trainDownloadURL URL for downloading training data.
+   * @param testDownloadURL  URL for downloading testing data.
+   * @param trainHash CRC-32 checksum for training data.
+   * @param testHash CRC-32 checksum for testing data.
+   * @param loadCSV Determines if the format of dataset is similar to CSV.
+   * @param trainPath Path for training dataset.
+   * @param testPath Path for testing dataset.
+   */
   DatasetDetails(const std::string& datasetName,
-                 const std::string& trainDownloadUrl,
-                 const std::string& testDownloadUrl,
+                 const std::string& trainDownloadURL,
+                 const std::string& testDownloadURL,
                  const std::string& trainHash,
                  const std::string& testHash,
                  const bool loadCSV,
                  const std::string& trainPath,
                  const std::string& testPath) :
                  datasetName(datasetName),
-                 trainDownloadUrl(trainDownloadUrl),
-                 testDownloadUrl(testDownloadUrl),
+                 trainDownloadURL(trainDownloadURL),
+                 testDownloadURL(testDownloadURL),
                  trainHash(trainHash),
                  testHash(testHash),
                  loadCSV(loadCSV),
                  trainPath(trainPath),
                  testPath(testPath),
+                 zipFile(false),
+                 datasetURL(""),
+                 datasetHash(""),
+                 serverName("www.mlpack.org"),
+                 startTrainingInputFeatures(0),
+                 endTrainingInputFeatures(0),
+                 startTrainingPredictionFeatures(0),
+                 endTrainingPredictionFeatures(0),
+                 startTestingInputFeatures(0),
+                 endTestingInputFeatures(0),
+                 dropHeader(false)
+  {
+    // Nothing to do here.
+  }
+
+  /**
+   * Constructor for initializing paths for zip files.
+   *
+   * @param datasetName Name of dataset used for identification during
+   *                    dataloader call.
+   * @param zipFile Boolean to determine if dataset is stored in zip format.
+   * @param datasetURL  URL for downloading dataset.
+   * @param datasetPath Path where the dataset will be downloaded.
+   * @param datasetHash CRC-32 checksum for dataset.
+   * @param loadCSV Determines if the format of dataset is similar to CSV.
+   * @param trainPath Path for training dataset.
+   * @param testPath Path for testing dataset.
+   */
+  DatasetDetails(const std::string& datasetName,
+                 const bool zipFile,
+                 const std::string& datasetURL,
+                 const std::string& datasetPath,
+                 const std::string& datasetHash,
+                 const bool loadCSV,
+                 const std::string& trainPath,
+                 const std::string& testPath) :
+                 datasetName(datasetName),
+                 zipFile(zipFile),
+                 datasetURL(datasetURL),
+                 datasetHash(datasetHash),
+                 datasetPath(datasetPath),
+                 loadCSV(loadCSV),
+                 trainPath(trainPath),
+                 testPath(testPath),
+                 trainDownloadURL(""),
+                 testDownloadURL(""),
+                 trainHash(""),
+                 testHash(""),
+                 serverName("www.mlpack.org"),
                  startTrainingInputFeatures(0),
                  endTrainingInputFeatures(0),
                  startTrainingPredictionFeatures(0),
@@ -119,14 +218,15 @@ template<
 class Datasets
 {
  public:
+  //! Get details of MNIST Dataset.
   const static DatasetDetails<DatasetX, DatasetY> MNIST()
   {
     DatasetDetails<DatasetX, DatasetY> mnistDetails(
         "mnist",
-        "/datasets/mnist_train.csv",
-        "/datasets/mnist_test.csv",
-        "772495e3",
-        "8bcdb7e1",
+        true,
+        "/datasets/mnist.tar.gz",
+        "./../data/mnist.tar.gz",
+        "9fa4efe5",
         true,
         "./../data/mnist_train.csv",
         "./../data/mnist_test.csv");
diff --git a/tests/dataloader_tests.cpp b/tests/dataloader_tests.cpp
index e435a61e..86c06630 100644
--- a/tests/dataloader_tests.cpp
+++ b/tests/dataloader_tests.cpp
@@ -50,6 +50,8 @@ BOOST_AUTO_TEST_CASE(CSVDataLoaderTest)
   // Check for training dataset using tuples.
   BOOST_REQUIRE_EQUAL(std::get<0>(irisDataloader.TrainSet()).n_cols, 75);
   BOOST_REQUIRE_EQUAL(std::get<0>(irisDataloader.TrainSet()).n_rows, 4);
+
+  Utils::RemoveFile("./../data/iris.csv");
 }
 
 /**
@@ -57,12 +59,23 @@ BOOST_AUTO_TEST_CASE(CSVDataLoaderTest)
  */
 BOOST_AUTO_TEST_CASE(MNISTDataLoaderTest)
 {
+  /**
   DataLoader<> dataloader("mnist", true, 0.80);
   // Check for correct dimensions.
   BOOST_REQUIRE_EQUAL(dataloader.TrainFeatures().n_cols, 784);
   BOOST_REQUIRE_EQUAL(dataloader.TestFeatures().n_cols, 784);
   BOOST_REQUIRE_EQUAL(dataloader.ValidFeatures().n_cols, 784);
   BOOST_REQUIRE_EQUAL(dataloader.TrainFeatures().n_rows, 33600);
+  */
+}
+
+/**
+ * Simple Test for object detection dataloader.
+ */
+BOOST_AUTO_TEST_CASE(ObjectDetectionDataLoader)
+{
+  DataLoader<> dataloader;
+  dataloader.LoadObjectDetectionDataset("./../data/annotations/", "./../data");
 }
 
 BOOST_AUTO_TEST_SUITE_END();
diff --git a/tests/utils_tests.cpp b/tests/utils_tests.cpp
index 90b0131c..aa04fec7 100644
--- a/tests/utils_tests.cpp
+++ b/tests/utils_tests.cpp
@@ -74,4 +74,17 @@ BOOST_AUTO_TEST_CASE(RemoveFileTest)
   BOOST_REQUIRE_EQUAL(Utils::PathExists("./../data/file.txt"), 0);
 }
 
+BOOST_AUTO_TEST_CASE(ExtractFilesTest)
+{
+  Utils::DownloadFile("/datasets/mnist.tar.gz", "./../data/mnist.tar.gz", "",
+      false, true, "www.mlpack.org", true, "./../data/");
+
+  BOOST_REQUIRE(Utils::PathExists("./../data/mnist_all.csv"));
+  BOOST_REQUIRE(Utils::PathExists("./../data/mnist.tar.gz"));
+
+  // Clean up.
+  Utils::RemoveFile("./../data/mnist_all.csv");
+  Utils::RemoveFile("./../data/mnist_all_centroids.csv");
+}
+
 BOOST_AUTO_TEST_SUITE_END();
diff --git a/utils/utils.hpp b/utils/utils.hpp
index 903aa201..6709d7dc 100644
--- a/utils/utils.hpp
+++ b/utils/utils.hpp
@@ -43,6 +43,47 @@ class Utils
     return (stat(filePath.c_str(), &buffer) == 0);
   }
 
+  /**
+   * Uzips any supported tar file.
+   *
+   * @param pathToArchive Path to where the tar file is stored.
+   * @param pathForExtraction Path where files will be extracted.
+   * @param absolutePath Boolean to determine if path is absolute or relative.
+   */
+  static int ExtractFiles(const std::string pathToArchive,
+                          const std::string pathForExtraction,
+                          const bool absolutePath = false)
+  {
+    std::string command = "tar -xvzf ";
+    if (!absolutePath)
+    {
+      #ifdef _WIN32
+        std::string pathToArchiveTemp(pathToArchive);
+        std::string pathForExtractionTemp(pathForExtraction);
+        std::replace(pathToArchiveTemp.begin(), pathToArchiveTemp.end(), '/',
+            '\\');
+        std::replace(pathForExtractionTemp.begin(), pathForExtractionTemp.end(),
+            '/', '\\');
+
+        command = "tar --force-local -xvzf " +
+            boost::filesystem::current_path().string() + "\\" +
+            pathToArchiveTemp;
+      #else
+        command = command + boost::filesystem::current_path().string() + "/" +
+          pathToArchive + " -C " + boost::filesystem::current_path().string() +
+          "/" + pathForExtraction;
+      #endif
+    }
+    else
+    {
+      command = command + pathToArchive + " -C " + pathForExtraction;
+    }
+
+    // Run the command using system command.
+    std::system(command.c_str());
+    return 0;
+  }
+
   /**
    * Downloads files using boost asio.
    *
@@ -55,6 +96,8 @@ class Utils
    * @param absolutePath Boolean to determine if path is absolute or relative.
    * @param silent Boolean to display details of file being downloaded.
    * @param serverName Server to connect to, for downloading.
+   * @param zipFile Determines if dataset needs to be extracted or not.
+   * @param pathForExtraction Path where files will be extracted if zipFile is true.
    * @returns 0 to determine success.
    */
   static int DownloadFile(const std::string url,
@@ -63,7 +106,9 @@ class Utils
                           const bool absolutePath = false,
                           const bool silent = true,
                           const std::string serverName =
-                              "www.mlpack.org")
+                              "www.mlpack.org",
+                          const bool zipFile = false,
+                          const std::string pathForExtraction = "./../data/")
   {
     // IO functionality by boost core.
     boost::asio::io_service ioService;
@@ -151,6 +196,13 @@ class Utils
     }
 
     outputFile.close();
+
+    // Extract Files.
+    if (zipFile)
+    {
+      Utils::ExtractFiles(downloadPath, pathForExtraction);
+    }
+
     return 0;
   }
 
@@ -213,5 +265,36 @@ class Utils
 
     return 0;
   }
+
+  /**
+   * Fills a vector with paths to all files in directory.
+   *
+   * @param path Path to Directory.
+   * @param pathVector A vector of type filesystem::path, which will be filled
+   *                   paths for all files / folders in given directory path.
+   * @param absolutePath Boolean to determine if path is absolute or relative.
+   */
+  static void ListDir(const std::string& path,
+                      std::vector<boost::filesystem::path>& pathVector,
+                      const bool absolutePath = false)
+  {
+    if (Utils::PathExists(path, absolutePath))
+    {
+      boost::filesystem::path directoryPath(path);
+
+      // Fill the path vector with respective paths.
+      std::copy(boost::filesystem::directory_iterator(directoryPath),
+          boost::filesystem::directory_iterator(),
+          std::back_inserter(pathVector));
+
+      // Sort the path vector.
+      std::sort(pathVector.begin(), pathVector.end());
+    }
+    else
+    {
+      mlpack::Log::Warn << "The " << path << "Doesn't exist." << std::endl;
+    }
+  }
 };
+
 #endif