diff --git a/.clang-format b/.clang-format
new file mode 100644
index 000000000..ed5e6cc42
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,3 @@
+BasedOnStyle: LLVM
+IndentWidth: 4
+ColumnLimit: 120
diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs
new file mode 100644
index 000000000..ab68da3aa
--- /dev/null
+++ b/.git-blame-ignore-revs
@@ -0,0 +1,12 @@
+# These commits are ignored by `git blame`.
+# https://git-scm.com/docs/git-blame
+# Run this command to configure git to use this file.
+# `$ git config blame.ignoreRevsFile .git-blame-ignore-revs`
+
+# clang-format src/
+cb999f20b6f2934ad7c94b10d2b02f6acf74aab4
+b8d91db545fba0f2e85070dc438d2447528b619e
+
+# clang-format test/
+9a1b93e4fea27e91d14c18821be2c940adf63bf4
+68f7925044ba8777f6a7f41bf5704915de13f608
diff --git a/.github/workflows/clang-format-check.yml b/.github/workflows/clang-format-check.yml
new file mode 100644
index 000000000..35ddc05b0
--- /dev/null
+++ b/.github/workflows/clang-format-check.yml
@@ -0,0 +1,20 @@
+name: check-clang-format
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v4
+    - uses: DoozyX/clang-format-lint-action@v0.18.1
+      with:
+        source: 'src test'
+        exclude: './third_party ./external'
+        extensions: 'h,cpp'
+        clangFormatVersion: 18.1.3
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e25b7642e..fb57026ab 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-cmake_minimum_required(VERSION 3.20)
+cmake_minimum_required(VERSION 3.25.2)
 
 # Build release version by default (override with -DCMAKE_BUILD_TYPE=Debug in your initial cmake invocation)
 # This needs to be set *before* the project() command
@@ -32,6 +32,7 @@ set(CMAKE_BUILD_WITH_INSTALL_NAME_DIR ON)
 set(CMAKE_CXX_STANDARD 20 CACHE STRING "C++ standard to conform to")
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+set(CMAKE_OPTIMIZE_DEPENDENCIES 1)
 
 set(CMAKE_CXX_FLAGS_DEBUG="${CMAKE_CXX_FLAGS_DEBUG} -g")
 set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3")
@@ -112,12 +113,17 @@ if(USE_PAPI)
 endif()
 ##########
 
+find_package(fmt)
+add_definitions(-DSPDLOG_FMT_EXTERNAL)
+
 option(USE_CUDA "Whether to activate compilation of CUDA features" OFF)
 include(CheckLanguage)
 check_language(CUDA)
 if(USE_CUDA AND CMAKE_CUDA_COMPILER)
     enable_language(CUDA)
     find_package(CUDAToolkit REQUIRED)
+    set(CMAKE_CUDA_STANDARD 20)
+    set(CMAKE_CUDA_STANDARD_REQUIRED ON)
 
     if(${CMAKE_COMPILER_IS_GNUCXX})
         set(GCC_EXPECTED_VERSION 11.3.0)
@@ -137,7 +143,7 @@ if(USE_CUDA AND CMAKE_CUDA_COMPILER)
     add_definitions(-DUSE_CUDA)
     message(STATUS "Note: disabled CUSPARSE_DEPRECATED in main CMakeLists.txt")
     add_definitions(-DDISABLE_CUSPARSE_DEPRECATED)
-    set(CMAKE_CUDA_STANDARD 17)
+    set(CMAKE_CUDA_STANDARD 20)
     set(CMAKE_CUDA_STANDARD_REQUIRED ON)
     message(STATUS  "CUDA enabled (version ${CMAKE_CUDA_COMPILER_VERSION})")
     if(DEFINED ENV{CUDAHOSTCXX})
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 56ba82fe0..5e27ad91b 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -79,9 +79,13 @@ That is, please try your best to make a good-quality contribution and we will he
    Please choose an expressive title and provide a short description of your changes.
    Feel free to mark your pull request "WIP: " or "Draft: " in the title.
    Note that you can add more commits to your pull request after you created it.
-7. You **receive feedback** on your proposed contribution.
+   Ideally, the changes in the PR contain only the changes you made for that PR,
+   e.g, by rebasing your branch on top of the target branch. This makes it easier for others to
+   review your PR.
+7. [Resolve any open conflicts](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/addressing-merge-conflicts/about-merge-conflicts) to the target branch of the PR.
+8. You **receive feedback** on your proposed contribution.
    You may be asked to apply certain changes, or we might apply straightforward adjustments ourselves before the integration.
-8. If it looks good (potentially after some help), **your contribution becomes a part of DAPHNE**.
+9. If it looks good (potentially after some help), **your contribution becomes a part of DAPHNE**.
 
 ### Experienced DAPHNE Contributors (Collaborators)
 
@@ -108,3 +112,18 @@ At the same time, this freedom comes with certain responsibilities, which are ro
    - actually merging a pull request in
    
    Balancing the handling of pull requests is important to *keep the development process scalable*.
+
+
+### Code Style
+
+Before contributing, please make sure to run `clang-format` on your C++ (.h and
+.cpp) files. The codebase is currently formatted with `clang-format` version
+`18.1.3`. This is the default `clang-format` version when installing via `apt`
+on Ubuntu 24.04, and can easily be installed via `python -mpip install clang-format==18.1.3`
+on other systems.
+We provide a `.clang-format` file at the root of the repository. Most text
+editors and IDEs will have some kind of integration for detecting that file
+and automatically applying `clang-format`. `git-clang-format` can be used to
+format staged files.
+For more information about `clang-format`, `git-clang-format` and text editor
+integration, please see [ClangFormat](https://clang.llvm.org/docs/ClangFormat.html).
diff --git a/build.sh b/build.sh
index 7b0813dcf..5bdf6203f 100755
--- a/build.sh
+++ b/build.sh
@@ -60,6 +60,7 @@ function printHelp {
     echo "  --fpgaopencl      Compile with support for Intel PAC D5005 FPGA"
     echo "  --mpi             Compile with support for MPI"
     echo "  --hdfs            Compile with support for HDFS"
+    echo "  --io_uring        Compile with support for io_uring"
     echo "  --no-papi         Compile without support for PAPI"
 }
 
@@ -451,6 +452,7 @@ BUILD_FPGAOPENCL="-DUSE_FPGAOPENCL=OFF"
 BUILD_DEBUG="-DCMAKE_BUILD_TYPE=Release"
 BUILD_MPI="-DUSE_MPI=OFF"
 BUILD_HDFS="-DUSE_HDFS=OFF"
+BUILD_IO_URING="-DUSE_IO_URING=OFF"
 BUILD_PAPI="-DUSE_PAPI=ON"
 WITH_DEPS=1
 WITH_SUBMODULE_UPDATE=1
@@ -504,6 +506,10 @@ while [[ $# -gt 0 ]]; do
         echo using HDFS
         export BUILD_HDFS="-DUSE_HDFS=ON"
         ;;
+    --io-uring)
+        echo using io_uring
+        export BUILD_IO_URING="-DUSE_IO_URING=ON"
+        ;;
     --no-papi)
         echo not using PAPI
         export BUILD_PAPI="-DUSE_PAPI=OFF"
@@ -655,7 +661,7 @@ if [ $BUILD_PAPI == "-DUSE_PAPI=ON" ]; then
     fi
 
     #------------------------------------------------------------------------------
-    # #8.3 Antlr4 (parser)
+    # Antlr4 (parser)
     #------------------------------------------------------------------------------
 
     antlrJarName="antlr-${antlrVersion}-complete.jar"
@@ -708,7 +714,7 @@ if [ $BUILD_PAPI == "-DUSE_PAPI=ON" ]; then
     fi
 
     #------------------------------------------------------------------------------
-    # #8.4 catch2 (unit test framework)
+    # catch2 (unit test framework)
     #------------------------------------------------------------------------------
     # Download catch2 release zip (if necessary), and unpack the single header file
     # (if necessary).
@@ -734,7 +740,7 @@ if [ $BUILD_PAPI == "-DUSE_PAPI=ON" ]; then
     fi
 
     #------------------------------------------------------------------------------
-    # #8.5 OpenBLAS (basic linear algebra subprograms)
+    # OpenBLAS (basic linear algebra subprograms)
     #------------------------------------------------------------------------------
 
     openBlasDirName="OpenBLAS-$openBlasVersion"
@@ -744,7 +750,7 @@ if [ $BUILD_PAPI == "-DUSE_PAPI=ON" ]; then
 
     if ! is_dependency_downloaded "${dep_openBlas[@]}"; then
         daphne_msg "Get OpenBlas version ${openBlasVersion}"
-        wget "https://github.com/xianyi/OpenBLAS/releases/download/v${openBlasVersion}/${openBlasZipName}" \
+        wget "https://github.com/OpenMathLib/OpenBLAS/releases/download/v${openBlasVersion}/${openBlasZipName}" \
             -qO "${cacheDir}/${openBlasZipName}"
         unzip -q "$cacheDir/$openBlasZipName" -d "$sourcePrefix"
         dependency_download_success "${dep_openBlas[@]}"
@@ -761,7 +767,7 @@ if [ $BUILD_PAPI == "-DUSE_PAPI=ON" ]; then
     fi
 
     #------------------------------------------------------------------------------
-    # #8.6 nlohmann/json (library for JSON parsing)
+    # nlohmann/json (library for JSON parsing)
     #------------------------------------------------------------------------------
 
     nlohmannjsonDirName=nlohmannjson
@@ -779,7 +785,7 @@ if [ $BUILD_PAPI == "-DUSE_PAPI=ON" ]; then
     fi
 
     #------------------------------------------------------------------------------
-    # #8.7 abseil (compiled separately to apply a patch)
+    # abseil (compiled separately to apply a patch)
     #------------------------------------------------------------------------------
 
     abslPath=$sourcePrefix/abseil-cpp
@@ -808,7 +814,7 @@ if [ $BUILD_PAPI == "-DUSE_PAPI=ON" ]; then
     fi
 
     #------------------------------------------------------------------------------
-    # #8.8 MPI (Default is MPI library is OpenMPI but cut can be any)
+    # MPI (Default is MPI library is OpenMPI but cut can be any)
     #------------------------------------------------------------------------------
 
     MPIZipName=openmpi-$openMPIVersion.tar.gz
@@ -834,7 +840,7 @@ if [ $BUILD_PAPI == "-DUSE_PAPI=ON" ]; then
     fi
 
     #------------------------------------------------------------------------------
-    # #8.9 gRPC
+    # gRPC
     #------------------------------------------------------------------------------
 
     grpcDirName="grpc"
@@ -875,7 +881,7 @@ if [ $BUILD_PAPI == "-DUSE_PAPI=ON" ]; then
     fi
 
     #------------------------------------------------------------------------------
-    # #8.10 Arrow / Parquet
+    # Arrow / Parquet
     #------------------------------------------------------------------------------
 
     arrowDirName="apache-arrow-$arrowVersion"
@@ -913,22 +919,44 @@ if [ $BUILD_PAPI == "-DUSE_PAPI=ON" ]; then
     fi
 
     #------------------------------------------------------------------------------
-    # 8.11 spdlog
+    # fmt
+    #------------------------------------------------------------------------------
+
+    fmtDirName="fmt-$fmtVersion"
+    fmtArtifactFileName=$fmtDirName.zip
+    if ! is_dependency_downloaded "fmt_v${fmtVersion}"; then
+        rm -rf "${sourcePrefix:?}/${fmtDirName}"
+        wget "https://github.com/fmtlib/fmt/releases/download/${fmtVersion}/$fmtArtifactFileName" -qO  "$cacheDir/$fmtArtifactFileName"
+        unzip -q "$cacheDir/$fmtArtifactFileName" -d "$sourcePrefix"
+        dependency_download_success "fmt_v${fmtVersion}"
+    fi
+    if ! is_dependency_installed "fmt_v${fmtVersion}"; then
+        cmake -G Ninja -S "${sourcePrefix}/${fmtDirName}" -B "${buildPrefix}/${fmtDirName}" \
+            -DCMAKE_INSTALL_PREFIX="${installPrefix}" -DCMAKE_POSITION_INDEPENDENT_CODE=ON -DFMT_MASTER_PROJECT=OFF
+        cmake --build "${buildPrefix}/${fmtDirName}" --target install/strip
+        dependency_install_success "fmt_v${fmtVersion}"
+    else
+        daphne_msg "No need to build fmt again."
+    fi
+
+    #------------------------------------------------------------------------------
+    # spdlog
     #------------------------------------------------------------------------------
 
     spdlogDirName="spdlog-$spdlogVersion"
     spdlogArtifactFileName=$spdlogDirName.tar.gz
     if ! is_dependency_downloaded "spdlog_v${spdlogVersion}"; then
         rm -rf "${sourcePrefix:?}/${spdlogDirName}"
-        wget "https://github.com/gabime/spdlog/archive/refs/tags/v$spdlogVersion.tar.gz" -qO \
+        # changed URL scheme due to  temporarily use tip of main branch (2024-10-03)
+#        wget "https://github.com/gabime/spdlog/archive/refs/tags/v$spdlogVersion.tar.gz" -qO \
+        wget https://github.com/gabime/spdlog/archive/$spdlogVersion.tar.gz -qO \
             "$cacheDir/$spdlogArtifactFileName"
         tar xzf "$cacheDir/$spdlogArtifactFileName" --directory="$sourcePrefix"
         dependency_download_success "spdlog_v${spdlogVersion}"
     fi
-
     if ! is_dependency_installed "spdlog_v${spdlogVersion}"; then
         cmake -G Ninja -S "${sourcePrefix}/${spdlogDirName}" -B "${buildPrefix}/${spdlogDirName}" \
-            -DCMAKE_INSTALL_PREFIX="${installPrefix}" -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+            -DSPDLOG_FMT_EXTERNAL=ON -DCMAKE_INSTALL_PREFIX="${installPrefix}" -DCMAKE_POSITION_INDEPENDENT_CODE=ON
         cmake --build "${buildPrefix}/${spdlogDirName}" --target install/strip
         dependency_install_success "spdlog_v${spdlogVersion}"
     else
@@ -936,7 +964,7 @@ if [ $BUILD_PAPI == "-DUSE_PAPI=ON" ]; then
     fi
 
     #------------------------------------------------------------------------------
-    # 8.12 Eigen
+    # Eigen
     #------------------------------------------------------------------------------
 
     eigenDirName="eigen-${eigenVersion}"
@@ -957,7 +985,38 @@ if [ $BUILD_PAPI == "-DUSE_PAPI=ON" ]; then
     fi
 
     #------------------------------------------------------------------------------
-    # #8.13 Build MLIR
+    # HAWQ (libhdfs3)
+    #------------------------------------------------------------------------------
+
+    hawqDirName="hawq-rel-v$hawqVersion"
+    hawqDlTarName="v${hawqVersion}.tar.gz"
+    hawqTarName="${hawqDirName}.tar.gz"
+    hawqInstDirName=$installPrefix
+
+    if [ $BUILD_HDFS == "-DUSE_HDFS=ON" ]; then
+        if ! is_dependency_downloaded "hawq_v${hawqVersion}"; then
+            daphne_msg "Get HAWQ (libhdfs3) version ${hawqVersion}"
+            wget "https://github.com/apache/hawq/archive/refs/tags/rel/${hawqDlTarName}" \
+                -qO "${cacheDir}/${hawqTarName}"
+            tar -xf "$cacheDir/$hawqTarName" -C "$sourcePrefix"
+            daphne_msg "Applying 0005-libhdfs3-remove-gtest-dep.patch"
+            patch -Np1 -i "${patchDir}/0005-libhdfs3-remove-gtest-dep.patch" -d "$sourcePrefix/$hawqDirName"
+            daphne_msg "Applying 0006-libhdfs3-add-cstdint-include.patch"
+            patch -Np1 -i "${patchDir}/0006-libhdfs3-add-cstdint-include.patch" -d "$sourcePrefix/$hawqDirName"
+            dependency_download_success "hawq_v${hawqVersion}"
+        fi
+        if ! is_dependency_installed "hawq_v${hawqVersion}"; then
+            cmake -G Ninja -S "$sourcePrefix/$hawqDirName/depends/libhdfs3" -B "${buildPrefix}/${hawqDirName}" \
+                -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX="$installPrefix"
+            cmake --build "${buildPrefix}/${hawqDirName}" --target install/strip
+            dependency_install_success "hawq_v${hawqVersion}"
+        else
+              daphne_msg "No need to build HAWQ (libhdfs3) again."
+        fi
+    fi
+
+    #------------------------------------------------------------------------------
+    # Build MLIR
     #------------------------------------------------------------------------------
     # We rarely need to build MLIR/LLVM, only during the first build of the
     # prototype and after upgrades of the LLVM sub-module. To avoid unnecessary
@@ -1016,34 +1075,7 @@ if [ $BUILD_PAPI == "-DUSE_PAPI=ON" ]; then
     fi
 
     #------------------------------------------------------------------------------
-    # #8.14 HAWQ (libhdfs3)
-    #------------------------------------------------------------------------------
-
-    hawqDirName="hawq-rel-v$hawqVersion"
-    hawqTarName="v${hawqVersion}.tar.gz"
-    hawqInstDirName=$installPrefix
-    if ! is_dependency_downloaded "hawq_v${hawqVersion}"; then
-	      daphne_msg "Get HAWQ (libhdfs3) version ${hawqVersion}"
-        wget "https://github.com/apache/hawq/archive/refs/tags/rel/${hawqTarName}" \
-            -qO "${cacheDir}/${hawqTarName}"
-        tar -xf "$cacheDir/$hawqTarName" -C "$sourcePrefix"
-        daphne_msg "Applying 0005-libhdfs3-remove-gtest-dep.patch"
-        patch -Np1 -i "${patchDir}/0005-libhdfs3-remove-gtest-dep.patch" -d "$sourcePrefix/$hawqDirName"
-        daphne_msg "Applying 0006-libhdfs3-add-cstdint-include.patch"
-        patch -Np1 -i "${patchDir}/0006-libhdfs3-add-cstdint-include.patch" -d "$sourcePrefix/$hawqDirName"
-        dependency_download_success "hawq_v${hawqVersion}"
-    fi
-    if ! is_dependency_installed "hawq_v${hawqVersion}"; then
-        cmake -G Ninja -S "$sourcePrefix/$hawqDirName/depends/libhdfs3" -B "${buildPrefix}/${hawqDirName}" \
-            -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX="$installPrefix"
-        cmake --build "${buildPrefix}/${hawqDirName}" --target install/strip
-        dependency_install_success "hawq_v${hawqVersion}"
-    else
-	      daphne_msg "No need to build HAWQ (libhdfs3) again."
-    fi
-
-    #------------------------------------------------------------------------------
-    # 8.15 Liburing
+    # Liburing
     #------------------------------------------------------------------------------
 
     liburingDirName="liburing-$liburingVersion"
@@ -1053,28 +1085,30 @@ if [ $BUILD_PAPI == "-DUSE_PAPI=ON" ]; then
     liburing_cc=$([ "$CC" = "" ] && echo "gcc" || echo "$CC")
     liburing_cxx=$([ "$CXX" = "" ] && echo "g++" || echo "$CXX")
 
-    if ! is_dependency_downloaded "liburing_v${liburingVersion}"; then
-        daphne_msg "Get liburing version ${liburingVersion}"
-        wget "https://github.com/axboe/liburing/archive/refs/tags/${liburingTarName}" \
-            -qO "${cacheDir}/${liburingTarName}"
-        mkdir "$sourcePrefix/$liburingDirName"
-        tar -xf "$cacheDir/$liburingTarName" -C "$sourcePrefix/$liburingDirName" --strip-components=1
-        dependency_download_success "liburing_v${liburingVersion}"
-    fi
-    if ! is_dependency_installed "liburing_v${liburingVersion}"; then
-        cd "$sourcePrefix/$liburingDirName"
-        ./configure --cc="$liburing_cc" --cxx="$liburing_cxx" --prefix="$liburingInstDirName"
-        make -j"$(nproc)"
-        cp ./src/liburing.a "$installPrefix/lib/"
-        cp -r ./src/include/* "$installPrefix/include"
-        cd - > /dev/null
-        dependency_install_success "liburing_v${liburingVersion}"
-    else
-        daphne_msg "No need to build liburing again."
+    if [ $BUILD_IO_URING == "-DUSE_IO_URING=ON" ]; then
+        if ! is_dependency_downloaded "liburing_v${liburingVersion}"; then
+            daphne_msg "Get liburing version ${liburingVersion}"
+            wget "https://github.com/axboe/liburing/archive/refs/tags/${liburingTarName}" \
+                -qO "${cacheDir}/${liburingTarName}"
+            mkdir "$sourcePrefix/$liburingDirName"
+            tar -xf "$cacheDir/$liburingTarName" -C "$sourcePrefix/$liburingDirName" --strip-components=1
+            dependency_download_success "liburing_v${liburingVersion}"
+        fi
+        if ! is_dependency_installed "liburing_v${liburingVersion}"; then
+            cd "$sourcePrefix/$liburingDirName"
+            ./configure --cc="$liburing_cc" --cxx="$liburing_cxx" --prefix="$liburingInstDirName"
+            make -j"$(nproc)"
+            cp ./src/liburing.a "$installPrefix/lib/"
+            cp -r ./src/include/* "$installPrefix/include"
+            cd - > /dev/null
+            dependency_install_success "liburing_v${liburingVersion}"
+        else
+            daphne_msg "No need to build liburing again."
+        fi
     fi
 
     #------------------------------------------------------------------------------
-    # 8.16 Fetch bitstreams
+    # Fetch bitstreams
     #------------------------------------------------------------------------------
 
     if [[ $BUILD_FPGAOPENCL = *"ON"* ]]; then
diff --git a/containers/build-containers.sh b/containers/build-containers.sh
index 10e65020e..42d842c38 100755
--- a/containers/build-containers.sh
+++ b/containers/build-containers.sh
@@ -85,6 +85,7 @@ DAPHNE_TARGET=daphne-deps
 BASE_IMAGE=ubuntu:${ubuntuVersion}
 DAPHNE_TAG=$TIMESTAMP_DATE_${ARCH}
 IMAGE_REPO=daphneeu/$DAPHNE_TARGET
+DAPHNE_BUILD_FLAGS="--hdfs --mpi"
 #bulid deps stage
 build_daphne -deps
 
@@ -106,7 +107,6 @@ BASE_IMAGE=ubuntu:${ubuntuVersion}
 DAPHNE_TAG=${TIMESTAMP_DATE}_${ARCH}_BASE_ubuntu${ubuntuVersion}
 IMAGE_REPO=daphneeu/$DAPHNE_TARGET
 build_daphne -dev
-
 $USE_SUDO docker tag $IMAGE_REPO:$DAPHNE_TAG daphneeu/daphne-dev:latest_${ARCH}_BASE
 
 #------------------------------------------------------------------------------
@@ -118,19 +118,8 @@ BASE_IMAGE=nvidia/cuda:$CUDA_TAG
 DAPHNE_TAG=${TIMESTAMP_DATE}_${ARCH}_CUDA_${CUDA_TAG}
 IMAGE_REPO=daphneeu/$DAPHNE_TARGET
 build_daphne -dev
-
 $USE_SUDO docker tag $IMAGE_REPO:$DAPHNE_TAG daphneeu/daphne-dev:latest_${ARCH}_CUDA
 
-#-----------------------------------------------------------------------------
-# Images for DAPHNE development (OneAPI)
-#------------------------------------------------------------------------------
-#DAPHNE_TARGET=daphne-dev
-#ONEAPI_TAG=2023.1.0-devel-ubuntu${ubuntuVersion}
-#BASE_IMAGE=intel/oneapi:$ONEAPI_TAG
-#DAPHNE_TAG=${TIMESTAMP_DATE}_${ONEAPI_TAG}
-#IMAGE_REPO=daphneeu/$DAPHNE_TARGET
-#build_daphne -dev
-
 #------------------------------------------------------------------------------
 # Images for running DAPHNE
 #------------------------------------------------------------------------------
@@ -139,7 +128,7 @@ BASE_IMAGE=daphneeu/daphne-deps
 FINAL_BASE_IMAGE=ubuntu:${ubuntuVersion}
 DAPHNE_TAG=${TIMESTAMP_DATE}_${ARCH}_BASE_ubuntu${ubuntuVersion}
 IMAGE_REPO=daphneeu/$DAPHNE_TARGET
-DAPHNE_BUILD_FLAGS="--mpi"
+DAPHNE_BUILD_FLAGS="--hdfs --mpi"
 build_daphne
 $USE_SUDO docker tag $IMAGE_REPO:$DAPHNE_TAG daphneeu/daphne:latest_${ARCH}_BASE
 
@@ -152,8 +141,19 @@ DAPHNE_TAG=${TIMESTAMP_DATE}_${ARCH}_CUDA_${CUDA_TAG}
 IMAGE_REPO=daphneeu/$DAPHNE_TARGET
 BASE_IMAGE=daphneeu/daphne-dev
 FINAL_BASE_IMAGE=nvidia/cuda:$CUDA_TAG
-DAPHNE_BUILD_FLAGS="--mpi --cuda"
+DAPHNE_BUILD_FLAGS="--hdfs --mpi --cuda"
 build_daphne
 $USE_SUDO docker tag $IMAGE_REPO:$DAPHNE_TAG daphneeu/daphne:latest_${ARCH}_CUDA
 
+#-----------------------------------------------------------------------------
+# Images for conversion to singularity for DAPHNE compilation
+#------------------------------------------------------------------------------
+DAPHNE_TARGET=daphne-dev-hpc
+CUDA_TAG=${cudaVersion}-cudnn-devel-ubuntu${ubuntuVersion}
+BASE_IMAGE=nvidia/cuda:$CUDA_TAG
+DAPHNE_TAG=${TIMESTAMP_DATE}_${ARCH}_CUDA_${CUDA_TAG}
+IMAGE_REPO=daphneeu/$DAPHNE_TARGET
+build_daphne -dev-hpc
+$USE_SUDO docker tag $IMAGE_REPO:$DAPHNE_TAG daphneeu/daphne-dev:latest_${ARCH}_HPC
+
 set +e
diff --git a/containers/daphne-deps.Dockerfile b/containers/daphne-deps.Dockerfile
index b5b93de55..9a9131068 100644
--- a/containers/daphne-deps.Dockerfile
+++ b/containers/daphne-deps.Dockerfile
@@ -62,9 +62,10 @@ FROM build-cmake AS build
 ARG DAPHNE_DIR=/daphne
 ARG DAPHNE_REPO=https://github.com/daphne-eu/daphne.git
 ARG DAPHNE_BRANCH=main
+ARG DAPHNE_BUILD_FLAGS="--mpi --hdfs"
 RUN git clone --depth=1 --single-branch --branch=$DAPHNE_BRANCH $DAPHNE_REPO $DAPHNE_DIR
 WORKDIR $DAPHNE_DIR
-RUN ./build.sh --no-fancy --no-submodule-update --installPrefix /usr/local
+RUN PATH=/usr/local/bin:$PATH LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH ./build.sh --no-fancy --no-submodule-update --installPrefix /usr/local $DAPHNE_BUILD_FLAGS
 RUN find /usr/local -exec file {} \; | grep -e "not stripped" | cut -d ":" -f 1 | xargs strip --strip-unneeded
 RUN rm -rf $DAPHNE_DIR
 RUN ldconfig
diff --git a/containers/daphne-dev-hpc.Dockerfile b/containers/daphne-dev-hpc.Dockerfile
new file mode 100644
index 000000000..5b4516d8e
--- /dev/null
+++ b/containers/daphne-dev-hpc.Dockerfile
@@ -0,0 +1,51 @@
+# syntax=docker/dockerfile:1
+
+# Copyright 2023 The DAPHNE Consortium
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# This Dockerfile provides a basic DAPHNE compilation environment with all
+# third party dependencies precompiled (use ''./build.sh --no-deps --installPrefix /usr/local'' to compile DAPHNE)
+
+ARG BASE_IMAGE=ubuntu:20.04
+#ARG FINAL_BASE_IMAGE=ubuntu:20.04
+ARG CMAKE_VERSION=3.29.3
+ARG TIMESTAMP=0
+ARG TZ=Etc/UTC
+
+FROM ${BASE_IMAGE} AS daphne-dev-hpc
+ARG DEBIAN_FRONTEND="noninteractive"
+ARG TZ
+RUN apt-get -qq -y update && apt-get -y upgrade && apt-get -y --no-install-recommends install  \
+    ca-certificates file git openssh-client unzip wget tar \
+    libomp-dev  libpfm4-dev libssl-dev libxml2-dev uuid-dev zlib1g-dev libgsasl-dev libkrb5-dev \
+    build-essential clang gfortran lld llvm llvm-18-tools ninja-build openjdk-11-jdk-headless pkg-config python3-numpy python3-pandas \
+    vim nano rsync sudo iputils-ping virtualenv openssh-server iproute2 git htop gdb lldb lld gpg-agent net-tools \
+    software-properties-common ca-certificates file unzip wget tar zstd \
+    ccache python3-pip python3-networkx python3-dev graphviz-dev clang-format \
+    && apt-get clean && rm -rf /var/lib/apt/lists/*
+
+COPY --from=daphneeu/daphne-deps /usr/local/bin/ /usr/local/bin/
+COPY --from=daphneeu/daphne-deps /usr/local/include/ /usr/local/include/
+COPY --from=daphneeu/daphne-deps /usr/local/lib/ /usr/local/lib/
+COPY --from=daphneeu/daphne-deps /usr/local/share/ /usr/local/share/
+RUN ldconfig
+# this is a temporary workaround to make the lit code (from the llvm-*-tools package) available to some pre-Ubuntu24 \
+# test cases when run locally in the dev container
+RUN ln -s /usr/lib/llvm-18 /usr/lib/llvm-10
+RUN ln -fs /usr/share/zoneinfo/$TZ /etc/localtime
+#COPY entrypoint-interactive.sh /
+#RUN mkdir -p /var/run/sshd
+#EXPOSE 22
+#ENTRYPOINT [ "/entrypoint-interactive.sh"]
diff --git a/containers/daphne-dev.Dockerfile b/containers/daphne-dev.Dockerfile
index d56fb8002..c50a7c375 100644
--- a/containers/daphne-dev.Dockerfile
+++ b/containers/daphne-dev.Dockerfile
@@ -33,8 +33,9 @@ RUN apt-get -qq -y update && apt-get -y upgrade && apt-get -y --no-install-recom
     build-essential clang gfortran lld llvm llvm-18-tools ninja-build openjdk-11-jdk-headless pkg-config python3-numpy python3-pandas \
     vim nano rsync sudo iputils-ping virtualenv openssh-server iproute2 git htop gdb lldb lld gpg-agent net-tools \
     software-properties-common ca-certificates file unzip wget tar zstd \
-    ccache python3-pip python3-networkx python3-dev graphviz-dev \
+    ccache python3-pip python3-networkx python3-dev graphviz-dev clang-format \
     && apt-get clean && rm -rf /var/lib/apt/lists/*
+
 COPY --from=daphneeu/daphne-deps /usr/local/bin/ /usr/local/bin/
 COPY --from=daphneeu/daphne-deps /usr/local/include/ /usr/local/include/
 COPY --from=daphneeu/daphne-deps /usr/local/lib/ /usr/local/lib/
diff --git a/containers/publish.sh b/containers/publish.sh
index 0de241a25..cd6d32e06 100755
--- a/containers/publish.sh
+++ b/containers/publish.sh
@@ -44,8 +44,8 @@ fi
 $USE_SUDO docker push -a daphneeu/github-action
 
 # cuda dev image
-$USE_SUDO docker tag daphneeu/daphne-dev:${TIMESTAMP_DATE}_${ARCH}_CUDA_${cudaVersion}-cudnn8-devel-ubuntu${ubuntuVersion} daphneeu/daphne-dev:${VERSION}_${ARCH}_CUDA_${cudaVersion}-cudnn8-devel-ubuntu${ubuntuVersion}
-$USE_SUDO docker push daphneeu/daphne-dev:${VERSION}_${ARCH}_CUDA_${cudaVersion}-cudnn8-devel-ubuntu${ubuntuVersion}
+$USE_SUDO docker tag daphneeu/daphne-dev:${TIMESTAMP_DATE}_${ARCH}_CUDA_${cudaVersion}-cudnn-devel-ubuntu${ubuntuVersion} daphneeu/daphne-dev:${VERSION}_${ARCH}_CUDA_${cudaVersion}-cudnn-devel-ubuntu${ubuntuVersion}
+$USE_SUDO docker push daphneeu/daphne-dev:${VERSION}_${ARCH}_CUDA_${cudaVersion}-cudnn-devel-ubuntu${ubuntuVersion}
 $USE_SUDO docker push daphneeu/daphne-dev:latest_${ARCH}_CUDA
 
 # base dev image
@@ -54,8 +54,8 @@ $USE_SUDO docker push daphneeu/daphne-dev:${VERSION}_${ARCH}_BASE_ubuntu${ubuntu
 $USE_SUDO docker push daphneeu/daphne-dev:latest_${ARCH}_BASE
 
 # cuda run image
-$USE_SUDO docker tag daphneeu/daphne:${TIMESTAMP_DATE}_${ARCH}_CUDA_${cudaVersion}-cudnn8-runtime-ubuntu${ubuntuVersion} daphneeu/daphne:${VERSION}_${ARCH}_CUDA_${cudaVersion}-cudnn8-runtime-ubuntu${ubuntuVersion}
-$USE_SUDO docker push daphneeu/daphne:${VERSION}_${ARCH}_CUDA_${cudaVersion}-cudnn8-runtime-ubuntu${ubuntuVersion}
+$USE_SUDO docker tag daphneeu/daphne:${TIMESTAMP_DATE}_${ARCH}_CUDA_${cudaVersion}-cudnn-runtime-ubuntu${ubuntuVersion} daphneeu/daphne:${VERSION}_${ARCH}_CUDA_${cudaVersion}-cudnn-runtime-ubuntu${ubuntuVersion}
+$USE_SUDO docker push daphneeu/daphne:${VERSION}_${ARCH}_CUDA_${cudaVersion}-cudnn-runtime-ubuntu${ubuntuVersion}
 $USE_SUDO docker push daphneeu/daphne:latest_${ARCH}_CUDA
 
 # base run image
diff --git a/doc/GettingStarted.md b/doc/GettingStarted.md
index 07e69abbd..dd9eebd41 100644
--- a/doc/GettingStarted.md
+++ b/doc/GettingStarted.md
@@ -233,6 +233,7 @@ launching DAPHNE via Docker (see below) should work the same way as in a native
 | java (e.g. openjdk)                  | 11 (1.7 should be fine)      |                                                                                                                                         |
 | jq                                   |                              | json commandline processor used in docker image generation scripts.                                                                     |
 | libpfm4-dev                          | 4.10                         | This dependency is needed for profiling support [DAPHNE-#479]                                                                           |
+| gRPC                                 | 1.38.0                       |                                                                                                                                         |
 | libssl-dev                           | 1.1.1                        | Dependency introduced while optimizing grpc build (which used to build ssl unnecessarily)                                               |
 | lld                                  | 10.0.0                       |                                                                                                                                         |
 | llvm-10-tools                        | 10, 15, 18                   | `apt` provides up to `llvm-10-tools` for Ubuntu 20.04 whereas 22.04 / 24.04 require a newer version such as `llvm-15-tools`.            |
diff --git a/doc/HDFS-Usage.md b/doc/HDFS-Usage.md
new file mode 100644
index 000000000..429e40bfe
--- /dev/null
+++ b/doc/HDFS-Usage.md
@@ -0,0 +1,233 @@
+<!--
+Copyright 2023 The DAPHNE Consortium
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+# HDFS Usage
+
+About employing HDFS as a distributed file system.
+
+This document shows how a DAPHNE user can execute DAPHNE scripts using HDFS as a file system,
+which is optimized for performance on big data through distributed computing.
+This document assumes that DAPHNE was build with the `--hdfs` options, if this is not the case please rebuild DAPHNE with the `--hdfs` option
+`./build.sh --hdfs`
+
+The DAPHNE build script uses [HAWQ (libhdfs3)](https://github.com/apache/hawq/archive/refs/tags/rel/v3.0.0.0.tar.gz).
+
+## Configuring DAPHNE for HDFS
+
+In order for DAPHNE to utilize the HDFS file system certain command line arguments need to be passed
+(or included in the config file).
+
+-   `--enable-hdfs`: A flag to enable hdfs.
+-   `--hdfs-address=<IP:PORT>`: The IP and port HDFS listens to.
+-   `--hdfs-username=<username>`: The username used to connect to HDFS.
+
+## Reading from HDFS
+
+In order to read a file from the HDFS some pre processing must be done. Assuming the
+file is named `FILE_NAME`, a user needs to:
+
+1. Upload the file into HDFS. DAPHNE expects the file to be located inside a directory with some specific naming conventions.
+   The path can by any path under HDFS, however the file must be named with the following convention:
+
+```
+/path/to/hdfs/file/FILE_NAME.FILE_TYPE/FILE_NAME.FILE_TYPE_segment_1
+```
+
+`FILE_TYPE` is either `.csv` or `.dbdf` (DAPHNE binary data format) followed by `.hdfs`, e.g. `myfile.csv.hdfs`.
+
+The suffix `_segment_1` is necessary, since we support multiple writers at once (see more below), the writers need to write into different files (different segments).
+In this case where the user pre-uploads the file, it needs to be in the same format, but just one segment.
+
+Each segment must also have it's own .meta file within the HDFS. This is a JSON
+file containg information about the size of the segment as well as the type.
+For example `myfile.csv.hdfs_segment_1.meta`:
+
+```json
+{
+    "numCols": 10,
+    "numRows": 10,
+    "valueType": "f64"
+}
+```
+
+2.  We also need to create a .meta file containing information about the file, within the local file system (from where DAPHNE is invoked).
+    Similar to any other file which will be read by DAPHNE, we need to create a .meta file, which is in JSON format, containing information about where the
+    file is, information about the rows/cols etc. The file should be named: `FILE_NAME.FILE_TYPE.meta`, e.g.
+    `myfile.csv.hdfs.meta`. The meta file should contain all the regular information any DAPHNE meta file contains, but in addition it also contains information about whether this is an HDFS file and where it is located within HDFS:
+
+```json
+{
+    "hdfs": {
+        "HDFSFilename": "/path/to/hdfs/file/FILE_NAME.FILE_TYPE",
+        "isHDFS": true
+    },
+    "numCols": 10,
+    "numRows": 10,
+    "valueType": "f64"
+}
+```
+
+### Example:
+
+Let's say we have a dataset called `training_data.csv` which we want to upload to HDFS and use it with DAPHNE.
+
+1. Upload file under path `datasets` and create the segment .meta file. HDFS should look like this:
+
+```bash
+$ hdfs dfs -ls /
+/datasets/training_data.csv.hdfs/training_data.csv.hdfs_segment_1
+/datasets/training_data.csv.hdfs/training_data.csv.hdfs_segment_1.meta
+
+$ hdfs dfs -cat /datasets/training_data.csv.hdfs/training_data.csv.hdfs_segment_1.meta
+{"numCols":10,"numRows":10,"valueType":"f64"}
+```
+
+2. Create the local file .meta file:
+
+```bash
+$ cat ./training_data.csv.hdfs.meta
+{"hdfs":{"HDFSFilename":"/datasets/training_data.csv.hdfs","isHDFS":true},"numCols":10,"numRows":10,"valueType":"f64"}
+```
+
+3. DAPHNE script:
+
+```
+X = readMatrix("training_data.csv.hdfs");
+print(X);
+```
+
+4. Run DAPHNE
+
+```
+./bin/daphne --enable-hdfs --hdfs-ip=<IP:PORT> --hdfs-username=ubuntu code.daph
+```
+
+## Writing to HDFS
+
+In order to write to HDFS we just need to use the `writeMatrix` function like we would for any other file type and specify the hdfs suffix. For example:
+
+1. Code
+
+```
+X = rand(10, 10, 0.0, 1.0, 1.0, 1);
+writeMatrix(X, "randomSet.csv.hdfs");
+```
+
+2. Call daphne
+
+```bash
+./bin/daphne --enable-hdfs --hdfs-ip=<IP:PORT> --hdfs-username=ubuntu code.daph
+```
+
+This will create the following files inside HDFS:
+
+```bash
+$ hdfs dfs -ls /
+/randomSet.csv.hdfs/randomSet.csv.hdfs_segment_1
+/randomSet.csv.hdfs/randomSet.csv.hdfs_segment_1.meta
+
+$ hdfs dfs -cat /randomSet.csv.hdfs/randomSet.csv.hdfs_segment_1.meta
+{"numCols":10,"numRows":10,"valueType":"f64"}
+```
+
+And also the .meta file within the local file system named `randomSet.csv.hdfs.meta`:
+
+```json
+{
+    "hdfs": {
+        "HDFSFilename": "/randomSet.csv.hdfs",
+        "isHDFS": true
+    },
+    "numCols": 10,
+    "numRows": 10,
+    "valueType": "f64"
+}
+```
+
+### Limitations:
+
+For now writing to a specific directory, through DAPHNE, within HDFS is not supported. DAPHNE will always try to write under the root HDFS directory `/<name>.<type>.hdfs`.
+
+## Distributed Runtime
+
+Both read and write operations are supported by the distributed runtime.
+
+### Read
+
+Exactly the same preprocessing must be done, creating one file inside the HDFS with the
+appropriate naming conventions. Users can then run DAPHNE using the
+[distributed runtime](DistributedRuntime.md) and depending on the generated pipeline, DAPHNE's distributed workers will read their
+corresponding part of the data speeding up IO significantly. For example:
+
+1. DAPHNE script:
+
+```
+X = readMatrix("training_data.csv.hdfs");
+print(X+X);
+```
+
+2. Run DAPHNE
+
+```bash
+$ export DISTRIBUTED_WORKERS=worker-1:<PORT>:worker-2:<PORT>
+$ ./bin/daphne --distributed --dist_backend=sync-gRPC --enable-hdfs --hdfs-ip=<IP:PORT> --hdfs-username=ubuntu code.daph
+```
+
+### Write
+
+Similar to read, nothing really changes, users just need to call DAPHNE using the distributed runtime flags. Notice that since we have multiple workers/writers, more than
+one segements are generated inside HDFS:
+
+1. Code
+
+```
+X = rand(10, 10, 0.0, 1.0, 1.0, 1);
+writeMatrix(X, "randomSet.csv.hdfs");
+```
+
+2. Call daphne
+
+```bash
+$ export DISTRIBUTED_WORKERS=worker-1:<PORT>:worker-2:<PORT>
+$ ./bin/daphne --distributed --dist_backend=sync-gRPC --enable-hdfs --hdfs-ip=<IP:PORT> --hdfs-username=ubuntu code.daph
+```
+
+Assuming 2 distributed workers:
+
+```bash
+$ hdfs dfs -ls /
+/randomSet.csv.hdfs/randomSet.csv.hdfs_segment_1        # First part of the matrix
+/randomSet.csv.hdfs/randomSet.csv.hdfs_segment_1.meta
+/randomSet.csv.hdfs/randomSet.csv.hdfs_segment_2        # Second part of the matrix
+/randomSet.csv.hdfs/randomSet.csv.hdfs_segment_2.meta
+
+$ hdfs dfs -cat /randomSet.csv.hdfs/randomSet.csv.hdfs_segment_1.meta
+{"numCols":10,"numRows":5,"valueType":"f64"}
+$ hdfs dfs -cat /randomSet.csv.hdfs/randomSet.csv.hdfs_segment_2.meta
+{"numCols":10,"numRows":5,"valueType":"f64"}
+```
+
+And also the .meta file within the local file system named `randomSet.csv.hdfs.meta`.
+
+### Notes
+
+It does not matter how many segments are generated or exist. DAPHNE is designed to read
+the segments according to the current state (distributed or not and how many distributed
+workers are being used).
+
+For example if we use 4 distributed workers to write a matrix,
+DAPHNE will generate 4 different segments. DAPHNE can later read the same matrix either in
+local execution (no distributed runtime) or using a different number of workers, not depending on the amount of segments generated earlier.
diff --git a/doc/development/ImplementBuiltinKernel.md b/doc/development/ImplementBuiltinKernel.md
index 4ab3f487d..0dc093aa3 100644
--- a/doc/development/ImplementBuiltinKernel.md
+++ b/doc/development/ImplementBuiltinKernel.md
@@ -193,3 +193,27 @@ It is recommended to exceptions such as `throw std::runtime_error` in a kernel
 in case the code runs into an unresolvable issue. We catch these exceptions in
 our surrounding code to the kernel and provide, whenever possible, additional
 information about the source of the error in the DaphneDSL script.
+
+
+### Experimental Kernels
+
+As an alternative to implementing a new kernel that is directly integrated into
+DAPHNE, one can also work on kernel implementations using the [kernel catalog](doc/Extensions.md).
+These should reside in [experimental/op/](src/runtime/local/kernels/experimental/op/) where `op` is
+the mnemonic of the DaphneIR operation that the kernel is implementing.
+
+Experimental kernels are not directly integrated into DAPHNE and are neither
+compiled nor executed by default. They can be used to test new ideas and
+provide an easier way of prototyping kernel implementations. One can easily
+test multiple different implementations of the same DAPHNE kernel using a
+single DaphneDSL script which calls all the kernel implementations.
+
+There are less restrictions put on experimental kernels than on built-in
+kernels, e.g., they are not tested as part of the CI pipeline. You are also
+free to introduce new dependencies that are handled by the accompanying
+`Makefile` or build script. Testing and dependency management will have to be
+resolved before the experimental kernel is integrated into DAPHNE as a built-in
+kernel.
+
+Check out [Extensions.md](doc/Extensions.md) for more information on how to
+implement experimental kernels.
diff --git a/software-package-versions.txt b/software-package-versions.txt
index 30e00f49f..cda7b2841 100644
--- a/software-package-versions.txt
+++ b/software-package-versions.txt
@@ -19,7 +19,7 @@ abslVersion=20230802.1
 antlrVersion=4.9.2
 arrowVersion=13.0.0
 catch2Version=2.13.8
-cmakeVersion=3.30.3
+cmakeVersion=3.30.5
 cudaVersion=12.6.1
 eigenVersion=3.4.0
 grpcVersion=1.38.0
@@ -28,7 +28,10 @@ nlohmannjsonVersion=3.10.5
 openBlasVersion=0.3.23
 openMPIVersion=4.1.5
 papiVersion=7.0.1
-spdlogVersion=1.11.0
+# temporarily (2024-10-03) use tip of default branch for spdlog due to compilation issue of latest
+# release 1.14.1 in combination with external fmt 11.0.2
+spdlogVersion=e593f6695c6065e6b345fe2862f04a519ed484e0
 ubuntuVersion=24.04
 hawqVersion=3.0.0.0
 liburingVersion=2.7
+fmtVersion=11.0.2
diff --git a/src/api/cli/DaphneUserConfig.h b/src/api/cli/DaphneUserConfig.h
index 071e15812..d33a18e30 100644
--- a/src/api/cli/DaphneUserConfig.h
+++ b/src/api/cli/DaphneUserConfig.h
@@ -14,29 +14,28 @@
  * limitations under the License.
  */
 
-
 #pragma once
 
 #include <api/daphnelib/DaphneLibResult.h>
 #include <compiler/catalog/KernelCatalog.h>
-#include <runtime/local/vectorized/LoadPartitioningDefs.h>
 #include <runtime/local/datastructures/IAllocationDescriptor.h>
-#include <util/LogConfig.h>
+#include <runtime/local/vectorized/LoadPartitioningDefs.h>
 #include <util/DaphneLogger.h>
+#include <util/LogConfig.h>
 class DaphneLogger;
 
-#include <vector>
-#include <string>
-#include <memory>
-#include <map>
-#include <limits>
 #include <filesystem>
+#include <limits>
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
 
 /*
  * Container to pass around user configuration
  */
 struct DaphneUserConfig {
-    // Remember to update UserConfig.json accordingly!    
+    // Remember to update UserConfig.json accordingly!
     bool use_cuda = false;
     bool use_vectorized_exec = false;
     bool use_distributed = false;
@@ -44,11 +43,11 @@ struct DaphneUserConfig {
     bool use_ipa_const_propa = true;
     bool use_phy_op_selection = true;
     bool use_mlir_codegen = false;
-    int  matmul_vec_size_bits = 0;
+    int matmul_vec_size_bits = 0;
     bool matmul_tile = false;
     int matmul_unroll_factor = 1;
-    int matmul_unroll_jam_factor=4;
-    int matmul_num_vec_registers=16;
+    int matmul_unroll_jam_factor = 4;
+    int matmul_num_vec_registers = 16;
     bool matmul_use_fixed_tile_sizes = false;
     std::vector<unsigned> matmul_fixed_tile_sizes = {4, 4};
     bool matmul_invert_loops = false;
@@ -81,9 +80,12 @@ struct DaphneUserConfig {
 
     SelfSchedulingScheme taskPartitioningScheme = STATIC;
     QueueTypeOption queueSetupScheme = CENTRALIZED;
-	VictimSelectionLogic victimSelection = SEQPRI;
-    ALLOCATION_TYPE distributedBackEndSetup= ALLOCATION_TYPE::DIST_MPI; // default value
-    size_t max_distributed_serialization_chunk_size = std::numeric_limits<int>::max() - 1024; // 2GB (-1KB to make up for gRPC headers etc.) - which is the maximum size allowed by gRPC / MPI. TODO: Investigate what might be the optimal.
+    VictimSelectionLogic victimSelection = SEQPRI;
+    ALLOCATION_TYPE distributedBackEndSetup = ALLOCATION_TYPE::DIST_MPI; // default value
+    size_t max_distributed_serialization_chunk_size =
+        std::numeric_limits<int>::max() - 1024; // 2GB (-1KB to make up for gRPC headers etc.) - which is the
+                                                // maximum size allowed by gRPC / MPI. TODO: Investigate what
+                                                // might be the optimal.
     int numberOfThreads = -1;
     int minimumTaskSize = 1;
 
@@ -92,14 +94,16 @@ struct DaphneUserConfig {
     std::string hdfs_Address = "";
     std::string hdfs_username = "";
 
-    // minimum considered log level (e.g., no logging below ERROR (essentially suppressing WARN, INFO, DEBUG and TRACE)
+    // minimum considered log level (e.g., no logging below ERROR (essentially
+    // suppressing WARN, INFO, DEBUG and TRACE)
     spdlog::level::level_enum log_level_limit = spdlog::level::err;
     std::vector<LogConfig> loggers;
-    DaphneLogger* log_ptr{};
+    DaphneLogger *log_ptr{};
     float sparsity_threshold = 0.25;
 
 #ifdef USE_CUDA
-    // User config holds once context atm for convenience until we have proper system infrastructure
+    // User config holds once context atm for convenience until we have proper
+    // system infrastructure
 
     // CUDA device IDs (future work, as we create only one context atm)
     std::vector<int> cuda_devices;
@@ -110,28 +114,27 @@ struct DaphneUserConfig {
 #ifdef USE_FPGAOPENCL
     std::vector<int> fpga_devices;
 #endif
-    
-    
+
     std::string libdir = "{exedir}/../lib";
     std::map<std::string, std::vector<std::string>> daphnedsl_import_paths;
 
+    // TODO Maybe the DaphneLib result should better reside in the
+    // DaphneContext, but having it here is simpler for now.
+    DaphneLibResult *result_struct = nullptr;
 
-    // TODO Maybe the DaphneLib result should better reside in the DaphneContext,
-    // but having it here is simpler for now.
-    DaphneLibResult* result_struct = nullptr;
-    
     KernelCatalog kernelCatalog;
 
     /**
-     * @brief Replaces the prefix `"{exedir}/"` in the field `libdir` by the path
-     * of the directory in which the currently running executable resides.
+     * @brief Replaces the prefix `"{exedir}/"` in the field `libdir` by the
+     * path of the directory in which the currently running executable resides.
      *
-     * Note that the current executable is not necessarily `daphne`. It could also
-     * be a distributed worker (e.g., `DistributedWorker`) or Python (`python3`).
+     * Note that the current executable is not necessarily `daphne`. It could
+     * also be a distributed worker (e.g., `DistributedWorker`) or Python
+     * (`python3`).
      */
     void resolveLibDir() {
         const std::string exedirPlaceholder = "{exedir}/";
-        if(libdir.substr(0, exedirPlaceholder.size()) == exedirPlaceholder) {
+        if (libdir.substr(0, exedirPlaceholder.size()) == exedirPlaceholder) {
             // This next line adds to our Linux platform lock-in.
             std::filesystem::path daphneExeDir(std::filesystem::canonical("/proc/self/exe").parent_path());
             libdir = daphneExeDir / libdir.substr(exedirPlaceholder.size());
diff --git a/src/api/cli/StatusCode.h b/src/api/cli/StatusCode.h
index f705f2e71..2be5be2b8 100644
--- a/src/api/cli/StatusCode.h
+++ b/src/api/cli/StatusCode.h
@@ -19,7 +19,7 @@
 
 /**
  * @brief Possible status codes returned by the command line interface.
- * 
+ *
  * Note that this is deliberately not an `enum class`, because we frequently
  * need to use it as an integer.
  */
@@ -30,4 +30,4 @@ enum StatusCode {
     EXECUTION_ERROR,
 };
 
-#endif //SRC_API_CLI_STATUSCODE_H
\ No newline at end of file
+#endif // SRC_API_CLI_STATUSCODE_H
\ No newline at end of file
diff --git a/src/api/cli/daphne.cpp b/src/api/cli/daphne.cpp
index 347ff81bc..bd83fbcce 100644
--- a/src/api/cli/daphne.cpp
+++ b/src/api/cli/daphne.cpp
@@ -16,6 +16,4 @@
 
 #include <api/internal/daphne_internal.h>
 
-int main(int argc, const char** argv) {
-    return mainInternal(argc, argv, nullptr);
-}
+int main(int argc, const char **argv) { return mainInternal(argc, argv, nullptr); }
diff --git a/src/api/daphnelib/DaphneLibResult.h b/src/api/daphnelib/DaphneLibResult.h
index 4d346c8cb..2a7a99158 100644
--- a/src/api/daphnelib/DaphneLibResult.h
+++ b/src/api/daphnelib/DaphneLibResult.h
@@ -19,17 +19,16 @@
 #include <cinttypes>
 #include <string>
 
-
 struct DaphneLibResult {
     // For matrices.
-    void* address;
+    void *address;
     int64_t rows;
     int64_t cols;
     int64_t vtc;
     // For frames.
-    int64_t* vtcs;
-    char** labels;
-    void** columns;
+    int64_t *vtcs;
+    char **labels;
+    void **columns;
     // To pass error messages to Python code.
     std::string error_message;
 };
\ No newline at end of file
diff --git a/src/api/daphnelib/daphnelib.cpp b/src/api/daphnelib/daphnelib.cpp
index 1e6f99735..16c129e4c 100644
--- a/src/api/daphnelib/daphnelib.cpp
+++ b/src/api/daphnelib/daphnelib.cpp
@@ -25,15 +25,14 @@ DaphneLibResult daphneLibRes;
 /**
  * @brief Returns the result of a DaphneLib invocation.
  */
-extern "C" DaphneLibResult getResult() {
-    return daphneLibRes;
-}
+extern "C" DaphneLibResult getResult() { return daphneLibRes; }
 
 /**
- * @brief Invokes DAPHNE with the specified DaphneDSL script and path to lib dir.
+ * @brief Invokes DAPHNE with the specified DaphneDSL script and path to lib
+ * dir.
  */
-extern "C" int daphne(const char* libDirPath, const char* scriptPath) {
-    const char * argv[] = {"daphne", "--libdir", libDirPath, scriptPath};
+extern "C" int daphne(const char *libDirPath, const char *scriptPath) {
+    const char *argv[] = {"daphne", "--libdir", libDirPath, scriptPath};
     int argc = 4;
 
     return mainInternal(argc, argv, &daphneLibRes);
diff --git a/src/api/internal/daphne_internal.cpp b/src/api/internal/daphne_internal.cpp
index d406f954d..5b533601b 100644
--- a/src/api/internal/daphne_internal.cpp
+++ b/src/api/internal/daphne_internal.cpp
@@ -15,19 +15,20 @@
  */
 
 #include "runtime/local/datastructures/IAllocationDescriptor.h"
-#include <vector>
+
 #ifdef USE_MPI
-    #include "runtime/distributed/worker/MPIWorker.h"
+#include "runtime/distributed/worker/MPIWorker.h"
 #endif
-#include <api/cli/StatusCode.h>
-#include <api/internal/daphne_internal.h>
+
+#include "compiler/execution/DaphneIrExecutor.h"
 #include <api/cli/DaphneUserConfig.h>
+#include <api/cli/StatusCode.h>
 #include <api/daphnelib/DaphneLibResult.h>
-#include <parser/daphnedsl/DaphneDSLParser.h>
-#include "compiler/execution/DaphneIrExecutor.h"
-#include <runtime/local/vectorized/LoadPartitioning.h>
+#include <api/internal/daphne_internal.h>
 #include <parser/catalog/KernelCatalogParser.h>
 #include <parser/config/ConfigParser.h>
+#include <parser/daphnedsl/DaphneDSLParser.h>
+#include <runtime/local/vectorized/LoadPartitioningDefs.h>
 #include <util/DaphneLogger.h>
 #include <util/KernelDispatchMapping.h>
 #include <util/Statistics.h>
@@ -38,7 +39,7 @@
 #include "llvm/Support/CommandLine.h"
 
 #ifdef USE_CUDA
-    #include <runtime/local/kernels/CUDA/HostUtils.h>
+#include <runtime/local/kernels/CUDA/HostUtils.h>
 #endif
 
 #include <chrono>
@@ -47,9 +48,8 @@
 #include <string>
 #include <unordered_map>
 
-#include <csignal>
 #include <csetjmp>
-#include <cstdlib>
+#include <csignal>
 #include <cstring>
 #include <execinfo.h>
 
@@ -60,49 +60,50 @@ using namespace std;
 using namespace mlir;
 using namespace llvm::cl;
 
-void parseScriptArgs(const llvm::cl::list<string>& scriptArgsCli, unordered_map<string, string>& scriptArgsFinal) {
-    for(const std::string& pair : scriptArgsCli) {
+void parseScriptArgs(const llvm::cl::list<string> &scriptArgsCli, unordered_map<string, string> &scriptArgsFinal) {
+    for (const std::string &pair : scriptArgsCli) {
         size_t pos = pair.find('=');
-        if(pos == string::npos)
-            throw std::runtime_error("script arguments must be specified as name=value, but found '" + pair + "'");
+        if (pos == string::npos)
+            throw std::runtime_error("script arguments must be specified as "
+                                     "name=value, but found '" +
+                                     pair + "'");
         const string argName = pair.substr(0, pos);
         const string argValue = pair.substr(pos + 1, pair.size());
-        if(scriptArgsFinal.count(argName))
+        if (scriptArgsFinal.count(argName))
             throw runtime_error("script argument: '" + argName + "' was provided more than once");
         scriptArgsFinal.emplace(argName, argValue);
     }
 }
-void printVersion(llvm::raw_ostream& os) {
+void printVersion(llvm::raw_ostream &os) {
     // TODO Include some of the important build flags into the version string.
-    os
-      << "DAPHNE Version 0.3\n"
-      << "An Open and Extensible System Infrastructure for Integrated Data Analysis Pipelines\n"
-      << "https://github.com/daphne-eu/daphne\n";
+    os << "DAPHNE Version 0.3\n"
+       << "An Open and Extensible System Infrastructure for Integrated Data "
+          "Analysis Pipelines\n"
+       << "https://github.com/daphne-eu/daphne\n";
 }
 
-namespace
-{
-    volatile std::sig_atomic_t gSignalStatus;
-    jmp_buf return_from_handler;
-}
+namespace {
+volatile std::sig_atomic_t gSignalStatus;
+jmp_buf return_from_handler;
+} // namespace
 
 void handleSignals(int signal) {
     constexpr int callstackMaxSize = 25;
-    void* callstack[callstackMaxSize];
+    void *callstack[callstackMaxSize];
     auto callstacksReturned = backtrace(callstack, callstackMaxSize);
     backtrace_symbols_fd(callstack, callstacksReturned, STDOUT_FILENO);
     gSignalStatus = signal;
     longjmp(return_from_handler, gSignalStatus);
 }
 
-void logErrorDaphneLibAware(DaphneLibResult * daphneLibRes, std::string msg) {
-    if(daphneLibRes != nullptr) // For DaphneLib (Python API), error message is handled later in script.py.
+void logErrorDaphneLibAware(DaphneLibResult *daphneLibRes, std::string msg) {
+    if (daphneLibRes != nullptr) // For DaphneLib (Python API), error message is handled later in script.py.
         daphneLibRes->error_message = msg;
     else
         spdlog::error(msg);
 }
 
-int startDAPHNE(int argc, const char** argv, DaphneLibResult* daphneLibRes, int *id, DaphneUserConfig& user_config){
+int startDAPHNE(int argc, const char **argv, DaphneLibResult *daphneLibRes, int *id, DaphneUserConfig &user_config) {
     using clock = std::chrono::high_resolution_clock;
     clock::time_point tpBeg = clock::now();
 
@@ -113,330 +114,240 @@ int startDAPHNE(int argc, const char** argv, DaphneLibResult* daphneLibRes, int
     // ************************************************************************
     // Parse command line arguments
     // ************************************************************************
-    
+
     // ------------------------------------------------------------------------
     // Define options
     // ------------------------------------------------------------------------
 
     // All the variables concerned with the LLVM command line parser (those of
     // type OptionCategory, opt, ...) must be declared static here, because
-    // this function may run multiple times in the context of DaphneLib (DAPHNE's
-    // Python API). Without static, the second invocation of this function
-    // crashes because the options set in the first invocation are still present
-    // in some global state. This must be due to the way the LLVM command line
-    // library handles its internal state.
-    
+    // this function may run multiple times in the context of DaphneLib
+    // (DAPHNE's Python API). Without static, the second invocation of this
+    // function crashes because the options set in the first invocation are
+    // still present in some global state. This must be due to the way the LLVM
+    // command line library handles its internal state.
+
     // Option categories ------------------------------------------------------
-    
+
     // TODO We will probably subdivide the options into multiple groups later.
     static OptionCategory daphneOptions("DAPHNE Options");
     static OptionCategory schedulingOptions("Advanced Scheduling Knobs");
     static OptionCategory distributedBackEndSetupOptions("Distributed Backend Knobs");
     static OptionCategory HDFSOptions("HDFS Knobs");
 
-
     // Options ----------------------------------------------------------------
 
     // Distributed backend Knobs
-    static opt<ALLOCATION_TYPE> distributedBackEndSetup("dist_backend", cat(distributedBackEndSetupOptions), 
-                                            desc("Choose the options for the distribution backend:"),
-                                            values(
-                                                    clEnumValN(ALLOCATION_TYPE::DIST_MPI, "MPI", "Use message passing interface for internode data exchange (default)"),
-                                                    clEnumValN(ALLOCATION_TYPE::DIST_GRPC_SYNC, "sync-gRPC", "Use remote procedure call (synchronous gRPC with threading) for internode data exchange"),
-                                                    clEnumValN(ALLOCATION_TYPE::DIST_GRPC_ASYNC, "async-gRPC", "Use remote procedure call (asynchronous gRPC) for internode data exchange")
-                                                ),
-                                            init(ALLOCATION_TYPE::DIST_MPI)
-                                            );
-    static opt<size_t> maxDistrChunkSize("max-distr-chunk-size", cat(distributedBackEndSetupOptions), 
-                                            desc(
-                                                "Define the maximum chunk size per message for the distributed runtime (in bytes)"
-                                                "(default is close to maximum allowed ~2GB)"
-                                            ),
-                                            init(std::numeric_limits<int>::max() - 1024)
-                                        );
+    static opt<ALLOCATION_TYPE> distributedBackEndSetup(
+        "dist_backend", cat(distributedBackEndSetupOptions), desc("Choose the options for the distribution backend:"),
+        values(clEnumValN(ALLOCATION_TYPE::DIST_MPI, "MPI",
+                          "Use message passing interface for internode data "
+                          "exchange (default)"),
+               clEnumValN(ALLOCATION_TYPE::DIST_GRPC_SYNC, "sync-gRPC",
+                          "Use remote procedure call (synchronous gRPC with "
+                          "threading) for internode data exchange"),
+               clEnumValN(ALLOCATION_TYPE::DIST_GRPC_ASYNC, "async-gRPC",
+                          "Use remote procedure call (asynchronous gRPC) for "
+                          "internode data exchange")),
+        init(ALLOCATION_TYPE::DIST_MPI));
+    static opt<size_t> maxDistrChunkSize("max-distr-chunk-size", cat(distributedBackEndSetupOptions),
+                                         desc("Define the maximum chunk size per message for the distributed "
+                                              "runtime (in bytes)"
+                                              "(default is close to maximum allowed ~2GB)"),
+                                         init(std::numeric_limits<int>::max() - 1024));
 
     // HDFS knobs
-    static opt<bool> use_hdfs(
-        "enable-hdfs", cat(HDFSOptions),
-        desc("Enable HDFS filesystem")
-    );
-    static opt<string> hdfs_Address(
-        "hdfs-ip", cat(HDFSOptions),
-        desc("IP of the HDFS filesystem (including port)."),
-        init("")
-    );
-    static opt<string> hdfs_username(
-        "hdfs-username", cat(HDFSOptions),
-        desc("Username of the HDFS filesystem."),
-        init("")
-    );
-
-    
+    static opt<bool> use_hdfs("enable-hdfs", cat(HDFSOptions), desc("Enable HDFS filesystem"));
+    static opt<string> hdfs_Address("hdfs-ip", cat(HDFSOptions), desc("IP of the HDFS filesystem (including port)."),
+                                    init(""));
+    static opt<string> hdfs_username("hdfs-username", cat(HDFSOptions), desc("Username of the HDFS filesystem."),
+                                     init(""));
+
     // Scheduling options
 
-    static opt<SelfSchedulingScheme> taskPartitioningScheme("partitioning",
-            cat(schedulingOptions), desc("Choose task partitioning scheme:"),
-            values(
-                clEnumVal(STATIC , "Static (default)"),
-                clEnumVal(SS, "Self-scheduling"),
-                clEnumVal(GSS, "Guided self-scheduling"),
-                clEnumVal(TSS, "Trapezoid self-scheduling"),
-                clEnumVal(FAC2, "Factoring self-scheduling"),
-                clEnumVal(TFSS, "Trapezoid Factoring self-scheduling"),
-                clEnumVal(FISS, "Fixed-increase self-scheduling"),
-                clEnumVal(VISS, "Variable-increase self-scheduling"),
-                clEnumVal(PLS, "Performance loop-based self-scheduling"),
-                clEnumVal(MSTATIC, "Modified version of Static, i.e., instead of n/p, it uses n/(4*p) where n is number of tasks and p is number of threads"),
-                clEnumVal(MFSC, "Modified version of fixed size chunk self-scheduling, i.e., MFSC does not require profiling information as FSC"),
-                clEnumVal(PSS, "Probabilistic self-scheduling"),
-                clEnumVal(AUTO, "Automatic partitioning")
-            ),
-            init(STATIC)
-    );
-    static opt<QueueTypeOption> queueSetupScheme("queue_layout",
-            cat(schedulingOptions), desc("Choose queue setup scheme:"),
-            values(
-                clEnumVal(CENTRALIZED, "One queue (default)"),
-                clEnumVal(PERGROUP, "One queue per CPU group"),
-                clEnumVal(PERCPU, "One queue per CPU core")
-            ),
-            init(CENTRALIZED)
-    );
-	static opt<VictimSelectionLogic> victimSelection("victim_selection",
-            cat(schedulingOptions), desc("Choose work stealing victim selection logic:"),
-            values(
-                clEnumVal(SEQ, "Steal from next adjacent worker (default)"),
-                clEnumVal(SEQPRI, "Steal from next adjacent worker, prioritize same NUMA domain"),
-                clEnumVal(RANDOM, "Steal from random worker"),
-				clEnumVal(RANDOMPRI, "Steal from random worker, prioritize same NUMA domain")
-            ),
-            init(SEQ)
-    );
-
-    static opt<int> numberOfThreads(
-            "num-threads", cat(schedulingOptions),
-            desc(
-                "Define the number of the CPU threads used by the vectorized execution engine "
-                "(default is equal to the number of physical cores on the target node that executes the code)"
-            )
-    );
-    static opt<int> minimumTaskSize(
-            "grain-size", cat(schedulingOptions),
-            desc(
-                "Define the minimum grain size of a task (default is 1)"
-            ),
-            init(1)
-    );
-    static opt<bool> useVectorizedPipelines(
-            "vec", cat(schedulingOptions),
-            desc("Enable vectorized execution engine")
-    );
-    static opt<bool> useDistributedRuntime(
-        "distributed", cat(daphneOptions),
-        desc("Enable distributed runtime")
-    );
-    static opt<bool> prePartitionRows(
-            "pre-partition", cat(schedulingOptions),
-            desc("Partition rows into the number of queues before applying scheduling technique")
-    );
-    static opt<bool> pinWorkers(
-            "pin-workers", cat(schedulingOptions),
-            desc("Pin workers to CPU cores")
-    );
-    static opt<bool> hyperthreadingEnabled(
-            "hyperthreading", cat(schedulingOptions),
-            desc("Utilize multiple logical CPUs located on the same physical CPU")
-    );
-    static opt<bool> debugMultiThreading(
-            "debug-mt", cat(schedulingOptions),
-            desc("Prints debug information about the Multithreading Wrapper")
-    );
-    
+    static opt<SelfSchedulingScheme> taskPartitioningScheme(
+        "partitioning", cat(schedulingOptions), desc("Choose task partitioning scheme:"),
+        values(clEnumVal(STATIC, "Static (default)"), clEnumVal(SS, "Self-scheduling"),
+               clEnumVal(GSS, "Guided self-scheduling"), clEnumVal(TSS, "Trapezoid self-scheduling"),
+               clEnumVal(FAC2, "Factoring self-scheduling"), clEnumVal(TFSS, "Trapezoid Factoring self-scheduling"),
+               clEnumVal(FISS, "Fixed-increase self-scheduling"), clEnumVal(VISS, "Variable-increase self-scheduling"),
+               clEnumVal(PLS, "Performance loop-based self-scheduling"),
+               clEnumVal(MSTATIC, "Modified version of Static, i.e., instead "
+                                  "of n/p, it uses n/(4*p) where n is number "
+                                  "of tasks and p is number of threads"),
+               clEnumVal(MFSC, "Modified version of fixed size chunk self-scheduling, "
+                               "i.e., MFSC does not require profiling information as FSC"),
+               clEnumVal(PSS, "Probabilistic self-scheduling"), clEnumVal(AUTO, "Automatic partitioning")),
+        init(STATIC));
+    static opt<QueueTypeOption> queueSetupScheme(
+        "queue_layout", cat(schedulingOptions), desc("Choose queue setup scheme:"),
+        values(clEnumVal(CENTRALIZED, "One queue (default)"), clEnumVal(PERGROUP, "One queue per CPU group"),
+               clEnumVal(PERCPU, "One queue per CPU core")),
+        init(CENTRALIZED));
+    static opt<VictimSelectionLogic> victimSelection(
+        "victim_selection", cat(schedulingOptions), desc("Choose work stealing victim selection logic:"),
+        values(clEnumVal(SEQ, "Steal from next adjacent worker (default)"),
+               clEnumVal(SEQPRI, "Steal from next adjacent worker, prioritize same NUMA domain"),
+               clEnumVal(RANDOM, "Steal from random worker"),
+               clEnumVal(RANDOMPRI, "Steal from random worker, prioritize same NUMA domain")),
+        init(SEQ));
+
+    static opt<int> numberOfThreads("num-threads", cat(schedulingOptions),
+                                    desc("Define the number of the CPU threads used by the vectorized "
+                                         "execution engine "
+                                         "(default is equal to the number of physical cores on the target "
+                                         "node that executes the code)"));
+    static opt<int> minimumTaskSize("grain-size", cat(schedulingOptions),
+                                    desc("Define the minimum grain size of a task (default is 1)"), init(1));
+    static opt<bool> useVectorizedPipelines("vec", cat(schedulingOptions), desc("Enable vectorized execution engine"));
+    static opt<bool> useDistributedRuntime("distributed", cat(daphneOptions), desc("Enable distributed runtime"));
+    static opt<bool> prePartitionRows("pre-partition", cat(schedulingOptions),
+                                      desc("Partition rows into the number of queues before applying "
+                                           "scheduling technique"));
+    static opt<bool> pinWorkers("pin-workers", cat(schedulingOptions), desc("Pin workers to CPU cores"));
+    static opt<bool> hyperthreadingEnabled("hyperthreading", cat(schedulingOptions),
+                                           desc("Utilize multiple logical CPUs located on the same physical CPU"));
+    static opt<bool> debugMultiThreading("debug-mt", cat(schedulingOptions),
+                                         desc("Prints debug information about the Multithreading Wrapper"));
+
     // Other options
 
-    static opt<bool> noObjRefMgnt(
-            "no-obj-ref-mgnt", cat(daphneOptions),
-            desc(
-                    "Switch off garbage collection by not managing data "
-                    "objects' reference counters"
-            )
-    );
-    static opt<bool> noIPAConstPropa(
-            "no-ipa-const-propa", cat(daphneOptions),
-            desc("Switch off inter-procedural constant propagation")
-    );
-    static opt<bool> noPhyOpSelection(
-            "no-phy-op-selection", cat(daphneOptions),
-            desc("Switch off physical operator selection, use default kernels for all operations")
-    );
-    static opt<bool> selectMatrixRepr(
-            "select-matrix-repr", cat(daphneOptions),
-            desc(
-                    "Automatically choose physical matrix representations "
-                    "(e.g., dense/sparse)"
-            )
-    );
+    static opt<bool> noObjRefMgnt("no-obj-ref-mgnt", cat(daphneOptions),
+                                  desc("Switch off garbage collection by not managing data "
+                                       "objects' reference counters"));
+    static opt<bool> noIPAConstPropa("no-ipa-const-propa", cat(daphneOptions),
+                                     desc("Switch off inter-procedural constant propagation"));
+    static opt<bool> noPhyOpSelection("no-phy-op-selection", cat(daphneOptions),
+                                      desc("Switch off physical operator selection, use default kernels for "
+                                           "all operations"));
+    static opt<bool> selectMatrixRepr("select-matrix-repr", cat(daphneOptions),
+                                      desc("Automatically choose physical matrix representations "
+                                           "(e.g., dense/sparse)"));
     static alias selectMatrixReprAlias( // to still support the longer old form
-            "select-matrix-representations", aliasopt(selectMatrixRepr),
-            desc("Alias for --select-matrix-repr")
-    );
-    static opt<bool> cuda(
-            "cuda", cat(daphneOptions),
-            desc("Use CUDA")
-    );
-    static opt<bool> fpgaopencl(
-            "fpgaopencl", cat(daphneOptions),
-            desc("Use FPGAOPENCL")
-    );
-    static opt<string> libDir(
-            "libdir", cat(daphneOptions),
-            desc(
-                "The directory containing the kernel catalog files "
-                "(typically, but not necessarily, along with the kernel shared libraries)"
-            )
-    );
-
-    static opt<bool> mlirCodegen(
-        "mlir-codegen", cat(daphneOptions),
-        desc("Enables lowering of certain DaphneIR operations on DenseMatrix to low-level MLIR operations.")
-    );
-    static opt<int> matmul_vec_size_bits(
-        "matmul-vec-size-bits", cat(daphneOptions),
-        desc("Set the vector size to be used in the lowering of the MatMul operation if possible. Value of 0 is interpreted as off switch."),
-        init(0)
-    );
-    static opt<bool> matmul_tile(
-        "matmul-tile", cat(daphneOptions),
-        desc("Enables loop tiling in the lowering of the MatMul operation.")
-    );
-    static opt<int> matmul_unroll_factor(
-        "matmul-unroll-factor", cat(daphneOptions),
-        desc("Factor by which to unroll the finally resulting inner most loop in the lowered MatMul if tiling is used."),
-        init(1)
-    );
-    static opt<int> matmul_unroll_jam_factor(
-        "matmul-unroll-jam-factor", cat(daphneOptions),
-        desc("Factor by which to unroll jam the two inner most loop in the lowered MatMul if tiling is used."),
-        init(4)
-    );
-    static opt<int> matmul_num_vec_registers(
-        "matmul-num-vec-registers", cat(daphneOptions),
-        desc("Number of vector registers. Used during automatic tiling in lowering of MatMulOp"),
-        init(16)
-    );
+        "select-matrix-representations", aliasopt(selectMatrixRepr), desc("Alias for --select-matrix-repr"));
+    static opt<bool> cuda("cuda", cat(daphneOptions), desc("Use CUDA"));
+    static opt<bool> fpgaopencl("fpgaopencl", cat(daphneOptions), desc("Use FPGAOPENCL"));
+    static opt<string> libDir("libdir", cat(daphneOptions),
+                              desc("The directory containing the kernel catalog files "
+                                   "(typically, but not necessarily, along with the kernel shared "
+                                   "libraries)"));
+
+    static opt<bool> mlirCodegen("mlir-codegen", cat(daphneOptions),
+                                 desc("Enables lowering of certain DaphneIR operations on DenseMatrix "
+                                      "to low-level MLIR operations."));
+    static opt<int> matmul_vec_size_bits("matmul-vec-size-bits", cat(daphneOptions),
+                                         desc("Set the vector size to be used in the lowering of the MatMul "
+                                              "operation if possible. Value of 0 is interpreted as off switch."),
+                                         init(0));
+    static opt<bool> matmul_tile("matmul-tile", cat(daphneOptions),
+                                 desc("Enables loop tiling in the lowering of the MatMul operation."));
+    static opt<int> matmul_unroll_factor("matmul-unroll-factor", cat(daphneOptions),
+                                         desc("Factor by which to unroll the finally resulting inner most loop "
+                                              "in the lowered MatMul if tiling is used."),
+                                         init(1));
+    static opt<int> matmul_unroll_jam_factor("matmul-unroll-jam-factor", cat(daphneOptions),
+                                             desc("Factor by which to unroll jam the two inner most loop in the "
+                                                  "lowered MatMul if tiling is used."),
+                                             init(4));
+    static opt<int> matmul_num_vec_registers("matmul-num-vec-registers", cat(daphneOptions),
+                                             desc("Number of vector registers. Used during automatic tiling in "
+                                                  "lowering of MatMulOp"),
+                                             init(16));
     static llvm::cl::list<unsigned> matmul_fixed_tile_sizes(
         "matmul-fixed-tile-sizes", cat(daphneOptions),
-        desc("Set fixed tile sizes to be used for the lowering of MatMul if tiling is used. This also enables tiling."),
-        CommaSeparated
-    );
-    static opt<bool> matmul_invert_loops(
-        "matmul-invert-loops", cat(daphneOptions),
-        desc("Enable inverting of the inner two loops in the matrix multiplication as a fallback option, if tiling is not possible or deactivated.")
-    );
-    
-
-    static opt<bool> performHybridCodegen(
-        "mlir-hybrid-codegen", cat(daphneOptions),
-        desc("Enables prototypical hybrid code generation combining pre-compiled kernels and MLIR code generation.")
-    );
-    static opt<string> kernelExt(
-        "kernel-ext", cat(daphneOptions),
-        desc("Additional kernel extension to register (path to a kernel catalog JSON file).")
-    );
+        desc("Set fixed tile sizes to be used for the lowering of MatMul if "
+             "tiling is used. This also enables tiling."),
+        CommaSeparated);
+    static opt<bool> matmul_invert_loops("matmul-invert-loops", cat(daphneOptions),
+                                         desc("Enable inverting of the inner two loops in the matrix "
+                                              "multiplication as a fallback option, if tiling is not possible "
+                                              "or deactivated."));
+
+    static opt<bool> performHybridCodegen("mlir-hybrid-codegen", cat(daphneOptions),
+                                          desc("Enables prototypical hybrid code generation combining "
+                                               "pre-compiled kernels and MLIR code generation."));
+    static opt<string> kernelExt("kernel-ext", cat(daphneOptions),
+                                 desc("Additional kernel extension to register "
+                                      "(path to a kernel catalog JSON file)."));
 
     enum ExplainArgs {
-      kernels,
-      llvm,
-      parsing,
-      parsing_simplified,
-      property_inference,
-      select_matrix_repr,
-      sql,
-      phy_op_selection,
-      type_adaptation,
-      vectorized,
-      obj_ref_mgnt,
-      mlir_codegen
+        kernels,
+        llvm,
+        parsing,
+        parsing_simplified,
+        property_inference,
+        select_matrix_repr,
+        sql,
+        phy_op_selection,
+        type_adaptation,
+        vectorized,
+        obj_ref_mgnt,
+        mlir_codegen
     };
 
     static llvm::cl::list<ExplainArgs> explainArgList(
         "explain", cat(daphneOptions),
         llvm::cl::desc("Show DaphneIR after certain compiler passes (separate "
                        "multiple values by comma, the order is irrelevant)"),
-        llvm::cl::values(
-            clEnumVal(parsing, "Show DaphneIR after parsing"),
-            clEnumVal(parsing_simplified, "Show DaphneIR after parsing and some simplifications"),
-            clEnumVal(sql, "Show DaphneIR after SQL parsing"),
-            clEnumVal(property_inference, "Show DaphneIR after property inference"),
-            clEnumVal(select_matrix_repr, "Show DaphneIR after selecting physical matrix representations"),
-            clEnumVal(phy_op_selection, "Show DaphneIR after selecting physical operators"),
-            clEnumVal(type_adaptation, "Show DaphneIR after adapting types to available kernels"),
-            clEnumVal(vectorized, "Show DaphneIR after vectorization"),
-            clEnumVal(obj_ref_mgnt, "Show DaphneIR after managing object references"),
-            clEnumVal(kernels, "Show DaphneIR after kernel lowering"),
-            clEnumVal(llvm, "Show DaphneIR after llvm lowering"),
-            clEnumVal(mlir_codegen, "Show DaphneIR after MLIR codegen")),
+        llvm::cl::values(clEnumVal(parsing, "Show DaphneIR after parsing"),
+                         clEnumVal(parsing_simplified, "Show DaphneIR after parsing and some simplifications"),
+                         clEnumVal(sql, "Show DaphneIR after SQL parsing"),
+                         clEnumVal(property_inference, "Show DaphneIR after property inference"),
+                         clEnumVal(select_matrix_repr, "Show DaphneIR after selecting "
+                                                       "physical matrix representations"),
+                         clEnumVal(phy_op_selection, "Show DaphneIR after selecting physical operators"),
+                         clEnumVal(type_adaptation, "Show DaphneIR after adapting types to available kernels"),
+                         clEnumVal(vectorized, "Show DaphneIR after vectorization"),
+                         clEnumVal(obj_ref_mgnt, "Show DaphneIR after managing object references"),
+                         clEnumVal(kernels, "Show DaphneIR after kernel lowering"),
+                         clEnumVal(llvm, "Show DaphneIR after llvm lowering"),
+                         clEnumVal(mlir_codegen, "Show DaphneIR after MLIR codegen")),
         CommaSeparated);
 
-    static llvm::cl::list<string> scriptArgs1(
-            "args", cat(daphneOptions),
-            desc(
-                    "Alternative way of specifying arguments to the DaphneDSL "
-                    "script; must be a comma-separated list of name-value-pairs, "
-                    "e.g., `--args x=1,y=2.2`"
-            ),
-            CommaSeparated
-    );
+    static llvm::cl::list<string> scriptArgs1("args", cat(daphneOptions),
+                                              desc("Alternative way of specifying arguments to the DaphneDSL "
+                                                   "script; must be a comma-separated list of name-value-pairs, "
+                                                   "e.g., `--args x=1,y=2.2`"),
+                                              CommaSeparated);
     const std::string configFileInitValue = "-";
-    static opt<string> configFile(
-        "config", cat(daphneOptions),
-        desc("A JSON file that contains the DAPHNE configuration"),
-        value_desc("filename"),
-        llvm::cl::init(configFileInitValue)
-    );
-
-    static opt<bool> enableStatistics(
-        "statistics", cat(daphneOptions),
-        desc("Enables runtime statistics output."));
-
-    static opt<bool> enableProfiling (
-            "enable-profiling", cat(daphneOptions),
-            desc("Enable profiling support")
-    );
-    static opt<bool> timing (
-            "timing", cat(daphneOptions),
-            desc("Enable timing of high-level steps (start-up, parsing, compilation, execution) and print the times to stderr in JSON format")
-    );
+    static opt<string> configFile("config", cat(daphneOptions),
+                                  desc("A JSON file that contains the DAPHNE configuration"), value_desc("filename"),
+                                  llvm::cl::init(configFileInitValue));
+
+    static opt<bool> enableStatistics("statistics", cat(daphneOptions), desc("Enables runtime statistics output."));
+
+    static opt<bool> enableProfiling("enable-profiling", cat(daphneOptions), desc("Enable profiling support"));
+    static opt<bool> timing("timing", cat(daphneOptions),
+                            desc("Enable timing of high-level steps (start-up, "
+                                 "parsing, compilation, execution) and print "
+                                 "the times to stderr in JSON format"));
 
     // Positional arguments ---------------------------------------------------
-    
+
     static opt<string> inputFile(Positional, desc("script"), Required);
     static llvm::cl::list<string> scriptArgs2(ConsumeAfter, desc("[arguments]"));
 
     // ------------------------------------------------------------------------
     // Parse arguments
     // ------------------------------------------------------------------------
-    
+
     static std::vector<const llvm::cl::OptionCategory *> visibleCategories;
     visibleCategories.push_back(&daphneOptions);
     visibleCategories.push_back(&schedulingOptions);
     visibleCategories.push_back(&distributedBackEndSetupOptions);
     visibleCategories.push_back(&HDFSOptions);
-    
+
     HideUnrelatedOptions(visibleCategories);
 
-    extrahelp(
-            "\nEXAMPLES:\n\n"
-            "  daphne example.daphne\n"
-            "  daphne --vec example.daphne x=1 y=2.2 z=\"foo\"\n"
-            "  daphne --vec --args x=1,y=2.2,z=\"foo\" example.daphne\n"
-            "  daphne --vec --args x=1,y=2.2 example.daphne z=\"foo\"\n"
-    );
+    extrahelp("\nEXAMPLES:\n\n"
+              "  daphne example.daphne\n"
+              "  daphne --vec example.daphne x=1 y=2.2 z=\"foo\"\n"
+              "  daphne --vec --args x=1,y=2.2,z=\"foo\" example.daphne\n"
+              "  daphne --vec --args x=1,y=2.2 example.daphne z=\"foo\"\n");
     SetVersionPrinter(&printVersion);
-    ParseCommandLineOptions(
-            argc, argv,
-            "The DAPHNE Prototype.\n\nThis program compiles and executes a DaphneDSL script.\n"
-    );
+    ParseCommandLineOptions(argc, argv,
+                            "The DAPHNE Prototype.\n\nThis program compiles "
+                            "and executes a DaphneDSL script.\n");
 
     // ************************************************************************
     // Process parsed arguments
@@ -446,18 +357,17 @@ int startDAPHNE(int argc, const char** argv, DaphneLibResult* daphneLibRes, int
         if (configFile != configFileInitValue && ConfigParser::fileExists(configFile)) {
             ConfigParser::readUserConfig(configFile, user_config);
         }
-    }
-    catch(std::exception & e) {
+    } catch (std::exception &e) {
         logErrorDaphneLibAware(daphneLibRes, "Parser error while reading user config:\n" + std::string(e.what()));
         return StatusCode::PARSER_ERROR;
     }
 
     // initialize logging facility
-    if(not logger)
+    if (not logger)
         logger = std::make_unique<DaphneLogger>(user_config);
 
     user_config.use_vectorized_exec = useVectorizedPipelines;
-    user_config.use_distributed = useDistributedRuntime; 
+    user_config.use_distributed = useDistributedRuntime;
     user_config.use_obj_ref_mgnt = !noObjRefMgnt;
     user_config.use_ipa_const_propa = !noIPAConstPropa;
     user_config.use_phy_op_selection = !noPhyOpSelection;
@@ -476,33 +386,34 @@ int startDAPHNE(int argc, const char** argv, DaphneLibResult* daphneLibRes, int
     }
     user_config.use_mlir_hybrid_codegen = performHybridCodegen;
 
-    if(!libDir.getValue().empty())
+    if (!libDir.getValue().empty())
         user_config.libdir = libDir.getValue();
     user_config.resolveLibDir();
 
     user_config.taskPartitioningScheme = taskPartitioningScheme;
     user_config.queueSetupScheme = queueSetupScheme;
-	user_config.victimSelection = victimSelection;
+    user_config.victimSelection = victimSelection;
 
     // only overwrite with non-defaults
-    if(numberOfThreads != 0) {
+    if (numberOfThreads != 0) {
         spdlog::trace("Overwriting config file supplied numberOfThreads={} with command line argument --num-threads={}",
-                      user_config.numberOfThreads, numberOfThreads);
+                      user_config.numberOfThreads, static_cast<int>(numberOfThreads));
         user_config.numberOfThreads = numberOfThreads;
     }
 
-    user_config.minimumTaskSize = minimumTaskSize; 
+    user_config.minimumTaskSize = minimumTaskSize;
     user_config.pinWorkers = pinWorkers;
     user_config.hyperthreadingEnabled = hyperthreadingEnabled;
     user_config.debugMultiThreading = debugMultiThreading;
     user_config.prePartitionRows = prePartitionRows;
     user_config.distributedBackEndSetup = distributedBackEndSetup;
-    if(user_config.use_distributed)
-    {
-        if(user_config.distributedBackEndSetup!=ALLOCATION_TYPE::DIST_MPI &&  user_config.distributedBackEndSetup!=ALLOCATION_TYPE::DIST_GRPC_SYNC &&  user_config.distributedBackEndSetup!=ALLOCATION_TYPE::DIST_GRPC_ASYNC)
+    if (user_config.use_distributed) {
+        if (user_config.distributedBackEndSetup != ALLOCATION_TYPE::DIST_MPI &&
+            user_config.distributedBackEndSetup != ALLOCATION_TYPE::DIST_GRPC_SYNC &&
+            user_config.distributedBackEndSetup != ALLOCATION_TYPE::DIST_GRPC_ASYNC)
             spdlog::warn("No backend has been selected. Wiil use the default 'MPI'");
     }
-    user_config.max_distributed_serialization_chunk_size = maxDistrChunkSize;  
+    user_config.max_distributed_serialization_chunk_size = maxDistrChunkSize;
 
     // only overwrite with non-defaults
     if (use_hdfs) {
@@ -514,96 +425,99 @@ int startDAPHNE(int argc, const char** argv, DaphneLibResult* daphneLibRes, int
     if (hdfs_username != "") {
         user_config.hdfs_username = hdfs_username;
     }
-    if (user_config.use_hdfs && (user_config.hdfs_Address == "" || user_config.hdfs_username == "")){
-        spdlog::warn("HDFS is enabled, but the HDFS IP address or username were not provided.");
+    if (user_config.use_hdfs && (user_config.hdfs_Address == "" || user_config.hdfs_username == "")) {
+        spdlog::warn("HDFS is enabled, but the HDFS IP address or username "
+                     "were not provided.");
     }
 #ifndef USE_HDFS
-    if (user_config.use_hdfs){
-        throw std::runtime_error("you are trying to use HDFS, but Daphne was not build with --hdfs option\n");    
+    if (user_config.use_hdfs) {
+        throw std::runtime_error("you are trying to use HDFS, but Daphne was "
+                                 "not build with --hdfs option\n");
     }
 #endif
 
     for (auto explain : explainArgList) {
         switch (explain) {
-            case kernels:
-                user_config.explain_kernels = true;
-                break;
-            case llvm:
-                user_config.explain_llvm = true;
-                break;
-            case parsing:
-                user_config.explain_parsing = true;
-                break;
-            case parsing_simplified:
-                user_config.explain_parsing_simplified = true;
-                break;
-            case property_inference:
-                user_config.explain_property_inference = true;
-                break;
-            case select_matrix_repr:
-                user_config.explain_select_matrix_repr = true;
-                break;
-            case sql:
-                user_config.explain_sql = true;
-                break;
-            case phy_op_selection:
-                user_config.explain_phy_op_selection = true;
-                break;
-            case type_adaptation:
-                user_config.explain_type_adaptation = true;
-                break;
-            case vectorized:
-                user_config.explain_vectorized = true;
-                break;
-            case obj_ref_mgnt:
-                user_config.explain_obj_ref_mgnt = true;
-                break;
-            case mlir_codegen:
-                user_config.explain_mlir_codegen = true;
-                break;
+        case kernels:
+            user_config.explain_kernels = true;
+            break;
+        case llvm:
+            user_config.explain_llvm = true;
+            break;
+        case parsing:
+            user_config.explain_parsing = true;
+            break;
+        case parsing_simplified:
+            user_config.explain_parsing_simplified = true;
+            break;
+        case property_inference:
+            user_config.explain_property_inference = true;
+            break;
+        case select_matrix_repr:
+            user_config.explain_select_matrix_repr = true;
+            break;
+        case sql:
+            user_config.explain_sql = true;
+            break;
+        case phy_op_selection:
+            user_config.explain_phy_op_selection = true;
+            break;
+        case type_adaptation:
+            user_config.explain_type_adaptation = true;
+            break;
+        case vectorized:
+            user_config.explain_vectorized = true;
+            break;
+        case obj_ref_mgnt:
+            user_config.explain_obj_ref_mgnt = true;
+            break;
+        case mlir_codegen:
+            user_config.explain_mlir_codegen = true;
+            break;
         }
     }
 
     user_config.statistics = enableStatistics;
 
-    if(user_config.use_distributed && distributedBackEndSetup==ALLOCATION_TYPE::DIST_MPI)
-    {
+    if (user_config.use_distributed && distributedBackEndSetup == ALLOCATION_TYPE::DIST_MPI) {
 #ifndef USE_MPI
-    throw std::runtime_error("you are trying to use the MPI backend. But, Daphne was not build with --mpi option\n");    
+        throw std::runtime_error("you are trying to use the MPI backend. But, "
+                                 "Daphne was not build with --mpi option\n");
 #else
-        MPI_Init(NULL,NULL);
+        MPI_Init(NULL, NULL);
         MPI_Comm_rank(MPI_COMM_WORLD, id);
-        int size=0;
+        int size = 0;
         MPI_Comm_size(MPI_COMM_WORLD, &size);
-        if(size<=1)
-        {
-             throw std::runtime_error("you need to rerun with at least 2 MPI ranks (1 Master + 1 Worker)\n");
+        if (size <= 1) {
+            throw std::runtime_error("you need to rerun with at least 2 MPI "
+                                     "ranks (1 Master + 1 Worker)\n");
         }
-        if(*id!=COORDINATOR)
-        {
-            return *id; 
+        if (*id != COORDINATOR) {
+            return *id;
         }
-#endif 
+#endif
     }
-    if(cuda) {
+    if (cuda) {
         int device_count = 0;
 #ifdef USE_CUDA
         CHECK_CUDART(cudaGetDeviceCount(&device_count));
 #endif
-        if(device_count < 1)
-            spdlog::warn("CUDA ops requested by user option but no suitable device found");
+        if (device_count < 1)
+            spdlog::warn("CUDA ops requested by user option but no suitable "
+                         "device found");
         else {
             user_config.use_cuda = true;
         }
     }
 
-    if(fpgaopencl) {
+    if (fpgaopencl) {
         user_config.use_fpgaopencl = true;
     }
 
-    if(enableProfiling) {
+    if (enableProfiling) {
 #ifndef USE_PAPI
-        throw std::runtime_error("you are trying to use profiling, but daphne was built with --no-papi\n");
+        throw std::runtime_error("you are trying to use profiling, but daphne "
+                                 "was built with --no-papi\n");
 #else
         user_config.enable_profiling = true;
 #endif
@@ -617,8 +531,7 @@ int startDAPHNE(int argc, const char** argv, DaphneLibResult* daphneLibRes, int
     try {
         parseScriptArgs(scriptArgs2, scriptArgsFinal);
         parseScriptArgs(scriptArgs1, scriptArgsFinal);
-    }
-    catch(exception& e) {
+    } catch (exception &e) {
         logErrorDaphneLibAware(daphneLibRes, "Parser error: " + std::string(e.what()));
         return StatusCode::PARSER_ERROR;
     }
@@ -629,20 +542,20 @@ int startDAPHNE(int argc, const char** argv, DaphneLibResult* daphneLibRes, int
 
     // Creates an MLIR context and loads the required MLIR dialects.
     DaphneIrExecutor executor(selectMatrixRepr, user_config);
-    mlir::MLIRContext * mctx = executor.getContext();
+    mlir::MLIRContext *mctx = executor.getContext();
 
     // ************************************************************************
     // Populate kernel extension catalog
     // ************************************************************************
 
-    KernelCatalog & kc = executor.getUserConfig().kernelCatalog;
+    KernelCatalog &kc = executor.getUserConfig().kernelCatalog;
     // kc.dump();
     KernelCatalogParser kcp(mctx);
     kcp.parseKernelCatalog(user_config.libdir + "/catalog.json", kc);
-    if(user_config.use_cuda)
+    if (user_config.use_cuda)
         kcp.parseKernelCatalog(user_config.libdir + "/CUDAcatalog.json", kc);
     // kc.dump();
-    if(!kernelExt.empty())
+    if (!kernelExt.empty())
         kcp.parseKernelCatalog(kernelExt, kc);
 
     // ************************************************************************
@@ -657,7 +570,7 @@ int startDAPHNE(int argc, const char** argv, DaphneLibResult* daphneLibRes, int
     OpBuilder builder(mctx);
     auto loc = mlir::FileLineColLoc::get(builder.getStringAttr(inputFile), 0, 0);
     auto moduleOp = ModuleOp::create(loc);
-    auto * body = moduleOp.getBody();
+    auto *body = moduleOp.getBody();
     builder.setInsertionPoint(body, body->begin());
 
     // Parse the input file and generate the corresponding DaphneIR operations
@@ -665,8 +578,7 @@ int startDAPHNE(int argc, const char** argv, DaphneLibResult* daphneLibRes, int
     DaphneDSLParser parser(scriptArgsFinal, user_config);
     try {
         parser.parseFile(builder, inputFile);
-    }
-    catch(std::exception & e) {
+    } catch (std::exception &e) {
         logErrorDaphneLibAware(daphneLibRes, "While parsing: " + std::string(e.what()));
         return StatusCode::PARSER_ERROR;
     }
@@ -674,16 +586,14 @@ int startDAPHNE(int argc, const char** argv, DaphneLibResult* daphneLibRes, int
     clock::time_point tpBegComp = clock::now();
 
     // Further, process the module, including optimization and lowering passes.
-    try{
+    try {
         if (!executor.runPasses(moduleOp)) {
             return StatusCode::PASS_ERROR;
         }
     } catch (std::exception &e) {
-        logErrorDaphneLibAware(
-            daphneLibRes,
-            "Lowering pipeline error.{}\nPassManager failed module lowering, "
-            "responsible IR written to module_fail.log.\n" + std::string(e.what())
-        );
+        logErrorDaphneLibAware(daphneLibRes, "Lowering pipeline error.{}\nPassManager failed module lowering, "
+                                             "responsible IR written to module_fail.log.\n" +
+                                                 std::string(e.what()));
         return StatusCode::PASS_ERROR;
     } catch (...) {
         logErrorDaphneLibAware(daphneLibRes, "Lowering pipeline error: Unknown exception");
@@ -693,91 +603,86 @@ int startDAPHNE(int argc, const char** argv, DaphneLibResult* daphneLibRes, int
     // JIT-compile the module and execute it.
     // module->dump(); // print the LLVM IR representation
     clock::time_point tpBegExec;
-    try{
+    try {
         auto engine = executor.createExecutionEngine(moduleOp);
         tpBegExec = clock::now();
 
-        // set jump address for catching exceptions in kernel libraries via signal handling
-        if(setjmp(return_from_handler) == 0) {
+        // set jump address for catching exceptions in kernel libraries via
+        // signal handling
+        if (setjmp(return_from_handler) == 0) {
             auto error = engine->invoke("main");
             if (error) {
                 llvm::errs() << "JIT-Engine invocation failed: " << error;
                 return StatusCode::EXECUTION_ERROR;
             }
-        }
-        else {
-            logErrorDaphneLibAware(
-                daphneLibRes,
-                "Got an abort signal from the execution engine. Most likely an "
-                "exception in a shared library. Check logs!\n"
-                "Execution error: Returning from signal " + std::to_string(gSignalStatus)
-            );
+        } else {
+            logErrorDaphneLibAware(daphneLibRes, "Got an abort signal from the execution engine. Most likely an "
+                                                 "exception in a shared library. Check logs!\n"
+                                                 "Execution error: Returning from signal " +
+                                                     std::to_string(gSignalStatus));
             return StatusCode::EXECUTION_ERROR;
         }
-    }
-    catch (std::runtime_error& re) {
+    } catch (std::runtime_error &re) {
         logErrorDaphneLibAware(daphneLibRes, "Execution error: " + std::string(re.what()));
         return StatusCode::EXECUTION_ERROR;
-    }
-    catch(std::exception & e){
+    } catch (std::exception &e) {
         logErrorDaphneLibAware(daphneLibRes, "Execution error " + std::string(e.what()));
         return StatusCode::EXECUTION_ERROR;
     }
     clock::time_point tpEnd = clock::now();
 
-    if(timing) {
+    if (timing) {
         // Calculate durations of the individual high-level steps of DAPHNE.
-        double durStrt  = chrono::duration_cast<chrono::duration<double>>(tpBegPars - tpBeg    ).count();
-        double durPars  = chrono::duration_cast<chrono::duration<double>>(tpBegComp - tpBegPars).count();
-        double durComp  = chrono::duration_cast<chrono::duration<double>>(tpBegExec - tpBegComp).count();
-        double durExec  = chrono::duration_cast<chrono::duration<double>>(tpEnd     - tpBegExec).count();
-        double durTotal = chrono::duration_cast<chrono::duration<double>>(tpEnd     - tpBeg    ).count();
+        double durStrt = chrono::duration_cast<chrono::duration<double>>(tpBegPars - tpBeg).count();
+        double durPars = chrono::duration_cast<chrono::duration<double>>(tpBegComp - tpBegPars).count();
+        double durComp = chrono::duration_cast<chrono::duration<double>>(tpBegExec - tpBegComp).count();
+        double durExec = chrono::duration_cast<chrono::duration<double>>(tpEnd - tpBegExec).count();
+        double durTotal = chrono::duration_cast<chrono::duration<double>>(tpEnd - tpBeg).count();
         // ToDo: use logger
         // Output durations in JSON.
         std::cerr << "{";
-        std::cerr << "\"startup_seconds\": "     << durStrt  << ", ";
-        std::cerr << "\"parsing_seconds\": "     << durPars  << ", ";
-        std::cerr << "\"compilation_seconds\": " << durComp  << ", ";
-        std::cerr << "\"execution_seconds\": "   << durExec  << ", ";
-        std::cerr << "\"total_seconds\": "       << durTotal;
+        std::cerr << "\"startup_seconds\": " << durStrt << ", ";
+        std::cerr << "\"parsing_seconds\": " << durPars << ", ";
+        std::cerr << "\"compilation_seconds\": " << durComp << ", ";
+        std::cerr << "\"execution_seconds\": " << durExec << ", ";
+        std::cerr << "\"total_seconds\": " << durTotal;
         std::cerr << "}" << std::endl;
     }
 
     if (user_config.statistics)
         Statistics::instance().dumpStatistics(KernelDispatchMapping::instance());
 
-    // explicitly destroying the moduleOp here due to valgrind complaining about a memory leak otherwise.
+    // explicitly destroying the moduleOp here due to valgrind complaining about
+    // a memory leak otherwise.
     moduleOp->destroy();
     return StatusCode::SUCCESS;
 }
 
-
-int mainInternal(int argc, const char** argv, DaphneLibResult* daphneLibRes){
-    int id=-1; // this  -1 would not change if the user did not select mpi backend during execution
+int mainInternal(int argc, const char **argv, DaphneLibResult *daphneLibRes) {
+    int id = -1; // this  -1 would not change if the user did not select mpi
+                 // backend during execution
 
     // Initialize user configuration.
     DaphneUserConfig user_config{};
 
-    int res=startDAPHNE(argc, argv, daphneLibRes, &id, user_config);
+    int res = startDAPHNE(argc, argv, daphneLibRes, &id, user_config);
 
-#ifdef USE_MPI    
-    if(id==COORDINATOR)
-    {
-        int size=0;
+#ifdef USE_MPI
+    if (id == COORDINATOR) {
+        int size = 0;
         MPI_Comm_size(MPI_COMM_WORLD, &size);
-        unsigned char terminateMessage=0x00;
-        for(int i=1;i<size;i++){
-            MPI_Send(&terminateMessage,1, MPI_UNSIGNED_CHAR, i,  DETACH, MPI_COMM_WORLD);
-       }
-       MPI_Finalize();
-    }   
-    else if(id>-1){
+        unsigned char terminateMessage = 0x00;
+        for (int i = 1; i < size; i++) {
+            MPI_Send(&terminateMessage, 1, MPI_UNSIGNED_CHAR, i, DETACH, MPI_COMM_WORLD);
+        }
+        MPI_Finalize();
+    } else if (id > -1) {
         MPIWorker worker(user_config);
         worker.joinComputingTeam();
-        res=StatusCode::SUCCESS;
+        res = StatusCode::SUCCESS;
         MPI_Finalize();
     }
 #endif
-   
+
     return res;
 }
diff --git a/src/api/internal/daphne_internal.h b/src/api/internal/daphne_internal.h
index 737b0383f..c237ff35d 100644
--- a/src/api/internal/daphne_internal.h
+++ b/src/api/internal/daphne_internal.h
@@ -18,4 +18,4 @@
 
 #include <api/daphnelib/DaphneLibResult.h>
 
-int mainInternal(int argc, const char** argv, DaphneLibResult* daphneLibRes);
\ No newline at end of file
+int mainInternal(int argc, const char **argv, DaphneLibResult *daphneLibRes);
\ No newline at end of file
diff --git a/src/compiler/catalog/KernelCatalog.h b/src/compiler/catalog/KernelCatalog.h
index 258cb8d78..f15a031bc 100644
--- a/src/compiler/catalog/KernelCatalog.h
+++ b/src/compiler/catalog/KernelCatalog.h
@@ -60,15 +60,9 @@ struct KernelInfo {
      */
     const std::string libPath;
 
-    KernelInfo(
-        const std::string kernelFuncName,
-        const std::vector<mlir::Type> resTypes,
-        const std::vector<mlir::Type> argTypes,
-        const std::string backend,
-        const std::string libPath
-    ) :
-        kernelFuncName(kernelFuncName), resTypes(resTypes), argTypes(argTypes), backend(backend), libPath(libPath)
-    {
+    KernelInfo(const std::string kernelFuncName, const std::vector<mlir::Type> resTypes,
+               const std::vector<mlir::Type> argTypes, const std::string backend, const std::string libPath)
+        : kernelFuncName(kernelFuncName), resTypes(resTypes), argTypes(argTypes), backend(backend), libPath(libPath) {
         //
     }
 };
@@ -78,44 +72,46 @@ struct KernelInfo {
  */
 class KernelCatalog {
     /**
-     * @brief The central data structure mapping DaphneIR operations to registered kernels.
-     * 
-     * The DaphneIR operation is represented by its mnemonic. The kernels are represented
-     * by their kernel information.
+     * @brief The central data structure mapping DaphneIR operations to
+     * registered kernels.
+     *
+     * The DaphneIR operation is represented by its mnemonic. The kernels are
+     * represented by their kernel information.
      */
     std::unordered_map<std::string, std::vector<KernelInfo>> kernelInfosByOp;
 
     /**
      * @brief Prints the given kernel information.
-     * 
+     *
      * @param opMnemonic The mnemonic of the corresponding DaphneIR operation.
      * @param kernelInfos The kernel information to print.
      * @param os The stream to print to. Defaults to `std::cerr`.
      */
-    void dumpKernelInfos(const std::string & opMnemonic, const std::vector<KernelInfo> & kernelInfos, std::ostream & os = std::cerr) const {
+    void dumpKernelInfos(const std::string &opMnemonic, const std::vector<KernelInfo> &kernelInfos,
+                         std::ostream &os = std::cerr) const {
         os << "- operation `" << opMnemonic << "` (" << kernelInfos.size() << " kernels)" << std::endl;
-        for(KernelInfo ki : kernelInfos) {
+        for (KernelInfo ki : kernelInfos) {
             os << "  - kernel `" << ki.kernelFuncName << "`: (";
-            for(size_t i = 0; i < ki.argTypes.size(); i++) {
+            for (size_t i = 0; i < ki.argTypes.size(); i++) {
                 os << ki.argTypes[i];
-                if(i < ki.argTypes.size() - 1)
+                if (i < ki.argTypes.size() - 1)
                     os << ", ";
             }
             os << ") -> (";
-            for(size_t i = 0; i < ki.resTypes.size(); i++) {
+            for (size_t i = 0; i < ki.resTypes.size(); i++) {
                 os << ki.resTypes[i];
-                if(i < ki.resTypes.size() - 1)
+                if (i < ki.resTypes.size() - 1)
                     os << ", ";
             }
-            os << ") for backend `" << ki.backend  << "` (in `" << ki.libPath << "`)" << std::endl;
+            os << ") for backend `" << ki.backend << "` (in `" << ki.libPath << "`)" << std::endl;
         }
     }
 
-public:
+  public:
     /**
-     * @brief Registers the given kernel information as a kernel for the DaphneIR
-     * operation with the given mnemonic.
-     * 
+     * @brief Registers the given kernel information as a kernel for the
+     * DaphneIR operation with the given mnemonic.
+     *
      * @param opMnemonic The DaphneIR operation's mnemonic.
      * @param kernelInfo The information on the kernel.
      */
@@ -124,15 +120,16 @@ class KernelCatalog {
     }
 
     /**
-     * @brief Retrieves information on all kernels registered for the given DaphneIR operation.
-     * 
+     * @brief Retrieves information on all kernels registered for the given
+     * DaphneIR operation.
+     *
      * @param opMnemonic The mnemonic of the DaphneIR operation.
-     * @return A vector of kernel information, or an empty vector if no kernels are registered
-     * for the given operation.
+     * @return A vector of kernel information, or an empty vector if no kernels
+     * are registered for the given operation.
      */
-    const std::vector<KernelInfo> getKernelInfos(const std::string & opMnemonic) const {
+    const std::vector<KernelInfo> getKernelInfos(const std::string &opMnemonic) const {
         auto it = kernelInfosByOp.find(opMnemonic);
-        if(it != kernelInfosByOp.end())
+        if (it != kernelInfosByOp.end())
             return it->second;
         else
             return {};
@@ -145,44 +142,43 @@ class KernelCatalog {
      * @param kernelFuncName The name of the kernel function to look for.
      * @return The mnemonic of the operation.
      */
-    std::string getOpMnemonic(const std::string & kernelFuncName) {
-        for(auto it : kernelInfosByOp) {
+    std::string getOpMnemonic(const std::string &kernelFuncName) {
+        for (auto it : kernelInfosByOp) {
             std::string opMnemonic = it.first;
-            const std::vector<KernelInfo> & kis = it.second;
-            for(auto it2 : kis)
-                if(it2.kernelFuncName == kernelFuncName)
+            const std::vector<KernelInfo> &kis = it.second;
+            for (auto it2 : kis)
+                if (it2.kernelFuncName == kernelFuncName)
                     return opMnemonic;
         }
-        throw std::runtime_error(
-            "no kernel with name `" + kernelFuncName + "` registered in the kernel catalog"
-        );
+        throw std::runtime_error("no kernel with name `" + kernelFuncName + "` registered in the kernel catalog");
     }
 
     /**
      * @brief Prints high-level statistics on the kernel catalog.
-     * 
+     *
      * @param os The stream to print to. Defaults to `std::cerr`.
      */
-    void stats(std::ostream & os = std::cerr) const {
+    void stats(std::ostream &os = std::cerr) const {
         const size_t numOps = kernelInfosByOp.size();
         size_t numKernels = 0;
-        for(auto it = kernelInfosByOp.begin(); it != kernelInfosByOp.end(); it++)
+        for (auto it = kernelInfosByOp.begin(); it != kernelInfosByOp.end(); it++)
             numKernels += it->second.size();
         os << "KernelCatalog (" << numOps << " ops, " << numKernels << " kernels)" << std::endl;
     }
 
     /**
      * @brief Prints this kernel catalog.
-     * 
-     * @param opMnemonic If an empty string, print registered kernels for all DaphneIR
-     * operations; otherwise, consider only the specified DaphneIR operation.
+     *
+     * @param opMnemonic If an empty string, print registered kernels for all
+     * DaphneIR operations; otherwise, consider only the specified DaphneIR
+     * operation.
      * @param os The stream to print to. Defaults to `std::cerr`.
      */
-    void dump(std::string opMnemonic = "", std::ostream & os = std::cerr) const {
+    void dump(std::string opMnemonic = "", std::ostream &os = std::cerr) const {
         stats(os);
-        if(opMnemonic.empty())
+        if (opMnemonic.empty())
             // Print info on all ops.
-            for(auto it = kernelInfosByOp.begin(); it != kernelInfosByOp.end(); it++)
+            for (auto it = kernelInfosByOp.begin(); it != kernelInfosByOp.end(); it++)
                 dumpKernelInfos(it->first, it->second, os);
         else
             // Print info on specified op only.
@@ -190,17 +186,18 @@ class KernelCatalog {
     }
 
     /**
-     * @brief Returns all distinct kernel libraries in the form of a mapping from
-     * the library path to the constant `false`.
+     * @brief Returns all distinct kernel libraries in the form of a mapping
+     * from the library path to the constant `false`.
      *
-     * @return A mapping from each distict kernel library path to the constant `false`.
+     * @return A mapping from each distict kernel library path to the constant
+     * `false`.
      */
     std::unordered_map<std::string, bool> getLibPaths() const {
         std::unordered_map<std::string, bool> res;
 
-        for(auto it : kernelInfosByOp) {
-            const std::vector<KernelInfo> & kis = it.second;
-            for(auto it2 : kis)
+        for (auto it : kernelInfosByOp) {
+            const std::vector<KernelInfo> &kis = it.second;
+            for (auto it2 : kis)
                 res[it2.libPath] = false;
         }
 
diff --git a/src/compiler/execution/DaphneIrExecutor.cpp b/src/compiler/execution/DaphneIrExecutor.cpp
index b19f76ba7..67fdac21d 100644
--- a/src/compiler/execution/DaphneIrExecutor.cpp
+++ b/src/compiler/execution/DaphneIrExecutor.cpp
@@ -18,14 +18,13 @@
 #include <util/ErrorHandler.h>
 
 #include <ir/daphneir/Daphne.h>
-#include <ir/daphneir/Passes.h>
 #include <ir/daphneir/Passes.h.inc>
+#include <ir/daphneir/Passes.h>
 #include <mlir/Dialect/LLVMIR/LLVMDialect.h>
 #include <mlir/Dialect/LLVMIR/Transforms/Passes.h>
 
 #include <filesystem>
 
-#include "llvm/Support/TargetSelect.h"
 #include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
 #include "mlir/Conversion/LinalgToLLVM/LinalgToLLVM.h"
 #include "mlir/Conversion/MathToLLVM/MathToLLVM.h"
@@ -47,13 +46,13 @@
 #include "mlir/Support/LogicalResult.h"
 #include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h"
 #include "mlir/Transforms/Passes.h"
+#include "llvm/Support/TargetSelect.h"
 
-DaphneIrExecutor::DaphneIrExecutor(bool selectMatrixRepresentations,
-                                   DaphneUserConfig cfg)
-    : userConfig_(std::move(cfg)),
-      selectMatrixRepresentations_(selectMatrixRepresentations) {
+DaphneIrExecutor::DaphneIrExecutor(bool selectMatrixRepresentations, DaphneUserConfig cfg)
+    : userConfig_(std::move(cfg)), selectMatrixRepresentations_(selectMatrixRepresentations) {
     // register loggers
-    if (userConfig_.log_ptr != nullptr) userConfig_.log_ptr->registerLoggers();
+    if (userConfig_.log_ptr != nullptr)
+        userConfig_.log_ptr->registerLoggers();
 
     context_.getOrLoadDialect<mlir::daphne::DaphneDialect>();
     context_.getOrLoadDialect<mlir::arith::ArithDialect>();
@@ -78,49 +77,34 @@ bool DaphneIrExecutor::runPasses(mlir::ModuleOp module) {
     // return false;
     //}
 
-    if (!module) return false;
+    if (!module)
+        return false;
 
     // This flag is really useful to figure out why the lowering failed
     llvm::DebugFlag = userConfig_.debug_llvm;
-    {
-        mlir::PassManager pm(&context_);
-        // TODO Enable the verifier for all passes where it is possible.
-        // Originally, it was only turned off for the
-        // SpecializeGenericFunctionsPass.
-        pm.enableVerifier(false);
 
-        if (userConfig_.explain_parsing)
-            pm.addPass(mlir::daphne::createPrintIRPass("IR after parsing:"));
+    mlir::PassManager pm(&context_);
+    // TODO Enable the verifier for all passes where it is possible.
+    // Originally, it was only turned off for the
+    // SpecializeGenericFunctionsPass.
+    pm.enableVerifier(false);
 
-        pm.addPass(mlir::createCanonicalizerPass());
-        pm.addPass(mlir::createCSEPass());
-        if (userConfig_.explain_parsing_simplified)
-            pm.addPass(mlir::daphne::createPrintIRPass(
-                "IR after parsing and some simplifications:"));
-
-        pm.addPass(mlir::daphne::createRewriteSqlOpPass());  // calls SQL Parser
-        if (userConfig_.explain_sql)
-            pm.addPass(
-                mlir::daphne::createPrintIRPass("IR after SQL parsing:"));
-
-        pm.addPass(
-            mlir::daphne::createSpecializeGenericFunctionsPass(userConfig_));
-        if (userConfig_.explain_property_inference)
-            pm.addPass(mlir::daphne::createPrintIRPass("IR after inference:"));
-
-        try {
-            if (failed(pm.run(module))) {
-                module->dump();
-                module->emitError("module pass error");
-                return false;
-            }
-        } catch(...) {
-            ErrorHandler::dumpModuleToDisk(module);
-            throw;
-        }
-    }
+    if (userConfig_.explain_parsing)
+        pm.addPass(mlir::daphne::createPrintIRPass("IR after parsing:"));
+
+    pm.addPass(mlir::createCanonicalizerPass());
+    pm.addPass(mlir::createCSEPass());
+    if (userConfig_.explain_parsing_simplified)
+        pm.addPass(mlir::daphne::createPrintIRPass("IR after parsing and some simplifications:"));
+
+    pm.addPass(mlir::daphne::createRewriteSqlOpPass()); // calls SQL Parser
+    if (userConfig_.explain_sql)
+        pm.addPass(mlir::daphne::createPrintIRPass("IR after SQL parsing:"));
+
+    pm.addPass(mlir::daphne::createSpecializeGenericFunctionsPass(userConfig_));
+    if (userConfig_.explain_property_inference)
+        pm.addPass(mlir::daphne::createPrintIRPass("IR after inference:"));
 
-    mlir::PassManager pm(&context_);
     // Note that property inference and canonicalization have already been done
     // in the SpecializeGenericFunctionsPass, so actually, it's not necessary
     // here anymore.
@@ -137,22 +121,18 @@ bool DaphneIrExecutor::runPasses(mlir::ModuleOp module) {
     }
 
     if (userConfig_.explain_select_matrix_repr)
-        pm.addPass(mlir::daphne::createPrintIRPass(
-            "IR after selecting matrix representations:"));
+        pm.addPass(mlir::daphne::createPrintIRPass("IR after selecting matrix representations:"));
 
     if (userConfig_.use_phy_op_selection) {
         pm.addPass(mlir::daphne::createPhyOperatorSelectionPass());
         pm.addPass(mlir::createCSEPass());
     }
     if (userConfig_.explain_phy_op_selection)
-        pm.addPass(mlir::daphne::createPrintIRPass(
-            "IR after selecting physical operators:"));
+        pm.addPass(mlir::daphne::createPrintIRPass("IR after selecting physical operators:"));
 
-    pm.addNestedPass<mlir::func::FuncOp>(
-        mlir::daphne::createAdaptTypesToKernelsPass());
+    pm.addNestedPass<mlir::func::FuncOp>(mlir::daphne::createAdaptTypesToKernelsPass());
     if (userConfig_.explain_type_adaptation)
-        pm.addPass(
-            mlir::daphne::createPrintIRPass("IR after type adaptation:"));
+        pm.addPass(mlir::daphne::createPrintIRPass("IR after type adaptation:"));
 
     // For now, in order to use the distributed runtime we also require the
     // vectorized engine to be enabled to create pipelines. Therefore, *if*
@@ -160,8 +140,7 @@ bool DaphneIrExecutor::runPasses(mlir::ModuleOp module) {
     if (userConfig_.use_vectorized_exec || userConfig_.use_distributed) {
         // TODO: add inference here if we have rewrites that could apply to
         // vectorized pipelines due to smaller sizes
-        pm.addNestedPass<mlir::func::FuncOp>(
-            mlir::daphne::createVectorizeComputationsPass());
+        pm.addNestedPass<mlir::func::FuncOp>(mlir::daphne::createVectorizeComputationsPass());
         pm.addPass(mlir::createCanonicalizerPass());
     }
     if (userConfig_.explain_vectorized)
@@ -170,25 +149,22 @@ bool DaphneIrExecutor::runPasses(mlir::ModuleOp module) {
     if (userConfig_.use_distributed)
         pm.addPass(mlir::daphne::createDistributePipelinesPass());
 
-    if (userConfig_.use_mlir_codegen || userConfig_.use_mlir_hybrid_codegen) buildCodegenPipeline(pm);
+    if (userConfig_.use_mlir_codegen || userConfig_.use_mlir_hybrid_codegen)
+        buildCodegenPipeline(pm);
 
     if (userConfig_.enable_profiling)
-        pm.addNestedPass<mlir::func::FuncOp>(
-            mlir::daphne::createProfilingPass());
+        pm.addNestedPass<mlir::func::FuncOp>(mlir::daphne::createProfilingPass());
 
-    pm.addNestedPass<mlir::func::FuncOp>(
-        mlir::daphne::createInsertDaphneContextPass(userConfig_));
+    pm.addNestedPass<mlir::func::FuncOp>(mlir::daphne::createInsertDaphneContextPass(userConfig_));
 
 #ifdef USE_CUDA
     if (userConfig_.use_cuda)
-        pm.addNestedPass<mlir::func::FuncOp>(
-            mlir::daphne::createMarkCUDAOpsPass(userConfig_));
+        pm.addNestedPass<mlir::func::FuncOp>(mlir::daphne::createMarkCUDAOpsPass(userConfig_));
 #endif
 
 #ifdef USE_FPGAOPENCL
     if (userConfig_.use_fpgaopencl)
-        pm.addNestedPass<mlir::func::FuncOp>(
-            mlir::daphne::createMarkFPGAOPENCLOpsPass(userConfig_));
+        pm.addNestedPass<mlir::func::FuncOp>(mlir::daphne::createMarkFPGAOPENCLOpsPass(userConfig_));
 #endif
 
     // Tidy up the IR before managing object reference counters with IncRefOp
@@ -200,21 +176,16 @@ bool DaphneIrExecutor::runPasses(mlir::ModuleOp module) {
     pm.addPass(mlir::createCSEPass());
 
     if (userConfig_.use_obj_ref_mgnt)
-        pm.addNestedPass<mlir::func::FuncOp>(
-            mlir::daphne::createManageObjRefsPass());
+        pm.addNestedPass<mlir::func::FuncOp>(mlir::daphne::createManageObjRefsPass());
     if (userConfig_.explain_obj_ref_mgnt)
-        pm.addPass(mlir::daphne::createPrintIRPass(
-            "IR after managing object references:"));
+        pm.addPass(mlir::daphne::createPrintIRPass("IR after managing object references:"));
 
-    pm.addNestedPass<mlir::func::FuncOp>(
-        mlir::daphne::createRewriteToCallKernelOpPass(userConfig_, usedLibPaths));
+    pm.addNestedPass<mlir::func::FuncOp>(mlir::daphne::createRewriteToCallKernelOpPass(userConfig_, usedLibPaths));
     if (userConfig_.explain_kernels)
-        pm.addPass(
-            mlir::daphne::createPrintIRPass("IR after kernel lowering:"));
+        pm.addPass(mlir::daphne::createPrintIRPass("IR after kernel lowering:"));
 
     pm.addPass(mlir::createConvertSCFToCFPass());
-    pm.addNestedPass<mlir::func::FuncOp>(
-        mlir::LLVM::createRequestCWrappersPass());
+    pm.addNestedPass<mlir::func::FuncOp>(mlir::LLVM::createRequestCWrappersPass());
     pm.addPass(mlir::daphne::createLowerToLLVMPass(userConfig_));
     pm.addPass(mlir::createReconcileUnrealizedCastsPass());
     if (userConfig_.explain_llvm)
@@ -222,7 +193,7 @@ bool DaphneIrExecutor::runPasses(mlir::ModuleOp module) {
 
     // Initialize the use of each distinct kernels library to false.
     usedLibPaths = userConfig_.kernelCatalog.getLibPaths();
-    
+
     try {
         if (failed(pm.run(module))) {
             module->dump();
@@ -237,9 +208,9 @@ bool DaphneIrExecutor::runPasses(mlir::ModuleOp module) {
     return true;
 }
 
-std::unique_ptr<mlir::ExecutionEngine> DaphneIrExecutor::createExecutionEngine(
-    mlir::ModuleOp module) {
-    if (!module) return nullptr;
+std::unique_ptr<mlir::ExecutionEngine> DaphneIrExecutor::createExecutionEngine(mlir::ModuleOp module) {
+    if (!module)
+        return nullptr;
     // An optimization pipeline to use within the execution engine.
     unsigned optLevel = 0;
     unsigned sizeLevel = 0;
@@ -248,19 +219,17 @@ std::unique_ptr<mlir::ExecutionEngine> DaphneIrExecutor::createExecutionEngine(
 
     // Determine the actually used kernels libraries.
     std::vector<llvm::StringRef> sharedLibRefs;
-    for(auto it = usedLibPaths.begin(); it != usedLibPaths.end(); it++)
-        if(it->second) {
+    for (auto it = usedLibPaths.begin(); it != usedLibPaths.end(); it++)
+        if (it->second) {
             std::string usedLibPath = it->first;
             sharedLibRefPaths.push_back(usedLibPath);
             sharedLibRefs.emplace_back(sharedLibRefPaths.back());
 
-            // Check if the used kernels library really exists at the expected path
-            // and throw an understandable error, otherwise.
-            if(!std::filesystem::exists(usedLibPath))
-                throw std::runtime_error(
-                    "the shared library `" + usedLibPath +
-                    "` is needed for some kernel, but the file does not exist"
-                );
+            // Check if the used kernels library really exists at the expected
+            // path and throw an understandable error, otherwise.
+            if (!std::filesystem::exists(usedLibPath))
+                throw std::runtime_error("the shared library `" + usedLibPath +
+                                         "` is needed for some kernel, but the file does not exist");
         }
 
     registerLLVMDialectTranslation(context_);
@@ -276,8 +245,7 @@ std::unique_ptr<mlir::ExecutionEngine> DaphneIrExecutor::createExecutionEngine(
     auto maybeEngine = mlir::ExecutionEngine::create(module, options);
 
     if (!maybeEngine) {
-        llvm::errs() << "Failed to create JIT-Execution engine: "
-                     << maybeEngine.takeError();
+        llvm::errs() << "Failed to create JIT-Execution engine: " << maybeEngine.takeError();
         return nullptr;
     }
     return std::move(maybeEngine.get());
@@ -285,8 +253,7 @@ std::unique_ptr<mlir::ExecutionEngine> DaphneIrExecutor::createExecutionEngine(
 
 void DaphneIrExecutor::buildCodegenPipeline(mlir::PassManager &pm) {
     if (userConfig_.explain_mlir_codegen)
-        pm.addPass(
-            mlir::daphne::createPrintIRPass("IR before codegen pipeline"));
+        pm.addPass(mlir::daphne::createPrintIRPass("IR before codegen pipeline"));
 
     pm.addPass(mlir::daphne::createDaphneOptPass());
     pm.addPass(mlir::daphne::createEwOpLoweringPass());
@@ -298,28 +265,23 @@ void DaphneIrExecutor::buildCodegenPipeline(mlir::PassManager &pm) {
 
     if (!userConfig_.use_mlir_hybrid_codegen) {
         pm.addPass(mlir::daphne::createMatMulOpLoweringPass(
-        userConfig_.matmul_tile, userConfig_.matmul_vec_size_bits,
-        userConfig_.matmul_fixed_tile_sizes,
-        userConfig_.matmul_use_fixed_tile_sizes,
-        userConfig_.matmul_unroll_factor, userConfig_.matmul_unroll_jam_factor,
-        userConfig_.matmul_num_vec_registers,
-        userConfig_.matmul_invert_loops));
+            userConfig_.matmul_tile, userConfig_.matmul_vec_size_bits, userConfig_.matmul_fixed_tile_sizes,
+            userConfig_.matmul_use_fixed_tile_sizes, userConfig_.matmul_unroll_factor,
+            userConfig_.matmul_unroll_jam_factor, userConfig_.matmul_num_vec_registers,
+            userConfig_.matmul_invert_loops));
         if (userConfig_.explain_mlir_codegen)
-        pm.addPass(
-            mlir::daphne::createPrintIRPass("IR directly after lowering MatMulOp."));
+            pm.addPass(mlir::daphne::createPrintIRPass("IR directly after lowering MatMulOp."));
     }
 
     pm.addPass(mlir::createConvertMathToLLVMPass());
     pm.addPass(mlir::daphne::createModOpLoweringPass());
     pm.addPass(mlir::createCanonicalizerPass());
     pm.addPass(mlir::createCSEPass());
-    pm.addNestedPass<mlir::func::FuncOp>(
-        mlir::createAffineScalarReplacementPass());
+    pm.addNestedPass<mlir::func::FuncOp>(mlir::createAffineScalarReplacementPass());
     pm.addPass(mlir::createLowerAffinePass());
     mlir::LowerVectorToLLVMOptions lowerVectorToLLVMOptions;
     pm.addPass(mlir::createConvertVectorToLLVMPass(lowerVectorToLLVMOptions));
-    
+
     if (userConfig_.explain_mlir_codegen)
-        pm.addPass(
-            mlir::daphne::createPrintIRPass("IR after codegen pipeline"));
+        pm.addPass(mlir::daphne::createPrintIRPass("IR after codegen pipeline"));
 }
diff --git a/src/compiler/execution/DaphneIrExecutor.h b/src/compiler/execution/DaphneIrExecutor.h
index b13a23a73..809dafb45 100644
--- a/src/compiler/execution/DaphneIrExecutor.h
+++ b/src/compiler/execution/DaphneIrExecutor.h
@@ -16,33 +16,27 @@
 
 #pragma once
 
-#include "mlir/IR/BuiltinOps.h"
 #include "mlir/ExecutionEngine/ExecutionEngine.h"
-#include <api/cli/DaphneUserConfig.h>
+#include "mlir/IR/BuiltinOps.h"
 #include "mlir/Pass/PassManager.h"
+#include <api/cli/DaphneUserConfig.h>
 
 #include <unordered_map>
 
-class DaphneIrExecutor
-{
-public:
+class DaphneIrExecutor {
+  public:
     DaphneIrExecutor(bool selectMatrixRepresentations, DaphneUserConfig cfg);
 
     bool runPasses(mlir::ModuleOp module);
     std::unique_ptr<mlir::ExecutionEngine> createExecutionEngine(mlir::ModuleOp module);
 
-    mlir::MLIRContext *getContext()
-    { return &context_; }
+    mlir::MLIRContext *getContext() { return &context_; }
 
-    DaphneUserConfig & getUserConfig() {
-        return userConfig_;
-    }
+    DaphneUserConfig &getUserConfig() { return userConfig_; }
 
-    const DaphneUserConfig & getUserConfig() const {
-        return userConfig_;
-    }
+    const DaphneUserConfig &getUserConfig() const { return userConfig_; }
 
-private:
+  private:
     mlir::MLIRContext context_;
     DaphneUserConfig userConfig_;
     bool selectMatrixRepresentations_;
@@ -50,17 +44,16 @@ class DaphneIrExecutor
     std::vector<std::string> sharedLibRefPaths;
 
     /**
-     * @brief A map indicating which of the distinct kernels libraries known to the
-     * kernel catalog are actually used in the MLIR module.
+     * @brief A map indicating which of the distinct kernels libraries known to
+     * the kernel catalog are actually used in the MLIR module.
      *
-     * This map gets pre-populated with `false` for each distinct library. The values
-     * are set to `true` when a call to a pre-compiled kernel from that library is
-     * created by this pass. This approach is thread-safe, since the structure of the
-     * map does not change anymore. Thus, it can be used by multiple concurrent
-     * instances of this pass.
+     * This map gets pre-populated with `false` for each distinct library. The
+     * values are set to `true` when a call to a pre-compiled kernel from that
+     * library is created by the RewriteToCallKernelOpPass pass. This approach
+     * is thread-safe, since the structure of the map does not change anymore.
+     * Thus, it can be used by multiple concurrent instances of this pass.
      */
     std::unordered_map<std::string, bool> usedLibPaths;
 
     void buildCodegenPipeline(mlir::PassManager &);
 };
-
diff --git a/src/compiler/explanation/PrintIRPass.cpp b/src/compiler/explanation/PrintIRPass.cpp
index 3adf1bf5b..befc773fa 100644
--- a/src/compiler/explanation/PrintIRPass.cpp
+++ b/src/compiler/explanation/PrintIRPass.cpp
@@ -31,7 +31,7 @@ using namespace mlir;
 class PrintIRPass : public PassWrapper<PrintIRPass, OperationPass<ModuleOp>> {
     std::string message;
 
-   public:
+  public:
     PrintIRPass(const std::string message) : message(message) {}
 
     void runOnOperation() final;
diff --git a/src/compiler/inference/AdaptTypesToKernelsPass.cpp b/src/compiler/inference/AdaptTypesToKernelsPass.cpp
index 0e3ec7d4e..414c27827 100644
--- a/src/compiler/inference/AdaptTypesToKernelsPass.cpp
+++ b/src/compiler/inference/AdaptTypesToKernelsPass.cpp
@@ -26,121 +26,110 @@
 using namespace mlir;
 
 /**
- * @brief Adapts an operation's input/output types such that it can be lowered to an available pre-compiled kernel.
- * 
- * While type inference propagates types through the IR, it is not guaranteed that a pre-compiled kernel
- * for each infered type combination is available. Thus, the task of this pass is to adapt input and
- * output types by casts, where necessary, to ensure that an existing pre-compiled kernel can be used.
- * 
- * At the moment, this pass is implemented in a very simple way. It supports only two concrete actions:
- * - It harmonizes the value types of all inputs with those of the single output, for certain operations.
- *   This is because so far we mainly pre-compile our kernels for homogeneous combinations of input/output
- *   types.
- * - It harmonizes the value types of all inputs (independently of the output type), for certain operations.
- *   This is because some kernels need to output a different type than their inputs (e.g., comparisons on
- *   non-numeric value types).
- * In general, the affected operations are marked by traits.
- * 
- * In the future, this pass should take the kernel registry and/or extension catalog into account to find
- * out for which type combinations there are available kernels.
+ * @brief Adapts an operation's input/output types such that it can be lowered
+ * to an available pre-compiled kernel.
+ *
+ * While type inference propagates types through the IR, it is not guaranteed
+ * that a pre-compiled kernel for each infered type combination is available.
+ * Thus, the task of this pass is to adapt input and output types by casts,
+ * where necessary, to ensure that an existing pre-compiled kernel can be used.
+ *
+ * At the moment, this pass is implemented in a very simple way. It supports
+ * only two concrete actions:
+ * - It harmonizes the value types of all inputs with those of the single
+ * output, for certain operations. This is because so far we mainly pre-compile
+ * our kernels for homogeneous combinations of input/output types.
+ * - It harmonizes the value types of all inputs (independently of the output
+ * type), for certain operations. This is because some kernels need to output a
+ * different type than their inputs (e.g., comparisons on non-numeric value
+ * types). In general, the affected operations are marked by traits.
+ *
+ * In the future, this pass should take the kernel registry and/or extension
+ * catalog into account to find out for which type combinations there are
+ * available kernels.
  */
-struct AdaptTypesToKernelsPass : public PassWrapper<AdaptTypesToKernelsPass, OperationPass<func::FuncOp>>
-{
+struct AdaptTypesToKernelsPass : public PassWrapper<AdaptTypesToKernelsPass, OperationPass<func::FuncOp>> {
     void runOnOperation() final;
     StringRef getArgument() const final { return "adapt-types-to-kernels"; }
-    StringRef getDescription() const final {
-        return "TODO";
-    }
+    StringRef getDescription() const final { return "TODO"; }
 };
 
-void AdaptTypesToKernelsPass::runOnOperation()
-{
+void AdaptTypesToKernelsPass::runOnOperation() {
     func::FuncOp f = getOperation();
     OpBuilder builder(f.getContext());
-    f.getBody().front().walk([&](Operation* op) {
+    f.getBody().front().walk([&](Operation *op) {
         const size_t numOperands = op->getNumOperands();
 
-        // Depending on the related trait, determine which inputs to cast to which value type.
+        // Depending on the related trait, determine which inputs to cast to
+        // which value type.
         std::vector<size_t> operandIdxs; // the inputs to cast
-        Type targetVTy; // the value type to cast to
+        Type targetVTy;                  // the value type to cast to
 
-        if(op->hasTrait<OpTrait::CastArgsToMostGeneralArgType>()) {
+        if (op->hasTrait<OpTrait::CastArgsToMostGeneralArgType>()) {
             // The only related trait that does not consider the result type.
 
             // TODO Support frame ops.
-            // Skip frame ops, since we cannot easily cast the column types of frames anyway.
-            if(llvm::any_of(op->getOperands(), [](Value operand){
-                return llvm::isa<daphne::FrameType>(operand.getType());
-            }))
+            // Skip frame ops, since we cannot easily cast the column types of
+            // frames anyway.
+            if (llvm::any_of(op->getOperands(),
+                             [](Value operand) { return llvm::isa<daphne::FrameType>(operand.getType()); }))
                 return;
 
             // Cast all inputs to the most general input value type.
-            for(size_t i = 0; i < numOperands; i++)
+            for (size_t i = 0; i < numOperands; i++)
                 operandIdxs.push_back(i);
             std::vector<Type> argVTys;
-            for(size_t i = 0; i < numOperands; i++)
+            for (size_t i = 0; i < numOperands; i++)
                 argVTys.push_back(CompilerUtils::getValueType(op->getOperand(i).getType()));
             targetVTy = mostGeneralVt(argVTys);
-        }
-        else {
+        } else {
             // All remaining related traits consider the result type.
 
             // Skip operations without results.
-            if(!op->getNumResults())
+            if (!op->getNumResults())
                 return;
             Type resTy = op->getResult(0).getType();
             // TODO Support frame ops.
-            // Skip frame ops, since we cannot easily cast the column types of frames anyway.
-            if(
-                llvm::isa<daphne::FrameType>(resTy) ||
-                llvm::any_of(op->getOperands(), [](Value operand){
+            // Skip frame ops, since we cannot easily cast the column types of
+            // frames anyway.
+            if (llvm::isa<daphne::FrameType>(resTy) || llvm::any_of(op->getOperands(), [](Value operand) {
                     return llvm::isa<daphne::FrameType>(operand.getType());
-                })
-            )
+                }))
                 return;
             Type resVTy = CompilerUtils::getValueType(resTy);
 
-            if(op->hasTrait<OpTrait::CastArgsToResType>()) {
+            if (op->hasTrait<OpTrait::CastArgsToResType>()) {
                 // Cast all inputs to the result value type.
-                for(size_t i = 0; i < numOperands; i++)
+                for (size_t i = 0; i < numOperands; i++)
                     operandIdxs.push_back(i);
                 targetVTy = resVTy;
-            }
-            else if(op->hasTrait<OpTrait::CastFirstTwoArgsToResType>()) {
+            } else if (op->hasTrait<OpTrait::CastFirstTwoArgsToResType>()) {
                 // Cast inputs 0 and 1 to the result value type.
                 operandIdxs = {0, 1};
                 targetVTy = resVTy;
             }
-            // TODO Instead of such a non-reusable op-specific trait, we should rather check for the concrete op here.
-            else if(op->hasTrait<OpTrait::CastArgsToResTypeRandMatrixOp>()) {
+            // TODO Instead of such a non-reusable op-specific trait, we should
+            // rather check for the concrete op here.
+            else if (op->hasTrait<OpTrait::CastArgsToResTypeRandMatrixOp>()) {
                 // Cast inputs 2 and 3 to the result value type.
                 operandIdxs = {2, 3};
                 targetVTy = resVTy;
             }
         }
 
-        if(!operandIdxs.empty()) {
+        if (!operandIdxs.empty()) {
             // Insert casts where necessary.
             builder.setInsertionPoint(op);
-            for(size_t i : operandIdxs) {
+            for (size_t i : operandIdxs) {
                 Value argVal = op->getOperand(i);
                 Type argTy = argVal.getType();
-                if(CompilerUtils::getValueType(argTy) != targetVTy) {
-                    op->setOperand(
-                            i,
-                            builder.create<daphne::CastOp>(
-                                    argVal.getLoc(),
-                                    CompilerUtils::setValueType(argTy, targetVTy),
-                                    argVal
-                            )
-                    );
+                if (CompilerUtils::getValueType(argTy) != targetVTy) {
+                    op->setOperand(i, builder.create<daphne::CastOp>(
+                                          argVal.getLoc(), CompilerUtils::setValueType(argTy, targetVTy), argVal));
                 }
             }
         }
     });
 }
 
-std::unique_ptr<Pass> daphne::createAdaptTypesToKernelsPass()
-{
-    return std::make_unique<AdaptTypesToKernelsPass>();
-}
+std::unique_ptr<Pass> daphne::createAdaptTypesToKernelsPass() { return std::make_unique<AdaptTypesToKernelsPass>(); }
diff --git a/src/compiler/inference/InferencePass.cpp b/src/compiler/inference/InferencePass.cpp
index 192890783..2fe63de74 100644
--- a/src/compiler/inference/InferencePass.cpp
+++ b/src/compiler/inference/InferencePass.cpp
@@ -14,121 +14,105 @@
  * limitations under the License.
  */
 
-#include <util/ErrorHandler.h>
 #include <ir/daphneir/Daphne.h>
 #include <ir/daphneir/Passes.h>
+#include <util/ErrorHandler.h>
 
 #include <mlir/Dialect/Arith/IR/Arith.h>
 #include <mlir/Dialect/SCF/IR/SCF.h>
 #include <mlir/IR/Operation.h>
 #include <mlir/Pass/Pass.h>
 
-#include <stdexcept>
 #include <memory>
-#include <vector>
+#include <stdexcept>
 #include <utility>
+#include <vector>
 
 using namespace mlir;
 
-daphne::InferenceConfig::InferenceConfig(bool partialInferenceAllowed,
-                                         bool typeInference,
-                                         bool shapeInference,
-                                         bool frameLabelInference,
-                                         bool sparsityInference)
+daphne::InferenceConfig::InferenceConfig(bool partialInferenceAllowed, bool typeInference, bool shapeInference,
+                                         bool frameLabelInference, bool sparsityInference)
     : partialInferenceAllowed(partialInferenceAllowed), typeInference(typeInference), shapeInference(shapeInference),
       frameLabelInference(frameLabelInference), sparsityInference(sparsityInference) {}
 
 namespace {
-    void castOperandIf(OpBuilder & builder, Operation * op, size_t operandIdx, Type type) {
-        Value operand = op->getOperand(operandIdx);
-        if(operand.getType() != type) {
-            builder.setInsertionPoint(op);
-            op->setOperand(
-                operandIdx,
-                // TODO Is this the right loc?
-                builder.create<daphne::CastOp>(op->getLoc(), type, operand)
-            );
-        }
+void castOperandIf(OpBuilder &builder, Operation *op, size_t operandIdx, Type type) {
+    Value operand = op->getOperand(operandIdx);
+    if (operand.getType() != type) {
+        builder.setInsertionPoint(op);
+        op->setOperand(operandIdx,
+                       // TODO Is this the right loc?
+                       builder.create<daphne::CastOp>(op->getLoc(), type, operand));
     }
+}
 
-    /**
-     * @brief Returns a type retaining all common properties of the two
-     * given types, and setting all mismatching properties to unknown.
-     * 
-     * If the two given types are of different data types, then `nullptr`
-     * is returned.
-     */
-    Type getTypeWithCommonInfo(Type t1, Type t2) {
-        MLIRContext* ctx = t1.getContext();
-        Type u = daphne::UnknownType::get(ctx);
-        auto mat1 = t1.dyn_cast<daphne::MatrixType>();
-        auto mat2 = t2.dyn_cast<daphne::MatrixType>();
-        auto frm1 = t1.dyn_cast<daphne::FrameType>();
-        auto frm2 = t2.dyn_cast<daphne::FrameType>();
-
-        if(mat1 && mat2) { // both types are matrices
-            const Type vt1 = mat1.getElementType();
-            const Type vt2 = mat2.getElementType();
-            const ssize_t nr1 = mat1.getNumRows();
-            const ssize_t nr2 = mat2.getNumRows();
-            const ssize_t nc1 = mat1.getNumCols();
-            const ssize_t nc2 = mat2.getNumCols();
-            const ssize_t sp1 = mat1.getSparsity();
-            const ssize_t sp2 = mat2.getSparsity();
-            const daphne::MatrixRepresentation repr1 = mat1.getRepresentation();
-            const daphne::MatrixRepresentation repr2 = mat2.getRepresentation();
-            return daphne::MatrixType::get(
-                ctx,
-                (vt1 == vt2) ? vt1 : u,
-                (nr1 == nr2) ? nr1 : -1,
-                (nc1 == nc2) ? nc1 : -1,
-                // TODO Maybe do approximate comparison of floating-point values.
-                (sp1 == sp2) ? sp1 : -1,
-                (repr1 == repr2) ? repr1 : daphne::MatrixRepresentation::Default
-            );
-        }
-        else if(frm1 && frm2) { // both types are frames
-            const std::vector<Type> cts1 = frm1.getColumnTypes();
-            const std::vector<Type> cts2 = frm2.getColumnTypes();
-            std::vector<Type> cts3;
-            if(cts1.size() == cts2.size())
-                for(size_t i = 0; i < cts1.size(); i++)
-                    cts3.push_back((cts1[i] == cts2[i]) ? cts1[i] : u);
-            else
-                // TODO How to represent a frame with unknown column
-                // types? See #421.
-                cts3.push_back(u);
-            const ssize_t nr1 = frm1.getNumRows();
-            const ssize_t nr2 = frm2.getNumRows();
-            const ssize_t nc1 = frm1.getNumCols();
-            const ssize_t nc2 = frm2.getNumCols();
-            std::vector<std::string>* lbls1 = frm1.getLabels();
-            std::vector<std::string>* lbls2 = frm2.getLabels();
-            return daphne::FrameType::get(
-                ctx,
-                cts3,
-                (nr1 == nr2) ? nr1 : -1,
-                (nc1 == nc2) ? nc1 : -1,
-                // TODO Take #485 into account.
-                (lbls1 == lbls2) ? lbls1 : nullptr
-            );
-        }
-        else if(mat1 || mat2 || frm1 || frm2) // t1 and t2 are of different data types (matrix, frame, scalar)
-            return nullptr;
-        else // both types are unknown or scalars
-            return (t1 == t2) ? t1 : u;
-    }
+/**
+ * @brief Returns a type retaining all common properties of the two
+ * given types, and setting all mismatching properties to unknown.
+ *
+ * If the two given types are of different data types, then `nullptr`
+ * is returned.
+ */
+Type getTypeWithCommonInfo(Type t1, Type t2) {
+    MLIRContext *ctx = t1.getContext();
+    Type u = daphne::UnknownType::get(ctx);
+    auto mat1 = t1.dyn_cast<daphne::MatrixType>();
+    auto mat2 = t2.dyn_cast<daphne::MatrixType>();
+    auto frm1 = t1.dyn_cast<daphne::FrameType>();
+    auto frm2 = t2.dyn_cast<daphne::FrameType>();
+
+    if (mat1 && mat2) { // both types are matrices
+        const Type vt1 = mat1.getElementType();
+        const Type vt2 = mat2.getElementType();
+        const ssize_t nr1 = mat1.getNumRows();
+        const ssize_t nr2 = mat2.getNumRows();
+        const ssize_t nc1 = mat1.getNumCols();
+        const ssize_t nc2 = mat2.getNumCols();
+        const ssize_t sp1 = mat1.getSparsity();
+        const ssize_t sp2 = mat2.getSparsity();
+        const daphne::MatrixRepresentation repr1 = mat1.getRepresentation();
+        const daphne::MatrixRepresentation repr2 = mat2.getRepresentation();
+        return daphne::MatrixType::get(ctx, (vt1 == vt2) ? vt1 : u, (nr1 == nr2) ? nr1 : -1, (nc1 == nc2) ? nc1 : -1,
+                                       // TODO Maybe do approximate comparison of floating-point values.
+                                       (sp1 == sp2) ? sp1 : -1,
+                                       (repr1 == repr2) ? repr1 : daphne::MatrixRepresentation::Default);
+    } else if (frm1 && frm2) { // both types are frames
+        const std::vector<Type> cts1 = frm1.getColumnTypes();
+        const std::vector<Type> cts2 = frm2.getColumnTypes();
+        std::vector<Type> cts3;
+        if (cts1.size() == cts2.size())
+            for (size_t i = 0; i < cts1.size(); i++)
+                cts3.push_back((cts1[i] == cts2[i]) ? cts1[i] : u);
+        else
+            // TODO How to represent a frame with unknown column
+            // types? See #421.
+            cts3.push_back(u);
+        const ssize_t nr1 = frm1.getNumRows();
+        const ssize_t nr2 = frm2.getNumRows();
+        const ssize_t nc1 = frm1.getNumCols();
+        const ssize_t nc2 = frm2.getNumCols();
+        std::vector<std::string> *lbls1 = frm1.getLabels();
+        std::vector<std::string> *lbls2 = frm2.getLabels();
+        return daphne::FrameType::get(ctx, cts3, (nr1 == nr2) ? nr1 : -1, (nc1 == nc2) ? nc1 : -1,
+                                      // TODO Take #485 into account.
+                                      (lbls1 == lbls2) ? lbls1 : nullptr);
+    } else if (mat1 || mat2 || frm1 || frm2) // t1 and t2 are of different data
+                                             // types (matrix, frame, scalar)
+        return nullptr;
+    else // both types are unknown or scalars
+        return (t1 == t2) ? t1 : u;
 }
+} // namespace
 
 /**
  * @brief A compiler pass infering various properties of the data objects.
- * 
+ *
  * Rooted at a function, the pass walks the operations, and for each operation
  * it encounters, it infers all currently considered properties of the
  * operation's results based on the properties of the operation's arguments.
  * This approach can easily handle dependencies between different properties to
  * be infered without explicitly modeling them.
- * 
+ *
  * Note that the actual inference logic is outsourced to MLIR operation
  * interfaces.
  */
@@ -136,18 +120,19 @@ class InferencePass : public PassWrapper<InferencePass, OperationPass<func::Func
     daphne::InferenceConfig cfg;
 
     /**
-     * @brief Sets all properties of all results of the given operation to unknown
-     * to undo any prior property inference.
-     * 
+     * @brief Sets all properties of all results of the given operation to
+     * unknown to undo any prior property inference.
+     *
      * The data/value types are retained.
      */
-    std::function<WalkResult(Operation*)> walkSetUnknown = [&](Operation * op) {
-        // For all other operations, we reset the types of all results to unknown.
-        for(size_t i = 0; i < op->getNumResults(); i++) {
+    std::function<WalkResult(Operation *)> walkSetUnknown = [&](Operation *op) {
+        // For all other operations, we reset the types of all results to
+        // unknown.
+        for (size_t i = 0; i < op->getNumResults(); i++) {
             Type t = op->getResult(i).getType();
-            if(auto mt = t.dyn_cast<daphne::MatrixType>())
+            if (auto mt = t.dyn_cast<daphne::MatrixType>())
                 t = mt.withSameElementType();
-            else if(auto ft = t.dyn_cast<daphne::FrameType>())
+            else if (auto ft = t.dyn_cast<daphne::FrameType>())
                 t = ft.withSameColumnTypes();
             op->getResult(i).setType(t);
         }
@@ -157,38 +142,31 @@ class InferencePass : public PassWrapper<InferencePass, OperationPass<func::Func
     /**
      * @brief Triggers the inference of all properties on the given operation.
      */
-    std::function<WalkResult(Operation*)> walkOp = [&](Operation * op) {
+    std::function<WalkResult(Operation *)> walkOp = [&](Operation *op) {
         const bool isScfOp = op->getDialect() == op->getContext()->getOrLoadDialect<scf::SCFDialect>();
 
         // ----------------------------------------------------------------
         // Handle all non-control-flow (non-SCF) operations
         // ----------------------------------------------------------------
-        if(llvm::isa<arith::SelectOp>(op)) {
-            Type typeWithCommonInfo = getTypeWithCommonInfo(
-                op->getOperand(1).getType(),
-                op->getOperand(2).getType()
-            );
-            if(!typeWithCommonInfo) {
-                throw ErrorHandler::compilerError(
-                    op, "InferencePass.cpp:" + std::to_string(__LINE__),
-                    " a variable must not be assigned values of "
-                    "different data types (matrix, frame, scalar) "
-                    "in then/else branches (arith.select)");
+        if (llvm::isa<arith::SelectOp>(op)) {
+            Type typeWithCommonInfo = getTypeWithCommonInfo(op->getOperand(1).getType(), op->getOperand(2).getType());
+            if (!typeWithCommonInfo) {
+                throw ErrorHandler::compilerError(op, "InferencePass.cpp:" + std::to_string(__LINE__),
+                                                  " a variable must not be assigned values of "
+                                                  "different data types (matrix, frame, scalar) "
+                                                  "in then/else branches (arith.select)");
             }
             OpBuilder builder(op->getContext());
             castOperandIf(builder, op, 1, typeWithCommonInfo);
             castOperandIf(builder, op, 2, typeWithCommonInfo);
             op->getResult(0).setType(typeWithCommonInfo);
-        }
-        else if(!isScfOp) {
+        } else if (!isScfOp) {
             if (cfg.typeInference && returnsUnknownType(op)) {
                 // Try to infer the types of all results of this operation.
                 try {
                     daphne::setInferedTypes(op, cfg.partialInferenceAllowed);
-                }
-                catch (std::runtime_error& re) {
-                    throw ErrorHandler::rethrowError(
-                        "InferencePass.cpp:" + std::to_string(__LINE__), re.what());
+                } catch (std::runtime_error &re) {
+                    throw ErrorHandler::rethrowError("InferencePass.cpp:" + std::to_string(__LINE__), re.what());
                 }
             }
             if (cfg.shapeInference && returnsUnknownShape(op)) {
@@ -196,18 +174,16 @@ class InferencePass : public PassWrapper<InferencePass, OperationPass<func::Func
                 std::vector<std::pair<ssize_t, ssize_t>> shapes = daphne::tryInferShape(op);
                 const size_t numRes = op->getNumResults();
                 if (shapes.size() != numRes) {
-                    throw ErrorHandler::compilerError(
-                        op, "InferencePass.cpp:" + std::to_string(__LINE__),
-                        "shape inference for op " +
-                            op->getName().getStringRef().str() + " returned " +
-                            std::to_string(shapes.size()) +
-                            " shapes, but the op has " +
-                            std::to_string(numRes) + " results");
+                    throw ErrorHandler::compilerError(op, "InferencePass.cpp:" + std::to_string(__LINE__),
+                                                      "shape inference for op " + op->getName().getStringRef().str() +
+                                                          " returned " + std::to_string(shapes.size()) +
+                                                          " shapes, but the op has " + std::to_string(numRes) +
+                                                          " results");
                 }
                 // Set the infered shapes on all results of this operation.
-                for(size_t i = 0 ; i < numRes ; i++) {
-                    if(llvm::isa<mlir::daphne::MatrixType>(op->getResultTypes()[i]) ||
-                       llvm::isa<mlir::daphne::FrameType>(op->getResultTypes()[i])) {
+                for (size_t i = 0; i < numRes; i++) {
+                    if (llvm::isa<mlir::daphne::MatrixType>(op->getResultTypes()[i]) ||
+                        llvm::isa<mlir::daphne::FrameType>(op->getResultTypes()[i])) {
                         const ssize_t numRows = shapes[i].first;
                         const ssize_t numCols = shapes[i].second;
                         Value rv = op->getResult(i);
@@ -218,11 +194,11 @@ class InferencePass : public PassWrapper<InferencePass, OperationPass<func::Func
                             rv.setType(ft.withShape(numRows, numCols));
                         else
                             throw ErrorHandler::compilerError(op, "InferencePass.cpp:" + std::to_string(__LINE__),
-                                    "shape inference cannot set the shape of op " +
-                                    op->getName().getStringRef().str() +
-                                    " operand " + std::to_string(i) + ", since it "
-                                                                        "is neither a matrix nor a frame"
-                            );
+                                                              "shape inference cannot set the shape of op " +
+                                                                  op->getName().getStringRef().str() + " operand " +
+                                                                  std::to_string(i) +
+                                                                  ", since it "
+                                                                  "is neither a matrix nor a frame");
                     }
                 }
             }
@@ -230,18 +206,19 @@ class InferencePass : public PassWrapper<InferencePass, OperationPass<func::Func
                 // Try to infer the sparsity of all results of this operation.
                 std::vector<double> sparsities = daphne::tryInferSparsity(op);
                 const size_t numRes = op->getNumResults();
-                if(sparsities.size() != numRes)
+                if (sparsities.size() != numRes)
                     throw ErrorHandler::compilerError(op, "InferencePass",
-                        "sparsity inference for op " +
-                            op->getName().getStringRef().str() + " returned " +
-                            std::to_string(sparsities.size()) + " shapes, but the "
-                                                            "op has " + std::to_string(numRes) + " results"
-                    );
+                                                      "sparsity inference for op " +
+                                                          op->getName().getStringRef().str() + " returned " +
+                                                          std::to_string(sparsities.size()) +
+                                                          " shapes, but the "
+                                                          "op has " +
+                                                          std::to_string(numRes) + " results");
                 // Set the inferred sparsities on all results of this operation.
-                for(size_t i = 0 ; i < numRes ; i++) {
+                for (size_t i = 0; i < numRes; i++) {
                     const double sparsity = sparsities[i];
-                    if(llvm::isa<mlir::daphne::MatrixType>(op->getResultTypes()[i]) ||
-                       llvm::isa<mlir::daphne::FrameType>(op->getResultTypes()[i])) {
+                    if (llvm::isa<mlir::daphne::MatrixType>(op->getResultTypes()[i]) ||
+                        llvm::isa<mlir::daphne::FrameType>(op->getResultTypes()[i])) {
                         Value rv = op->getResult(i);
                         const Type rt = rv.getType();
                         auto mt = rt.dyn_cast<daphne::MatrixType>();
@@ -253,16 +230,17 @@ class InferencePass : public PassWrapper<InferencePass, OperationPass<func::Func
                             // sparsity for a frame result is provided as
                             // unknown (-1) that's okay.
                             throw ErrorHandler::compilerError(op, "InferencePass",
-                                    "sparsity inference cannot set the shape of op " +
-                                    op->getName().getStringRef().str() +
-                                    " operand " + std::to_string(i) + ", since it "
-                                                                        "is not a matrix"
-                            );
+                                                              "sparsity inference cannot set the shape of "
+                                                              "op " +
+                                                                  op->getName().getStringRef().str() + " operand " +
+                                                                  std::to_string(i) +
+                                                                  ", since it "
+                                                                  "is not a matrix");
                     }
                 }
             }
             if (cfg.frameLabelInference && returnsFrameWithUnknownLabels(op)) {
-                if(auto inferFrameLabelsOp = llvm::dyn_cast<daphne::InferFrameLabels>(op))
+                if (auto inferFrameLabelsOp = llvm::dyn_cast<daphne::InferFrameLabels>(op))
                     inferFrameLabelsOp.inferFrameLabels();
                 // Else: Not a problem, since currently we use the frame labels
                 // only to aid type inference, and for this purpose, we don't
@@ -274,35 +252,34 @@ class InferencePass : public PassWrapper<InferencePass, OperationPass<func::Func
         // ----------------------------------------------------------------
         // The following control-flow operations require that certain SSA values
         // have the same MLIR type. For instance, for IfOp, the value yielded in
-        // the then-branch and the value yielded in the else-branch must have the
-        // same type in MLIR.
-        // At the same time, we encode interesting data properties (such as those
-        // inferred by this pass) as MLIR type parameters. As a consequence, e.g.,
-        // a matrix with two rows and a matrix with three rows are technically
-        // different MLIR types. Thus, e.g., an IfOp cannot simply yield matrices
-        // of different shapes from the then- and else-branches.
-        // To solve this general problem, and to allow control-flow operations to
-        // change all properties of a data object, we generally set mismatching
-        // properties to unknown. The details depend on the specific SCF operation.
-        else if(auto whileOp = llvm::dyn_cast<scf::WhileOp>(op)) {
-            Block & beforeBlock = whileOp.getBefore().front();
-            Block & afterBlock = whileOp.getAfter().front();
+        // the then-branch and the value yielded in the else-branch must have
+        // the same type in MLIR. At the same time, we encode interesting data
+        // properties (such as those inferred by this pass) as MLIR type
+        // parameters. As a consequence, e.g., a matrix with two rows and a
+        // matrix with three rows are technically different MLIR types. Thus,
+        // e.g., an IfOp cannot simply yield matrices of different shapes from
+        // the then- and else-branches. To solve this general problem, and to
+        // allow control-flow operations to change all properties of a data
+        // object, we generally set mismatching properties to unknown. The
+        // details depend on the specific SCF operation.
+        else if (auto whileOp = llvm::dyn_cast<scf::WhileOp>(op)) {
+            Block &beforeBlock = whileOp.getBefore().front();
+            Block &afterBlock = whileOp.getAfter().front();
             OpBuilder builder(whileOp.getContext());
             // Infer the types/properties inside the loop body. If some property
-            // of some argument is changed inside the loop body, this property is
-            // set to unknown for both the argument and the yielded value. If that
-            // is the case, we need to do the inference anew, with the new set of
-            // arguments' properties.
-            // This loop searches a fix-point and always terminates, since we only
-            // set properties to unknown and in the extreme case, after a finite
-            // number of iterations all of the arguments' properties will have
-            // become unknown.
-            while(true) {
+            // of some argument is changed inside the loop body, this property
+            // is set to unknown for both the argument and the yielded value. If
+            // that is the case, we need to do the inference anew, with the new
+            // set of arguments' properties. This loop searches a fix-point and
+            // always terminates, since we only set properties to unknown and in
+            // the extreme case, after a finite number of iterations all of the
+            // arguments' properties will have become unknown.
+            while (true) {
                 bool repeat = false;
 
                 // Transfer the WhileOp's operand types to the block arguments
                 // of the before-block to fulfill constraints on the WhileOp.
-                for(size_t i = 0; i < whileOp.getNumOperands(); i++) {
+                for (size_t i = 0; i < whileOp.getNumOperands(); i++) {
                     Type t = whileOp->getOperand(i).getType();
                     beforeBlock.getArgument(i).setType(t);
                 }
@@ -312,17 +289,17 @@ class InferencePass : public PassWrapper<InferencePass, OperationPass<func::Func
                 beforeBlock.walk<WalkOrder::PreOrder>(walkOp);
 
                 // Get the ConditionOp.
-                Operation * condOp = beforeBlock.getTerminator();
+                Operation *condOp = beforeBlock.getTerminator();
 
-                if(!llvm::isa<scf::ConditionOp>(condOp))
+                if (!llvm::isa<scf::ConditionOp>(condOp))
                     throw ErrorHandler::compilerError(op, "InferencePass", "WhileOp terminator is not a ConditionOp");
 
-                // Transfer the ConditionOp's operand types to the block arguments
-                // of the after-block and the results of the WhileOp to fulfill
-                // constraints on the WhileOp.
-                // Note that the first operand of the ConditionOp is skipped, since it
-                // is the condition value itself.
-                for(size_t i = 1; i < condOp->getNumOperands(); i++) {
+                // Transfer the ConditionOp's operand types to the block
+                // arguments of the after-block and the results of the WhileOp
+                // to fulfill constraints on the WhileOp. Note that the first
+                // operand of the ConditionOp is skipped, since it is the
+                // condition value itself.
+                for (size_t i = 1; i < condOp->getNumOperands(); i++) {
                     Type t = condOp->getOperand(i).getType();
                     afterBlock.getArgument(i - 1).setType(t);
                     whileOp.getResult(i - 1).setType(t);
@@ -333,72 +310,71 @@ class InferencePass : public PassWrapper<InferencePass, OperationPass<func::Func
                 afterBlock.walk<WalkOrder::PreOrder>(walkOp);
 
                 // Get the YieldOp.
-                Operation * yieldOp = afterBlock.getTerminator();
+                Operation *yieldOp = afterBlock.getTerminator();
 
-                if(whileOp->getNumOperands() != yieldOp->getNumOperands())
-                    throw ErrorHandler::compilerError(
-                        op, "InferencePass",
-                        "WhileOp and YieldOp must have the same number of "
-                        "operands");
+                if (whileOp->getNumOperands() != yieldOp->getNumOperands())
+                    throw ErrorHandler::compilerError(op, "InferencePass",
+                                                      "WhileOp and YieldOp must have the same number of "
+                                                      "operands");
 
                 // Check if the inferred MLIR types match the result MLIR types.
-                // If any interesting properties were changed inside the loop body,
-                // we set them to unknown to make the type comparison pass.
-                for(size_t i = 0; i < whileOp.getNumOperands(); i++) {
+                // If any interesting properties were changed inside the loop
+                // body, we set them to unknown to make the type comparison
+                // pass.
+                for (size_t i = 0; i < whileOp.getNumOperands(); i++) {
                     Type yieldedTy = yieldOp->getOperand(i).getType();
                     Type operandTy = op->getOperand(i).getType();
-                    if(yieldedTy != operandTy) {
-                        // Get a type with the conflicting properties set to unknown.
+                    if (yieldedTy != operandTy) {
+                        // Get a type with the conflicting properties set to
+                        // unknown.
                         Type typeWithCommonInfo = getTypeWithCommonInfo(yieldedTy, operandTy);
-                        if(!typeWithCommonInfo) {
-                            throw ErrorHandler::compilerError(
-                                op, "InferencePass",
-                                "the data type (matrix, frame, scalar) of a "
-                                "variable "
-                                "must not be changed within the body of a "
-                                "while-loop");
+                        if (!typeWithCommonInfo) {
+                            throw ErrorHandler::compilerError(op, "InferencePass",
+                                                              "the data type (matrix, frame, scalar) of a "
+                                                              "variable "
+                                                              "must not be changed within the body of a "
+                                                              "while-loop");
                         }
                         // Use casts to remove those properties accordingly.
                         castOperandIf(builder, yieldOp, i, typeWithCommonInfo);
                         castOperandIf(builder, whileOp, i, typeWithCommonInfo);
-                        // Since the WhileOp's argument types/properties have changed,
-                        // we must repeat the inference for the loop body.
+                        // Since the WhileOp's argument types/properties have
+                        // changed, we must repeat the inference for the loop
+                        // body.
                         repeat = true;
                     }
                 }
-                if(repeat) {
-                    // Before we can repeat the inference, we reset all information
-                    // inferred so far to unknown (in the loop body).
+                if (repeat) {
+                    // Before we can repeat the inference, we reset all
+                    // information inferred so far to unknown (in the loop
+                    // body).
                     beforeBlock.walk<WalkOrder::PreOrder>(walkSetUnknown);
                     afterBlock.walk<WalkOrder::PreOrder>(walkSetUnknown);
-                }
-                else
+                } else
                     // If all types matched, we are done.
                     break;
             }
             // Tell the walker to skip the descendants of the WhileOp, we
             // have already triggered a walk on them explicitly.
             return WalkResult::skip();
-        }
-        else if(auto forOp = llvm::dyn_cast<scf::ForOp>(op)) {
-            Block & block = forOp.getRegion().front();
+        } else if (auto forOp = llvm::dyn_cast<scf::ForOp>(op)) {
+            Block &block = forOp.getRegion().front();
             const size_t numIndVars = forOp.getNumInductionVars();
             OpBuilder builder(forOp.getContext());
             // Infer the types/properties inside the loop body. If some property
-            // of some argument is changed inside the loop body, this property is
-            // set to unknown for both the argument and the yielded value. If that
-            // is the case, we need to do the inference anew, with the new set of
-            // arguments' properties.
-            // This loop searches a fix-point and always terminates, since we only
-            // set properties to unknown and in the extreme case, after a finite
-            // number of iterations all of the arguments' properties will have
-            // become unknown.
-            while(true) {
+            // of some argument is changed inside the loop body, this property
+            // is set to unknown for both the argument and the yielded value. If
+            // that is the case, we need to do the inference anew, with the new
+            // set of arguments' properties. This loop searches a fix-point and
+            // always terminates, since we only set properties to unknown and in
+            // the extreme case, after a finite number of iterations all of the
+            // arguments' properties will have become unknown.
+            while (true) {
                 bool repeat = false;
 
                 // Transfer the ForOp's operand types to the block arguments
                 // and results to fulfill constraints on the ForOp.
-                for(size_t i = 0; i < forOp.getNumIterOperands(); i++) {
+                for (size_t i = 0; i < forOp.getNumIterOperands(); i++) {
                     Type t = forOp.getIterOpOperands()[i].get().getType();
                     block.getArgument(i + numIndVars).setType(t);
                     forOp.getResult(i).setType(t);
@@ -409,35 +385,38 @@ class InferencePass : public PassWrapper<InferencePass, OperationPass<func::Func
                 block.walk<WalkOrder::PreOrder>(walkOp);
 
                 // Get the YieldOp.
-                Operation * yieldOp = block.getTerminator();
+                Operation *yieldOp = block.getTerminator();
 
                 // Check if the inferred MLIR types match the result MLIR types.
-                // If any interesting properties were changed inside the loop body,
-                // we set them to unknown to make the type comparison pass.
-                for(size_t i = 0; i < forOp.getNumIterOperands(); i++) {
+                // If any interesting properties were changed inside the loop
+                // body, we set them to unknown to make the type comparison
+                // pass.
+                for (size_t i = 0; i < forOp.getNumIterOperands(); i++) {
                     Type yieldedTy = yieldOp->getOperand(i).getType();
                     Type resultTy = op->getResult(i).getType();
-                    if(yieldedTy != resultTy) {
-                        // Get a type with the conflicting properties set to unknown.
+                    if (yieldedTy != resultTy) {
+                        // Get a type with the conflicting properties set to
+                        // unknown.
                         Type typeWithCommonInfo = getTypeWithCommonInfo(yieldedTy, resultTy);
-                        if(!typeWithCommonInfo)
-                            throw ErrorHandler::compilerError(
-                                op, "InferencePass.cpp:" + std::to_string(__LINE__),
-                                "the data type (matrix, frame, scalar) of a "
-                                "variable "
-                                "must not be changed within the body of a "
-                                "for-loop.");
+                        if (!typeWithCommonInfo)
+                            throw ErrorHandler::compilerError(op, "InferencePass.cpp:" + std::to_string(__LINE__),
+                                                              "the data type (matrix, frame, scalar) of a "
+                                                              "variable "
+                                                              "must not be changed within the body of a "
+                                                              "for-loop.");
                         // Use casts to remove those properties accordingly.
                         castOperandIf(builder, yieldOp, i, typeWithCommonInfo);
                         castOperandIf(builder, forOp, forOp.getNumControlOperands() + i, typeWithCommonInfo);
-                        // Since the WhileOp's argument types/properties have changed,
-                        // we must repeat the inference for the loop body.
+                        // Since the WhileOp's argument types/properties have
+                        // changed, we must repeat the inference for the loop
+                        // body.
                         repeat = true;
                     }
                 }
-                if(repeat)
-                    // Before we can repeat the inference, we reset all information
-                    // inferred so far to unknown (in the loop body).
+                if (repeat)
+                    // Before we can repeat the inference, we reset all
+                    // information inferred so far to unknown (in the loop
+                    // body).
                     block.walk<WalkOrder::PreOrder>(walkSetUnknown);
                 else
                     // If all types matched, we are done.
@@ -446,12 +425,11 @@ class InferencePass : public PassWrapper<InferencePass, OperationPass<func::Func
             // Tell the walker to skip the descendants of the ForOp, we
             // have already triggered a walk on them explicitly.
             return WalkResult::skip();
-        }
-        else if(auto ifOp = llvm::dyn_cast<scf::IfOp>(op)) {
+        } else if (auto ifOp = llvm::dyn_cast<scf::IfOp>(op)) {
             // Walk the then/else blocks first. We need the inference on
             // them before we can do anything about the IfOp itself.
             ifOp.thenBlock()->walk<WalkOrder::PreOrder>(walkOp);
-            if(ifOp.elseBlock()) {
+            if (ifOp.elseBlock()) {
                 ifOp.elseBlock()->walk<WalkOrder::PreOrder>(walkOp);
 
                 // For all pairs of corresponding values yielded in the
@@ -461,17 +439,14 @@ class InferencePass : public PassWrapper<InferencePass, OperationPass<func::Func
                 scf::YieldOp thenYield = ifOp.thenYield();
                 scf::YieldOp elseYield = ifOp.elseYield();
                 OpBuilder builder(ifOp.getContext());
-                for(size_t i = 0; i < ifOp.getNumResults(); i++) {
-                    Type typeWithCommonInfo = getTypeWithCommonInfo(
-                        thenYield->getOperand(i).getType(),
-                        elseYield->getOperand(i).getType()
-                    );
-                    if(!typeWithCommonInfo)
-                        throw ErrorHandler::compilerError(
-                            op, "InferencePass" + std::to_string(__LINE__),
-                            "a variable must not be assigned values of "
-                            "different data types (matrix, frame, scalar) "
-                            "in then/else branches");
+                for (size_t i = 0; i < ifOp.getNumResults(); i++) {
+                    Type typeWithCommonInfo =
+                        getTypeWithCommonInfo(thenYield->getOperand(i).getType(), elseYield->getOperand(i).getType());
+                    if (!typeWithCommonInfo)
+                        throw ErrorHandler::compilerError(op, "InferencePass" + std::to_string(__LINE__),
+                                                          "a variable must not be assigned values of "
+                                                          "different data types (matrix, frame, scalar) "
+                                                          "in then/else branches");
                     castOperandIf(builder, thenYield, i, typeWithCommonInfo);
                     castOperandIf(builder, elseYield, i, typeWithCommonInfo);
                     ifOp.getResult(i).setType(typeWithCommonInfo);
@@ -486,7 +461,7 @@ class InferencePass : public PassWrapper<InferencePass, OperationPass<func::Func
         return WalkResult::advance();
     };
 
-public:
+  public:
     InferencePass(daphne::InferenceConfig cfg) : cfg(cfg) {}
 
     void runOnOperation() override {
@@ -494,49 +469,47 @@ class InferencePass : public PassWrapper<InferencePass, OperationPass<func::Func
         try {
             f.walk<WalkOrder::PreOrder>(walkOp);
         } catch (std::runtime_error &re) {
-            throw ErrorHandler::rethrowError(
-                "InferencePass.cpp:" + std::to_string(__LINE__), re.what());
+            throw ErrorHandler::rethrowError("InferencePass.cpp:" + std::to_string(__LINE__), re.what());
         }
         // infer function return types
-        f.setType(FunctionType::get(&getContext(),
-            f.getFunctionType().getInputs(),
-            f.getBody().back().getTerminator()->getOperandTypes()));
+        f.setType(FunctionType::get(&getContext(), f.getFunctionType().getInputs(),
+                                    f.getBody().back().getTerminator()->getOperandTypes()));
     }
 
     static bool returnsUnknownType(Operation *op) {
         return llvm::any_of(op->getResultTypes(), [](Type resType) {
-            if(llvm::isa<daphne::UnknownType>(resType))
+            if (llvm::isa<daphne::UnknownType>(resType))
                 return true;
-            if(auto mt = resType.dyn_cast<daphne::MatrixType>())
+            if (auto mt = resType.dyn_cast<daphne::MatrixType>())
                 return llvm::isa<daphne::UnknownType>(mt.getElementType());
-            if(auto ft = resType.dyn_cast<daphne::FrameType>())
-                for(Type ct : ft.getColumnTypes())
-                    if(llvm::isa<daphne::UnknownType>(ct))
+            if (auto ft = resType.dyn_cast<daphne::FrameType>())
+                for (Type ct : ft.getColumnTypes())
+                    if (llvm::isa<daphne::UnknownType>(ct))
                         return true;
             return false;
         });
     }
 
-    static bool returnsFrameWithUnknownLabels(Operation * op) {
+    static bool returnsFrameWithUnknownLabels(Operation *op) {
         return llvm::any_of(op->getResultTypes(), [](Type resultType) {
             auto ft = resultType.dyn_cast<daphne::FrameType>();
             return ft && !ft.getLabels();
         });
     }
 
-    static bool returnsUnknownShape(Operation * op) {
+    static bool returnsUnknownShape(Operation *op) {
         return llvm::any_of(op->getResultTypes(), [](Type rt) {
-            if(auto mt = rt.dyn_cast<daphne::MatrixType>())
+            if (auto mt = rt.dyn_cast<daphne::MatrixType>())
                 return mt.getNumRows() == -1 || mt.getNumCols() == -1;
-            if(auto ft = rt.dyn_cast<daphne::FrameType>())
+            if (auto ft = rt.dyn_cast<daphne::FrameType>())
                 return ft.getNumRows() == -1 || ft.getNumCols() == -1;
             return false;
         });
     }
 
-    static bool returnsUnknownSparsity(Operation * op) {
+    static bool returnsUnknownSparsity(Operation *op) {
         return llvm::any_of(op->getResultTypes(), [](Type rt) {
-            if(auto mt = rt.dyn_cast<daphne::MatrixType>())
+            if (auto mt = rt.dyn_cast<daphne::MatrixType>())
                 return mt.getSparsity() == -1.0;
             return false;
         });
diff --git a/src/compiler/inference/SelectMatrixRepresentationsPass.cpp b/src/compiler/inference/SelectMatrixRepresentationsPass.cpp
index b05c996d9..f49fa05f1 100644
--- a/src/compiler/inference/SelectMatrixRepresentationsPass.cpp
+++ b/src/compiler/inference/SelectMatrixRepresentationsPass.cpp
@@ -22,26 +22,27 @@
 #include <mlir/IR/Operation.h>
 #include <mlir/Pass/Pass.h>
 
-#include <stdexcept>
 #include <memory>
+#include <stdexcept>
 
 using namespace mlir;
 
-class SelectMatrixRepresentationsPass : public PassWrapper<SelectMatrixRepresentationsPass, OperationPass<func::FuncOp>> {
-    const DaphneUserConfig& cfg;
+class SelectMatrixRepresentationsPass
+    : public PassWrapper<SelectMatrixRepresentationsPass, OperationPass<func::FuncOp>> {
+    const DaphneUserConfig &cfg;
 
-    std::function<WalkResult(Operation*)> walkOp = [&](Operation * op) {
-        if(returnsKnownProperties(op)) {
+    std::function<WalkResult(Operation *)> walkOp = [&](Operation *op) {
+        if (returnsKnownProperties(op)) {
             const bool isScfOp = op->getDialect() == op->getContext()->getOrLoadDialect<scf::SCFDialect>();
             // ----------------------------------------------------------------
             // Handle all non-SCF operations
             // ----------------------------------------------------------------
-            if(!isScfOp) {
+            if (!isScfOp) {
                 // Set the matrix representation for all result types
-                for(auto res : op->getResults()) {
-                    if(auto matTy = res.getType().dyn_cast<daphne::MatrixType>()) {
+                for (auto res : op->getResults()) {
+                    if (auto matTy = res.getType().dyn_cast<daphne::MatrixType>()) {
                         const double sparsity = matTy.getSparsity();
-                        if(sparsity < cfg.sparsity_threshold) {
+                        if (sparsity < cfg.sparsity_threshold) {
                             res.setType(matTy.withRepresentation(daphne::MatrixRepresentation::Sparse));
                         }
                     }
@@ -52,12 +53,12 @@ class SelectMatrixRepresentationsPass : public PassWrapper<SelectMatrixRepresent
             // ----------------------------------------------------------------
             // Special treatment for some SCF operations
             // ----------------------------------------------------------------
-            else if(auto whileOp = llvm::dyn_cast<scf::WhileOp>(op)) {
+            else if (auto whileOp = llvm::dyn_cast<scf::WhileOp>(op)) {
                 Block &beforeBlock = whileOp.getBefore().front();
                 Block &afterBlock = whileOp.getAfter().front();
                 // Transfer the WhileOp's operand types to the block arguments
                 // and results to fulfill constraints on the WhileOp.
-                for(size_t i = 0 ; i < whileOp.getNumOperands() ; i++) {
+                for (size_t i = 0; i < whileOp.getNumOperands(); i++) {
                     Type t = whileOp->getOperand(i).getType();
                     beforeBlock.getArgument(i).setType(t);
                     afterBlock.getArgument(i).setType(t);
@@ -68,31 +69,30 @@ class SelectMatrixRepresentationsPass : public PassWrapper<SelectMatrixRepresent
                 beforeBlock.walk<WalkOrder::PreOrder>(walkOp);
                 afterBlock.walk<WalkOrder::PreOrder>(walkOp);
 
-                // Check if the inferred matrix representations match the required result representations.
-                // This is not the case if, for instance, the representation of some
-                // variable written in the loop changes. The WhileOp would also
-                // check this later during verification, but here, we want to
-                // throw a readable error message.
+                // Check if the inferred matrix representations match the
+                // required result representations. This is not the case if, for
+                // instance, the representation of some variable written in the
+                // loop changes. The WhileOp would also check this later during
+                // verification, but here, we want to throw a readable error
+                // message.
                 Operation *yieldOp = afterBlock.getTerminator();
-                for(size_t i = 0 ; i < whileOp.getNumOperands() ; i++) {
+                for (size_t i = 0; i < whileOp.getNumOperands(); i++) {
                     Type yieldedTy = yieldOp->getOperand(i).getType();
                     Type resultTy = op->getResult(i).getType();
                     if (yieldedTy != resultTy)
-                        throw ErrorHandler::compilerError(
-                            whileOp, "SelectMatrixRepresentationsPass",
-                            "the representation of a matrix must not be "
-                            "changed within the body of a while-loop.");
+                        throw ErrorHandler::compilerError(whileOp, "SelectMatrixRepresentationsPass",
+                                                          "the representation of a matrix must not be "
+                                                          "changed within the body of a while-loop.");
                 }
                 // Tell the walker to skip the descendants of the WhileOp, we
                 // have already triggered a walk on them explicitly.
                 return WalkResult::skip();
-            }
-            else if(auto forOp = llvm::dyn_cast<scf::ForOp>(op)) {
+            } else if (auto forOp = llvm::dyn_cast<scf::ForOp>(op)) {
                 Block &block = forOp.getRegion().front();
                 const size_t numIndVars = forOp.getNumInductionVars();
                 // Transfer the ForOp's operand types to the block arguments
                 // and results to fulfill constraints on the ForOp.
-                for(size_t i = 0 ; i < forOp.getNumIterOperands() ; i++) {
+                for (size_t i = 0; i < forOp.getNumIterOperands(); i++) {
                     Type t = forOp.getIterOperands()[i].getType();
                     block.getArgument(i + numIndVars).setType(t);
                     forOp.getResult(i).setType(t);
@@ -100,45 +100,43 @@ class SelectMatrixRepresentationsPass : public PassWrapper<SelectMatrixRepresent
                 // Continue the walk on the body block of the ForOp. We trigger
                 // this explicitly, since we need to do something afterwards.
                 block.walk<WalkOrder::PreOrder>(walkOp);
-                // Check if the infered matrix representations match the required result representations.
-                // This is not the case if, for instance, the representation of some
-                // variable written in the loop changes. The ForOp would also
-                // check this later during verification, but here, we want to
-                // throw a readable error message.
+                // Check if the infered matrix representations match the
+                // required result representations. This is not the case if, for
+                // instance, the representation of some variable written in the
+                // loop changes. The ForOp would also check this later during
+                // verification, but here, we want to throw a readable error
+                // message.
                 Operation *yieldOp = block.getTerminator();
-                for(size_t i = 0 ; i < forOp.getNumIterOperands() ; i++) {
+                for (size_t i = 0; i < forOp.getNumIterOperands(); i++) {
                     Type yieldedTy = yieldOp->getOperand(i).getType();
                     Type resultTy = op->getResult(i).getType();
                     if (yieldedTy != resultTy)
-                        throw ErrorHandler::compilerError(
-                            forOp, "SelectMatrixRepresentationsPass",
-                            "the representation of a matrix must not be "
-                            "changed within the body of a for-loop");
+                        throw ErrorHandler::compilerError(forOp, "SelectMatrixRepresentationsPass",
+                                                          "the representation of a matrix must not be "
+                                                          "changed within the body of a for-loop");
                 }
                 // Tell the walker to skip the descendants of the ForOp, we
                 // have already triggered a walk on them explicitly.
                 return WalkResult::skip();
-            }
-            else if(auto ifOp = llvm::dyn_cast<scf::IfOp>(op)) {
+            } else if (auto ifOp = llvm::dyn_cast<scf::IfOp>(op)) {
                 // Walk the then/else blocks first. We need the inference on
                 // them before we can do anything about the IfOp itself.
                 ifOp.thenBlock()->walk<WalkOrder::PreOrder>(walkOp);
                 ifOp.elseBlock()->walk<WalkOrder::PreOrder>(walkOp);
-                // Check if the yielded matrix representations are the same in both
-                // branches. The IfOp would also check this later during
+                // Check if the yielded matrix representations are the same in
+                // both branches. The IfOp would also check this later during
                 // verification, but here, we want to throw a readable error
                 // message.
                 // Additionally, we set the result types of the IfOp here.
                 scf::YieldOp thenYield = ifOp.thenYield();
                 scf::YieldOp elseYield = ifOp.elseYield();
-                for(size_t i = 0 ; i < ifOp.getNumResults() ; i++) {
+                for (size_t i = 0; i < ifOp.getNumResults(); i++) {
                     Type thenTy = thenYield->getOperand(i).getType();
                     Type elseTy = elseYield->getOperand(i).getType();
                     if (thenTy != elseTy)
-                        throw ErrorHandler::compilerError(
-                            ifOp, "SelectMatrixRepresentationsPass",
-                            "a matrix must not be assigned two values of "
-                            "different representations in then/else branches");
+                        throw ErrorHandler::compilerError(ifOp, "SelectMatrixRepresentationsPass",
+                                                          "a matrix must not be assigned two values of "
+                                                          "different representations in then/else branches");
                     ifOp.getResult(i).setType(thenTy);
                 }
                 // Tell the walker to skip the descendants of the IfOp, we
@@ -150,17 +148,16 @@ class SelectMatrixRepresentationsPass : public PassWrapper<SelectMatrixRepresent
         return WalkResult::advance();
     };
 
-public:
-    explicit SelectMatrixRepresentationsPass(const DaphneUserConfig& cfg) : cfg(cfg) {}
+  public:
+    explicit SelectMatrixRepresentationsPass(const DaphneUserConfig &cfg) : cfg(cfg) {}
 
     void runOnOperation() override {
         func::FuncOp f = getOperation();
         f.walk<WalkOrder::PreOrder>(walkOp);
         // infer function return types
         // TODO: cast for UDFs?
-        f.setType(FunctionType::get(&getContext(),
-            f.getFunctionType().getInputs(),
-            f.getBody().back().getTerminator()->getOperandTypes()));
+        f.setType(FunctionType::get(&getContext(), f.getFunctionType().getInputs(),
+                                    f.getBody().back().getTerminator()->getOperandTypes()));
     }
 
     StringRef getArgument() const final { return "select-matrix-representations"; }
@@ -168,13 +165,13 @@ class SelectMatrixRepresentationsPass : public PassWrapper<SelectMatrixRepresent
 
     static bool returnsKnownProperties(Operation *op) {
         return llvm::any_of(op->getResultTypes(), [](Type rt) {
-            if(auto mt = rt.dyn_cast<daphne::MatrixType>())
+            if (auto mt = rt.dyn_cast<daphne::MatrixType>())
                 return mt.getSparsity() != -1.0;
             return false;
         });
     }
 };
 
-std::unique_ptr<Pass> daphne::createSelectMatrixRepresentationsPass(const DaphneUserConfig& cfg) {
+std::unique_ptr<Pass> daphne::createSelectMatrixRepresentationsPass(const DaphneUserConfig &cfg) {
     return std::make_unique<SelectMatrixRepresentationsPass>(cfg);
 }
diff --git a/src/compiler/inference/TypeInferenceUtils.cpp b/src/compiler/inference/TypeInferenceUtils.cpp
index e9a3a5365..aba2d3611 100644
--- a/src/compiler/inference/TypeInferenceUtils.cpp
+++ b/src/compiler/inference/TypeInferenceUtils.cpp
@@ -18,67 +18,73 @@
 
 int generality(mlir::Type t) {
     using namespace mlir;
-    
+
     // TODO It is debatable if unsigned int shall be more general than signed
     // int of the same bit width.
-    
+
     // The greater the number, the more general the type.
-    if(llvm::isa<daphne::UnknownType>(t)) return 11;
-    if(llvm::isa<daphne::StringType>(t)) return 10;
-    if(t.isF64()) return 9;
-    if(t.isF32()) return 8;
-    if(t.isUnsignedInteger(64)) return 7;
-    if(t.  isSignedInteger(64)) return 6;
-    if(t.isIndex()) return 5;
-    if(t.isUnsignedInteger(32)) return 4;
-    if(t.  isSignedInteger(32)) return 3;
-    if(t.isUnsignedInteger(8)) return 2;
-    if(t.  isSignedInteger(8)) return 1;
-    if(t.        isInteger(1)) return 0;
-    
+    if (llvm::isa<daphne::UnknownType>(t))
+        return 11;
+    if (llvm::isa<daphne::StringType>(t))
+        return 10;
+    if (t.isF64())
+        return 9;
+    if (t.isF32())
+        return 8;
+    if (t.isUnsignedInteger(64))
+        return 7;
+    if (t.isSignedInteger(64))
+        return 6;
+    if (t.isIndex())
+        return 5;
+    if (t.isUnsignedInteger(32))
+        return 4;
+    if (t.isSignedInteger(32))
+        return 3;
+    if (t.isUnsignedInteger(8))
+        return 2;
+    if (t.isSignedInteger(8))
+        return 1;
+    if (t.isInteger(1))
+        return 0;
+
     std::string str;
     llvm::raw_string_ostream msg(str);
     msg << "no generality code available for value type: " << t;
     throw std::runtime_error(msg.str());
 }
 
-mlir::Type mostGeneralVt(const std::vector<mlir::Type> & vt) {
-    if(vt.empty())
-        throw std::runtime_error(
-                "mostGeneralVt() invoked with empty list of value types"
-        );
-                
+mlir::Type mostGeneralVt(const std::vector<mlir::Type> &vt) {
+    if (vt.empty())
+        throw std::runtime_error("mostGeneralVt() invoked with empty list of value types");
+
     mlir::Type res = vt[0];
-    for(size_t i = 1; i < vt.size(); i++)
-        if(generality(vt[i]) > generality(res))
+    for (size_t i = 1; i < vt.size(); i++)
+        if (generality(vt[i]) > generality(res))
             res = vt[i];
-    
+
     return res;
 }
 
-mlir::Type mostGeneralVt(const std::vector<std::vector<mlir::Type>> & vts, size_t num) {
-    if(vts.empty())
-        throw std::runtime_error(
-                "mostGeneralVt() invoked with empty list of lists of value types"
-        );
-    
-    if(num == 0)
+mlir::Type mostGeneralVt(const std::vector<std::vector<mlir::Type>> &vts, size_t num) {
+    if (vts.empty())
+        throw std::runtime_error("mostGeneralVt() invoked with empty list of lists of value types");
+
+    if (num == 0)
         num = vts.size();
-    
+
     mlir::Type res = mostGeneralVt(vts[0]);
-    for(size_t i = 1; i < std::min(vts.size(), num); i++) {
+    for (size_t i = 1; i < std::min(vts.size(), num); i++) {
         mlir::Type cur = mostGeneralVt(vts[i]);
-        if(generality(cur) > generality(res))
+        if (generality(cur) > generality(res))
             res = cur;
     }
-    
+
     return res;
 }
 
-std::vector<mlir::Type> inferValueTypeFromArgs(
-        const std::vector<DataTypeCode> & argDtc,
-        std::vector<std::vector<mlir::Type>> & argVts
-) {
+std::vector<mlir::Type> inferValueTypeFromArgs(const std::vector<DataTypeCode> &argDtc,
+                                               std::vector<std::vector<mlir::Type>> &argVts) {
     // TODO Simplify: resDtc is already known. If it's not Frame, this
     // can be done simpler and we don't need the getMostGeneralVt later.
 
@@ -86,31 +92,29 @@ std::vector<mlir::Type> inferValueTypeFromArgs(
     // arguments to match the number of column types of frame arguments.
     size_t commonNumFrameCols = 1;
     bool hasFrame = false;
-    for(size_t i = 0; i < argVts.size(); i++)
-        if(argDtc[i] == DataTypeCode::FRAME) {
-            if(hasFrame && argVts[i].size() != commonNumFrameCols)
-                throw std::runtime_error(
-                        "type inference trait ValueTypeFromArgs requires that "
-                        "all input frames have the same number of columns"
-                );
+    for (size_t i = 0; i < argVts.size(); i++)
+        if (argDtc[i] == DataTypeCode::FRAME) {
+            if (hasFrame && argVts[i].size() != commonNumFrameCols)
+                throw std::runtime_error("type inference trait ValueTypeFromArgs requires that "
+                                         "all input frames have the same number of columns");
             hasFrame = true;
             commonNumFrameCols = argVts[i].size();
         }
 
     // If required: Expand the value type of matrix and scalar arguments to
     // match the common number of column types of frame arguments.
-    if(hasFrame)
-        for(size_t i = 0; i < argVts.size(); i++)
-            if(argDtc[i] != DataTypeCode::FRAME)
+    if (hasFrame)
+        for (size_t i = 0; i < argVts.size(); i++)
+            if (argDtc[i] != DataTypeCode::FRAME)
                 argVts[i] = std::vector(commonNumFrameCols, argVts[i][0]);
 
     // Determine the most general argument value type. This is done for each
     // column separately, if frames are involved.
     std::vector<mlir::Type> resVts = argVts[0];
-    for(size_t i = 1; i < argVts.size(); i++)
-        for(size_t k = 0; k < commonNumFrameCols; k++)
-            if(generality(argVts[i][k]) > generality(resVts[k]))
+    for (size_t i = 1; i < argVts.size(); i++)
+        for (size_t k = 0; k < commonNumFrameCols; k++)
+            if (generality(argVts[i][k]) > generality(resVts[k]))
                 resVts[k] = argVts[i][k];
-    
+
     return resVts;
 }
diff --git a/src/compiler/inference/TypeInferenceUtils.h b/src/compiler/inference/TypeInferenceUtils.h
index e03f64373..5de58c8e2 100644
--- a/src/compiler/inference/TypeInferenceUtils.h
+++ b/src/compiler/inference/TypeInferenceUtils.h
@@ -18,20 +18,20 @@
 
 #include <ir/daphneir/Daphne.h>
 
-#include <mlir/IR/Operation.h>
 #include <mlir/IR/OpDefinition.h>
+#include <mlir/IR/Operation.h>
 #include <mlir/IR/Types.h>
 
 #include <vector>
 
 /**
  * @brief Returns an integer code representing how general a value type is.
- * 
+ *
  * This code can be used to determine which of two value types is more general.
  * The larger the code, the more general the value type.
- * 
+ *
  * @param t
- * @return 
+ * @return
  */
 int generality(mlir::Type t);
 
@@ -51,54 +51,48 @@ enum class DataTypeCode : uint8_t {
 
 /**
  * @brief Returns the most general value type in a list of value types.
- * 
+ *
  * @param vt A list of value types.
- * @return 
+ * @return
  */
-mlir::Type mostGeneralVt(const std::vector<mlir::Type> & vt);
+mlir::Type mostGeneralVt(const std::vector<mlir::Type> &vt);
 
 /**
  * @brief Returns the most general value type in a list of lists of value types.
- * 
+ *
  * @param vts A list of lists of value types.
  * @param num Optionally, only consider the first `num` lists of value types.
- * @return 
+ * @return
  */
-mlir::Type mostGeneralVt(
-        const std::vector<std::vector<mlir::Type>> & vts,
-        size_t num = 0
-);
+mlir::Type mostGeneralVt(const std::vector<std::vector<mlir::Type>> &vts, size_t num = 0);
 
 /**
  * @brief Infers the value type assuming the type inference trait
  * `ValueTypeFromArgs`.
- * 
+ *
  * @param argDtc Information on the argument data types.
  * @param argVts Information on the argument value types.
  * @return The infered value type.
  */
-std::vector<mlir::Type> inferValueTypeFromArgs(
-        const std::vector<DataTypeCode> & argDtc,
-        std::vector<std::vector<mlir::Type>> & argVts
-);
+std::vector<mlir::Type> inferValueTypeFromArgs(const std::vector<DataTypeCode> &argDtc,
+                                               std::vector<std::vector<mlir::Type>> &argVts);
 
 /**
  * @brief Infers the type of the result of the given operation based on its
  * type inference traits.
- * 
+ *
  * @tparam O The type of the operation. For the inference in the compiler we
  * use `mlir::Operation`, but for the unit tests we use a mock class.
  * @param op
  * @return The infered type of the single result of the operation.
  */
-template<class O>
-mlir::Type inferTypeByTraits(O * op) {
+template <class O> mlir::Type inferTypeByTraits(O *op) {
     using namespace mlir;
     using namespace mlir::OpTrait;
-    
-    MLIRContext * ctx = op->getContext();
+
+    MLIRContext *ctx = op->getContext();
     Type u = daphne::UnknownType::get(ctx);
-    
+
     Type resTy = u;
 
     // --------------------------------------------------------------------
@@ -107,20 +101,17 @@ mlir::Type inferTypeByTraits(O * op) {
 
     std::vector<DataTypeCode> argDtc;
     std::vector<std::vector<Type>> argVts;
-    for(Type t : op->getOperandTypes()) {
-        if(llvm::isa<daphne::UnknownType>(t)) {
+    for (Type t : op->getOperandTypes()) {
+        if (llvm::isa<daphne::UnknownType>(t)) {
             argDtc.push_back(DataTypeCode::UNKNOWN);
             argVts.push_back({u});
-        }
-        else if(auto ft = t.dyn_cast<daphne::FrameType>()) {
+        } else if (auto ft = t.dyn_cast<daphne::FrameType>()) {
             argDtc.push_back(DataTypeCode::FRAME);
             argVts.push_back(ft.getColumnTypes());
-        }
-        else if(auto mt = t.dyn_cast<daphne::MatrixType>()) {
+        } else if (auto mt = t.dyn_cast<daphne::MatrixType>()) {
             argDtc.push_back(DataTypeCode::MATRIX);
             argVts.push_back({mt.getElementType()});
-        }
-        else { // TODO Check if this is really a supported scalar type!
+        } else { // TODO Check if this is really a supported scalar type!
             argDtc.push_back(DataTypeCode::SCALAR);
             argVts.push_back({t});
         }
@@ -132,19 +123,18 @@ mlir::Type inferTypeByTraits(O * op) {
 
     DataTypeCode resDtc = DataTypeCode::UNKNOWN;
 
-    if(op->template hasTrait<DataTypeFromFirstArg>() || op->template hasTrait<TypeFromFirstArg>())
+    if (op->template hasTrait<DataTypeFromFirstArg>() || op->template hasTrait<TypeFromFirstArg>())
         resDtc = argDtc[0];
-    else if(op->template hasTrait<DataTypeFromArgs>()) {
+    else if (op->template hasTrait<DataTypeFromArgs>()) {
         resDtc = argDtc[0];
-        for(size_t i = 1; i < argDtc.size(); i++)
-            if(argDtc[i] > resDtc) // generality comparison
+        for (size_t i = 1; i < argDtc.size(); i++)
+            if (argDtc[i] > resDtc) // generality comparison
                 resDtc = argDtc[i];
-    }
-    else if(op->template hasTrait<DataTypeSca>())
+    } else if (op->template hasTrait<DataTypeSca>())
         resDtc = DataTypeCode::SCALAR;
-    else if(op->template hasTrait<DataTypeMat>())
+    else if (op->template hasTrait<DataTypeMat>())
         resDtc = DataTypeCode::MATRIX;
-    else if(op->template hasTrait<DataTypeFrm>())
+    else if (op->template hasTrait<DataTypeFrm>())
         resDtc = DataTypeCode::FRAME;
 
     // --------------------------------------------------------------------
@@ -154,158 +144,140 @@ mlir::Type inferTypeByTraits(O * op) {
     // TODO What about the #cols, if the data type is a frame (see #421)?
     std::vector<Type> resVts = {u};
 
-    if(op->template hasTrait<ValueTypeCmp>()) {
+    if (op->template hasTrait<ValueTypeCmp>()) {
         // Initially take the most general value type of the arguments,
         // resVts has one element for scalars and matrices, or one element
         // per column for frames.
         resVts = inferValueTypeFromArgs(argDtc, argVts);
         // Replace string by si64. Otherwise, we would represent the result
         // of the comparison of two strings as a string.
-        for(size_t i = 0; i < resVts.size(); i++)
-            if(llvm::isa<daphne::StringType>(resVts[i]))
+        for (size_t i = 0; i < resVts.size(); i++)
+            if (llvm::isa<daphne::StringType>(resVts[i]))
                 resVts[i] = IntegerType::get(ctx, 64, IntegerType::SignednessSemantics::Signed);
-    }
-    else if(op->template hasTrait<TypeFromFirstArg>())
+    } else if (op->template hasTrait<TypeFromFirstArg>())
         resVts = argVts[0];
-    else if(op->template hasTrait<ValueTypeFromFirstArg>()) {
-        if(resDtc == DataTypeCode::FRAME && argDtc[0] == DataTypeCode::MATRIX) {
+    else if (op->template hasTrait<ValueTypeFromFirstArg>()) {
+        if (resDtc == DataTypeCode::FRAME && argDtc[0] == DataTypeCode::MATRIX) {
             // We need to make sure that the value type of the input matrix is
             // repeated in the column value types of the output frame to match
             // the number of columns of the input matrix.
-            const ssize_t numCols = op->getOperand(0)
-                .getType()
-                .template dyn_cast<daphne::MatrixType>()
-                .getNumCols();
-            if(numCols == -1)
+            const ssize_t numCols = op->getOperand(0).getType().template dyn_cast<daphne::MatrixType>().getNumCols();
+            if (numCols == -1)
                 // The input's number of columns is unknown.
                 resVts = {u}; // TODO How to properly represent such cases (see #421)?
             else
                 // The input's number of columns is known.
                 resVts = std::vector(numCols, argVts[0][0]);
-        }
-        else
+        } else
             // Even if the first arg is a frame, its column types get collapsed
             // to the most general type later on.
             resVts = argVts[0];
     }
-    // TODO Reduce the code duplication. Merge the traits ValueTypeFromFirstArg and
-    // ValueTypeFromThirdArg into one parametric trait ValueTypeFromArg<N>, see #487.
-    else if(op->template hasTrait<ValueTypeFromThirdArg>()) {
-        if(resDtc == DataTypeCode::FRAME && argDtc[2] == DataTypeCode::MATRIX) {
+    // TODO Reduce the code duplication. Merge the traits ValueTypeFromFirstArg
+    // and ValueTypeFromThirdArg into one parametric trait ValueTypeFromArg<N>,
+    // see #487.
+    else if (op->template hasTrait<ValueTypeFromThirdArg>()) {
+        if (resDtc == DataTypeCode::FRAME && argDtc[2] == DataTypeCode::MATRIX) {
             // We need to make sure that the value type of the input matrix is
             // repeated in the column value types of the output frame to match
             // the number of columns of the input matrix.
-            const ssize_t numCols = op->getOperand(2)
-                .getType()
-                .template dyn_cast<daphne::MatrixType>()
-                .getNumCols();
-            if(numCols == -1)
+            const ssize_t numCols = op->getOperand(2).getType().template dyn_cast<daphne::MatrixType>().getNumCols();
+            if (numCols == -1)
                 // The input's number of columns is unknown.
                 resVts = {u}; // TODO How to properly represent such cases (see #421)?
             else
                 // The input's number of columns is known.
                 resVts = std::vector(numCols, argVts[2][0]);
-        }
-        else
+        } else
             // Even if the third arg is a frame, its column types get collapsed
             // to the most general type later on.
             resVts = argVts[2];
-    }
-    else if(op->template hasTrait<ValueTypeFromArgs>())
+    } else if (op->template hasTrait<ValueTypeFromArgs>())
         resVts = inferValueTypeFromArgs(argDtc, argVts);
-    else if(op->template hasTrait<ValueTypeFromArgsFP>()) {
+    else if (op->template hasTrait<ValueTypeFromArgsFP>()) {
         // Get the most general value types...
         resVts = inferValueTypeFromArgs(argDtc, argVts);
 
         // ...and replace them by the most general floating-point type where
         // necessary.
-        for(size_t i = 0; i < resVts.size(); i++)
-            if(!llvm::isa<FloatType>(resVts[i]) && !llvm::isa<daphne::UnknownType>(resVts[i]))
+        for (size_t i = 0; i < resVts.size(); i++)
+            if (!llvm::isa<FloatType>(resVts[i]) && !llvm::isa<daphne::UnknownType>(resVts[i]))
                 resVts[i] = FloatType::getF64(ctx);
-    }
-    else if(op->template hasTrait<ValueTypeFromArgsInt>()) {
+    } else if (op->template hasTrait<ValueTypeFromArgsInt>()) {
         // Get the most general value types...
         resVts = inferValueTypeFromArgs(argDtc, argVts);
 
         // ...and replace them by the most general integer type where
         // necessary.
-        for(size_t i = 0; i < resVts.size(); i++)
-            if(!llvm::isa<IntegerType>(resVts[i]) && !llvm::isa<daphne::UnknownType>(resVts[i]))
-                resVts[i] = IntegerType::get(
-                        ctx, 64, IntegerType::SignednessSemantics::Unsigned
-                );
-    }
-    else if(op->template hasTrait<ValueTypesConcat>()) {
+        for (size_t i = 0; i < resVts.size(); i++)
+            if (!llvm::isa<IntegerType>(resVts[i]) && !llvm::isa<daphne::UnknownType>(resVts[i]))
+                resVts[i] = IntegerType::get(ctx, 64, IntegerType::SignednessSemantics::Unsigned);
+    } else if (op->template hasTrait<ValueTypesConcat>()) {
         const size_t numArgsConsider = 2;
-        if(argVts.size() < numArgsConsider)
-            throw std::runtime_error(
-                    "type inference trait ValueTypesConcat requires at least "
-                    "two arguments"
-            );
+        if (argVts.size() < numArgsConsider)
+            throw std::runtime_error("type inference trait ValueTypesConcat requires at least "
+                                     "two arguments");
 
-        switch(resDtc) {
-            case DataTypeCode::FRAME:
-                resVts = {};
-                for(size_t i = 0; i < numArgsConsider; i++) {
-                    bool abort = false;
-                    switch(argDtc[i]) {
-                        case DataTypeCode::FRAME:
-                            // Append this input frame's column types to the
-                            // result's column types.
-                            for(size_t k = 0; k < argVts[i].size(); k++)
-                                resVts.push_back(argVts[i][k]);
-                            break;
-                        case DataTypeCode::MATRIX: {
-                            const ssize_t numCols = op->getOperand(i)
-                                .getType()
-                                .template dyn_cast<daphne::MatrixType>()
-                                .getNumCols();
-                            if(numCols == -1) {
-                                // The number of columns of this input matrix
-                                // is unknown, so it is unclear how often its
-                                // value type needs to be appended to the
-                                // result column types.
-                                resVts = {u}; // TODO How to best represent this case (see #421)?
-                                abort = true;
-                            }
-                            else
-                                // The number of columns of this input matrix
-                                // is known, so we append its value type to the
-                                // result column types that number of times.
-                                for(ssize_t k = 0; k < numCols; k++)
-                                    resVts.push_back(argVts[i][0]);
-                            break;
-                        }
-                        case DataTypeCode::SCALAR:
-                            // Append the value type of this input scalar to
-                            // the result column types.
+        switch (resDtc) {
+        case DataTypeCode::FRAME:
+            resVts = {};
+            for (size_t i = 0; i < numArgsConsider; i++) {
+                bool abort = false;
+                switch (argDtc[i]) {
+                case DataTypeCode::FRAME:
+                    // Append this input frame's column types to the
+                    // result's column types.
+                    for (size_t k = 0; k < argVts[i].size(); k++)
+                        resVts.push_back(argVts[i][k]);
+                    break;
+                case DataTypeCode::MATRIX: {
+                    const ssize_t numCols =
+                        op->getOperand(i).getType().template dyn_cast<daphne::MatrixType>().getNumCols();
+                    if (numCols == -1) {
+                        // The number of columns of this input matrix
+                        // is unknown, so it is unclear how often its
+                        // value type needs to be appended to the
+                        // result column types.
+                        resVts = {u}; // TODO How to best represent this case
+                                      // (see #421)?
+                        abort = true;
+                    } else
+                        // The number of columns of this input matrix
+                        // is known, so we append its value type to the
+                        // result column types that number of times.
+                        for (ssize_t k = 0; k < numCols; k++)
                             resVts.push_back(argVts[i][0]);
-                            break;
-                        case DataTypeCode::UNKNOWN:
-                            // It is unclear how this input contributes to
-                            // the result's column types.
-                            resVts = {u}; // TODO How to best represent this case (see #421)?
-                            abort = true;
-                            break;
-                    }
-                    if(abort)
-                        break;
+                    break;
+                }
+                case DataTypeCode::SCALAR:
+                    // Append the value type of this input scalar to
+                    // the result column types.
+                    resVts.push_back(argVts[i][0]);
+                    break;
+                case DataTypeCode::UNKNOWN:
+                    // It is unclear how this input contributes to
+                    // the result's column types.
+                    resVts = {u}; // TODO How to best represent this case (see #421)?
+                    abort = true;
+                    break;
                 }
-                break;
-            case DataTypeCode::MATRIX: // fall-through intended
-            case DataTypeCode::SCALAR:
-                resVts = {mostGeneralVt(argVts, numArgsConsider)};
-                break;
-            case DataTypeCode::UNKNOWN:
-                // nothing to do
-                break;
+                if (abort)
+                    break;
+            }
+            break;
+        case DataTypeCode::MATRIX: // fall-through intended
+        case DataTypeCode::SCALAR:
+            resVts = {mostGeneralVt(argVts, numArgsConsider)};
+            break;
+        case DataTypeCode::UNKNOWN:
+            // nothing to do
+            break;
         }
-    }
-    else if(op->template hasTrait<ValueTypeSI64>())
+    } else if (op->template hasTrait<ValueTypeSI64>())
         resVts = {IntegerType::get(ctx, 64, IntegerType::SignednessSemantics::Signed)};
-    else if(op->template hasTrait<ValueTypeSize>())
+    else if (op->template hasTrait<ValueTypeSize>())
         resVts = {IndexType::get(ctx)};
-    else if(op->template hasTrait<ValueTypeStr>())
+    else if (op->template hasTrait<ValueTypeStr>())
         resVts = {daphne::StringType::get(ctx)};
 
     // --------------------------------------------------------------------
@@ -314,20 +286,20 @@ mlir::Type inferTypeByTraits(O * op) {
 
     // It is important to recreate matrix and frame types (not reuse those from
     // the inputs) to get rid of any additional properties (shape, etc.).
-    switch(resDtc) {
-        case DataTypeCode::UNKNOWN:
-            resTy = u;
-            break;
-        case DataTypeCode::SCALAR:
-            resTy = mostGeneralVt(resVts);
-            break;
-        case DataTypeCode::MATRIX:
-            resTy = daphne::MatrixType::get(ctx, mostGeneralVt(resVts));
-            break;
-        case DataTypeCode::FRAME: {
-            resTy = daphne::FrameType::get(ctx, resVts);
-            break;
-        }
+    switch (resDtc) {
+    case DataTypeCode::UNKNOWN:
+        resTy = u;
+        break;
+    case DataTypeCode::SCALAR:
+        resTy = mostGeneralVt(resVts);
+        break;
+    case DataTypeCode::MATRIX:
+        resTy = daphne::MatrixType::get(ctx, mostGeneralVt(resVts));
+        break;
+    case DataTypeCode::FRAME: {
+        resTy = daphne::FrameType::get(ctx, resVts);
+        break;
+    }
     }
 
     // Note that all our type inference traits assume that the operation has
diff --git a/src/compiler/lowering/AggAllOpLowering.cpp b/src/compiler/lowering/AggAllOpLowering.cpp
index e6398803f..031f814d0 100644
--- a/src/compiler/lowering/AggAllOpLowering.cpp
+++ b/src/compiler/lowering/AggAllOpLowering.cpp
@@ -58,128 +58,120 @@
 using namespace mlir;
 
 class SumAllOpLowering : public OpConversionPattern<daphne::AllAggSumOp> {
-public:
-  using OpConversionPattern::OpConversionPattern;
-
-  SumAllOpLowering(TypeConverter &typeConverter, MLIRContext *ctx)
-      : mlir::OpConversionPattern<daphne::AllAggSumOp>(typeConverter, ctx) {
-    this->setDebugName("SumAllOpLowering");
-  }
-  // Float and Integer value type matrices have to be handled separately, since
-  // arith operations are different.
-  LogicalResult
-  matchAndRewrite(daphne::AllAggSumOp op, OpAdaptor adaptor,
-                  ConversionPatternRewriter &rewriter) const override {
-    mlir::daphne::MatrixType matrixType =
-        adaptor.getArg().getType().dyn_cast<mlir::daphne::MatrixType>();
-    auto loc = op->getLoc();
-    auto nR = matrixType.getNumRows();
-    auto nC = matrixType.getNumCols();
-
-    auto matrixElementType = matrixType.getElementType();
-    auto memRefType = mlir::MemRefType::get({nR, nC}, matrixElementType);
-    auto memRef = rewriter.create<mlir::daphne::ConvertDenseMatrixToMemRef>(
-        op->getLoc(), memRefType, adaptor.getArg());
-
-    if (matrixElementType.isIntOrIndex()) {
-      IntegerType signless_type =
-          rewriter.getIntegerType(matrixElementType.getIntOrFloatBitWidth());
-      Value sum = rewriter.create<mlir::arith::ConstantOp>(
-          loc, signless_type, rewriter.getIntegerAttr(signless_type, 0));
-
-      SmallVector<Value, 4> loopIvs;
-      SmallVector<AffineForOp, 2> forOps;
-      auto outerLoop =
-          rewriter.create<AffineForOp>(loc, 0, nR, 1, ValueRange{sum});
-      for (Operation &nested : *outerLoop.getBody()) {
-        rewriter.eraseOp(&nested);
-      }
-      loopIvs.push_back(outerLoop.getInductionVar());
-      // outer loop body
-      rewriter.setInsertionPointToStart(outerLoop.getBody());
-      Value sum_iter = rewriter.create<mlir::arith::ConstantOp>(
-          loc, signless_type, rewriter.getIntegerAttr(signless_type, 0));
-      // inner loop
-      auto innerLoop =
-          rewriter.create<AffineForOp>(loc, 0, nC, 1, ValueRange{sum_iter});
-      for (Operation &nested : *innerLoop.getBody()) {
-        rewriter.eraseOp(&nested);
-      }
-      loopIvs.push_back(innerLoop.getInductionVar());
-      // inner loop body
-      rewriter.setInsertionPointToStart(innerLoop.getBody());
-      // load value from memref
-      Value elementLoad = rewriter.create<memref::LoadOp>(loc, memRef, loopIvs);
-      auto castedElement = this->typeConverter->materializeSourceConversion(
-          rewriter, loc, signless_type, ValueRange{elementLoad});
-      // sum loop iter arg and memref value
-      mlir::Value inner_sum = rewriter.create<mlir::arith::AddIOp>(
-          loc, innerLoop.getRegionIterArgs()[0], castedElement);
-      // yield inner loop result
-      rewriter.setInsertionPointToEnd(innerLoop.getBody());
-      rewriter.create<AffineYieldOp>(loc, inner_sum);
-      // yield outer loop result
-      rewriter.setInsertionPointToEnd(outerLoop.getBody());
-      mlir::Value outer_sum = rewriter.create<mlir::arith::AddIOp>(
-          loc, outerLoop.getRegionIterArgs()[0], innerLoop.getResult(0));
-      rewriter.create<AffineYieldOp>(loc, outer_sum);
-
-      rewriter.setInsertionPointAfter(outerLoop);
-      rewriter.create<daphne::DecRefOp>(loc, adaptor.getArg());
-      // replace sumAll op with result of loops
-      auto castedRes = this->typeConverter->materializeTargetConversion(
-          rewriter, loc, matrixElementType,
-          ValueRange{outerLoop->getResult(0)});
-      rewriter.replaceOp(op, ValueRange{castedRes});
-
-      return success();
-    } else {
-      Value sum = rewriter.create<mlir::arith::ConstantOp>(
-          loc, matrixElementType, rewriter.getFloatAttr(matrixElementType, 0));
-
-      SmallVector<Value, 4> loopIvs;
-      SmallVector<AffineForOp, 2> forOps;
-      auto outerLoop =
-          rewriter.create<AffineForOp>(loc, 0, nR, 1, ValueRange{sum});
-      for (Operation &nested : *outerLoop.getBody()) {
-        rewriter.eraseOp(&nested);
-      }
-      loopIvs.push_back(outerLoop.getInductionVar());
-      // outer loop body
-      rewriter.setInsertionPointToStart(outerLoop.getBody());
-      Value sum_iter = rewriter.create<mlir::arith::ConstantOp>(
-          loc, matrixElementType, rewriter.getFloatAttr(matrixElementType, 0));
-      // inner loop
-      auto innerLoop =
-          rewriter.create<AffineForOp>(loc, 0, nC, 1, ValueRange{sum_iter});
-      for (Operation &nested : *innerLoop.getBody()) {
-        rewriter.eraseOp(&nested);
-      }
-      loopIvs.push_back(innerLoop.getInductionVar());
-      // inner loop body
-      rewriter.setInsertionPointToStart(innerLoop.getBody());
-      // load value from memref
-      auto elementLoad = rewriter.create<memref::LoadOp>(loc, memRef, loopIvs);
-      // sum loop iter arg and memref value
-      mlir::Value inner_sum = rewriter.create<mlir::arith::AddFOp>(
-          loc, innerLoop.getRegionIterArgs()[0], elementLoad);
-      // yield inner loop result
-      rewriter.setInsertionPointToEnd(innerLoop.getBody());
-      rewriter.create<AffineYieldOp>(loc, inner_sum);
-      // yield outer loop result
-      rewriter.setInsertionPointToEnd(outerLoop.getBody());
-      mlir::Value outer_sum = rewriter.create<mlir::arith::AddFOp>(
-          loc, outerLoop.getRegionIterArgs()[0], innerLoop.getResult(0));
-      rewriter.create<AffineYieldOp>(loc, outer_sum);
-
-      rewriter.setInsertionPointAfter(outerLoop);
-      rewriter.create<daphne::DecRefOp>(loc, adaptor.getArg());
-      // replace sumAll op with result of loops
-      rewriter.replaceOp(op, outerLoop.getResult(0));
-
-      return success();
+  public:
+    using OpConversionPattern::OpConversionPattern;
+
+    SumAllOpLowering(TypeConverter &typeConverter, MLIRContext *ctx)
+        : mlir::OpConversionPattern<daphne::AllAggSumOp>(typeConverter, ctx) {
+        this->setDebugName("SumAllOpLowering");
+    }
+    // Float and Integer value type matrices have to be handled separately,
+    // since arith operations are different.
+    LogicalResult matchAndRewrite(daphne::AllAggSumOp op, OpAdaptor adaptor,
+                                  ConversionPatternRewriter &rewriter) const override {
+        mlir::daphne::MatrixType matrixType = adaptor.getArg().getType().dyn_cast<mlir::daphne::MatrixType>();
+        auto loc = op->getLoc();
+        auto nR = matrixType.getNumRows();
+        auto nC = matrixType.getNumCols();
+
+        auto matrixElementType = matrixType.getElementType();
+        auto memRefType = mlir::MemRefType::get({nR, nC}, matrixElementType);
+        auto memRef =
+            rewriter.create<mlir::daphne::ConvertDenseMatrixToMemRef>(op->getLoc(), memRefType, adaptor.getArg());
+
+        if (matrixElementType.isIntOrIndex()) {
+            IntegerType signless_type = rewriter.getIntegerType(matrixElementType.getIntOrFloatBitWidth());
+            Value sum =
+                rewriter.create<mlir::arith::ConstantOp>(loc, signless_type, rewriter.getIntegerAttr(signless_type, 0));
+
+            SmallVector<Value, 4> loopIvs;
+            SmallVector<AffineForOp, 2> forOps;
+            auto outerLoop = rewriter.create<AffineForOp>(loc, 0, nR, 1, ValueRange{sum});
+            for (Operation &nested : *outerLoop.getBody()) {
+                rewriter.eraseOp(&nested);
+            }
+            loopIvs.push_back(outerLoop.getInductionVar());
+            // outer loop body
+            rewriter.setInsertionPointToStart(outerLoop.getBody());
+            Value sum_iter =
+                rewriter.create<mlir::arith::ConstantOp>(loc, signless_type, rewriter.getIntegerAttr(signless_type, 0));
+            // inner loop
+            auto innerLoop = rewriter.create<AffineForOp>(loc, 0, nC, 1, ValueRange{sum_iter});
+            for (Operation &nested : *innerLoop.getBody()) {
+                rewriter.eraseOp(&nested);
+            }
+            loopIvs.push_back(innerLoop.getInductionVar());
+            // inner loop body
+            rewriter.setInsertionPointToStart(innerLoop.getBody());
+            // load value from memref
+            Value elementLoad = rewriter.create<memref::LoadOp>(loc, memRef, loopIvs);
+            auto castedElement =
+                this->typeConverter->materializeSourceConversion(rewriter, loc, signless_type, ValueRange{elementLoad});
+            // sum loop iter arg and memref value
+            mlir::Value inner_sum =
+                rewriter.create<mlir::arith::AddIOp>(loc, innerLoop.getRegionIterArgs()[0], castedElement);
+            // yield inner loop result
+            rewriter.setInsertionPointToEnd(innerLoop.getBody());
+            rewriter.create<AffineYieldOp>(loc, inner_sum);
+            // yield outer loop result
+            rewriter.setInsertionPointToEnd(outerLoop.getBody());
+            mlir::Value outer_sum =
+                rewriter.create<mlir::arith::AddIOp>(loc, outerLoop.getRegionIterArgs()[0], innerLoop.getResult(0));
+            rewriter.create<AffineYieldOp>(loc, outer_sum);
+
+            rewriter.setInsertionPointAfter(outerLoop);
+            rewriter.create<daphne::DecRefOp>(loc, adaptor.getArg());
+            // replace sumAll op with result of loops
+            auto castedRes = this->typeConverter->materializeTargetConversion(rewriter, loc, matrixElementType,
+                                                                              ValueRange{outerLoop->getResult(0)});
+            rewriter.replaceOp(op, ValueRange{castedRes});
+
+            return success();
+        } else {
+            Value sum = rewriter.create<mlir::arith::ConstantOp>(loc, matrixElementType,
+                                                                 rewriter.getFloatAttr(matrixElementType, 0));
+
+            SmallVector<Value, 4> loopIvs;
+            SmallVector<AffineForOp, 2> forOps;
+            auto outerLoop = rewriter.create<AffineForOp>(loc, 0, nR, 1, ValueRange{sum});
+            for (Operation &nested : *outerLoop.getBody()) {
+                rewriter.eraseOp(&nested);
+            }
+            loopIvs.push_back(outerLoop.getInductionVar());
+            // outer loop body
+            rewriter.setInsertionPointToStart(outerLoop.getBody());
+            Value sum_iter = rewriter.create<mlir::arith::ConstantOp>(loc, matrixElementType,
+                                                                      rewriter.getFloatAttr(matrixElementType, 0));
+            // inner loop
+            auto innerLoop = rewriter.create<AffineForOp>(loc, 0, nC, 1, ValueRange{sum_iter});
+            for (Operation &nested : *innerLoop.getBody()) {
+                rewriter.eraseOp(&nested);
+            }
+            loopIvs.push_back(innerLoop.getInductionVar());
+            // inner loop body
+            rewriter.setInsertionPointToStart(innerLoop.getBody());
+            // load value from memref
+            auto elementLoad = rewriter.create<memref::LoadOp>(loc, memRef, loopIvs);
+            // sum loop iter arg and memref value
+            mlir::Value inner_sum =
+                rewriter.create<mlir::arith::AddFOp>(loc, innerLoop.getRegionIterArgs()[0], elementLoad);
+            // yield inner loop result
+            rewriter.setInsertionPointToEnd(innerLoop.getBody());
+            rewriter.create<AffineYieldOp>(loc, inner_sum);
+            // yield outer loop result
+            rewriter.setInsertionPointToEnd(outerLoop.getBody());
+            mlir::Value outer_sum =
+                rewriter.create<mlir::arith::AddFOp>(loc, outerLoop.getRegionIterArgs()[0], innerLoop.getResult(0));
+            rewriter.create<AffineYieldOp>(loc, outer_sum);
+
+            rewriter.setInsertionPointAfter(outerLoop);
+            rewriter.create<daphne::DecRefOp>(loc, adaptor.getArg());
+            // replace sumAll op with result of loops
+            rewriter.replaceOp(op, outerLoop.getResult(0));
+
+            return success();
+        }
     }
-  }
 };
 
 namespace {
@@ -191,61 +183,58 @@ namespace {
  * This rewrite may enable loop fusion of the produced affine loops by
  * running the loop fusion pass.
  */
-struct AggAllLoweringPass
-    : public mlir::PassWrapper<AggAllLoweringPass,
-                               mlir::OperationPass<mlir::ModuleOp>> {
-  explicit AggAllLoweringPass() {}
-
-  StringRef getArgument() const final { return "lower-agg"; }
-  StringRef getDescription() const final {
-    return "Lowers AggAll operators to a set of affine loops and performs "
-           "the aggregation on a MemRef which is created from the input "
-           "DenseMatrix.";
-  }
-
-  void getDependentDialects(mlir::DialectRegistry &registry) const override {
-    registry.insert<mlir::LLVM::LLVMDialect, mlir::AffineDialect,
-                    mlir::memref::MemRefDialect>();
-  }
-  void runOnOperation() final;
+struct AggAllLoweringPass : public mlir::PassWrapper<AggAllLoweringPass, mlir::OperationPass<mlir::ModuleOp>> {
+    explicit AggAllLoweringPass() {}
+
+    StringRef getArgument() const final { return "lower-agg"; }
+    StringRef getDescription() const final {
+        return "Lowers AggAll operators to a set of affine loops and performs "
+               "the aggregation on a MemRef which is created from the input "
+               "DenseMatrix.";
+    }
+
+    void getDependentDialects(mlir::DialectRegistry &registry) const override {
+        registry.insert<mlir::LLVM::LLVMDialect, mlir::AffineDialect, mlir::memref::MemRefDialect>();
+    }
+    void runOnOperation() final;
 };
 } // end anonymous namespace
 
 void AggAllLoweringPass::runOnOperation() {
-  mlir::ConversionTarget target(getContext());
-  mlir::RewritePatternSet patterns(&getContext());
-  LowerToLLVMOptions llvmOptions(&getContext());
-  LLVMTypeConverter typeConverter(&getContext(), llvmOptions);
-
-  typeConverter.addConversion(convertInteger);
-  typeConverter.addConversion(convertFloat);
-  typeConverter.addConversion([](Type type) { return type; });
-  typeConverter.addArgumentMaterialization(materializeCastFromIllegal);
-  typeConverter.addSourceMaterialization(materializeCastToIllegal);
-  typeConverter.addTargetMaterialization(materializeCastFromIllegal);
-
-  target.addLegalDialect<mlir::memref::MemRefDialect>();
-  target.addLegalDialect<mlir::arith::ArithDialect>();
-  target.addLegalDialect<mlir::scf::SCFDialect>();
-  target.addLegalDialect<mlir::AffineDialect>();
-  target.addLegalDialect<mlir::linalg::LinalgDialect>();
-  target.addLegalDialect<mlir::LLVM::LLVMDialect>();
-  target.addLegalDialect<daphne::DaphneDialect>();
-  target.addLegalDialect<BuiltinDialect>();
-
-  target.addLegalOp<mlir::daphne::ConvertDenseMatrixToMemRef>();
-  target.addLegalOp<mlir::daphne::ConvertMemRefToDenseMatrix>();
-  target.addLegalOp<mlir::daphne::DecRefOp>();
-
-  target.addIllegalOp<mlir::daphne::AllAggSumOp>();
-
-  patterns.insert<SumAllOpLowering>(typeConverter, &getContext());
-  auto module = getOperation();
-  if (failed(applyPartialConversion(module, target, std::move(patterns)))) {
-    signalPassFailure();
-  }
+    mlir::ConversionTarget target(getContext());
+    mlir::RewritePatternSet patterns(&getContext());
+    LowerToLLVMOptions llvmOptions(&getContext());
+    LLVMTypeConverter typeConverter(&getContext(), llvmOptions);
+
+    typeConverter.addConversion(convertInteger);
+    typeConverter.addConversion(convertFloat);
+    typeConverter.addConversion([](Type type) { return type; });
+    typeConverter.addArgumentMaterialization(materializeCastFromIllegal);
+    typeConverter.addSourceMaterialization(materializeCastToIllegal);
+    typeConverter.addTargetMaterialization(materializeCastFromIllegal);
+
+    target.addLegalDialect<mlir::memref::MemRefDialect>();
+    target.addLegalDialect<mlir::arith::ArithDialect>();
+    target.addLegalDialect<mlir::scf::SCFDialect>();
+    target.addLegalDialect<mlir::AffineDialect>();
+    target.addLegalDialect<mlir::linalg::LinalgDialect>();
+    target.addLegalDialect<mlir::LLVM::LLVMDialect>();
+    target.addLegalDialect<daphne::DaphneDialect>();
+    target.addLegalDialect<BuiltinDialect>();
+
+    target.addLegalOp<mlir::daphne::ConvertDenseMatrixToMemRef>();
+    target.addLegalOp<mlir::daphne::ConvertMemRefToDenseMatrix>();
+    target.addLegalOp<mlir::daphne::DecRefOp>();
+
+    target.addIllegalOp<mlir::daphne::AllAggSumOp>();
+
+    patterns.insert<SumAllOpLowering>(typeConverter, &getContext());
+    auto module = getOperation();
+    if (failed(applyPartialConversion(module, target, std::move(patterns)))) {
+        signalPassFailure();
+    }
 }
 
 std::unique_ptr<mlir::Pass> mlir::daphne::createAggAllOpLoweringPass() {
-  return std::make_unique<AggAllLoweringPass>();
+    return std::make_unique<AggAllLoweringPass>();
 }
diff --git a/src/compiler/lowering/CMakeLists.txt b/src/compiler/lowering/CMakeLists.txt
index 02bbf26ca..af7f0eb88 100644
--- a/src/compiler/lowering/CMakeLists.txt
+++ b/src/compiler/lowering/CMakeLists.txt
@@ -26,7 +26,6 @@ add_mlir_dialect_library(MLIRDaphneTransforms
     RewriteToCallKernelOpPass.cpp
     SpecializeGenericFunctionsPass.cpp
     VectorizeComputationsPass.cpp
-    WhileLoopInvariantCodeMotionPass.cpp
     DaphneOptPass.cpp
     EwOpsLowering.cpp
     ModOpLowering.cpp
diff --git a/src/compiler/lowering/DaphneOptPass.cpp b/src/compiler/lowering/DaphneOptPass.cpp
index 8795962e2..4b4ea2493 100644
--- a/src/compiler/lowering/DaphneOptPass.cpp
+++ b/src/compiler/lowering/DaphneOptPass.cpp
@@ -2,7 +2,6 @@
 #include "compiler/utils/LoweringUtils.h"
 #include "ir/daphneir/Daphne.h"
 #include "ir/daphneir/Passes.h"
-#include "llvm/Support/Debug.h"
 #include "mlir/Conversion/LLVMCommon/LoweringOptions.h"
 #include "mlir/Conversion/LLVMCommon/TypeConverter.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
@@ -14,33 +13,31 @@
 #include "mlir/Pass/Pass.h"
 #include "mlir/Support/LogicalResult.h"
 #include "mlir/Transforms/DialectConversion.h"
+#include "llvm/Support/Debug.h"
 
 #define DEBUG_TYPE "dm-opt"
 
 using namespace mlir;
 
 class IntegerModOpt : public mlir::OpConversionPattern<mlir::daphne::EwModOp> {
-   public:
+  public:
     using OpConversionPattern::OpConversionPattern;
 
     [[nodiscard]] static bool optimization_viable(mlir::daphne::EwModOp op) {
-        if (!op.getRhs().getType().isUnsignedInteger()) return false;
+        if (!op.getRhs().getType().isUnsignedInteger())
+            return false;
 
-        std::pair<bool, uint64_t> isConstant =
-            CompilerUtils::isConstant<uint64_t>(op.getRhs());
-        // Apply (lhs % rhs) to (lhs & (rhs - 1)) optimization when rhs is a power of two
+        std::pair<bool, uint64_t> isConstant = CompilerUtils::isConstant<uint64_t>(op.getRhs());
+        // Apply (lhs % rhs) to (lhs & (rhs - 1)) optimization when rhs is a
+        // power of two
         return isConstant.first && (isConstant.second & (isConstant.second - 1)) == 0;
     }
 
-    mlir::LogicalResult matchAndRewrite(
-        mlir::daphne::EwModOp op, OpAdaptor adaptor,
-        mlir::ConversionPatternRewriter &rewriter) const override {
-        mlir::Value cst_one = rewriter.create<mlir::daphne::ConstantOp>(
-            op.getLoc(), static_cast<uint64_t>(1));
-        mlir::Value sub = rewriter.create<mlir::daphne::EwSubOp>(
-            op.getLoc(), adaptor.getRhs(), cst_one);
-        mlir::Value andOp = rewriter.create<mlir::daphne::EwBitwiseAndOp>(
-            op.getLoc(), adaptor.getLhs(), sub);
+    mlir::LogicalResult matchAndRewrite(mlir::daphne::EwModOp op, OpAdaptor adaptor,
+                                        mlir::ConversionPatternRewriter &rewriter) const override {
+        mlir::Value cst_one = rewriter.create<mlir::daphne::ConstantOp>(op.getLoc(), static_cast<uint64_t>(1));
+        mlir::Value sub = rewriter.create<mlir::daphne::EwSubOp>(op.getLoc(), adaptor.getRhs(), cst_one);
+        mlir::Value andOp = rewriter.create<mlir::daphne::EwBitwiseAndOp>(op.getLoc(), adaptor.getLhs(), sub);
         rewriter.replaceOp(op, andOp);
         return success();
     }
@@ -52,14 +49,11 @@ namespace {
  * the DaphneDialect to a different set of operations also from the
  * DaphneDialect.
  */
-struct DenseMatrixOptPass
-    : public mlir::PassWrapper<DenseMatrixOptPass,
-                               mlir::OperationPass<mlir::ModuleOp>> {
+struct DenseMatrixOptPass : public mlir::PassWrapper<DenseMatrixOptPass, mlir::OperationPass<mlir::ModuleOp>> {
     explicit DenseMatrixOptPass() {}
 
     void getDependentDialects(mlir::DialectRegistry &registry) const override {
-        registry.insert<mlir::LLVM::LLVMDialect, mlir::arith::ArithDialect,
-                        mlir::daphne::DaphneDialect>();
+        registry.insert<mlir::LLVM::LLVMDialect, mlir::arith::ArithDialect, mlir::daphne::DaphneDialect>();
     }
     void runOnOperation() final;
 
@@ -70,7 +64,7 @@ struct DenseMatrixOptPass
                "also from the DaphneDialect.";
     }
 };
-}  // end anonymous namespace
+} // end anonymous namespace
 
 void DenseMatrixOptPass::runOnOperation() {
     mlir::ConversionTarget target(getContext());
@@ -85,9 +79,7 @@ void DenseMatrixOptPass::runOnOperation() {
     target.addLegalDialect<mlir::daphne::DaphneDialect>();
 
     target.addDynamicallyLegalOp<mlir::daphne::EwModOp>(
-        [&](mlir::daphne::EwModOp op) {
-            return !IntegerModOpt::optimization_viable(op);
-        });
+        [&](mlir::daphne::EwModOp op) { return !IntegerModOpt::optimization_viable(op); });
 
     patterns.insert<IntegerModOpt>(typeConverter, &getContext());
 
@@ -97,6 +89,4 @@ void DenseMatrixOptPass::runOnOperation() {
     }
 }
 
-std::unique_ptr<mlir::Pass> mlir::daphne::createDaphneOptPass() {
-    return std::make_unique<DenseMatrixOptPass>();
-}
+std::unique_ptr<mlir::Pass> mlir::daphne::createDaphneOptPass() { return std::make_unique<DenseMatrixOptPass>(); }
diff --git a/src/compiler/lowering/DistributeComputationsPass.cpp b/src/compiler/lowering/DistributeComputationsPass.cpp
index 088688d31..8f9428587 100644
--- a/src/compiler/lowering/DistributeComputationsPass.cpp
+++ b/src/compiler/lowering/DistributeComputationsPass.cpp
@@ -18,25 +18,21 @@
 #include "ir/daphneir/Passes.h"
 
 #include "mlir/Dialect/Arith/IR/Arith.h"
-#include <mlir/Dialect/LLVMIR/LLVMDialect.h>
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Transforms/DialectConversion.h"
+#include <mlir/Dialect/LLVMIR/LLVMDialect.h>
 
 #include <memory>
 #include <utility>
 
 using namespace mlir;
 
-namespace
-{
-struct Distribute : public OpInterfaceConversionPattern<daphne::Distributable>
-{
+namespace {
+struct Distribute : public OpInterfaceConversionPattern<daphne::Distributable> {
     using OpInterfaceConversionPattern::OpInterfaceConversionPattern;
 
-    LogicalResult
-    matchAndRewrite(daphne::Distributable op, ArrayRef<Value> operands,
-                    ConversionPatternRewriter &rewriter) const override
-    {
+    LogicalResult matchAndRewrite(daphne::Distributable op, ArrayRef<Value> operands,
+                                  ConversionPatternRewriter &rewriter) const override {
         std::vector<Value> distributedInputs;
         for (auto zipIt : llvm::zip(operands, op.getOperandDistrPrimitives())) {
             Value operand = std::get<0>(zipIt);
@@ -56,7 +52,7 @@ struct Distribute : public OpInterfaceConversionPattern<daphne::Distributable>
             else {
                 // The operands need to be distributed/broadcasted first.
                 Type t = daphne::HandleType::get(getContext(), operand.getType());
-                if(isBroadcast)
+                if (isBroadcast)
                     distributedInputs.push_back(rewriter.create<daphne::BroadcastOp>(op->getLoc(), t, operand));
                 else
                     distributedInputs.push_back(rewriter.create<daphne::DistributeOp>(op->getLoc(), t, operand));
@@ -69,24 +65,19 @@ struct Distribute : public OpInterfaceConversionPattern<daphne::Distributable>
     }
 };
 
-struct DistributeComputationsPass
-    : public PassWrapper<DistributeComputationsPass, OperationPass<ModuleOp>>
-{
+struct DistributeComputationsPass : public PassWrapper<DistributeComputationsPass, OperationPass<ModuleOp>> {
     void runOnOperation() final;
 
     StringRef getArgument() const final { return "distribute-computation"; }
     StringRef getDescription() const final { return "TODO"; }
 };
-}
+} // namespace
 
-bool onlyMatrixOperands(Operation * op) {
-    return llvm::all_of(op->getOperandTypes(), [](Type t) {
-        return llvm::isa<daphne::MatrixType>(t);
-    });
+bool onlyMatrixOperands(Operation *op) {
+    return llvm::all_of(op->getOperandTypes(), [](Type t) { return llvm::isa<daphne::MatrixType>(t); });
 }
 
-void DistributeComputationsPass::runOnOperation()
-{
+void DistributeComputationsPass::runOnOperation() {
     auto module = getOperation();
 
     RewritePatternSet patterns(&getContext());
@@ -95,17 +86,16 @@ void DistributeComputationsPass::runOnOperation()
     ConversionTarget target(getContext());
     target.addLegalDialect<arith::ArithDialect, LLVM::LLVMDialect, scf::SCFDialect>();
     target.addLegalOp<ModuleOp, func::FuncOp>();
-    target.addDynamicallyLegalDialect<daphne::DaphneDialect>([](Operation *op)
-    {
+    target.addDynamicallyLegalDialect<daphne::DaphneDialect>([](Operation *op) {
         // An operation is legal (does not need to be replaced), if ...
         return
-                // ... it is not distributable
-                !llvm::isa<daphne::Distributable>(op) ||
-                // ... it is inside some distributed computation already
-                op->getParentOfType<daphne::DistributedComputeOp>() ||
-                // ... not all of its operands are matrices
-                // TODO Support distributing frames and scalars.
-                !onlyMatrixOperands(op);
+            // ... it is not distributable
+            !llvm::isa<daphne::Distributable>(op) ||
+            // ... it is inside some distributed computation already
+            op->getParentOfType<daphne::DistributedComputeOp>() ||
+            // ... not all of its operands are matrices
+            // TODO Support distributing frames and scalars.
+            !onlyMatrixOperands(op);
     });
 
     patterns.add<Distribute>(&getContext());
@@ -114,7 +104,6 @@ void DistributeComputationsPass::runOnOperation()
         signalPassFailure();
 }
 
-std::unique_ptr<Pass> daphne::createDistributeComputationsPass()
-{
+std::unique_ptr<Pass> daphne::createDistributeComputationsPass() {
     return std::make_unique<DistributeComputationsPass>();
 }
diff --git a/src/compiler/lowering/DistributePipelinesPass.cpp b/src/compiler/lowering/DistributePipelinesPass.cpp
index 03489f63e..ede7ed178 100644
--- a/src/compiler/lowering/DistributePipelinesPass.cpp
+++ b/src/compiler/lowering/DistributePipelinesPass.cpp
@@ -18,31 +18,28 @@
 #include "ir/daphneir/Passes.h"
 
 #include "mlir/Dialect/Arith/IR/Arith.h"
-#include <mlir/Dialect/LLVMIR/LLVMDialect.h>
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/IR/IRMapping.h"
 #include "mlir/Transforms/DialectConversion.h"
+#include <mlir/Dialect/LLVMIR/LLVMDialect.h>
 
 using namespace mlir;
 
 /**
  * @brief Replaces vectorized pipelines by distributed pipelines.
  */
-struct DistributePipelines : public OpConversionPattern<daphne::VectorizedPipelineOp>
-{
+struct DistributePipelines : public OpConversionPattern<daphne::VectorizedPipelineOp> {
     using OpConversionPattern::OpConversionPattern;
 
-    LogicalResult
-    matchAndRewrite(daphne::VectorizedPipelineOp op, OpAdaptor adaptor,
-                    ConversionPatternRewriter &rewriter) const override
-    {
+    LogicalResult matchAndRewrite(daphne::VectorizedPipelineOp op, OpAdaptor adaptor,
+                                  ConversionPatternRewriter &rewriter) const override {
         MLIRContext newContext;
         OpBuilder tempBuilder(&newContext);
         std::string funcName = "dist";
 
         auto &bodyBlock = op.getBody().front();
-        auto funcType = tempBuilder.getFunctionType(
-            bodyBlock.getArgumentTypes(), bodyBlock.getTerminator()->getOperandTypes());
+        auto funcType =
+            tempBuilder.getFunctionType(bodyBlock.getArgumentTypes(), bodyBlock.getTerminator()->getOperandTypes());
         auto funcOp = tempBuilder.create<func::FuncOp>(op.getLoc(), funcName, funcType);
 
         IRMapping mapper;
@@ -71,36 +68,31 @@ struct DistributePipelines : public OpConversionPattern<daphne::VectorizedPipeli
                 // Else add to input/splits array.
                 newInputs.push_back(op.getInputs()[idx]);
                 newSplits.push_back(op.getSplits()[idx]);
-            }            
+            }
         }
         funcOp.eraseArguments(eraseVector);
-        
+
         std::string s;
         llvm::raw_string_ostream stream(s);
         funcOp.print(stream);
         Value irStr = rewriter.create<daphne::ConstantOp>(op.getLoc(), stream.str());
 
-        rewriter.replaceOpWithNewOp<daphne::DistributedPipelineOp>(
-                op.getOperation(),
-                op.getOutputs().getTypes(), irStr, newInputs,
-                op.getOutRows(), op.getOutCols(), rewriter.getArrayAttr(newSplits), op.getCombines()
-        );
-        
+        rewriter.replaceOpWithNewOp<daphne::DistributedPipelineOp>(op.getOperation(), op.getOutputs().getTypes(), irStr,
+                                                                   newInputs, op.getOutRows(), op.getOutCols(),
+                                                                   rewriter.getArrayAttr(newSplits), op.getCombines());
+
         return success();
     }
 };
 
-struct DistributePipelinesPass
-    : public PassWrapper<DistributePipelinesPass, OperationPass<ModuleOp>>
-{
+struct DistributePipelinesPass : public PassWrapper<DistributePipelinesPass, OperationPass<ModuleOp>> {
     void runOnOperation() final;
 
     StringRef getArgument() const final { return "distribute-pipelines"; }
     StringRef getDescription() const final { return "TODO"; }
 };
 
-void DistributePipelinesPass::runOnOperation()
-{
+void DistributePipelinesPass::runOnOperation() {
     auto module = getOperation();
 
     RewritePatternSet patterns(&getContext());
@@ -110,11 +102,11 @@ void DistributePipelinesPass::runOnOperation()
     // TODO do we need all these?
     target.addLegalDialect<arith::ArithDialect, LLVM::LLVMDialect, scf::SCFDialect, daphne::DaphneDialect>();
     target.addLegalOp<ModuleOp, func::FuncOp>();
-    target.addDynamicallyLegalOp<daphne::VectorizedPipelineOp>([](daphne::VectorizedPipelineOp op)
-    {
-        // TODO Carefully decide if this pipeline shall be distributed, e.g.,
-        // based on physical input size. For now, all pipelines are distributed
-        // (false means this pipeline is illegal and must be rewritten).
+    target.addDynamicallyLegalOp<daphne::VectorizedPipelineOp>([](daphne::VectorizedPipelineOp op) {
+        // TODO Carefully decide if this pipeline shall be distributed,
+        // e.g., based on physical input size. For now, all pipelines are
+        // distributed (false means this pipeline is illegal and must be
+        // rewritten).
         return false;
     });
 
@@ -124,7 +116,4 @@ void DistributePipelinesPass::runOnOperation()
         signalPassFailure();
 }
 
-std::unique_ptr<Pass> daphne::createDistributePipelinesPass()
-{
-    return std::make_unique<DistributePipelinesPass>();
-}
+std::unique_ptr<Pass> daphne::createDistributePipelinesPass() { return std::make_unique<DistributePipelinesPass>(); }
diff --git a/src/compiler/lowering/EwOpsLowering.cpp b/src/compiler/lowering/EwOpsLowering.cpp
index 056e46f44..d04960c83 100644
--- a/src/compiler/lowering/EwOpsLowering.cpp
+++ b/src/compiler/lowering/EwOpsLowering.cpp
@@ -44,27 +44,23 @@
 
 using namespace mlir;
 
-template <class UnaryOp, class IOp, class FOp>
-struct UnaryOpLowering : public mlir::OpConversionPattern<UnaryOp> {
+template <class UnaryOp, class IOp, class FOp> struct UnaryOpLowering : public mlir::OpConversionPattern<UnaryOp> {
     using OpAdaptor = typename mlir::OpConversionPattern<UnaryOp>::OpAdaptor;
 
-   public:
+  public:
     UnaryOpLowering(mlir::TypeConverter &typeConverter, mlir::MLIRContext *ctx)
         : mlir::OpConversionPattern<UnaryOp>(typeConverter, ctx) {
         this->setDebugName("EwDaphneOpsLowering");
     }
 
-    mlir::LogicalResult matchAndRewrite(
-        UnaryOp op, OpAdaptor adaptor,
-        mlir::ConversionPatternRewriter &rewriter) const override {
+    mlir::LogicalResult matchAndRewrite(UnaryOp op, OpAdaptor adaptor,
+                                        mlir::ConversionPatternRewriter &rewriter) const override {
         mlir::Type type = op.getType();
 
         if (llvm::isa<mlir::IntegerType>(type)) {
-            rewriter.replaceOpWithNewOp<IOp>(op.getOperation(),
-                                             adaptor.getOperands());
+            rewriter.replaceOpWithNewOp<IOp>(op.getOperation(), adaptor.getOperands());
         } else if (llvm::isa<mlir::FloatType>(type)) {
-            rewriter.replaceOpWithNewOp<FOp>(op.getOperation(),
-                                             adaptor.getOperands());
+            rewriter.replaceOpWithNewOp<FOp>(op.getOperation(), adaptor.getOperands());
         } else {
             return mlir::failure();
         }
@@ -76,50 +72,42 @@ template <class BinaryOp, class IOp, class FOp>
 class BinaryOpLowering final : public mlir::OpConversionPattern<BinaryOp> {
     using OpAdaptor = typename mlir::OpConversionPattern<BinaryOp>::OpAdaptor;
 
-   public:
+  public:
     BinaryOpLowering(mlir::TypeConverter &typeConverter, mlir::MLIRContext *ctx)
         : mlir::OpConversionPattern<BinaryOp>(typeConverter, ctx) {
         this->setDebugName("EwDaphneOpLowering");
     }
 
-    mlir::LogicalResult convertEwScalar(
-        BinaryOp op, OpAdaptor adaptor,
-        mlir::ConversionPatternRewriter &rewriter) const {
+    mlir::LogicalResult convertEwScalar(BinaryOp op, OpAdaptor adaptor,
+                                        mlir::ConversionPatternRewriter &rewriter) const {
         auto lhs = adaptor.getLhs();
         auto rhs = adaptor.getRhs();
         auto loc = op.getLoc();
 
-        if (lhs.getType().template isa<mlir::FloatType>() &&
-            rhs.getType().template isa<mlir::FloatType>()) {
-            rewriter.replaceOpWithNewOp<FOp>(op.getOperation(),
-                                             adaptor.getOperands());
+        if (lhs.getType().template isa<mlir::FloatType>() && rhs.getType().template isa<mlir::FloatType>()) {
+            rewriter.replaceOpWithNewOp<FOp>(op.getOperation(), adaptor.getOperands());
             return mlir::success();
         }
 
         Value castedLhs = this->typeConverter->materializeTargetConversion(
-            rewriter, loc,
-            rewriter.getIntegerType(
-                adaptor.getRhs().getType().getIntOrFloatBitWidth()),
+            rewriter, loc, rewriter.getIntegerType(adaptor.getRhs().getType().getIntOrFloatBitWidth()),
             ValueRange{adaptor.getLhs()});
 
         Value castedRhs = this->typeConverter->materializeTargetConversion(
-            rewriter, loc,
-            rewriter.getIntegerType(
-                adaptor.getRhs().getType().getIntOrFloatBitWidth()),
+            rewriter, loc, rewriter.getIntegerType(adaptor.getRhs().getType().getIntOrFloatBitWidth()),
             ValueRange{adaptor.getRhs()});
 
         Value binaryOp = rewriter.create<IOp>(loc, castedLhs, castedRhs);
 
-        Value res = this->typeConverter->materializeSourceConversion(
-            rewriter, loc, lhs.getType(), ValueRange{binaryOp});
+        Value res =
+            this->typeConverter->materializeSourceConversion(rewriter, loc, lhs.getType(), ValueRange{binaryOp});
 
         rewriter.replaceOp(op, res);
         return mlir::success();
     }
 
-    mlir::LogicalResult matchAndRewrite(
-        BinaryOp op, OpAdaptor adaptor,
-        mlir::ConversionPatternRewriter &rewriter) const override {
+    mlir::LogicalResult matchAndRewrite(BinaryOp op, OpAdaptor adaptor,
+                                        mlir::ConversionPatternRewriter &rewriter) const override {
         auto lhs = adaptor.getLhs();
         auto rhs = adaptor.getRhs();
 
@@ -130,98 +118,71 @@ class BinaryOpLowering final : public mlir::OpConversionPattern<BinaryOp> {
 
         // for now assume matrix is LHS and RHS is non matrix
         mlir::daphne::MatrixType lhsMatrixType =
-            adaptor.getLhs()
-                .getType()
-                .template dyn_cast<mlir::daphne::MatrixType>();
+            adaptor.getLhs().getType().template dyn_cast<mlir::daphne::MatrixType>();
         auto matrixElementType = lhsMatrixType.getElementType();
         auto lhsRows = lhsMatrixType.getNumRows();
         auto lhsCols = lhsMatrixType.getNumCols();
-        auto lhsMemRefType =
-            mlir::MemRefType::get({lhsRows, lhsCols}, matrixElementType);
+        auto lhsMemRefType = mlir::MemRefType::get({lhsRows, lhsCols}, matrixElementType);
 
         mlir::Type elementType{};
         mlir::Value memRefLhs =
-            rewriter.create<mlir::daphne::ConvertDenseMatrixToMemRef>(
-                op->getLoc(), lhsMemRefType, adaptor.getLhs());
+            rewriter.create<mlir::daphne::ConvertDenseMatrixToMemRef>(op->getLoc(), lhsMemRefType, adaptor.getLhs());
 
         mlir::Value memRefRhs{};
-        bool isMatrixMatrix =
-            rhs.getType().template isa<mlir::daphne::MatrixType>();
+        bool isMatrixMatrix = rhs.getType().template isa<mlir::daphne::MatrixType>();
 
         if (isMatrixMatrix) {
-            memRefRhs =
-                rewriter.create<mlir::daphne::ConvertDenseMatrixToMemRef>(
-                    op->getLoc(), lhsMemRefType, adaptor.getRhs());
+            memRefRhs = rewriter.create<mlir::daphne::ConvertDenseMatrixToMemRef>(op->getLoc(), lhsMemRefType,
+                                                                                  adaptor.getRhs());
             elementType = lhsMemRefType.getElementType();
         } else {
             elementType = rhs.getType();
         }
 
-        mlir::Value outputMemRef =
-            insertMemRefAlloc(lhsMemRefType, op->getLoc(), rewriter);
+        mlir::Value outputMemRef = insertMemRefAlloc(lhsMemRefType, op->getLoc(), rewriter);
 
         SmallVector<int64_t, 4> lowerBounds(/*Rank=*/2, /*Value=*/0);
         SmallVector<int64_t, 4> steps(/*Rank=*/2, /*Value=*/1);
         buildAffineLoopNest(
-            rewriter, op.getLoc(), lowerBounds,
-            {lhsMatrixType.getNumRows(), lhsMatrixType.getNumCols()}, steps,
+            rewriter, op.getLoc(), lowerBounds, {lhsMatrixType.getNumRows(), lhsMatrixType.getNumCols()}, steps,
             [&](OpBuilder &nestedBuilder, Location loc, ValueRange ivs) {
-                mlir::Value loadLhs =
-                    nestedBuilder.create<AffineLoadOp>(loc, memRefLhs, ivs);
+                mlir::Value loadLhs = nestedBuilder.create<AffineLoadOp>(loc, memRefLhs, ivs);
                 mlir::Value binaryOp{};
 
-                if (adaptor.getRhs()
-                        .getType()
-                        .template isa<mlir::FloatType>()) {
-                    binaryOp = nestedBuilder.create<FOp>(loc, loadLhs,
-                                                         adaptor.getRhs());
+                if (adaptor.getRhs().getType().template isa<mlir::FloatType>()) {
+                    binaryOp = nestedBuilder.create<FOp>(loc, loadLhs, adaptor.getRhs());
 
-                    nestedBuilder.create<AffineStoreOp>(loc, binaryOp,
-                                                        outputMemRef, ivs);
+                    nestedBuilder.create<AffineStoreOp>(loc, binaryOp, outputMemRef, ivs);
                     return;
                 }
 
                 mlir::Value rhs{};
                 if (isMatrixMatrix)
-                    rhs =
-                        nestedBuilder.create<AffineLoadOp>(loc, memRefRhs, ivs);
+                    rhs = nestedBuilder.create<AffineLoadOp>(loc, memRefRhs, ivs);
                 else
                     rhs = adaptor.getRhs();
 
                 // is integer
-                if (elementType.isInteger(
-                        elementType.getIntOrFloatBitWidth())) {
-                    Value castedLhs =
-                        this->typeConverter->materializeTargetConversion(
-                            nestedBuilder, loc,
-                            nestedBuilder.getIntegerType(
-                                lhsMemRefType.getElementTypeBitWidth()),
-                            ValueRange{loadLhs});
-
-                    Value castedRhs =
-                        this->typeConverter->materializeTargetConversion(
-                            nestedBuilder, loc,
-                            nestedBuilder.getIntegerType(
-                                lhsMemRefType.getElementTypeBitWidth()),
-                            ValueRange{rhs});
-
-                    binaryOp =
-                        nestedBuilder.create<IOp>(loc, castedLhs, castedRhs);
-                    Value castedRes =
-                        this->typeConverter->materializeSourceConversion(
-                            nestedBuilder, loc, elementType,
-                            ValueRange{binaryOp});
-                    nestedBuilder.create<AffineStoreOp>(loc, castedRes,
-                                                        outputMemRef, ivs);
+                if (elementType.isInteger(elementType.getIntOrFloatBitWidth())) {
+                    Value castedLhs = this->typeConverter->materializeTargetConversion(
+                        nestedBuilder, loc, nestedBuilder.getIntegerType(lhsMemRefType.getElementTypeBitWidth()),
+                        ValueRange{loadLhs});
+
+                    Value castedRhs = this->typeConverter->materializeTargetConversion(
+                        nestedBuilder, loc, nestedBuilder.getIntegerType(lhsMemRefType.getElementTypeBitWidth()),
+                        ValueRange{rhs});
+
+                    binaryOp = nestedBuilder.create<IOp>(loc, castedLhs, castedRhs);
+                    Value castedRes = this->typeConverter->materializeSourceConversion(nestedBuilder, loc, elementType,
+                                                                                       ValueRange{binaryOp});
+                    nestedBuilder.create<AffineStoreOp>(loc, castedRes, outputMemRef, ivs);
                 } else {
                     // is float
                     binaryOp = nestedBuilder.create<FOp>(loc, loadLhs, rhs);
-                    nestedBuilder.create<AffineStoreOp>(loc, binaryOp,
-                                                        outputMemRef, ivs);
+                    nestedBuilder.create<AffineStoreOp>(loc, binaryOp, outputMemRef, ivs);
                 }
             });
-        mlir::Value output = convertMemRefToDenseMatrix(
-            op->getLoc(), rewriter, outputMemRef, op.getType());
+        mlir::Value output = convertMemRefToDenseMatrix(op->getLoc(), rewriter, outputMemRef, op.getType());
 
         rewriter.replaceOp(op, output);
         return mlir::success();
@@ -247,14 +208,11 @@ namespace {
  * This rewrite may enable loop fusion of the produced affine loops by
  * running the loop fusion pass.
  */
-struct EwOpLoweringPass
-    : public mlir::PassWrapper<EwOpLoweringPass,
-                               mlir::OperationPass<mlir::ModuleOp>> {
+struct EwOpLoweringPass : public mlir::PassWrapper<EwOpLoweringPass, mlir::OperationPass<mlir::ModuleOp>> {
     explicit EwOpLoweringPass() {}
 
     void getDependentDialects(mlir::DialectRegistry &registry) const override {
-        registry.insert<mlir::LLVM::LLVMDialect, mlir::AffineDialect,
-                        mlir::memref::MemRefDialect,
+        registry.insert<mlir::LLVM::LLVMDialect, mlir::AffineDialect, mlir::memref::MemRefDialect,
                         mlir::daphne::DaphneDialect, mlir::math::MathDialect>();
     }
     void runOnOperation() final;
@@ -265,10 +223,9 @@ struct EwOpLoweringPass
                "structures and arithmetic operations.";
     }
 };
-}  // end anonymous namespace
+} // end anonymous namespace
 
-void populateLowerEwOpConversionPatterns(mlir::LLVMTypeConverter &typeConverter,
-                                         mlir::RewritePatternSet &patterns) {
+void populateLowerEwOpConversionPatterns(mlir::LLVMTypeConverter &typeConverter, mlir::RewritePatternSet &patterns) {
     // clang-format off
     patterns.insert<
         AddOpLowering,
@@ -294,29 +251,20 @@ void EwOpLoweringPass::runOnOperation() {
     typeConverter.addSourceMaterialization(materializeCastToIllegal);
     typeConverter.addTargetMaterialization(materializeCastFromIllegal);
 
-    target.addLegalDialect<mlir::arith::ArithDialect,
-                           mlir::memref::MemRefDialect, mlir::AffineDialect,
-                           mlir::LLVM::LLVMDialect, mlir::daphne::DaphneDialect,
-                           mlir::BuiltinDialect, mlir::math::MathDialect>();
+    target.addLegalDialect<mlir::arith::ArithDialect, mlir::memref::MemRefDialect, mlir::AffineDialect,
+                           mlir::LLVM::LLVMDialect, mlir::daphne::DaphneDialect, mlir::BuiltinDialect,
+                           mlir::math::MathDialect>();
 
     target.addDynamicallyLegalOp<mlir::daphne::EwSqrtOp, mlir::daphne::EwAbsOp>(
-        [](Operation *op) {
-            return llvm::isa<mlir::daphne::MatrixType>(op->getOperandTypes()[0]);
-        });
+        [](Operation *op) { return llvm::isa<mlir::daphne::MatrixType>(op->getOperandTypes()[0]); });
 
-    target.addDynamicallyLegalOp<mlir::daphne::EwAddOp, mlir::daphne::EwSubOp,
-                                 mlir::daphne::EwMulOp, mlir::daphne::EwPowOp,
-                                 mlir::daphne::EwDivOp>([](Operation *op) {
+    target.addDynamicallyLegalOp<mlir::daphne::EwAddOp, mlir::daphne::EwSubOp, mlir::daphne::EwMulOp,
+                                 mlir::daphne::EwPowOp, mlir::daphne::EwDivOp>([](Operation *op) {
         if (llvm::isa<mlir::daphne::MatrixType>(op->getOperandTypes()[0]) &&
             llvm::isa<mlir::daphne::MatrixType>(op->getOperandTypes()[1])) {
-            mlir::daphne::MatrixType lhs =
-                op->getOperandTypes()[0]
-                    .template dyn_cast<mlir::daphne::MatrixType>();
-            mlir::daphne::MatrixType rhs =
-                op->getOperandTypes()[1]
-                    .template dyn_cast<mlir::daphne::MatrixType>();
-            if (lhs.getNumRows() != rhs.getNumRows() ||
-                lhs.getNumCols() != rhs.getNumCols() ||
+            mlir::daphne::MatrixType lhs = op->getOperandTypes()[0].template dyn_cast<mlir::daphne::MatrixType>();
+            mlir::daphne::MatrixType rhs = op->getOperandTypes()[1].template dyn_cast<mlir::daphne::MatrixType>();
+            if (lhs.getNumRows() != rhs.getNumRows() || lhs.getNumCols() != rhs.getNumCols() ||
                 lhs.getNumRows() == -1 || lhs.getNumCols() == -1)
                 return true;
 
@@ -324,8 +272,7 @@ void EwOpLoweringPass::runOnOperation() {
         }
 
         if (llvm::isa<mlir::daphne::MatrixType>(op->getOperandTypes()[0])) {
-            mlir::daphne::MatrixType lhsMatrixType =
-                op->getOperandTypes()[0].dyn_cast<mlir::daphne::MatrixType>();
+            mlir::daphne::MatrixType lhsMatrixType = op->getOperandTypes()[0].dyn_cast<mlir::daphne::MatrixType>();
             return lhsMatrixType.getNumRows() == -1 || lhsMatrixType.getNumCols() == -1;
         }
 
@@ -339,6 +286,4 @@ void EwOpLoweringPass::runOnOperation() {
         signalPassFailure();
 }
 
-std::unique_ptr<mlir::Pass> mlir::daphne::createEwOpLoweringPass() {
-    return std::make_unique<EwOpLoweringPass>();
-}
+std::unique_ptr<mlir::Pass> mlir::daphne::createEwOpLoweringPass() { return std::make_unique<EwOpLoweringPass>(); }
diff --git a/src/compiler/lowering/InsertDaphneContextPass.cpp b/src/compiler/lowering/InsertDaphneContextPass.cpp
index c801cda8f..feb3a8bf0 100644
--- a/src/compiler/lowering/InsertDaphneContextPass.cpp
+++ b/src/compiler/lowering/InsertDaphneContextPass.cpp
@@ -33,54 +33,52 @@ using namespace mlir;
 // extensions in several directions, e.g.:
 // - inserting the context into blocks (e.g. parfor loop bodies)
 // - passing the context as an argument to a function
-struct InsertDaphneContextPass : public PassWrapper<InsertDaphneContextPass, OperationPass<func::FuncOp>>
-{
-    const DaphneUserConfig& user_config;
-    explicit InsertDaphneContextPass(const DaphneUserConfig& cfg) : user_config(cfg) {}
+struct InsertDaphneContextPass : public PassWrapper<InsertDaphneContextPass, OperationPass<func::FuncOp>> {
+    const DaphneUserConfig &user_config;
+    explicit InsertDaphneContextPass(const DaphneUserConfig &cfg) : user_config(cfg) {}
     void runOnOperation() final;
 };
 
-void InsertDaphneContextPass::runOnOperation()
-{
+void InsertDaphneContextPass::runOnOperation() {
     func::FuncOp f = getOperation();
-    Block & b = f.getBody().front();
-    
+    Block &b = f.getBody().front();
+
     OpBuilder builder(&b, b.begin());
     Location loc = f.getLoc();
 
     // Insert a CreateDaphneContextOp as the first operation in the block.
-    builder.create<daphne::CreateDaphneContextOp>(loc, daphne::DaphneContextType::get(&getContext()),
-            builder.create<daphne::ConstantOp>(loc, reinterpret_cast<uint64_t>(&user_config)),
-            builder.create<daphne::ConstantOp>(loc, reinterpret_cast<uint64_t>(&KernelDispatchMapping::instance())),
-            builder.create<daphne::ConstantOp>(loc, reinterpret_cast<uint64_t>(&Statistics::instance())),
-            builder.create<daphne::ConstantOp>(loc, reinterpret_cast<uint64_t>(&StringRefCounter::instance())));
+    builder.create<daphne::CreateDaphneContextOp>(
+        loc, daphne::DaphneContextType::get(&getContext()),
+        builder.create<daphne::ConstantOp>(loc, reinterpret_cast<uint64_t>(&user_config)),
+        builder.create<daphne::ConstantOp>(loc, reinterpret_cast<uint64_t>(&KernelDispatchMapping::instance())),
+        builder.create<daphne::ConstantOp>(loc, reinterpret_cast<uint64_t>(&Statistics::instance())),
+        builder.create<daphne::ConstantOp>(loc, reinterpret_cast<uint64_t>(&StringRefCounter::instance())));
 
 #ifdef USE_CUDA
-    if(user_config.use_cuda) {
+    if (user_config.use_cuda) {
         builder.create<daphne::CreateCUDAContextOp>(loc);
     }
 #endif
-    if (user_config.use_distributed){
+    if (user_config.use_distributed) {
         builder.create<daphne::CreateDistributedContextOp>(loc);
     }
 #ifdef USE_HDFS
-    if(user_config.use_hdfs) {
+    if (user_config.use_hdfs) {
         builder.create<daphne::CreateHDFSContextOp>(loc);
     }
 #endif
 #ifdef USE_FPGAOPENCL
-    if(user_config.use_fpgaopencl) {
+    if (user_config.use_fpgaopencl) {
         builder.create<daphne::CreateFPGAContextOp>(loc);
     }
 #endif
- 
+
     // Insert a DestroyDaphneContextOp as the last operation in the block, but
     // before the block's terminator.
     builder.setInsertionPoint(b.getTerminator());
     builder.create<daphne::DestroyDaphneContextOp>(loc);
 }
 
-std::unique_ptr<Pass> daphne::createInsertDaphneContextPass(const DaphneUserConfig& cfg)
-{
+std::unique_ptr<Pass> daphne::createInsertDaphneContextPass(const DaphneUserConfig &cfg) {
     return std::make_unique<InsertDaphneContextPass>(cfg);
 }
diff --git a/src/compiler/lowering/LowerToLLVMPass.cpp b/src/compiler/lowering/LowerToLLVMPass.cpp
index 31d981ab3..0f14ab0cf 100644
--- a/src/compiler/lowering/LowerToLLVMPass.cpp
+++ b/src/compiler/lowering/LowerToLLVMPass.cpp
@@ -14,9 +14,9 @@
  * limitations under the License.
  */
 
+#include "compiler/utils/CompilerUtils.h"
 #include "ir/daphneir/Daphne.h"
 #include "ir/daphneir/Passes.h"
-#include "compiler/utils/CompilerUtils.h"
 #include <util/ErrorHandler.h>
 
 #include "mlir/Conversion/LinalgToLLVM/LinalgToLLVM.h"
@@ -24,13 +24,13 @@
 #include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
 #include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h"
 #include "mlir/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.h"
-#include "mlir/Conversion/LinalgToStandard/LinalgToStandard.h"
-#include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"
 #include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVM.h"
 #include "mlir/Conversion/LLVMCommon/ConversionTarget.h"
 #include "mlir/Conversion/LLVMCommon/LoweringOptions.h"
 #include "mlir/Conversion/LLVMCommon/TypeConverter.h"
+#include "mlir/Conversion/LinalgToStandard/LinalgToStandard.h"
 #include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h"
+#include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Func/Transforms/FuncConversions.h"
@@ -38,38 +38,36 @@
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Transforms/DialectConversion.h"
 
+#include <iostream>
 #include <memory>
 #include <utility>
 #include <vector>
-#include <iostream>
 
 using namespace mlir;
 
 // Remark on the creation of mlir::LLVM::AllocaOp
 // ==============================================
-// This pass creates an mlir::LLVM::AllocaOp in several places and for various purposes,
-// e.g., to store the result pointer of a kernel call, for variadic operands/results, etc.
-// AllocaOp should not be inside a loop, as its repeated execution at run-time can lead
-// to a stack overflow (depending on the number of iterations, the number of AllocaOps
-// inside the loop, and the stack size). The reason is that the memory allocated by AllocaOp
-// is freed only at the end of the scope (i.e., function).
-// To avoid such problems, we don't create AllocaOps at the original insertion point of
-// the rewriter, but at the beginning of function surrounding the currently considered op.
-// To this end, we use the rewriter's ability to switch between different insertion points.
-// Note that the memory allocated by an AllocaOp can be reused by multiple repeated
-// kernel calls.
+// This pass creates an mlir::LLVM::AllocaOp in several places and for various
+// purposes, e.g., to store the result pointer of a kernel call, for variadic
+// operands/results, etc. AllocaOp should not be inside a loop, as its repeated
+// execution at run-time can lead to a stack overflow (depending on the number
+// of iterations, the number of AllocaOps inside the loop, and the stack size).
+// The reason is that the memory allocated by AllocaOp is freed only at the end
+// of the scope (i.e., function). To avoid such problems, we don't create
+// AllocaOps at the original insertion point of the rewriter, but at the
+// beginning of function surrounding the currently considered op. To this end,
+// we use the rewriter's ability to switch between different insertion points.
+// Note that the memory allocated by an AllocaOp can be reused by multiple
+// repeated kernel calls.
 
 // Optional attribute of CallKernelOp, which indicates that all results shall
 // be combined into a single variadic result.
 const std::string ATTR_HASVARIADICRESULTS = "hasVariadicResults";
 
-struct ReturnOpLowering : public OpRewritePattern<daphne::ReturnOp>
-{
+struct ReturnOpLowering : public OpRewritePattern<daphne::ReturnOp> {
     using OpRewritePattern<daphne::ReturnOp>::OpRewritePattern;
 
-    LogicalResult matchAndRewrite(daphne::ReturnOp op,
-                                  PatternRewriter &rewriter) const final
-    {
+    LogicalResult matchAndRewrite(daphne::ReturnOp op, PatternRewriter &rewriter) const final {
         rewriter.replaceOpWithNewOp<func::ReturnOp>(op, op.getOperands());
         return success();
     }
@@ -78,9 +76,8 @@ struct ReturnOpLowering : public OpRewritePattern<daphne::ReturnOp>
 struct CastOpLowering : public OpRewritePattern<daphne::CastOp> {
     using OpRewritePattern<daphne::CastOp>::OpRewritePattern;
 
-    LogicalResult matchAndRewrite(daphne::CastOp op,
-                                  PatternRewriter &rewriter) const final {
-        if(op.isTrivialCast() || op.isRemovePropertyCast()) {
+    LogicalResult matchAndRewrite(daphne::CastOp op, PatternRewriter &rewriter) const final {
+        if (op.isTrivialCast() || op.isRemovePropertyCast()) {
             rewriter.replaceOp(op, op.getOperand());
             return success();
         }
@@ -90,17 +87,14 @@ struct CastOpLowering : public OpRewritePattern<daphne::CastOp> {
 
 /// ConstantOp lowering for types not handled before (str)
 
-class ConstantOpLowering : public OpConversionPattern<daphne::ConstantOp>
-{
-public:
+class ConstantOpLowering : public OpConversionPattern<daphne::ConstantOp> {
+  public:
     using OpConversionPattern::OpConversionPattern;
 
-    LogicalResult
-    matchAndRewrite(daphne::ConstantOp op, OpAdaptor adaptor,
-                    ConversionPatternRewriter &rewriter) const override
-    {
+    LogicalResult matchAndRewrite(daphne::ConstantOp op, OpAdaptor adaptor,
+                                  ConversionPatternRewriter &rewriter) const override {
         Location loc = op->getLoc();
-        if(auto strAttr = op.getValue().dyn_cast<StringAttr>()) {
+        if (auto strAttr = op.getValue().dyn_cast<StringAttr>()) {
             StringRef sr = strAttr.getValue();
 #if 1
             // MLIR does not have direct support for strings. Thus, if this is
@@ -109,76 +103,60 @@ class ConstantOpLowering : public OpConversionPattern<daphne::ConstantOp>
             // characters of the string constant to that array one by one. The
             // SSA value of the constant is replaced by a pointer to i8
             // pointing to the allocated buffer.
-            Type i8PtrType = LLVM::LLVMPointerType::get(
-                    IntegerType::get(rewriter.getContext(), 8)
-            );
+            Type i8PtrType = LLVM::LLVMPointerType::get(IntegerType::get(rewriter.getContext(), 8));
             const size_t numChars = sr.size() + 1; // +1 for trailing '\0'
             const std::string str = sr.str();
-            const char * chars = str.c_str();
+            const char *chars = str.c_str();
 
-            // We could assume that the daphne::ConstantOp `op` is *not* inside a loop,
-            // because constants are typically moved to the top of a function during
-            // canonicalization. Consequently, we would not need to change the insertion
-            // point. However, being defensive, we still do it.
+            // We could assume that the daphne::ConstantOp `op` is *not* inside
+            // a loop, because constants are typically moved to the top of a
+            // function during canonicalization. Consequently, we would not need
+            // to change the insertion point. However, being defensive, we still
+            // do it.
 
-            // Set the insertion point to the beginning of the function surrounding this ConstantOp
-            // (see comment on AllocaOp above).
+            // Set the insertion point to the beginning of the function
+            // surrounding this ConstantOp (see comment on AllocaOp above).
             OpBuilder::InsertPoint ipHere = rewriter.saveInsertionPoint();
-            Block & fb = op.getOperation()->getParentOfType<LLVM::LLVMFuncOp>().getBody().front();
+            Block &fb = op.getOperation()->getParentOfType<LLVM::LLVMFuncOp>().getBody().front();
             rewriter.setInsertionPointToStart(&fb);
 
             auto allocaOp = rewriter.replaceOpWithNewOp<LLVM::AllocaOp>(
-                    op.getOperation(),
-                    i8PtrType,
-                    rewriter.create<arith::ConstantOp>(loc, rewriter.getI64IntegerAttr(numChars)),
-                    1
-            );
+                op.getOperation(), i8PtrType,
+                rewriter.create<arith::ConstantOp>(loc, rewriter.getI64IntegerAttr(numChars)), 1);
 
             // Go back to the original insertion point.
             rewriter.restoreInsertionPoint(ipHere);
 
-            for(size_t i = 0; i < numChars; i++) {
-                std::vector<Value> indices = {
-                    rewriter.create<arith::ConstantOp>(loc, rewriter.getI64IntegerAttr(i))
-                };
+            for (size_t i = 0; i < numChars; i++) {
+                std::vector<Value> indices = {rewriter.create<arith::ConstantOp>(loc, rewriter.getI64IntegerAttr(i))};
                 rewriter.create<LLVM::StoreOp>(
-                        loc,
-                        rewriter.create<arith::ConstantOp>(
-                                loc, rewriter.getI8IntegerAttr(chars[i])
-                        ),
-                        rewriter.create<LLVM::GEPOp>(
-                                op->getLoc(), i8PtrType, allocaOp, indices
-                        )
-                );
+                    loc, rewriter.create<arith::ConstantOp>(loc, rewriter.getI8IntegerAttr(chars[i])),
+                    rewriter.create<LLVM::GEPOp>(op->getLoc(), i8PtrType, allocaOp, indices));
             }
 #else
             // Alternatively, we could create a global string, which would
             // yield a poiner to i8, too. However, we would need to choose a
             // unique name.
-            rewriter.replaceOp(
-                    op.getOperation(),
-                    LLVM::createGlobalString(
-                            loc, rewriter, "someName", sr,
-                            LLVM::Linkage::Private // TODO Does that make sense?
-                    )
-            );
+            rewriter.replaceOp(op.getOperation(),
+                               LLVM::createGlobalString(loc, rewriter, "someName", sr,
+                                                        LLVM::Linkage::Private // TODO Does that make sense?
+                                                        ));
 #endif
-        }
-        else {
-            // Constants of all other types are lowered to an mlir::arith::ConstantOp.
-            // Note that this is a different op than mlir::daphne::ConstantOp!
+        } else {
+            // Constants of all other types are lowered to an
+            // mlir::arith::ConstantOp. Note that this is a different op than
+            // mlir::daphne::ConstantOp!
 #if 1
             rewriter.replaceOpWithNewOp<arith::ConstantOp>(op.getOperation(), op.getValue());
 #else
-            // NOTE: this fixes printing due to an error in the LLVMDialect, but is the wrong behaviour.
+            // NOTE: this fixes printing due to an error in the LLVMDialect, but
+            // is the wrong behaviour.
             //  Use this for debugging only
             if (auto iTy = op.getType().dyn_cast<IntegerType>()) {
                 auto ty = IntegerType::get(getContext(), iTy.getWidth());
-                rewriter.replaceOpWithNewOp<arith::ConstantOp>(op.getOperation(),
-                    ty,
-                    IntegerAttr::get(ty, op.getValue().cast<IntegerAttr>().getValue()));
-            }
-            else {
+                rewriter.replaceOpWithNewOp<arith::ConstantOp>(
+                    op.getOperation(), ty, IntegerAttr::get(ty, op.getValue().cast<IntegerAttr>().getValue()));
+            } else {
                 rewriter.replaceOpWithNewOp<arith::ConstantOp>(op.getOperation(), op.getValue());
             }
 #endif
@@ -188,79 +166,62 @@ class ConstantOpLowering : public OpConversionPattern<daphne::ConstantOp>
     }
 };
 
-class CallKernelOpLowering : public OpConversionPattern<daphne::CallKernelOp>
-{
-
-    static std::vector<Type> getLLVMInputOutputTypes(Location &loc,
-                                                     MLIRContext *context,
-                                                     TypeConverter *typeConverter,
-                                                     TypeRange resultTypes,
-                                                     TypeRange operandTypes,
-                                                     bool hasVarRes,
-                                                     Type indexType)
-    {
+class CallKernelOpLowering : public OpConversionPattern<daphne::CallKernelOp> {
+
+    static std::vector<Type> getLLVMInputOutputTypes(Location &loc, MLIRContext *context, TypeConverter *typeConverter,
+                                                     TypeRange resultTypes, TypeRange operandTypes, bool hasVarRes,
+                                                     Type indexType) {
         llvm::SmallVector<Type, 5> args;
-        
+
         // --------------------------------------------------------------------
         // Results
         // --------------------------------------------------------------------
-        
+
         const size_t numRes = resultTypes.size();
-        if(hasVarRes) { // combine all results into one variadic result
+        if (hasVarRes) { // combine all results into one variadic result
             // TODO Support individual result types, at least if they are all
             // mapped to the superclass Structure (see #397).
             // Check if all results have the same type.
             Type t0 = resultTypes[0];
             Type mt0 = t0.dyn_cast<daphne::MatrixType>().withSameElementTypeAndRepr();
-            for(size_t i = 1; i < numRes; i++)
-                if (mt0 != resultTypes[i]
-                               .dyn_cast<daphne::MatrixType>()
-                               .withSameElementTypeAndRepr()) {
-                    throw ErrorHandler::compilerError(
-                        loc, "LowerToLLVMPass",
-                        "all results of a CallKernelOp must have the same "
-                        "type to combine them into a single variadic result");
+            for (size_t i = 1; i < numRes; i++)
+                if (mt0 != resultTypes[i].dyn_cast<daphne::MatrixType>().withSameElementTypeAndRepr()) {
+                    throw ErrorHandler::compilerError(loc, "LowerToLLVMPass",
+                                                      "all results of a CallKernelOp must have the same "
+                                                      "type to combine them into a single variadic result");
                 }
             // Wrap the common result type into a pointer, since we need an
             // array of that type.
-            args.push_back(LLVM::LLVMPointerType::get(
-                    typeConverter->isLegal(t0)
-                    ? t0
-                    : typeConverter->convertType(t0)
-            ));
-        }
-        else // typical case
+            args.push_back(
+                LLVM::LLVMPointerType::get(typeConverter->isLegal(t0) ? t0 : typeConverter->convertType(t0)));
+        } else // typical case
             for (auto type : resultTypes) {
                 if (typeConverter->isLegal(type)) {
                     args.push_back(type);
-                }
-                else if (failed(typeConverter->convertType(type, args)))
+                } else if (failed(typeConverter->convertType(type, args)))
                     emitError(loc) << "Couldn't convert result type `" << type << "`\n";
             }
-        
+
         // --------------------------------------------------------------------
         // Operands
         // --------------------------------------------------------------------
-        
-        if(hasVarRes)
+
+        if (hasVarRes)
             // Create a parameter for passing the number of results in the
             // single variadic result.
-            args.push_back(typeConverter->isLegal(indexType)
-                    ? indexType
-                    : typeConverter->convertType(indexType));
-        
+            args.push_back(typeConverter->isLegal(indexType) ? indexType : typeConverter->convertType(indexType));
+
         for (auto type : operandTypes) {
             if (typeConverter->isLegal(type)) {
                 args.push_back(type);
-            }
-            else if (failed(typeConverter->convertType(type, args)))
+            } else if (failed(typeConverter->convertType(type, args)))
                 emitError(loc) << "Couldn't convert operand type `" << type << "`\n";
         }
 
         // --------------------------------------------------------------------
         // Create final LLVM types
         // --------------------------------------------------------------------
-        
+
         std::vector<Type> argsLLVM;
         for (size_t i = 0; i < args.size(); i++) {
             Type type = args[i];
@@ -271,18 +232,15 @@ class CallKernelOpLowering : public OpConversionPattern<daphne::CallKernelOp>
             if (!hasVarRes && i < numRes) {
                 type = LLVM::LLVMPointerType::get(type);
             }
-            
+
             argsLLVM.push_back(type);
         }
-        
+
         return argsLLVM;
     }
 
-    static FlatSymbolRefAttr
-    getOrInsertFunctionAttr(OpBuilder &rewriter, ModuleOp module,
-                            llvm::StringRef funcName,
-                            LLVM::LLVMFunctionType llvmFnType)
-    {
+    static FlatSymbolRefAttr getOrInsertFunctionAttr(OpBuilder &rewriter, ModuleOp module, llvm::StringRef funcName,
+                                                     LLVM::LLVMFunctionType llvmFnType) {
         auto *context = module.getContext();
         if (module.lookupSymbol<LLVM::LLVMFuncOp>(funcName))
             return SymbolRefAttr::get(context, funcName);
@@ -293,86 +251,61 @@ class CallKernelOpLowering : public OpConversionPattern<daphne::CallKernelOp>
         return SymbolRefAttr::get(context, funcName);
     }
 
-    static LLVM::LLVMFunctionType
-    getKernelFuncSignature(MLIRContext *context, std::vector<Type> argsLLVM)
-    {
-        return LLVM::LLVMFunctionType::get(LLVM::LLVMVoidType::get(context), argsLLVM,
-                                           false);
+    static LLVM::LLVMFunctionType getKernelFuncSignature(MLIRContext *context, std::vector<Type> argsLLVM) {
+        return LLVM::LLVMFunctionType::get(LLVM::LLVMVoidType::get(context), argsLLVM, false);
     }
 
-public:
+  public:
     using OpConversionPattern::OpConversionPattern;
 
-    LogicalResult
-    matchAndRewrite(daphne::CallKernelOp op, OpAdaptor adaptor,
-                    ConversionPatternRewriter &rewriter) const override
-    {
+    LogicalResult matchAndRewrite(daphne::CallKernelOp op, OpAdaptor adaptor,
+                                  ConversionPatternRewriter &rewriter) const override {
         // Whether all results of the operation shall be combined into one
         // vardiadic result. If this is false (typical case), we pass a
         // separate nullptr for each result to the kernel. If it is true, we
         // create an array with the number of results, fill it with nullptrs,
         // and pass that to the kernel (variadic results).
         const bool hasVarRes = op->hasAttr(ATTR_HASVARIADICRESULTS)
-                ? op->getAttr(ATTR_HASVARIADICRESULTS).dyn_cast<BoolAttr>().getValue()
-                : false;
-        
+                                   ? op->getAttr(ATTR_HASVARIADICRESULTS).dyn_cast<BoolAttr>().getValue()
+                                   : false;
+
         auto module = op->getParentOfType<ModuleOp>();
         auto loc = op.getLoc();
 
-        auto inputOutputTypes = getLLVMInputOutputTypes(
-            loc, rewriter.getContext(), typeConverter, op.getResultTypes(),
-            ValueRange(adaptor.getOperands()).getTypes(), hasVarRes,
-            rewriter.getIndexType());
+        auto inputOutputTypes =
+            getLLVMInputOutputTypes(loc, rewriter.getContext(), typeConverter, op.getResultTypes(),
+                                    ValueRange(adaptor.getOperands()).getTypes(), hasVarRes, rewriter.getIndexType());
 
         // create function protoype and get `FlatSymbolRefAttr` to it
-        auto kernelRef = getOrInsertFunctionAttr(
-            rewriter, module, op.getCalleeAttr().getValue(),
-            getKernelFuncSignature(rewriter.getContext(), inputOutputTypes));
+        auto kernelRef = getOrInsertFunctionAttr(rewriter, module, op.getCalleeAttr().getValue(),
+                                                 getKernelFuncSignature(rewriter.getContext(), inputOutputTypes));
 
-        auto kernelOperands = allocOutputReferences(
-            loc, rewriter, adaptor.getOperands(), inputOutputTypes,
-            op->getNumResults(), hasVarRes, op);
+        auto kernelOperands = allocOutputReferences(loc, rewriter, adaptor.getOperands(), inputOutputTypes,
+                                                    op->getNumResults(), hasVarRes, op);
 
         // call function
         // The kernel call has an empty list of return types, because our
         // kernel(-wrapper)s generally return via parameters.
         TypeRange ts;
-        rewriter.create<func::CallOp>(
-                loc, kernelRef,
-                ts,
-                kernelOperands);
-        rewriter.replaceOp(op, dereferenceOutputs(loc, rewriter, module,
-                                                  op->getNumResults(),
-                                                  hasVarRes, kernelOperands));
+        rewriter.create<func::CallOp>(loc, kernelRef, ts, kernelOperands);
+        rewriter.replaceOp(op,
+                           dereferenceOutputs(loc, rewriter, module, op->getNumResults(), hasVarRes, kernelOperands));
         return success();
     }
 
-private:
-
-    static std::vector<Value>
-    dereferenceOutputs(Location &loc, PatternRewriter &rewriter, ModuleOp &module,
-                       size_t numResults, bool hasVarRes, std::vector<Value> kernelOperands)
-    {
+  private:
+    static std::vector<Value> dereferenceOutputs(Location &loc, PatternRewriter &rewriter, ModuleOp &module,
+                                                 size_t numResults, bool hasVarRes, std::vector<Value> kernelOperands) {
         // transformed results
         std::vector<Value> results;
-        
-        if(hasVarRes) { // combine all results into one variadic result
-            for(size_t i = 0; i < numResults; i++) {
-                std::vector<Value> indices = {
-                    rewriter.create<arith::ConstantOp>(loc, rewriter.getI64IntegerAttr(i))
-                };
+
+        if (hasVarRes) { // combine all results into one variadic result
+            for (size_t i = 0; i < numResults; i++) {
+                std::vector<Value> indices = {rewriter.create<arith::ConstantOp>(loc, rewriter.getI64IntegerAttr(i))};
                 results.push_back(rewriter.create<LLVM::LoadOp>(
-                        loc,
-                        rewriter.create<LLVM::GEPOp>(
-                                loc,
-                                kernelOperands[0].getType(),
-                                kernelOperands[0],
-                                indices
-                        )
-                ));
+                    loc, rewriter.create<LLVM::GEPOp>(loc, kernelOperands[0].getType(), kernelOperands[0], indices)));
             }
-        }
-        else // typical case
+        } else // typical case
             for (size_t i = 0; i < numResults; i++) {
                 // dereference output
                 auto value = kernelOperands[i];
@@ -381,23 +314,20 @@ class CallKernelOpLowering : public OpConversionPattern<daphne::CallKernelOp>
 
                 results.push_back(resultVal);
             }
-        
+
         return results;
     }
 
-    std::vector<Value>
-    allocOutputReferences(Location &loc, PatternRewriter &rewriter,
-                          ValueRange operands,
-                          std::vector<Type> inputOutputTypes, size_t numRes, bool hasVarRes,
-                          daphne::CallKernelOp op) const
-    {
+    std::vector<Value> allocOutputReferences(Location &loc, PatternRewriter &rewriter, ValueRange operands,
+                                             std::vector<Type> inputOutputTypes, size_t numRes, bool hasVarRes,
+                                             daphne::CallKernelOp op) const {
 
         std::vector<Value> kernelOperands;
-        
-        // Obtain an insertion point at the beginning of the function surrounding this CallKernelOp
-        // (see comment on AllocaOp above).
+
+        // Obtain an insertion point at the beginning of the function
+        // surrounding this CallKernelOp (see comment on AllocaOp above).
         OpBuilder::InsertPoint ipHere = rewriter.saveInsertionPoint();
-        Block & fb = op.getOperation()->getParentOfType<LLVM::LLVMFuncOp>().getBody().front();
+        Block &fb = op.getOperation()->getParentOfType<LLVM::LLVMFuncOp>().getBody().front();
         rewriter.setInsertionPointToStart(&fb);
         OpBuilder::InsertPoint ipFuncStart = rewriter.saveInsertionPoint();
         rewriter.restoreInsertionPoint(ipHere);
@@ -405,18 +335,17 @@ class CallKernelOpLowering : public OpConversionPattern<daphne::CallKernelOp>
         // --------------------------------------------------------------------
         // Results
         // --------------------------------------------------------------------
-        
-        if(hasVarRes) { // combine all results into one variadic result
+
+        if (hasVarRes) { // combine all results into one variadic result
             // Allocate an array of numRes elements.
 
-            // Set the insertion point to the beginning of the function (see comment on AllocaOp above).
+            // Set the insertion point to the beginning of the function (see
+            // comment on AllocaOp above).
             ipHere = rewriter.saveInsertionPoint();
             rewriter.restoreInsertionPoint(ipFuncStart);
             auto allocaOp = rewriter.create<LLVM::AllocaOp>(
-                    loc,
-                    inputOutputTypes[0],
-                    rewriter.create<arith::ConstantOp>(loc, rewriter.getI64IntegerAttr(numRes)).getResult()
-            );
+                loc, inputOutputTypes[0],
+                rewriter.create<arith::ConstantOp>(loc, rewriter.getI64IntegerAttr(numRes)).getResult());
             ipFuncStart = rewriter.saveInsertionPoint();
 
             // Go back to the original insertion point.
@@ -430,24 +359,19 @@ class CallKernelOpLowering : public OpConversionPattern<daphne::CallKernelOp>
             // (i.e. when it represents a scalar), initialization is not
             // required.
             Type elType = inputOutputTypes[0].dyn_cast<LLVM::LLVMPointerType>().getElementType();
-            if(llvm::isa<LLVM::LLVMPointerType>(elType)) {
-                for(size_t i = 0; i < numRes; i++) {
+            if (llvm::isa<LLVM::LLVMPointerType>(elType)) {
+                for (size_t i = 0; i < numRes; i++) {
                     std::vector<Value> indices = {
-                        rewriter.create<arith::ConstantOp>(loc, rewriter.getI64IntegerAttr(i))
-                    };
+                        rewriter.create<arith::ConstantOp>(loc, rewriter.getI64IntegerAttr(i))};
                     rewriter.create<LLVM::StoreOp>(
-                        loc,
-                        rewriter.create<LLVM::NullOp>(loc, elType),
-                        rewriter.create<LLVM::GEPOp>(
-                                loc, inputOutputTypes[0], allocaOp, indices
-                        )
-                    );
+                        loc, rewriter.create<LLVM::NullOp>(loc, elType),
+                        rewriter.create<LLVM::GEPOp>(loc, inputOutputTypes[0], allocaOp, indices));
                 }
             }
-        }
-        else { // typical case
+        } else { // typical case
             // Constant of 1 for AllocaOp of output.
-            // Set the insertion point to the beginning of the function (see comment on AllocaOp above).
+            // Set the insertion point to the beginning of the function (see
+            // comment on AllocaOp above).
             ipHere = rewriter.saveInsertionPoint();
             rewriter.restoreInsertionPoint(ipFuncStart);
             Value cst1 = rewriter.create<arith::ConstantOp>(loc, rewriter.getI64IntegerAttr(1));
@@ -455,10 +379,11 @@ class CallKernelOpLowering : public OpConversionPattern<daphne::CallKernelOp>
 
             // Go back to the original insertion point.
             rewriter.restoreInsertionPoint(ipHere);
-            
+
             for (size_t i = 0; i < numRes; i++) {
                 // Allocate space for a single element.
-                // Set the insertion point to the beginning of the function (see comment on AllocaOp above).
+                // Set the insertion point to the beginning of the function (see
+                // comment on AllocaOp above).
                 ipHere = rewriter.saveInsertionPoint();
                 rewriter.restoreInsertionPoint(ipFuncStart);
                 auto allocaOp = rewriter.create<LLVM::AllocaOp>(loc, inputOutputTypes[i], cst1);
@@ -468,33 +393,30 @@ class CallKernelOpLowering : public OpConversionPattern<daphne::CallKernelOp>
                 // Go back to the original insertion point.
                 rewriter.restoreInsertionPoint(ipHere);
 
-                // If the type of this result parameter is a pointer (i.e. when it
-                // represents a matrix or frame), then initialize the allocated
-                // element with a null pointer (required by the kernels). Otherwise
-                // (i.e. when it represents a scalar), initialization is not
-                // required.
+                // If the type of this result parameter is a pointer (i.e. when
+                // it represents a matrix or frame), then initialize the
+                // allocated element with a null pointer (required by the
+                // kernels). Otherwise (i.e. when it represents a scalar),
+                // initialization is not required.
                 Type elType = inputOutputTypes[i].dyn_cast<LLVM::LLVMPointerType>().getElementType();
-                if(llvm::isa<LLVM::LLVMPointerType>(elType)) {
-                    rewriter.create<LLVM::StoreOp>(
-                        loc,
-                        rewriter.create<LLVM::NullOp>(loc, elType),
-                        allocaOp
-                    );
+                if (llvm::isa<LLVM::LLVMPointerType>(elType)) {
+                    rewriter.create<LLVM::StoreOp>(loc, rewriter.create<LLVM::NullOp>(loc, elType), allocaOp);
                 }
             }
         }
-        
+
         // --------------------------------------------------------------------
         // Operands
         // --------------------------------------------------------------------
-        
-        if(hasVarRes)
-            // Insert the number of results in the variadic result as a constant.
+
+        if (hasVarRes)
+            // Insert the number of results in the variadic result as a
+            // constant.
             kernelOperands.push_back(rewriter.create<arith::ConstantOp>(loc, rewriter.getIndexAttr(numRes)));
-        
-        for(auto op : operands)
+
+        for (auto op : operands)
             kernelOperands.push_back(op);
-        
+
         return kernelOperands;
     }
 };
@@ -503,28 +425,22 @@ class CallKernelOpLowering : public OpConversionPattern<daphne::CallKernelOp>
  * @brief Rewrites `daphne::CreateVariadicPackOp` to `LLVM::AllocaOp` to create
  * an array for the required number of occurrences of a variadic operand.
  */
-class CreateVariadicPackOpLowering : public OpConversionPattern<daphne::CreateVariadicPackOp>
-{
-public:
+class CreateVariadicPackOpLowering : public OpConversionPattern<daphne::CreateVariadicPackOp> {
+  public:
     using OpConversionPattern::OpConversionPattern;
 
-    LogicalResult
-    matchAndRewrite(daphne::CreateVariadicPackOp op, OpAdaptor adaptor,
-                    ConversionPatternRewriter &rewriter) const override
-    {
-        // Set the insertion point to the beginning of the function surrounding this CreateVariadicPackOp
-        // (see comment on AllocaOp above).
-        Block & fb = op.getOperation()->getParentOfType<LLVM::LLVMFuncOp>().getBody().front();
+    LogicalResult matchAndRewrite(daphne::CreateVariadicPackOp op, OpAdaptor adaptor,
+                                  ConversionPatternRewriter &rewriter) const override {
+        // Set the insertion point to the beginning of the function surrounding
+        // this CreateVariadicPackOp (see comment on AllocaOp above).
+        Block &fb = op.getOperation()->getParentOfType<LLVM::LLVMFuncOp>().getBody().front();
         rewriter.setInsertionPointToStart(&fb);
 
         Type contType = op.getRes().getType().dyn_cast<daphne::VariadicPackType>().getContainedType();
         Type convType = typeConverter->convertType(contType);
         rewriter.replaceOpWithNewOp<LLVM::AllocaOp>(
-                op.getOperation(),
-                LLVM::LLVMPointerType::get(convType),
-                rewriter.create<arith::ConstantOp>(op->getLoc(), op.getNumElementsAttr()),
-                1
-        );
+            op.getOperation(), LLVM::LLVMPointerType::get(convType),
+            rewriter.create<arith::ConstantOp>(op->getLoc(), op.getNumElementsAttr()), 1);
         return success();
     }
 };
@@ -534,65 +450,49 @@ class CreateVariadicPackOpLowering : public OpConversionPattern<daphne::CreateVa
  * an occurrence of a variadic operand to the respective position in an array
  * created by lowering `daphne::CreateVariadicPackOp`.
  */
-class StoreVariadicPackOpLowering : public OpConversionPattern<daphne::StoreVariadicPackOp>
-{
-public:
+class StoreVariadicPackOpLowering : public OpConversionPattern<daphne::StoreVariadicPackOp> {
+  public:
     using OpConversionPattern::OpConversionPattern;
 
-    LogicalResult
-    matchAndRewrite(daphne::StoreVariadicPackOp op, OpAdaptor adaptor,
-                    ConversionPatternRewriter &rewriter) const override
-    {
+    LogicalResult matchAndRewrite(daphne::StoreVariadicPackOp op, OpAdaptor adaptor,
+                                  ConversionPatternRewriter &rewriter) const override {
         mlir::Location loc = op->getLoc();
         mlir::Value pack = adaptor.getOperands()[0];
         mlir::Value item = adaptor.getOperands()[1];
         auto elementType = pack.getType().cast<LLVM::LLVMPointerType>().getElementType();
-        std::vector<Value> indices = {
-            rewriter.create<arith::ConstantOp>(loc, op.getPosAttr())
-        };
-        auto addr = rewriter.create<LLVM::GEPOp>(
-                loc, pack.getType(), pack, indices
-        );
+        std::vector<Value> indices = {rewriter.create<arith::ConstantOp>(loc, op.getPosAttr())};
+        auto addr = rewriter.create<LLVM::GEPOp>(loc, pack.getType(), pack, indices);
         Type itemType = item.getType();
         if (itemType != elementType) {
             if (llvm::isa<LLVM::LLVMPointerType>(elementType)) {
-                if(itemType.isSignedInteger())
+                if (itemType.isSignedInteger())
                     item = rewriter.create<LLVM::SExtOp>(loc, rewriter.getI64Type(), item);
-                else if(itemType.isUnsignedInteger() || itemType.isSignlessInteger())
+                else if (itemType.isUnsignedInteger() || itemType.isSignlessInteger())
                     item = rewriter.create<LLVM::ZExtOp>(loc, rewriter.getI64Type(), item);
-                else if(llvm::isa<FloatType>(itemType)) {
+                else if (llvm::isa<FloatType>(itemType)) {
                     item = rewriter.create<LLVM::FPExtOp>(loc, rewriter.getF64Type(), item);
                     item = rewriter.create<LLVM::BitcastOp>(loc, rewriter.getI64Type(), item);
                 } else {
-                    throw ErrorHandler::compilerError(
-                        loc, "LowerToLLVMPass",
-                        "itemType is an unsupported type");
+                    throw ErrorHandler::compilerError(loc, "LowerToLLVMPass", "itemType is an unsupported type");
                 }
                 item = rewriter.create<LLVM::IntToPtrOp>(loc, elementType, item);
-            }
-            else {
+            } else {
                 throw ErrorHandler::compilerError(loc, "LowerToLLVMPass",
-                        "casting to a non-pointer type in "
-                        "StoreVariadicPackOpLowering is not implemented yet"
-                );
+                                                  "casting to a non-pointer type in "
+                                                  "StoreVariadicPackOpLowering is not implemented yet");
             }
         }
-        rewriter.replaceOpWithNewOp<LLVM::StoreOp>(
-                op.getOperation(), item, addr
-        );
+        rewriter.replaceOpWithNewOp<LLVM::StoreOp>(op.getOperation(), item, addr);
         return success();
     }
 };
 
-class MapOpLowering : public OpConversionPattern<daphne::MapOp>
-{
-public:
+class MapOpLowering : public OpConversionPattern<daphne::MapOp> {
+  public:
     using OpConversionPattern::OpConversionPattern;
 
-    LogicalResult
-    matchAndRewrite(daphne::MapOp op, OpAdaptor adaptor,
-                    ConversionPatternRewriter &rewriter) const override
-    {
+    LogicalResult matchAndRewrite(daphne::MapOp op, OpAdaptor adaptor,
+                                  ConversionPatternRewriter &rewriter) const override {
         auto loc = op->getLoc();
         auto module = op->getParentOfType<ModuleOp>();
 
@@ -605,42 +505,34 @@ class MapOpLowering : public OpConversionPattern<daphne::MapOp>
         // Input Matrix
         callee << "__" << CompilerUtils::mlirTypeToCppTypeName(op.getArg().getType(), false);
 
-        // Pointer to UDF 
+        // Pointer to UDF
         callee << "__void";
 
-        
-        // get pointer to UDF 
+        // get pointer to UDF
         LLVM::LLVMFuncOp udfFuncOp = module.lookupSymbol<LLVM::LLVMFuncOp>(op.getFunc());
         auto udfFnPtr = rewriter.create<LLVM::AddressOfOp>(loc, udfFuncOp);
 
         std::vector<Value> kernelOperands{op.getArg(), udfFnPtr};
 
-        auto kernel = rewriter.create<daphne::CallKernelOp>(
-            loc,
-            callee.str(),
-            kernelOperands,
-            op->getResultTypes()
-        );
+        auto kernel = rewriter.create<daphne::CallKernelOp>(loc, callee.str(), kernelOperands, op->getResultTypes());
         rewriter.replaceOp(op, kernel.getResults());
 
         return success();
     }
 };
 
-class VectorizedPipelineOpLowering : public OpConversionPattern<daphne::VectorizedPipelineOp>
-{
-    const DaphneUserConfig& cfg;
+class VectorizedPipelineOpLowering : public OpConversionPattern<daphne::VectorizedPipelineOp> {
+    const DaphneUserConfig &cfg;
 
-public:
-    explicit VectorizedPipelineOpLowering(TypeConverter &typeConverter, MLIRContext *context, const DaphneUserConfig &cfg)
-            : OpConversionPattern(typeConverter, context), cfg(cfg) {}
+  public:
+    explicit VectorizedPipelineOpLowering(TypeConverter &typeConverter, MLIRContext *context,
+                                          const DaphneUserConfig &cfg)
+        : OpConversionPattern(typeConverter, context), cfg(cfg) {}
 
     using OpConversionPattern::OpConversionPattern;
 
-    LogicalResult
-    matchAndRewrite(daphne::VectorizedPipelineOp op, OpAdaptor adaptor,
-                    ConversionPatternRewriter &rewriter) const override
-    {
+    LogicalResult matchAndRewrite(daphne::VectorizedPipelineOp op, OpAdaptor adaptor,
+                                  ConversionPatternRewriter &rewriter) const override {
         if (op.getCtx() == nullptr) {
             op->emitOpError() << "`DaphneContext` not known";
             return failure();
@@ -658,7 +550,7 @@ class VectorizedPipelineOpLowering : public OpConversionPattern<daphne::Vectoriz
         {
             OpBuilder::InsertionGuard ig(rewriter);
             auto moduleOp = op->getParentOfType<ModuleOp>();
-            Block * moduleBody = moduleOp.getBody();
+            Block *moduleBody = moduleOp.getBody();
             rewriter.setInsertionPointToStart(moduleBody);
 
             static auto ix = 0;
@@ -666,7 +558,8 @@ class VectorizedPipelineOpLowering : public OpConversionPattern<daphne::Vectoriz
 
             // TODO: pass daphne context to function
             auto funcType = LLVM::LLVMFunctionType::get(LLVM::LLVMVoidType::get(rewriter.getContext()),
-                {/*outputs...*/pppI1Ty, /*inputs...*/ptrPtrI1Ty, /*daphneContext...*/ptrI1Ty});
+                                                        {/*outputs...*/ pppI1Ty, /*inputs...*/ ptrPtrI1Ty,
+                                                         /*daphneContext...*/ ptrI1Ty});
 
             fOp = rewriter.create<LLVM::LLVMFuncOp>(loc, funcName, funcType);
             fOp.getBody().takeBody(op.getBody());
@@ -675,35 +568,32 @@ class VectorizedPipelineOpLowering : public OpConversionPattern<daphne::Vectoriz
             auto returnRef = funcBlock.addArgument(pppI1Ty, rewriter.getUnknownLoc());
             auto inputsArg = funcBlock.addArgument(ptrPtrI1Ty, rewriter.getUnknownLoc());
             auto daphneContext = funcBlock.addArgument(ptrI1Ty, rewriter.getUnknownLoc());
-            // TODO: we should not create a new daphneContext, instead pass the one created in the main function
+            // TODO: we should not create a new daphneContext, instead pass the
+            // one created in the main function
             for (auto callKernelOp : funcBlock.getOps<daphne::CallKernelOp>()) {
                 callKernelOp.setOperand(callKernelOp.getNumOperands() - 1, daphneContext);
             }
 
-            // Extract inputs from array containing them and remove the block arguments matching the old inputs of the
-            // `VectorizedPipelineOp`
+            // Extract inputs from array containing them and remove the block
+            // arguments matching the old inputs of the `VectorizedPipelineOp`
             rewriter.setInsertionPointToStart(&funcBlock);
 
-            for(auto i = 0u; i < numDataOperands; ++i) {
-                auto addr = rewriter.create<LLVM::GEPOp>(loc,
-                    ptrPtrI1Ty,
-                    inputsArg,
-                    ArrayRef<Value>({
-                        rewriter.create<arith::ConstantOp>(loc, rewriter.getI64IntegerAttr(i))}));
+            for (auto i = 0u; i < numDataOperands; ++i) {
+                auto addr = rewriter.create<LLVM::GEPOp>(
+                    loc, ptrPtrI1Ty, inputsArg,
+                    ArrayRef<Value>({rewriter.create<arith::ConstantOp>(loc, rewriter.getI64IntegerAttr(i))}));
                 Value val = rewriter.create<LLVM::LoadOp>(loc, addr);
                 auto expTy = typeConverter->convertType(op.getInputs().getType()[i]);
                 if (expTy != val.getType()) {
                     // casting for scalars
                     val = rewriter.create<LLVM::PtrToIntOp>(loc, rewriter.getI64Type(), val);
-                    if(llvm::isa<IntegerType>(expTy))
+                    if (llvm::isa<IntegerType>(expTy))
                         val = rewriter.create<LLVM::TruncOp>(loc, expTy, val);
-                    else if(llvm::isa<FloatType>(expTy)) {
+                    else if (llvm::isa<FloatType>(expTy)) {
                         val = rewriter.create<LLVM::BitcastOp>(loc, rewriter.getF64Type(), val);
                         val = rewriter.create<LLVM::FPTruncOp>(loc, expTy, val);
                     } else {
-                        throw ErrorHandler::compilerError(
-                            loc, "LowerToLLVMPass",
-                            "expTy is an unsupported type");
+                        throw ErrorHandler::compilerError(loc, "LowerToLLVMPass", "expTy is an unsupported type");
                     }
                 }
                 funcBlock.getArgument(0).replaceAllUsesWith(val);
@@ -715,11 +605,14 @@ class VectorizedPipelineOpLowering : public OpConversionPattern<daphne::Vectoriz
             rewriter.setInsertionPoint(oldReturn);
             for (auto i = 0u; i < oldReturn->getNumOperands(); ++i) {
                 auto retVal = oldReturn->getOperand(i);
-                // TODO: check how the GEPOp works exactly, and if this can be written better
-                auto addr1 = rewriter.create<LLVM::GEPOp>(op->getLoc(), pppI1Ty, returnRef, ArrayRef<Value>(
-                        {rewriter.create<arith::ConstantOp>(loc, rewriter.getI64IntegerAttr(i))}));
+                // TODO: check how the GEPOp works exactly, and if this can be
+                // written better
+                auto addr1 = rewriter.create<LLVM::GEPOp>(
+                    op->getLoc(), pppI1Ty, returnRef,
+                    ArrayRef<Value>({rewriter.create<arith::ConstantOp>(loc, rewriter.getI64IntegerAttr(i))}));
                 auto addr2 = rewriter.create<LLVM::LoadOp>(op->getLoc(), addr1);
-                Value retValConverted = typeConverter->materializeTargetConversion(rewriter, oldReturn->getLoc(), typeConverter->convertType(retVal.getType()), {retVal});
+                Value retValConverted = typeConverter->materializeTargetConversion(
+                    rewriter, oldReturn->getLoc(), typeConverter->convertType(retVal.getType()), {retVal});
                 rewriter.create<LLVM::StoreOp>(loc, retValConverted, addr2);
             }
             // Replace the old ReturnOp with operands by a new ReturnOp without
@@ -731,19 +624,20 @@ class VectorizedPipelineOpLowering : public OpConversionPattern<daphne::Vectoriz
 
         func_ptrs.push_back(fnPtr);
 
-        if(cfg.use_cuda && !op.getCuda().getBlocks().empty()) {
+        if (cfg.use_cuda && !op.getCuda().getBlocks().empty()) {
             LLVM::LLVMFuncOp fOp2;
             {
                 OpBuilder::InsertionGuard ig(rewriter);
                 auto moduleOp = op->getParentOfType<ModuleOp>();
-                Block * moduleBody = moduleOp.getBody();
+                Block *moduleBody = moduleOp.getBody();
                 rewriter.setInsertionPointToStart(moduleBody);
 
                 static auto ix = 0;
                 std::string funcName = "_vect_cuda" + std::to_string(++ix);
 
                 auto funcType = LLVM::LLVMFunctionType::get(LLVM::LLVMVoidType::get(rewriter.getContext()),
-            {/*outputs...*/pppI1Ty, /*inputs...*/ ptrPtrI1Ty, /*daphneContext...*/ptrI1Ty});
+                                                            {/*outputs...*/ pppI1Ty, /*inputs...*/ ptrPtrI1Ty,
+                                                             /*daphneContext...*/ ptrI1Ty});
 
                 fOp2 = rewriter.create<LLVM::LLVMFuncOp>(loc, funcName, funcType);
                 fOp2.getBody().takeBody(op.getCuda());
@@ -753,42 +647,50 @@ class VectorizedPipelineOpLowering : public OpConversionPattern<daphne::Vectoriz
                 auto inputsArg = funcBlock.addArgument(ptrPtrI1Ty, rewriter.getUnknownLoc());
                 auto daphneContext = funcBlock.addArgument(ptrI1Ty, rewriter.getUnknownLoc());
 
-                // TODO: we should not create a new daphneContext, instead pass the one created in the main function
-                for (auto callKernelOp: funcBlock.getOps<daphne::CallKernelOp>()) {
+                // TODO: we should not create a new daphneContext, instead pass
+                // the one created in the main function
+                for (auto callKernelOp : funcBlock.getOps<daphne::CallKernelOp>()) {
                     callKernelOp.setOperand(callKernelOp.getNumOperands() - 1, daphneContext);
                 }
 
-                // Extract inputs from array containing them and remove the block arguments matching the old inputs of the
+                // Extract inputs from array containing them and remove the
+                // block arguments matching the old inputs of the
                 // `VectorizedPipelineOp`
                 rewriter.setInsertionPointToStart(&funcBlock);
 
                 for (auto i = 0u; i < numDataOperands; ++i) {
-                    auto addr = rewriter.create<LLVM::GEPOp>(loc, ptrPtrI1Ty, inputsArg, ArrayRef<Value>({
-                            rewriter.create<arith::ConstantOp>(loc, rewriter.getI64IntegerAttr(i))}));
+                    auto addr = rewriter.create<LLVM::GEPOp>(
+                        loc, ptrPtrI1Ty, inputsArg,
+                        ArrayRef<Value>({rewriter.create<arith::ConstantOp>(loc, rewriter.getI64IntegerAttr(i))}));
                     Value val = rewriter.create<LLVM::LoadOp>(loc, addr);
                     auto expTy = typeConverter->convertType(op.getInputs().getType()[i]);
                     if (expTy != val.getType()) {
-                        val = rewriter.create<LLVM::PtrToIntOp>(loc, rewriter.getIntegerType(expTy.getIntOrFloatBitWidth(),false), val);
+                        val = rewriter.create<LLVM::PtrToIntOp>(
+                            loc, rewriter.getIntegerType(expTy.getIntOrFloatBitWidth(), false), val);
                         val = rewriter.create<LLVM::BitcastOp>(loc, expTy, val);
                     }
                     funcBlock.getArgument(0).replaceAllUsesWith(val);
                     funcBlock.eraseArgument(0);
                 }
 
-                // Update function block to write return value by reference instead
+                // Update function block to write return value by reference
+                // instead
                 auto oldReturn = funcBlock.getTerminator();
                 rewriter.setInsertionPoint(oldReturn);
                 for (auto i = 0u; i < oldReturn->getNumOperands(); ++i) {
                     auto retVal = oldReturn->getOperand(i);
-                    // TODO: check how the GEPOp works exactly, and if this can be written better
-                    auto addr1 = rewriter.create<LLVM::GEPOp>(op->getLoc(), pppI1Ty, returnRef, ArrayRef<Value>(
-                        {rewriter.create<arith::ConstantOp>(loc, rewriter.getI64IntegerAttr(i))}));
+                    // TODO: check how the GEPOp works exactly, and if this can
+                    // be written better
+                    auto addr1 = rewriter.create<LLVM::GEPOp>(
+                        op->getLoc(), pppI1Ty, returnRef,
+                        ArrayRef<Value>({rewriter.create<arith::ConstantOp>(loc, rewriter.getI64IntegerAttr(i))}));
                     auto addr2 = rewriter.create<LLVM::LoadOp>(op->getLoc(), addr1);
-                    Value retValConverted = typeConverter->materializeTargetConversion(rewriter, oldReturn->getLoc(), typeConverter->convertType(retVal.getType()), {retVal});
+                    Value retValConverted = typeConverter->materializeTargetConversion(
+                        rewriter, oldReturn->getLoc(), typeConverter->convertType(retVal.getType()), {retVal});
                     rewriter.create<LLVM::StoreOp>(loc, retValConverted, addr2);
                 }
-                // Replace the old ReturnOp with operands by a new ReturnOp without
-                // operands.
+                // Replace the old ReturnOp with operands by a new ReturnOp
+                // without operands.
                 rewriter.replaceOpWithNewOp<func::ReturnOp>(oldReturn);
             }
 
@@ -803,95 +705,84 @@ class VectorizedPipelineOpLowering : public OpConversionPattern<daphne::Vectoriz
         Operation::result_type_range resultTypes = op->getResultTypes();
         const size_t numRes = op->getNumResults();
 
-        if(numRes > 0) {
+        if (numRes > 0) {
             // TODO Support individual types for all outputs (see #397).
             // Check if all results have the same type.
             Type mt0 = resultTypes[0].dyn_cast<daphne::MatrixType>().withSameElementTypeAndRepr();
             for (size_t i = 1; i < numRes; i++) {
-                if (mt0 != resultTypes[i]
-                               .dyn_cast<daphne::MatrixType>()
-                               .withSameElementTypeAndRepr()) {
-                    throw ErrorHandler::compilerError(
-                        op, "LowerToLLVMPass",
-                        "encountered a vectorized pipelines with different "
-                        "result types, but at the moment we require all "
-                        "results to have the same type");
+                if (mt0 != resultTypes[i].dyn_cast<daphne::MatrixType>().withSameElementTypeAndRepr()) {
+                    throw ErrorHandler::compilerError(op, "LowerToLLVMPass",
+                                                      "encountered a vectorized pipelines with different "
+                                                      "result types, but at the moment we require all "
+                                                      "results to have the same type");
                 }
             }
-            // Append the name of the common type of all results to the kernel name.
+            // Append the name of the common type of all results to the kernel
+            // name.
             callee << "__" << CompilerUtils::mlirTypeToCppTypeName(resultTypes[0], false) << "_variadic__size_t";
         }
 
         mlir::Type operandType;
         std::vector<Value> newOperands;
-        if(numRes > 0) {
+        if (numRes > 0) {
             auto m32type = rewriter.getF32Type();
             auto m64type = rewriter.getF64Type();
             auto msi64type = rewriter.getIntegerType(64, true);
 
             auto res_elem_type = op->getResult(0).getType().dyn_cast<mlir::daphne::MatrixType>().getElementType();
-            if(res_elem_type == m64type)
+            if (res_elem_type == m64type)
                 operandType = daphne::MatrixType::get(getContext(), m64type);
-            else if(res_elem_type == m32type)
+            else if (res_elem_type == m32type)
                 operandType = daphne::MatrixType::get(getContext(), m32type);
-            else if(res_elem_type == msi64type)
+            else if (res_elem_type == msi64type)
                 operandType = daphne::MatrixType::get(getContext(), msi64type);
             else {
                 std::string str;
                 llvm::raw_string_ostream output(str);
                 op->getResult(0).getType().print(output);
-                throw ErrorHandler::compilerError(
-                    op, "LowerToLLVMPass",
-                    "Unsupported result type for vectorizedPipeline op: " +
-                        str);
+                throw ErrorHandler::compilerError(op, "LowerToLLVMPass",
+                                                  "Unsupported result type for vectorizedPipeline op: " + str);
             }
-        }
-        else {
-            throw ErrorHandler::compilerError(
-                op, "LowerToLLVMPass",
-                "vectorizedPipelineOp without outputs not supported at the "
-                "moment!");
+        } else {
+            throw ErrorHandler::compilerError(op, "LowerToLLVMPass",
+                                              "vectorizedPipelineOp without outputs not supported at the "
+                                              "moment!");
         }
 
         // Handle variadic operands isScalar and inputs (both share numInputs).
         auto attrNumInputs = rewriter.getI64IntegerAttr(numDataOperands);
         // For isScalar.
         callee << "__bool";
-        auto vpScalar = rewriter.create<daphne::CreateVariadicPackOp>(loc,
-            daphne::VariadicPackType::get(rewriter.getContext(), rewriter.getI1Type()),
-            attrNumInputs);
+        auto vpScalar = rewriter.create<daphne::CreateVariadicPackOp>(
+            loc, daphne::VariadicPackType::get(rewriter.getContext(), rewriter.getI1Type()), attrNumInputs);
         // For inputs and numInputs.
         callee << "__" << CompilerUtils::mlirTypeToCppTypeName(operandType, false, true);
         callee << "_variadic__size_t";
-        auto vpInputs = rewriter.create<daphne::CreateVariadicPackOp>(loc,
-            daphne::VariadicPackType::get(rewriter.getContext(), operandType),
-            attrNumInputs);
+        auto vpInputs = rewriter.create<daphne::CreateVariadicPackOp>(
+            loc, daphne::VariadicPackType::get(rewriter.getContext(), operandType), attrNumInputs);
         // Populate the variadic packs for isScalar and inputs.
-        for(size_t k = 0; k < numDataOperands; k++) {
+        for (size_t k = 0; k < numDataOperands; k++) {
             auto attrK = rewriter.getI64IntegerAttr(k);
             rewriter.create<daphne::StoreVariadicPackOp>(
+                loc, vpScalar,
+                rewriter.create<daphne::ConstantOp>(
                     loc,
-                    vpScalar,
-                    rewriter.create<daphne::ConstantOp>(
-                            loc,
-                            // We assume this input to be a scalar if its type
-                            // has not been converted to a pointer type.
-                            !llvm::isa<LLVM::LLVMPointerType>(adaptor.getOperands()[k].getType())
-                    ),
-                    attrK
-            );
-            rewriter.create<daphne::StoreVariadicPackOp>(
-                    loc, vpInputs, adaptor.getOperands()[k], attrK
-            );
+                    // We assume this input to be a scalar if its type
+                    // has not been converted to a pointer type.
+                    !llvm::isa<LLVM::LLVMPointerType>(adaptor.getOperands()[k].getType())),
+                attrK);
+            rewriter.create<daphne::StoreVariadicPackOp>(loc, vpInputs, adaptor.getOperands()[k], attrK);
         }
         newOperands.push_back(vpScalar);
         newOperands.push_back(vpInputs);
-        newOperands.push_back(rewriter.create<daphne::ConstantOp>(loc, rewriter.getIndexType(), rewriter.getIndexAttr(numDataOperands)));
+        newOperands.push_back(
+            rewriter.create<daphne::ConstantOp>(loc, rewriter.getIndexType(), rewriter.getIndexAttr(numDataOperands)));
 
-        // Obtain an insertion point at the beginning of the function surrounding this VectorizedPipelineOp
-        // (see comment on AllocaOp above).
+        // Obtain an insertion point at the beginning of the function
+        // surrounding this VectorizedPipelineOp (see comment on AllocaOp
+        // above).
         OpBuilder::InsertPoint ipHere = rewriter.saveInsertionPoint();
-        Block & fb = op.getOperation()->getParentOfType<LLVM::LLVMFuncOp>().getBody().front();
+        Block &fb = op.getOperation()->getParentOfType<LLVM::LLVMFuncOp>().getBody().front();
         rewriter.setInsertionPointToStart(&fb);
         OpBuilder::InsertPoint ipFuncStart = rewriter.saveInsertionPoint();
         rewriter.restoreInsertionPoint(ipHere);
@@ -900,16 +791,17 @@ class VectorizedPipelineOpLowering : public OpConversionPattern<daphne::Vectoriz
         // Variadic num rows operands.
         callee << "__" << CompilerUtils::mlirTypeToCppTypeName(rewriter.getIntegerType(64, true), false);
         auto rowsOperands = adaptor.getOperands().drop_front(numDataOperands);
-        newOperands
-            .push_back(convertToArray(loc, rewriter, rewriter.getI64Type(), rowsOperands.take_front(numOutputs), ipFuncStart));
+        newOperands.push_back(
+            convertToArray(loc, rewriter, rewriter.getI64Type(), rowsOperands.take_front(numOutputs), ipFuncStart));
         callee << "__" << CompilerUtils::mlirTypeToCppTypeName(rewriter.getIntegerType(64, true), false);
         auto colsOperands = rowsOperands.drop_front(numOutputs);
-        newOperands.push_back(convertToArray(loc, rewriter, rewriter.getI64Type(), colsOperands.take_front(numOutputs), ipFuncStart));
+        newOperands.push_back(
+            convertToArray(loc, rewriter, rewriter.getI64Type(), colsOperands.take_front(numOutputs), ipFuncStart));
 
         // Add array of split enums
         callee << "__int64_t";
         std::vector<Value> splitConsts;
-        for(auto split : op.getSplits()) {
+        for (auto split : op.getSplits()) {
             splitConsts.push_back(rewriter.create<arith::ConstantOp>(loc, split));
         }
         newOperands.push_back(convertToArray(loc, rewriter, rewriter.getI64Type(), splitConsts, ipFuncStart));
@@ -917,57 +809,53 @@ class VectorizedPipelineOpLowering : public OpConversionPattern<daphne::Vectoriz
         // Add array of combine enums
         callee << "__int64_t";
         std::vector<Value> combineConsts;
-        for(auto combine : op.getCombines()) {
+        for (auto combine : op.getCombines()) {
             combineConsts.push_back(rewriter.create<arith::ConstantOp>(loc, combine));
         }
         newOperands.push_back(convertToArray(loc, rewriter, rewriter.getI64Type(), combineConsts, ipFuncStart));
 
-        // TODO: pass function pointer with special placeholder instead of `void`
+        // TODO: pass function pointer with special placeholder instead of
+        // `void`
 
         callee << "__size_t";
-        newOperands.push_back(rewriter.create<daphne::ConstantOp>(loc, rewriter.getIndexType(), rewriter.getIndexAttr(func_ptrs.size())));
+        newOperands.push_back(
+            rewriter.create<daphne::ConstantOp>(loc, rewriter.getIndexType(), rewriter.getIndexAttr(func_ptrs.size())));
         callee << "__void_variadic";
         newOperands.push_back(convertToArray(loc, rewriter, ptrPtrI1Ty, func_ptrs, ipFuncStart));
-//        newOperands.push_back(fnPtr);
+        //        newOperands.push_back(fnPtr);
 
         // Add ctx
-//        newOperands.push_back(operands.back());
+        //        newOperands.push_back(operands.back());
         if (op.getCtx() == nullptr) {
             op->emitOpError() << "`DaphneContext` not known";
             return failure();
-        }
-        else
+        } else
             newOperands.push_back(op.getCtx());
         // Create a CallKernelOp for the kernel function to call and return
         // success().
-        auto kernel = rewriter.create<daphne::CallKernelOp>(
-            loc,
-            callee.str(),
-            newOperands,
-            resultTypes
-        );
+        auto kernel = rewriter.create<daphne::CallKernelOp>(loc, callee.str(), newOperands, resultTypes);
         kernel->setAttr(ATTR_HASVARIADICRESULTS, rewriter.getBoolAttr(true));
         rewriter.replaceOp(op, kernel.getResults());
         return success();
     }
-private:
-    static Value convertToArray(Location loc, ConversionPatternRewriter &rewriter, Type valueTy, ValueRange values, OpBuilder::InsertPoint & ipFuncStart)
-    {
-        // Set the insertion point to the beginning of the function surrounding this VectorizedPipelineOp
-        // (see comment on AllocaOp above).
+
+  private:
+    static Value convertToArray(Location loc, ConversionPatternRewriter &rewriter, Type valueTy, ValueRange values,
+                                OpBuilder::InsertPoint &ipFuncStart) {
+        // Set the insertion point to the beginning of the function surrounding
+        // this VectorizedPipelineOp (see comment on AllocaOp above).
         OpBuilder::InsertPoint ipHere = rewriter.saveInsertionPoint();
         rewriter.restoreInsertionPoint(ipFuncStart);
 
         auto valuePtrTy = LLVM::LLVMPointerType::get(valueTy);
-        auto array = rewriter.create<LLVM::AllocaOp>(loc,
-            valuePtrTy,
-            Value(rewriter.create<arith::ConstantOp>(loc, rewriter.getI64IntegerAttr(values.size()))));
+        auto array = rewriter.create<LLVM::AllocaOp>(
+            loc, valuePtrTy, Value(rewriter.create<arith::ConstantOp>(loc, rewriter.getI64IntegerAttr(values.size()))));
         ipFuncStart = rewriter.saveInsertionPoint();
 
         // Go back to the original insertion point.
         rewriter.restoreInsertionPoint(ipHere);
 
-        for(auto i = 0u; i < values.size(); ++i) {
+        for (auto i = 0u; i < values.size(); ++i) {
             Value cstI = rewriter.create<arith::ConstantOp>(loc, rewriter.getI64IntegerAttr(i));
             auto addr = rewriter.create<LLVM::GEPOp>(loc, valuePtrTy, array, ArrayRef<Value>({cstI}));
             auto val = values[i];
@@ -980,38 +868,30 @@ class VectorizedPipelineOpLowering : public OpConversionPattern<daphne::Vectoriz
     }
 };
 
-class GenericCallOpLowering : public OpConversionPattern<daphne::GenericCallOp>
-{
-public:
+class GenericCallOpLowering : public OpConversionPattern<daphne::GenericCallOp> {
+  public:
     using OpConversionPattern::OpConversionPattern;
 
-    LogicalResult
-    matchAndRewrite(daphne::GenericCallOp op, OpAdaptor adaptor,
-                    ConversionPatternRewriter &rewriter) const override
-    {
+    LogicalResult matchAndRewrite(daphne::GenericCallOp op, OpAdaptor adaptor,
+                                  ConversionPatternRewriter &rewriter) const override {
         rewriter.replaceOpWithNewOp<func::CallOp>(op, op.getCallee(), op->getResultTypes(), adaptor.getOperands());
         return success();
     }
 };
 
-namespace
-{
-    struct DaphneLowerToLLVMPass
-    : public PassWrapper<DaphneLowerToLLVMPass, OperationPass<ModuleOp>>
-    {
-		explicit DaphneLowerToLLVMPass(const DaphneUserConfig& cfg) : cfg(cfg) { }
-		const DaphneUserConfig& cfg;
+namespace {
+struct DaphneLowerToLLVMPass : public PassWrapper<DaphneLowerToLLVMPass, OperationPass<ModuleOp>> {
+    explicit DaphneLowerToLLVMPass(const DaphneUserConfig &cfg) : cfg(cfg) {}
+    const DaphneUserConfig &cfg;
 
-        void getDependentDialects(DialectRegistry & registry) const override
-        {
-            registry.insert<LLVM::LLVMDialect/*, scf::SCFDialect*/>();
-        }
-        void runOnOperation() final;
-    };
+    void getDependentDialects(DialectRegistry &registry) const override {
+        registry.insert<LLVM::LLVMDialect /*, scf::SCFDialect*/>();
+    }
+    void runOnOperation() final;
+};
 } // end anonymous namespace
 
-void DaphneLowerToLLVMPass::runOnOperation()
-{
+void DaphneLowerToLLVMPass::runOnOperation() {
     auto module = getOperation();
 
     RewritePatternSet patterns(&getContext());
@@ -1019,57 +899,27 @@ void DaphneLowerToLLVMPass::runOnOperation()
     LowerToLLVMOptions llvmOptions(&getContext());
     // llvmOptions.useBarePtrCallConv = true;
     LLVMTypeConverter typeConverter(&getContext(), llvmOptions);
-    typeConverter.addConversion([&](daphne::MatrixType t)
-    {
-        return LLVM::LLVMPointerType::get(
-                IntegerType::get(t.getContext(), 1));
-    });
-    typeConverter.addConversion([&](daphne::FrameType t)
-    {
-        return LLVM::LLVMPointerType::get(
-                IntegerType::get(t.getContext(), 1));
-    });
-    typeConverter.addConversion([&](daphne::ListType t)
-    {
-        return LLVM::LLVMPointerType::get(
-                IntegerType::get(t.getContext(), 1));
-    });
-    typeConverter.addConversion([&](daphne::StringType t)
-    {
-        return LLVM::LLVMPointerType::get(
-                IntegerType::get(t.getContext(), 8));
-    });
-    typeConverter.addConversion([&](daphne::VariadicPackType t)
-    {
-        return LLVM::LLVMPointerType::get(
-                typeConverter.convertType(t.getContainedType())
-        );
-    });
-    typeConverter.addConversion([&](daphne::DaphneContextType t)
-    {
-        return LLVM::LLVMPointerType::get(
-                IntegerType::get(t.getContext(), 1));
-    });
-    typeConverter.addConversion([&](daphne::HandleType t)
-    {
-      return LLVM::LLVMPointerType::get(
-          IntegerType::get(t.getContext(), 1));
-    });
-    typeConverter.addConversion([&](daphne::FileType t)
-    {
-      return LLVM::LLVMPointerType::get(
-          IntegerType::get(t.getContext(), 1));
-    });
-    typeConverter.addConversion([&](daphne::DescriptorType t)
-    {
-      return LLVM::LLVMPointerType::get(
-          IntegerType::get(t.getContext(), 1));
-    });
-    typeConverter.addConversion([&](daphne::TargetType t)
-    {
-      return LLVM::LLVMPointerType::get(
-          IntegerType::get(t.getContext(), 1));
+    typeConverter.addConversion(
+        [&](daphne::MatrixType t) { return LLVM::LLVMPointerType::get(IntegerType::get(t.getContext(), 1)); });
+    typeConverter.addConversion(
+        [&](daphne::FrameType t) { return LLVM::LLVMPointerType::get(IntegerType::get(t.getContext(), 1)); });
+    typeConverter.addConversion(
+        [&](daphne::ListType t) { return LLVM::LLVMPointerType::get(IntegerType::get(t.getContext(), 1)); });
+    typeConverter.addConversion(
+        [&](daphne::StringType t) { return LLVM::LLVMPointerType::get(IntegerType::get(t.getContext(), 8)); });
+    typeConverter.addConversion([&](daphne::VariadicPackType t) {
+        return LLVM::LLVMPointerType::get(typeConverter.convertType(t.getContainedType()));
     });
+    typeConverter.addConversion(
+        [&](daphne::DaphneContextType t) { return LLVM::LLVMPointerType::get(IntegerType::get(t.getContext(), 1)); });
+    typeConverter.addConversion(
+        [&](daphne::HandleType t) { return LLVM::LLVMPointerType::get(IntegerType::get(t.getContext(), 1)); });
+    typeConverter.addConversion(
+        [&](daphne::FileType t) { return LLVM::LLVMPointerType::get(IntegerType::get(t.getContext(), 1)); });
+    typeConverter.addConversion(
+        [&](daphne::DescriptorType t) { return LLVM::LLVMPointerType::get(IntegerType::get(t.getContext(), 1)); });
+    typeConverter.addConversion(
+        [&](daphne::TargetType t) { return LLVM::LLVMPointerType::get(IntegerType::get(t.getContext(), 1)); });
 
     LLVMConversionTarget target(getContext());
 
@@ -1087,17 +937,11 @@ void DaphneLowerToLLVMPass::runOnOperation()
 
     // for trivial casts no lowering to kernels -> higher benefit
     patterns.insert<CastOpLowering>(&getContext(), 2);
-    patterns.insert<CallKernelOpLowering, CreateVariadicPackOpLowering>(
-        typeConverter, &getContext());
+    patterns.insert<CallKernelOpLowering, CreateVariadicPackOpLowering>(typeConverter, &getContext());
     patterns.insert<VectorizedPipelineOpLowering>(typeConverter, &getContext(), cfg);
 
-    patterns.insert<
-            ConstantOpLowering,
-            ReturnOpLowering,
-            StoreVariadicPackOpLowering,
-            GenericCallOpLowering,
-            MapOpLowering
-    >(&getContext());
+    patterns.insert<ConstantOpLowering, ReturnOpLowering, StoreVariadicPackOpLowering, GenericCallOpLowering,
+                    MapOpLowering>(&getContext());
 
     // We want to completely lower to LLVM, so we use a `FullConversion`. This
     // ensures that only legal operations will remain after the conversion.
@@ -1105,7 +949,6 @@ void DaphneLowerToLLVMPass::runOnOperation()
         signalPassFailure();
 }
 
-std::unique_ptr<Pass> daphne::createLowerToLLVMPass(const DaphneUserConfig& cfg)
-{
+std::unique_ptr<Pass> daphne::createLowerToLLVMPass(const DaphneUserConfig &cfg) {
     return std::make_unique<DaphneLowerToLLVMPass>(cfg);
 }
diff --git a/src/compiler/lowering/ManageObjRefsPass.cpp b/src/compiler/lowering/ManageObjRefsPass.cpp
index f0e89d6d7..fd9f8746e 100644
--- a/src/compiler/lowering/ManageObjRefsPass.cpp
+++ b/src/compiler/lowering/ManageObjRefsPass.cpp
@@ -15,10 +15,10 @@
  */
 
 #include <compiler/utils/CompilerUtils.h>
-#include <util/ErrorHandler.h>
 #include <compiler/utils/LoweringUtils.h>
 #include <ir/daphneir/Daphne.h>
 #include <ir/daphneir/Passes.h>
+#include <util/ErrorHandler.h>
 
 #include <mlir/Dialect/SCF/IR/SCF.h>
 #include <mlir/Pass/Pass.h>
@@ -46,8 +46,7 @@ using namespace mlir;
  *   object that is still needed in a surrounding scope, i.e., to prevent
  *   double frees.
  */
-struct ManageObjRefsPass : public PassWrapper<ManageObjRefsPass, OperationPass<func::FuncOp>>
-{
+struct ManageObjRefsPass : public PassWrapper<ManageObjRefsPass, OperationPass<func::FuncOp>> {
     explicit ManageObjRefsPass() {}
     void runOnOperation() final;
 
@@ -56,11 +55,10 @@ struct ManageObjRefsPass : public PassWrapper<ManageObjRefsPass, OperationPass<f
 };
 
 void processMemRefInterop(OpBuilder builder, Value v) {
-    Operation* lastUseOp = findLastUseOfSSAValue(v);
+    Operation *lastUseOp = findLastUseOfSSAValue(v);
 
     builder.setInsertionPointAfter(lastUseOp);
-    builder.create<daphne::DecRefOp>(v.getLoc(),
-                                     v.getDefiningOp()->getOperand(0));
+    builder.create<daphne::DecRefOp>(v.getLoc(), v.getDefiningOp()->getOperand(0));
 }
 
 /**
@@ -76,49 +74,48 @@ void processValue(OpBuilder builder, Value v) {
     // We only need to manage the reference counters of DAPHNE data objects
     // like matrices and frames (not of scalars).
 
-    Operation* defOp = v.getDefiningOp();
+    Operation *defOp = v.getDefiningOp();
     if (defOp && llvm::isa<daphne::ConvertDenseMatrixToMemRef>(defOp))
         processMemRefInterop(builder, v);
 
     // Increase the reference counter of string literals, such that they don't
     // get gargabe collected.
-    if(defOp && llvm::isa<daphne::ConstantOp>(defOp) && llvm::isa<daphne::StringType>(v.getType())) {
-        // The given value is a string literal. We want to increase its reference
-        // counter right after its definition, such that it is never removed.
-        // But if the defining op is the block of a FuncOp, make sure not to insert the
-        // IncRefOp before the CreateDaphneContextOp, otherwise we will run
-        // into problems during/after lowering to kernel calls.
-        Block * pb = v.getParentBlock();
-        if(auto fo = dyn_cast<func::FuncOp>(pb->getParentOp())) {
+    if (defOp && llvm::isa<daphne::ConstantOp>(defOp) && llvm::isa<daphne::StringType>(v.getType())) {
+        // The given value is a string literal. We want to increase its
+        // reference counter right after its definition, such that it is never
+        // removed. But if the defining op is the block of a FuncOp, make sure
+        // not to insert the IncRefOp before the CreateDaphneContextOp,
+        // otherwise we will run into problems during/after lowering to kernel
+        // calls.
+        Block *pb = v.getParentBlock();
+        if (auto fo = dyn_cast<func::FuncOp>(pb->getParentOp())) {
             Value dctx = CompilerUtils::getDaphneContext(fo);
             builder.setInsertionPointAfterValue(dctx);
-        }
-        else
+        } else
             builder.setInsertionPointAfter(defOp);
         builder.create<daphne::IncRefOp>(v.getLoc(), v);
     }
 
-    // Increase the reference counter of the result of the arith.select op, if it is
-    // a string scalar.
-    // This is necessary because for arith.select, we have no clue which of
-    // its two arguments (2nd or 3rd one) it will return. Unless we do something
-    // about it, the reference counter of the result will be too low by 1.
-    // Thus, we increase the result's reference counter here.
-    if(defOp && llvm::isa<arith::SelectOp>(defOp) && llvm::isa<daphne::StringType>(v.getType())) {
+    // Increase the reference counter of the result of the arith.select op, if
+    // it is a string scalar. This is necessary because for arith.select, we
+    // have no clue which of its two arguments (2nd or 3rd one) it will return.
+    // Unless we do something about it, the reference counter of the result will
+    // be too low by 1. Thus, we increase the result's reference counter here.
+    if (defOp && llvm::isa<arith::SelectOp>(defOp) && llvm::isa<daphne::StringType>(v.getType())) {
         builder.setInsertionPointAfter(defOp);
         builder.create<daphne::IncRefOp>(v.getLoc(), v);
     }
 
-    if (!llvm::isa<daphne::MatrixType, daphne::FrameType, daphne::ListType,
-                   daphne::StringType>(v.getType()))
+    if (!llvm::isa<daphne::MatrixType, daphne::FrameType, daphne::ListType, daphne::StringType>(v.getType()))
         return;
 
-    Operation* decRefAfterOp = nullptr;
+    Operation *decRefAfterOp = nullptr;
     if (v.use_empty()) {
         // If the given SSA value has no uses, we want to decrease its
         // reference counter directly after its definition (nullptr for block
         // args). Note that ideally, there should be no unused SSA values.
-        if (defOp) decRefAfterOp = defOp;
+        if (defOp)
+            decRefAfterOp = defOp;
         // else: decRefAfterOp stays nullptr
     } else {
         // If the given SSA value has uses, we need to find the last of them.
@@ -135,11 +132,11 @@ void processValue(OpBuilder builder, Value v) {
     // At this point, decRefAfterOp is nullptr, or the last user of v, or the
     // defining op of v.
 
-    if(decRefAfterOp) {
+    if (decRefAfterOp) {
         // The given value is used and/or an OpResult.
 
         // Don't insert a DecRefOp if the last user is a terminator.
-        if(decRefAfterOp->hasTrait<OpTrait::IsTerminator>())
+        if (decRefAfterOp->hasTrait<OpTrait::IsTerminator>())
             // The value is handed out of its block (e.g., return, yield, ...).
             // So a new reference to it is created. Thus, the reference counter
             // must remain unchanged. Moreover, it is impossible to insert any
@@ -150,23 +147,21 @@ void processValue(OpBuilder builder, Value v) {
         // Don't insert a DecRefOp if there is already one. Currently, this can
         // happen only on the distributed worker, since the IR it gets already
         // contains
-        if(llvm::isa<daphne::DecRefOp>(decRefAfterOp))
+        if (llvm::isa<daphne::DecRefOp>(decRefAfterOp))
             return;
 
         builder.setInsertionPointAfter(decRefAfterOp);
-    }
-    else {
+    } else {
         // The given value is an unused block arg. Decrease its reference
         // counter at the beginning of the block.
         // But if this is the block of a FuncOp, make sure not to insert the
         // DecRefOp before the CreateDaphneContextOp, otherwise we will run
         // into problems during/after lowering to kernel calls.
-        Block * pb = v.getParentBlock();
-        if(auto fo = dyn_cast<func::FuncOp>(pb->getParentOp())) {
+        Block *pb = v.getParentBlock();
+        if (auto fo = dyn_cast<func::FuncOp>(pb->getParentOp())) {
             Value dctx = CompilerUtils::getDaphneContext(fo);
             builder.setInsertionPointAfterValue(dctx);
-        }
-        else
+        } else
             builder.setInsertionPointToStart(pb);
     }
 
@@ -183,15 +178,14 @@ void processValue(OpBuilder builder, Value v) {
  * @param v
  * @param b
  */
-void incRefIfObj(Value v, OpBuilder & b) {
+void incRefIfObj(Value v, OpBuilder &b) {
     Type t = v.getType();
-    if(llvm::isa<daphne::MatrixType, daphne::FrameType, daphne::ListType, daphne::StringType>(t))
+    if (llvm::isa<daphne::MatrixType, daphne::FrameType, daphne::ListType, daphne::StringType>(t))
         b.create<daphne::IncRefOp>(v.getLoc(), v);
-    else if(llvm::isa<daphne::UnknownType>(t))
-        throw ErrorHandler::compilerError(
-            v.getDefiningOp(), "ManageObjRefsPass",
-            "ManageObjRefsPass encountered a value of unknown type, so it "
-            "cannot know if it is a data object.");
+    else if (llvm::isa<daphne::UnknownType>(t))
+        throw ErrorHandler::compilerError(v.getDefiningOp(), "ManageObjRefsPass",
+                                          "ManageObjRefsPass encountered a value of unknown type, so it "
+                                          "cannot know if it is a data object.");
 }
 
 /**
@@ -202,9 +196,9 @@ void incRefIfObj(Value v, OpBuilder & b) {
  * @param op
  * @param b
  */
-void incRefArgs(Operation& op, OpBuilder & b) {
+void incRefArgs(Operation &op, OpBuilder &b) {
     b.setInsertionPoint(&op);
-    for(Value arg : op.getOperands())
+    for (Value arg : op.getOperands())
         incRefIfObj(arg, b);
 }
 
@@ -215,46 +209,46 @@ void incRefArgs(Operation& op, OpBuilder & b) {
  * @param builder
  * @param b
  */
-void processBlock(OpBuilder builder, Block * b) {
+void processBlock(OpBuilder builder, Block *b) {
     // Make sure that the reference counters of block arguments are decreased.
-    for(BlockArgument& arg : b->getArguments())
+    for (BlockArgument &arg : b->getArguments())
         processValue(builder, arg);
 
     // Make sure the reference counters of op results are decreased, and
     // Increase the reference counters of operands where necessary.
-    for(Operation& op : b->getOperations()) {
+    for (Operation &op : b->getOperations()) {
         // 1) Increase the reference counters of operands, if necessary.
 
         // TODO We could use traits to identify those cases.
 
         // Casts that will not call a kernel.
-        if(auto co = dyn_cast<daphne::CastOp>(op)) {
-            if(co.isTrivialCast() || co.isRemovePropertyCast())
+        if (auto co = dyn_cast<daphne::CastOp>(op)) {
+            if (co.isTrivialCast() || co.isRemovePropertyCast())
                 incRefArgs(op, builder);
         }
         // Loops and function calls.
-        else if(llvm::isa<scf::WhileOp, scf::ForOp, func::CallOp, daphne::GenericCallOp>(op))
+        else if (llvm::isa<scf::WhileOp, scf::ForOp, func::CallOp, daphne::GenericCallOp>(op))
             incRefArgs(op, builder);
         // YieldOp of IfOp.
-        else if(llvm::isa<scf::YieldOp>(op) && llvm::isa<scf::IfOp>(op.getParentOp())) {
+        else if (llvm::isa<scf::YieldOp>(op) && llvm::isa<scf::IfOp>(op.getParentOp())) {
             // Increase the reference counters of data objects that already
             // existed before the IfOp, because yielding them creates a new
             // SSA value referring to them.
             builder.setInsertionPoint(&op);
-            for(Value arg : op.getOperands())
-                if(arg.getParentBlock() != op.getBlock())
+            for (Value arg : op.getOperands())
+                if (arg.getParentBlock() != op.getBlock())
                     incRefIfObj(arg, builder);
         }
         // Terminators.
-        else if(op.hasTrait<OpTrait::IsTerminator>()) {
+        else if (op.hasTrait<OpTrait::IsTerminator>()) {
             // By default, we do not decrease the reference counter of a
             // terminator's argument. If the same value is used multiple times
             // as an argument, we need to increase its reference counter.
             builder.setInsertionPoint(&op);
-            for(size_t i = 1; i < op.getNumOperands(); i++) {
+            for (size_t i = 1; i < op.getNumOperands(); i++) {
                 Value arg = op.getOperand(i);
-                for(size_t k = 0; k < i; k++)
-                    if(arg == op.getOperand(k))
+                for (size_t k = 0; k < i; k++)
+                    if (arg == op.getOperand(k))
                         incRefIfObj(arg, builder);
             }
         }
@@ -263,27 +257,21 @@ void processBlock(OpBuilder builder, Block * b) {
         //   of vectorized pipelines, because internally, a pipeline processes
         //   views into its inputs. These are individual data objects.
 
-
         // 2) Make sure the reference counters of op results are decreased.
-        for(Value v : op.getResults())
+        for (Value v : op.getResults())
             processValue(builder, v);
 
-
         // 3) Recurse into the op, if it has regions.
-        for(Region& r : op.getRegions())
-            for(Block& b2 : r.getBlocks())
+        for (Region &r : op.getRegions())
+            for (Block &b2 : r.getBlocks())
                 processBlock(builder, &b2);
     }
 }
 
-void ManageObjRefsPass::runOnOperation()
-{
+void ManageObjRefsPass::runOnOperation() {
     func::FuncOp f = getOperation();
     OpBuilder builder(f.getContext());
     processBlock(builder, &(f.getBody().front()));
 }
 
-std::unique_ptr<Pass> daphne::createManageObjRefsPass()
-{
-    return std::make_unique<ManageObjRefsPass>();
-}
+std::unique_ptr<Pass> daphne::createManageObjRefsPass() { return std::make_unique<ManageObjRefsPass>(); }
diff --git a/src/compiler/lowering/MapOpLowering.cpp b/src/compiler/lowering/MapOpLowering.cpp
index 27fff5dcc..fa6ac9016 100644
--- a/src/compiler/lowering/MapOpLowering.cpp
+++ b/src/compiler/lowering/MapOpLowering.cpp
@@ -32,33 +32,27 @@
 
 using namespace mlir;
 
-class InlineMapOpLowering
-    : public mlir::OpConversionPattern<mlir::daphne::MapOp> {
-   public:
+class InlineMapOpLowering : public mlir::OpConversionPattern<mlir::daphne::MapOp> {
+  public:
     using OpConversionPattern::OpConversionPattern;
 
-    mlir::LogicalResult matchAndRewrite(
-        mlir::daphne::MapOp op, OpAdaptor adaptor,
-        mlir::ConversionPatternRewriter &rewriter) const override {
+    mlir::LogicalResult matchAndRewrite(mlir::daphne::MapOp op, OpAdaptor adaptor,
+                                        mlir::ConversionPatternRewriter &rewriter) const override {
         auto loc = op->getLoc();
 
-        mlir::daphne::MatrixType lhsMatrixType =
-            op->getOperandTypes().front().dyn_cast<mlir::daphne::MatrixType>();
+        mlir::daphne::MatrixType lhsMatrixType = op->getOperandTypes().front().dyn_cast<mlir::daphne::MatrixType>();
         auto matrixElementType = lhsMatrixType.getElementType();
-        auto lhsMemRefType = mlir::MemRefType::get(
-            {lhsMatrixType.getNumRows(), lhsMatrixType.getNumCols()}, matrixElementType);
+        auto lhsMemRefType =
+            mlir::MemRefType::get({lhsMatrixType.getNumRows(), lhsMatrixType.getNumCols()}, matrixElementType);
 
         mlir::Value lhs =
-            rewriter.create<mlir::daphne::ConvertDenseMatrixToMemRef>(
-                loc, lhsMemRefType, adaptor.getArg());
+            rewriter.create<mlir::daphne::ConvertDenseMatrixToMemRef>(loc, lhsMemRefType, adaptor.getArg());
         mlir::ModuleOp module = op->getParentOfType<mlir::ModuleOp>();
-        func::FuncOp udfFuncOp =
-            module.lookupSymbol<func::FuncOp>(op.getFunc());
+        func::FuncOp udfFuncOp = module.lookupSymbol<func::FuncOp>(op.getFunc());
 
         SmallVector<Value, 4> loopIvs;
 
-        auto outerLoop =
-            rewriter.create<AffineForOp>(loc, 0, lhsMatrixType.getNumRows(), 1);
+        auto outerLoop = rewriter.create<AffineForOp>(loc, 0, lhsMatrixType.getNumRows(), 1);
         for (Operation &nested : *outerLoop.getBody()) {
             rewriter.eraseOp(&nested);
         }
@@ -66,8 +60,7 @@ class InlineMapOpLowering
 
         // outer loop body
         rewriter.setInsertionPointToStart(outerLoop.getBody());
-        auto innerLoop =
-            rewriter.create<AffineForOp>(loc, 0, lhsMatrixType.getNumCols(), 1);
+        auto innerLoop = rewriter.create<AffineForOp>(loc, 0, lhsMatrixType.getNumCols(), 1);
         for (Operation &nested : *innerLoop.getBody()) {
             rewriter.eraseOp(&nested);
         }
@@ -77,15 +70,12 @@ class InlineMapOpLowering
 
         // inner loop body
         mlir::Value lhsValue = rewriter.create<AffineLoadOp>(loc, lhs, loopIvs);
-        mlir::Value res =
-            rewriter.create<func::CallOp>(loc, udfFuncOp, ValueRange{lhsValue})
-                ->getResult(0);
+        mlir::Value res = rewriter.create<func::CallOp>(loc, udfFuncOp, ValueRange{lhsValue})->getResult(0);
         rewriter.create<AffineStoreOp>(loc, res, lhs, loopIvs);
         rewriter.create<AffineYieldOp>(loc);
 
         rewriter.setInsertionPointAfter(outerLoop);
-        mlir::Value output = convertMemRefToDenseMatrix(op->getLoc(), rewriter,
-                                                        lhs, op.getType());
+        mlir::Value output = convertMemRefToDenseMatrix(op->getLoc(), rewriter, lhs, op.getType());
         rewriter.replaceOp(op, output);
         return mlir::success();
     }
@@ -100,14 +90,11 @@ namespace {
  * This rewrite enables subsequent inlining pass to completely replace
  * the daphne::MapOp by inlining the produced CallOps from this pass.
  */
-struct MapOpLoweringPass
-    : public mlir::PassWrapper<MapOpLoweringPass,
-                               mlir::OperationPass<mlir::ModuleOp>> {
+struct MapOpLoweringPass : public mlir::PassWrapper<MapOpLoweringPass, mlir::OperationPass<mlir::ModuleOp>> {
     explicit MapOpLoweringPass() {}
 
     void getDependentDialects(mlir::DialectRegistry &registry) const override {
-        registry.insert<mlir::LLVM::LLVMDialect, mlir::AffineDialect,
-                        mlir::memref::MemRefDialect,
+        registry.insert<mlir::LLVM::LLVMDialect, mlir::AffineDialect, mlir::memref::MemRefDialect,
                         mlir::daphne::DaphneDialect, mlir::func::FuncDialect>();
     }
     void runOnOperation() final;
@@ -120,7 +107,7 @@ struct MapOpLoweringPass
                "UDF.";
     }
 };
-}  // end anonymous namespace
+} // end anonymous namespace
 
 void MapOpLoweringPass::runOnOperation() {
     mlir::ConversionTarget target(getContext());
@@ -128,8 +115,7 @@ void MapOpLoweringPass::runOnOperation() {
     mlir::LowerToLLVMOptions llvmOptions(&getContext());
     mlir::LLVMTypeConverter typeConverter(&getContext(), llvmOptions);
 
-    target.addLegalDialect<mlir::AffineDialect, arith::ArithDialect,
-                           memref::MemRefDialect, mlir::daphne::DaphneDialect,
+    target.addLegalDialect<mlir::AffineDialect, arith::ArithDialect, memref::MemRefDialect, mlir::daphne::DaphneDialect,
                            mlir::func::FuncDialect>();
 
     target.addIllegalOp<mlir::daphne::MapOp>();
@@ -141,6 +127,4 @@ void MapOpLoweringPass::runOnOperation() {
     }
 }
 
-std::unique_ptr<mlir::Pass> mlir::daphne::createMapOpLoweringPass() {
-    return std::make_unique<MapOpLoweringPass>();
-}
+std::unique_ptr<mlir::Pass> mlir::daphne::createMapOpLoweringPass() { return std::make_unique<MapOpLoweringPass>(); }
diff --git a/src/compiler/lowering/MarkCUDAOpsPass.cpp b/src/compiler/lowering/MarkCUDAOpsPass.cpp
index 96ea66f65..de89f2f16 100644
--- a/src/compiler/lowering/MarkCUDAOpsPass.cpp
+++ b/src/compiler/lowering/MarkCUDAOpsPass.cpp
@@ -24,76 +24,76 @@
 using namespace mlir;
 
 struct MarkCUDAOpsPass : public PassWrapper<MarkCUDAOpsPass, OperationPass<func::FuncOp>> {
-    
+
     /**
      * @brief User configuration influencing the rewrite pass
      */
-    const DaphneUserConfig& cfg;
+    const DaphneUserConfig &cfg;
     size_t available_gpu_mem{};
     size_t total_gpu_mem{};
     size_t mem_budget;
     std::shared_ptr<spdlog::logger> logger;
 
-    explicit MarkCUDAOpsPass(const DaphneUserConfig& cfg) : cfg(cfg) {
+    explicit MarkCUDAOpsPass(const DaphneUserConfig &cfg) : cfg(cfg) {
         // ToDo: use context and per device mem info
         cudaMemGetInfo(&available_gpu_mem, &total_gpu_mem);
         mem_budget = std::floor(0.9 * static_cast<double>(total_gpu_mem));
         logger = spdlog::get("compiler::cuda");
     }
-    
+
     void runOnOperation() final;
-    
-    void addCUDAOpsToVectorizedPipeline(OpBuilder& builder, daphne::VectorizedPipelineOp& pipelineOp) const {
-        
-        auto& pipeline = pipelineOp.getBody().front().getOperations();
+
+    void addCUDAOpsToVectorizedPipeline(OpBuilder &builder, daphne::VectorizedPipelineOp &pipelineOp) const {
+
+        auto &pipeline = pipelineOp.getBody().front().getOperations();
         bool build_cuda_pipeline;
-        
-        // add CUDA ops if at least one (cuda_fuse_any) or all (!cuda_fuse_any) ops would be supported
-        if(cfg.cuda_fuse_any) {
-            bool pipeline_has_supported_cuda_ops = llvm::any_of(pipeline, [&](Operation& o) {
-                return llvm::isa<daphne::ReturnOp>(o) || checkUseCUDA(&o);
-            });
+
+        // add CUDA ops if at least one (cuda_fuse_any) or all (!cuda_fuse_any)
+        // ops would be supported
+        if (cfg.cuda_fuse_any) {
+            bool pipeline_has_supported_cuda_ops = llvm::any_of(
+                pipeline, [&](Operation &o) { return llvm::isa<daphne::ReturnOp>(o) || checkUseCUDA(&o); });
             build_cuda_pipeline = pipeline_has_supported_cuda_ops;
-        }
-        else {
-            bool pipeline_has_unsupported_cuda_ops = llvm::any_of(pipeline, [&](Operation& o) {
+        } else {
+            bool pipeline_has_unsupported_cuda_ops = llvm::any_of(pipeline, [&](Operation &o) {
                 if (!llvm::isa<daphne::ReturnOp>(o)) {
                     bool out = checkUseCUDA(&o);
                     logger->trace("checking pipeline op for cuda: {}: {}", o.getName().getStringRef().str(), out);
                     return !out;
-                }
-                else return false;
+                } else
+                    return false;
             });
             build_cuda_pipeline = !pipeline_has_unsupported_cuda_ops;
         }
-        
-        // clone body region into cuda region if there's a cuda supported op in body
-        if(build_cuda_pipeline) {
+
+        // clone body region into cuda region if there's a cuda supported op in
+        // body
+        if (build_cuda_pipeline) {
             PatternRewriter::InsertionGuard insertGuard(builder);
             IRMapping mapper;
             pipelineOp.getBody().cloneInto(&pipelineOp.getCuda(), mapper);
-            for (auto &op: pipelineOp.getCuda().front().getOperations()) {
+            for (auto &op : pipelineOp.getCuda().front().getOperations()) {
                 bool isMat = CompilerUtils::isMatrixComputation(&op);
                 if (op.hasTrait<mlir::OpTrait::CUDASupport>() && isMat)
                     op.setAttr("cuda_device", builder.getI32IntegerAttr(0));
             }
         }
     }
-    
-    bool fitsInMemory(mlir::Operation* op) const {
+
+    bool fitsInMemory(mlir::Operation *op) const {
         auto opSize = 0ul;
-        for(auto operand : op->getOperands()) {
+        for (auto operand : op->getOperands()) {
             auto type = operand.getType();
-            if(auto t = type.dyn_cast<mlir::daphne::MatrixType>()) {
+            if (auto t = type.dyn_cast<mlir::daphne::MatrixType>()) {
                 auto rows = t.getNumRows();
                 auto cols = t.getNumCols();
-                if(rows < 0 || cols < 0) {
+                if (rows < 0 || cols < 0) {
                     logger->warn("Ignoring unknown dimension in max mem check of {}"
-                            "dims are: {}x{}\nsetting unknowns to 1 for this test", op->getName().getStringRef().str(),
-                            rows, cols);
-                    if(rows < 0)
+                                 "dims are: {}x{}\nsetting unknowns to 1 for this test",
+                                 op->getName().getStringRef().str(), rows, cols);
+                    if (rows < 0)
                         rows = 1;
-                    if(cols < 0)
+                    if (cols < 0)
                         cols = 1;
                 }
                 opSize += rows * cols * t.getElementType().getIntOrFloatBitWidth() / 8;
@@ -101,33 +101,34 @@ struct MarkCUDAOpsPass : public PassWrapper<MarkCUDAOpsPass, OperationPass<func:
         }
         auto inSize = opSize;
         logger->trace("op in size: {} kb", opSize / 1024);
-        for(auto result : op->getResults()) {
+        for (auto result : op->getResults()) {
             auto type = result.getType();
-            if(auto t = type.dyn_cast<mlir::daphne::MatrixType>()) {
+            if (auto t = type.dyn_cast<mlir::daphne::MatrixType>()) {
                 opSize += t.getNumRows() * t.getNumCols() * t.getElementType().getIntOrFloatBitWidth() / 8;
             }
         }
-        logger->debug("op out size: {} kb\ntotal op size: {} mb", (opSize-inSize) / 1024,
-                opSize / 1048576);
+        logger->debug("op out size: {} kb\ntotal op size: {} mb", (opSize - inSize) / 1024, opSize / 1048576);
 
-        if(opSize < mem_budget)
+        if (opSize < mem_budget)
             return true;
         else
             return false;
     }
-    
+
     // ToDo: requirements should be set per operator in tablegen
-    bool hasReqMinDims(mlir::Operation* op) const {
-        auto checkDims = [this,op](const mlir::Type& type) -> bool {
-            if(auto t = type.dyn_cast<mlir::daphne::MatrixType>()) {
+    bool hasReqMinDims(mlir::Operation *op) const {
+        auto checkDims = [this, op](const mlir::Type &type) -> bool {
+            if (auto t = type.dyn_cast<mlir::daphne::MatrixType>()) {
                 auto rows = t.getNumRows();
                 auto cols = t.getNumCols();
-                if(rows < 0 || cols < 0) {
-                    logger->warn("Ignoring unknown dimension in min input size check of {} dims are: {}x{}\nsetting "
-                            "unknowns to 256 for this test", op->getName().getStringRef().str(), rows, cols);
-                    if(rows < 0)
+                if (rows < 0 || cols < 0) {
+                    logger->warn("Ignoring unknown dimension in min input size "
+                                 "check of {} dims are: {}x{}\nsetting "
+                                 "unknowns to 256 for this test",
+                                 op->getName().getStringRef().str(), rows, cols);
+                    if (rows < 0)
                         rows = 256;
-                    if(cols < 0)
+                    if (cols < 0)
                         cols = 256;
                 }
                 return (rows > 255 || cols > 255);
@@ -136,27 +137,27 @@ struct MarkCUDAOpsPass : public PassWrapper<MarkCUDAOpsPass, OperationPass<func:
         };
 
         bool ret = false;
-        for(auto type : op->getOperandTypes()) {
-            if((ret = checkDims(type)))
+        for (auto type : op->getOperandTypes()) {
+            if ((ret = checkDims(type)))
                 break;
         }
 
-        if(!ret) {
-            for (auto type: op->getResultTypes()) {
-                if((ret = checkDims(type)))
+        if (!ret) {
+            for (auto type : op->getResultTypes()) {
+                if ((ret = checkDims(type)))
                     break;
             }
         }
         return ret;
     }
-    
-    bool checkUseCUDA(Operation* op) const {
+
+    bool checkUseCUDA(Operation *op) const {
         logger->trace("checkUseCUDA: {}", op->getName().getStringRef().str());
         bool use_cuda = op->hasTrait<mlir::OpTrait::CUDASupport>();
         logger->trace("{} CUDA supported={}", op->getName().getStringRef().str(), use_cuda);
         use_cuda = use_cuda && CompilerUtils::isMatrixComputation(op);
         logger->trace("{} isMatrixComputation={}", op->getName().getStringRef().str(), use_cuda);
-        if(!cfg.force_cuda) {
+        if (!cfg.force_cuda) {
             use_cuda = use_cuda && hasReqMinDims(op);
             logger->trace("{} hasMinInputDims={}", op->getName().getStringRef().str(), use_cuda);
             use_cuda = use_cuda && fitsInMemory(op);
@@ -167,21 +168,19 @@ struct MarkCUDAOpsPass : public PassWrapper<MarkCUDAOpsPass, OperationPass<func:
 };
 
 void MarkCUDAOpsPass::runOnOperation() {
-    getOperation()->walk([&](Operation* op) {
+    getOperation()->walk([&](Operation *op) {
         logger->debug("MarkCUDAOpsPass: {} parent: {}", op->getName().getStringRef().str(),
-                op->getParentOp()->getName().getStringRef().str());
+                      op->getParentOp()->getName().getStringRef().str());
         OpBuilder builder(op);
         // handle vectorizedPipelineOps
-        if (auto constOp = llvm::dyn_cast<daphne::ConstantOp>(op))
-        {
+        if (auto constOp = llvm::dyn_cast<daphne::ConstantOp>(op)) {
             WalkResult::advance();
             return;
-        }
-        else if (auto pipelineOp = llvm::dyn_cast<daphne::VectorizedPipelineOp>(op))
+        } else if (auto pipelineOp = llvm::dyn_cast<daphne::VectorizedPipelineOp>(op))
             addCUDAOpsToVectorizedPipeline(builder, pipelineOp);
         else {
-            if((!llvm::isa<daphne::VectorizedPipelineOp>(op->getParentOp()) && checkUseCUDA(op)) ||
-                 llvm::isa<daphne::CreateCUDAContextOp>(op)) {
+            if ((!llvm::isa<daphne::VectorizedPipelineOp>(op->getParentOp()) && checkUseCUDA(op)) ||
+                llvm::isa<daphne::CreateCUDAContextOp>(op)) {
                 op->setAttr("cuda_device", builder.getI32IntegerAttr(0));
             }
         }
@@ -189,7 +188,7 @@ void MarkCUDAOpsPass::runOnOperation() {
     });
 }
 
-std::unique_ptr<Pass> daphne::createMarkCUDAOpsPass(const DaphneUserConfig& cfg) {
+std::unique_ptr<Pass> daphne::createMarkCUDAOpsPass(const DaphneUserConfig &cfg) {
     return std::make_unique<MarkCUDAOpsPass>(cfg);
 }
 
diff --git a/src/compiler/lowering/MarkFPGAOPENCLOpsPass.cpp b/src/compiler/lowering/MarkFPGAOPENCLOpsPass.cpp
index 7d281322a..ef5666d06 100644
--- a/src/compiler/lowering/MarkFPGAOPENCLOpsPass.cpp
+++ b/src/compiler/lowering/MarkFPGAOPENCLOpsPass.cpp
@@ -27,31 +27,31 @@ struct MarkFPGAOPENCLOpsPass : public PassWrapper<MarkFPGAOPENCLOpsPass, Operati
     /**
      * @brief User configuration influencing the rewrite pass
      */
-    const DaphneUserConfig& cfg;
+    const DaphneUserConfig &cfg;
 
-    explicit MarkFPGAOPENCLOpsPass(const DaphneUserConfig& cfg) : cfg(cfg) {
-    }
+    explicit MarkFPGAOPENCLOpsPass(const DaphneUserConfig &cfg) : cfg(cfg) {}
 
     void runOnOperation() final;
 
-    bool checkUseFPGAOPENCL(Operation* op) const {
-//        std::cout << "checkUseFPGAOPENCL: " << op->getName().getStringRef().str() << std::endl;
+    bool checkUseFPGAOPENCL(Operation *op) const {
+        //        std::cout << "checkUseFPGAOPENCL: " <<
+        //        op->getName().getStringRef().str() << std::endl;
         return op->hasTrait<mlir::OpTrait::FPGAOPENCLSupport>();
     }
 };
 
 void MarkFPGAOPENCLOpsPass::runOnOperation() {
     func::FuncOp f = getOperation();
-    f->walk([&](Operation* op) {
+    f->walk([&](Operation *op) {
         OpBuilder builder(op);
-        if(checkUseFPGAOPENCL(op)) {
+        if (checkUseFPGAOPENCL(op)) {
             op->setAttr("fpgaopencl_device", builder.getI32IntegerAttr(0));
         }
         WalkResult::advance();
     });
 }
 
-std::unique_ptr<Pass> daphne::createMarkFPGAOPENCLOpsPass(const DaphneUserConfig& cfg) {
+std::unique_ptr<Pass> daphne::createMarkFPGAOPENCLOpsPass(const DaphneUserConfig &cfg) {
     return std::make_unique<MarkFPGAOPENCLOpsPass>(cfg);
 }
-#endif 
+#endif
diff --git a/src/compiler/lowering/MatMulOpLowering.cpp b/src/compiler/lowering/MatMulOpLowering.cpp
index 80b6709b7..c0aef46db 100644
--- a/src/compiler/lowering/MatMulOpLowering.cpp
+++ b/src/compiler/lowering/MatMulOpLowering.cpp
@@ -23,7 +23,6 @@
 #include <vector>
 
 #include "compiler/utils/LoweringUtils.h"
-#include <util/ErrorHandler.h>
 #include "hwloc.h"
 #include "ir/daphneir/Daphne.h"
 #include "ir/daphneir/Passes.h"
@@ -66,6 +65,7 @@
 #include "spdlog/spdlog.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
+#include <util/ErrorHandler.h>
 
 namespace mlir {
 #define GEN_PASS_DECL_MATMULOPLOWERINGPASS
@@ -79,537 +79,531 @@ static constexpr int ROW = 0;
 static constexpr int COL = 1;
 
 struct LowerMatMulOpOptions {
-  LowerMatMulOpOptions() {}
-  int vec_size_bits{0};
-  int num_vec_registers{0};
-  bool vectorize{false};
-  bool tile{false};
-  bool invert_loops{false};
-  bool useFixedTileSizes{false};
-  llvm::SmallVector<int, 3> cache_sizes;
-  llvm::SmallVector<unsigned, 5> tile_sizes;
-  int unroll_factor{0};
-  int unroll_jam_factor{0};
-
-  LowerMatMulOpOptions &setTileSizes(std::vector<unsigned> sizes) {
-    tile_sizes.clear();
-    for (auto s : sizes) {
-      tile_sizes.push_back(s);
+    LowerMatMulOpOptions() {}
+    int vec_size_bits{0};
+    int num_vec_registers{0};
+    bool vectorize{false};
+    bool tile{false};
+    bool invert_loops{false};
+    bool useFixedTileSizes{false};
+    llvm::SmallVector<int, 3> cache_sizes;
+    llvm::SmallVector<unsigned, 5> tile_sizes;
+    int unroll_factor{0};
+    int unroll_jam_factor{0};
+
+    LowerMatMulOpOptions &setTileSizes(std::vector<unsigned> sizes) {
+        tile_sizes.clear();
+        for (auto s : sizes) {
+            tile_sizes.push_back(s);
+        }
+        return *this;
+    }
+    LowerMatMulOpOptions &setUnrollFactor(int f) {
+        unroll_factor = f;
+        return *this;
+    }
+    LowerMatMulOpOptions &setUnrollJamFactor(int f) {
+        unroll_jam_factor = f;
+        return *this;
+    }
+    LowerMatMulOpOptions &setCacheSizes(llvm::SmallVector<int> caches) {
+        cache_sizes.clear();
+        for (auto c : caches) {
+            cache_sizes.push_back(c);
+        }
+        return *this;
+    }
+    LowerMatMulOpOptions &enableVectorization(bool b = true) {
+        vectorize = b;
+        return *this;
+    }
+    LowerMatMulOpOptions &setVectorSizeBits(int s) {
+        vec_size_bits = s;
+        return *this;
+    }
+    LowerMatMulOpOptions &setNumberOfVectorRegisters(int s) {
+        num_vec_registers = s;
+        return *this;
     }
-    return *this;
-  }
-  LowerMatMulOpOptions &setUnrollFactor(int f) {
-    unroll_factor = f;
-    return *this;
-  }
-  LowerMatMulOpOptions &setUnrollJamFactor(int f) {
-    unroll_jam_factor = f;
-    return *this;
-  }
-  LowerMatMulOpOptions &setCacheSizes(llvm::SmallVector<int> caches) {
-    cache_sizes.clear();
-    for (auto c : caches) {
-      cache_sizes.push_back(c);
+    LowerMatMulOpOptions &enableTiling(bool b = true) {
+        tile = b;
+        return *this;
     }
-    return *this;
-  }
-  LowerMatMulOpOptions &enableVectorization(bool b = true) {
-    vectorize = b;
-    return *this;
-  }
-  LowerMatMulOpOptions &setVectorSizeBits(int s) {
-    vec_size_bits = s;
-    return *this;
-  }
-  LowerMatMulOpOptions &setNumberOfVectorRegisters(int s) {
-    num_vec_registers = s;
-    return *this;
-  }
-  LowerMatMulOpOptions &enableTiling(bool b = true) {
-    tile = b;
-    return *this;
-  }
-  LowerMatMulOpOptions &enableLoopInversion(bool b = true) {
-    invert_loops = b;
-    return *this;
-  }
-  int getVecSize(int bitwidth) const {
-    if (vec_size_bits > 0) {
-      return std::max(1, vec_size_bits / bitwidth);
-    } else {
-      return 1;
+    LowerMatMulOpOptions &enableLoopInversion(bool b = true) {
+        invert_loops = b;
+        return *this;
     }
-  }
-  int getRegisterSize() const {
-    if (num_vec_registers != 0 && vec_size_bits != 0) {
-      return std::max(1, num_vec_registers * vec_size_bits);
+    int getVecSize(int bitwidth) const {
+        if (vec_size_bits > 0) {
+            return std::max(1, vec_size_bits / bitwidth);
+        } else {
+            return 1;
+        }
+    }
+    int getRegisterSize() const {
+        if (num_vec_registers != 0 && vec_size_bits != 0) {
+            return std::max(1, num_vec_registers * vec_size_bits);
+        }
+        return 1;
     }
-    return 1;
-  }
 };
 
 bool is_valid_options(LowerMatMulOpOptions const options) {
-  for (auto s : options.tile_sizes)
-    if (s <= 1) {
-      spdlog::warn("Tile sizes must be an integer larger than 1.");
-      return false;
+    for (auto s : options.tile_sizes)
+        if (s <= 1) {
+            spdlog::warn("Tile sizes must be an integer larger than 1.");
+            return false;
+        }
+    if (options.unroll_factor < 0) {
+        spdlog::warn("Unroll factor must be an integer >= 0.");
+        return false;
     }
-  if (options.unroll_factor < 0) {
-    spdlog::warn("Unroll factor must be an integer >= 0.");
-    return false;
-  }
-  if (options.unroll_jam_factor < 0) {
-    spdlog::warn("Unroll jam factor must be an integer >= 0.");
-    return false;
-  }
-  if (options.vec_size_bits < 0) {
-    spdlog::warn("Vector size bits must be an integer >= 0.");
-    return false;
-  }
-  return true;
-}
-
-class MatMulLowering : public OpConversionPattern<daphne::MatMulOp> {
-  const LowerMatMulOpOptions options;
-
-public:
-  using OpConversionPattern::OpConversionPattern;
-  explicit MatMulLowering(mlir::TypeConverter &typeConverter,
-                          MLIRContext *context,
-                          LowerMatMulOpOptions const &options)
-      : OpConversionPattern<daphne::MatMulOp>(typeConverter, context,
-                                              PatternBenefit(1)),
-        options(options) {
-    this->setDebugName("MatMulLowering");
-  }
-
-  bool is_vectorizable(ArrayRef<int64_t> const rhsShape,
-                       Type const matrixElementType) const {
-    if (rhsShape[COL] %
-            options.getVecSize(matrixElementType.getIntOrFloatBitWidth()) !=
-        0) {
-      return false;
+    if (options.unroll_jam_factor < 0) {
+        spdlog::warn("Unroll jam factor must be an integer >= 0.");
+        return false;
     }
-    if (!matrixElementType.isa<FloatType>()) {
-      return false;
+    if (options.vec_size_bits < 0) {
+        spdlog::warn("Vector size bits must be an integer >= 0.");
+        return false;
     }
     return true;
-  }
-
-  bool is_tileable(ArrayRef<int64_t> const rhsShape) const { return true; }
-
-  llvm::SmallVector<AffineForOp, 3>
-  affineMatMul(mlir::Value &lhs, mlir::Value &rhs, mlir::Value &output,
-               ConversionPatternRewriter &rewriter, mlir::Location loc,
-               ArrayRef<int64_t> lhsShape, ArrayRef<int64_t> rhsShape,
-               mlir::MLIRContext *ctx, SmallVector<AffineForOp, 3> &loops,
-               Type elementType) const {
-    // row loop
-    auto rowLoop = rewriter.create<AffineForOp>(loc, 0, lhsShape[ROW], 1);
-    // row loop body
-    rewriter.setInsertionPointToStart(rowLoop.getBody());
-    // col loop
-    auto colLoop = rewriter.create<AffineForOp>(loc, 0, rhsShape[COL], 1);
-    // col loop body
-    rewriter.setInsertionPointToStart(colLoop.getBody());
-    // fma loop
-    auto fmaLoop = rewriter.create<AffineForOp>(loc, 0, rhsShape[ROW], 1);
-    // inner loop body
-    rewriter.setInsertionPointToStart(fmaLoop.getBody());
-
-    auto a = rewriter.create<AffineLoadOp>(
-        loc, lhs,
-        ValueRange{rowLoop.getInductionVar(), fmaLoop.getInductionVar()});
-    auto b = rewriter.create<AffineLoadOp>(
-        loc, rhs,
-        ValueRange{fmaLoop.getInductionVar(), colLoop.getInductionVar()});
-    auto c = rewriter.create<AffineLoadOp>(
-        loc, output,
-        ValueRange{rowLoop.getInductionVar(), colLoop.getInductionVar()});
-    if (elementType.isIntOrIndex()) {
-      // Arith operates on MLIR signless integers, while Daphne uses (un)signed
-      // integers.
-      Value castedA = this->typeConverter->materializeTargetConversion(
-          rewriter, loc,
-          rewriter.getIntegerType(elementType.getIntOrFloatBitWidth()),
-          ValueRange{a});
-      Value castedB = this->typeConverter->materializeTargetConversion(
-          rewriter, loc,
-          rewriter.getIntegerType(elementType.getIntOrFloatBitWidth()),
-          ValueRange{b});
-      Value castedC = this->typeConverter->materializeTargetConversion(
-          rewriter, loc,
-          rewriter.getIntegerType(elementType.getIntOrFloatBitWidth()),
-          ValueRange{c});
-      Value added = rewriter.create<arith::MulIOp>(loc, castedA, castedB);
-      Value res = rewriter.create<arith::AddIOp>(loc, added, castedC);
-      Value castedRes = this->typeConverter->materializeSourceConversion(
-          rewriter, loc, elementType, ValueRange{res});
-      rewriter.create<AffineStoreOp>(
-          loc, castedRes, output,
-          ValueRange{rowLoop.getInductionVar(), colLoop.getInductionVar()});
-    } else {
-      Value res = rewriter.create<LLVM::FMAOp>(loc, a, b, c);
-      rewriter.create<AffineStoreOp>(
-          loc, res, output,
-          ValueRange{rowLoop.getInductionVar(), colLoop.getInductionVar()});
-    }
+}
 
-    // AffineYieldOp at end of loop blocks
-    rewriter.setInsertionPointAfter(fmaLoop);
-    rewriter.setInsertionPointAfter(colLoop);
-    rewriter.setInsertionPointAfter(rowLoop);
-
-    loops.push_back(rowLoop);
-    loops.push_back(colLoop);
-    loops.push_back(fmaLoop);
-    return loops;
-  }
-
-  llvm::SmallVector<AffineForOp, 3> vectorizedAffineMatMul(
-      mlir::Value &lhs, mlir::Value &rhs, mlir::Value &output,
-      ConversionPatternRewriter &rewriter, mlir::Location loc,
-      ArrayRef<int64_t> lhsShape, ArrayRef<int64_t> rhsShape,
-      mlir::MLIRContext *ctx, llvm::SmallVector<AffineForOp, 3> &loops,
-      Type elementType, int64_t vec_size) const {
-    auto vec_Type = mlir::VectorType::get({vec_size}, elementType);
-
-    // row loop
-    auto rowLoop = rewriter.create<AffineForOp>(loc, 0, lhsShape[ROW], 1);
-    // row loop body
-    rewriter.setInsertionPointToStart(rowLoop.getBody());
-    // col loop
-    auto colLoop =
-        rewriter.create<AffineForOp>(loc, 0, rhsShape[COL], vec_size);
-    // col loop body
-    rewriter.setInsertionPointToStart(colLoop.getBody());
-    // fma loop
-    auto fmaLoop = rewriter.create<AffineForOp>(loc, 0, rhsShape[ROW], 1);
-    // inner loop body
-    rewriter.setInsertionPointToStart(fmaLoop.getBody());
-
-    auto a_single = rewriter.create<AffineLoadOp>(
-        loc, lhs,
-        ValueRange{rowLoop.getInductionVar(), fmaLoop.getInductionVar()});
-    auto a = rewriter.create<vector::SplatOp>(loc, a_single, vec_Type);
-    auto b = rewriter.create<AffineVectorLoadOp>(
-        loc, vec_Type, rhs,
-        ValueRange{fmaLoop.getInductionVar(), colLoop.getInductionVar()});
-    auto c = rewriter.create<AffineVectorLoadOp>(
-        loc, vec_Type, output,
-        ValueRange{rowLoop.getInductionVar(), colLoop.getInductionVar()});
-
-    // TODO: Integer doesn't actually work yet, so is disabled in
-    // is_vectorizable.
-    if (elementType.isIntOrIndex()) {
-      Value added = rewriter.create<arith::MulIOp>(loc, a, b);
-      Value res = rewriter.create<arith::AddIOp>(loc, added, c);
-      rewriter.create<AffineVectorStoreOp>(
-          loc, res, output,
-          ValueRange{rowLoop.getInductionVar(), colLoop.getInductionVar()});
-    } else {
-      Value res = rewriter.create<vector::FMAOp>(loc, a, b, c);
-      rewriter.create<AffineVectorStoreOp>(
-          loc, res, output,
-          ValueRange{rowLoop.getInductionVar(), colLoop.getInductionVar()});
+class MatMulLowering : public OpConversionPattern<daphne::MatMulOp> {
+    const LowerMatMulOpOptions options;
+
+  public:
+    using OpConversionPattern::OpConversionPattern;
+    explicit MatMulLowering(mlir::TypeConverter &typeConverter, MLIRContext *context,
+                            LowerMatMulOpOptions const &options)
+        : OpConversionPattern<daphne::MatMulOp>(typeConverter, context, PatternBenefit(1)), options(options) {
+        this->setDebugName("MatMulLowering");
     }
 
-    // AffineYieldOp at end of loop blocks
-    rewriter.setInsertionPointAfter(fmaLoop);
-    rewriter.setInsertionPointAfter(colLoop);
-    rewriter.setInsertionPointAfter(rowLoop);
-
-    loops.push_back(rowLoop);
-    loops.push_back(colLoop);
-    loops.push_back(fmaLoop);
-    return loops;
-  }
-
-  LogicalResult
-  matchAndRewrite(daphne::MatMulOp op, OpAdaptor adaptor,
-                  ConversionPatternRewriter &rewriter) const override {
-    auto loc = op->getLoc();
-    mlir::daphne::MatrixType lhsMatrixType =
-        adaptor.getLhs().getType().dyn_cast<mlir::daphne::MatrixType>();
-    mlir::daphne::MatrixType rhsMatrixType =
-        adaptor.getRhs().getType().dyn_cast<mlir::daphne::MatrixType>();
-
-    auto lhsRows = lhsMatrixType.getNumRows();
-    auto lhsCols = lhsMatrixType.getNumCols();
-
-    auto rhsRows = rhsMatrixType.getNumRows();
-    auto rhsCols = rhsMatrixType.getNumCols();
-
-    auto matrixElementType = lhsMatrixType.getElementType();
-
-    // TODO(phil): if shape is unknown, e.g., row/col = -1 we currently
-    // can't create a MemRefType
-    auto lhsMemRefType =
-        mlir::MemRefType::get({lhsRows, lhsCols}, matrixElementType);
-    auto rhsMemRefType =
-        mlir::MemRefType::get({rhsRows, rhsCols}, matrixElementType);
-
-    mlir::MemRefType outputMemRefType =
-        mlir::MemRefType::get({lhsRows, rhsCols}, matrixElementType);
-
-    // daphne::Matrix -> memref
-    mlir::Value lhs = rewriter.create<mlir::daphne::ConvertDenseMatrixToMemRef>(
-        op->getLoc(), lhsMemRefType, adaptor.getLhs());
-    mlir::Value rhs = rewriter.create<mlir::daphne::ConvertDenseMatrixToMemRef>(
-        op->getLoc(), rhsMemRefType, adaptor.getRhs());
-
-    // Alloc output memref
-    mlir::Value outputMemRef =
-        insertMemRefAlloc(outputMemRefType, loc, rewriter);
-
-    // Fill the output MemRef
-    if (matrixElementType.isIntOrIndex()) {
-      auto signless_type =
-          rewriter.getIntegerType(matrixElementType.getIntOrFloatBitWidth());
-      auto fillValue = rewriter.create<arith::ConstantOp>(
-          loc, signless_type, rewriter.getIntegerAttr(signless_type, 0));
-      auto castedFillValue = this->typeConverter->materializeTargetConversion(
-          rewriter, loc, matrixElementType, mlir::ValueRange{fillValue});
-      affineFillMemRefInt(castedFillValue, rewriter, loc,
-                          outputMemRefType.getShape(), op->getContext(),
-                          outputMemRef);
-    } else {
-      affineFillMemRef(0.0, rewriter, loc, outputMemRefType.getShape(),
-                       op->getContext(), outputMemRef, matrixElementType);
+    bool is_vectorizable(ArrayRef<int64_t> const rhsShape, Type const matrixElementType) const {
+        if (rhsShape[COL] % options.getVecSize(matrixElementType.getIntOrFloatBitWidth()) != 0) {
+            return false;
+        }
+        if (!matrixElementType.isa<FloatType>()) {
+            return false;
+        }
+        return true;
     }
-    // Do the actual MatMul with hand built codegen
-    SmallVector<AffineForOp, 3> loops;
-    if (options.vectorize &&
-        is_vectorizable(rhsMemRefType.getShape(), matrixElementType)) {
-      vectorizedAffineMatMul(
-          lhs, rhs, outputMemRef, rewriter, loc, lhsMemRefType.getShape(),
-          rhsMemRefType.getShape(), op->getContext(), loops, matrixElementType,
-          options.getVecSize(matrixElementType.getIntOrFloatBitWidth()));
-    } else {
-      affineMatMul(lhs, rhs, outputMemRef, rewriter, loc,
-                   lhsMemRefType.getShape(), rhsMemRefType.getShape(),
-                   op->getContext(), loops, matrixElementType);
-    }
-    if (options.tile && is_tileable(rhsMemRefType.getShape())) {
-      auto tile_sizes = extendTileSizes(lhsRows);
-      if (!options.useFixedTileSizes) {
-        tile_sizes = getTileSizesFromCache(matrixElementType,
-                                           loops[1].getStep(), lhsRows);
-      }
-      tile_loops(loc, loops, tile_sizes);
-    } else if (options.invert_loops){
-      permuteLoops(loops, {0, 2, 1});
-    }
-    mlir::Value DM =
-        convertMemRefToDenseMatrix(loc, rewriter, outputMemRef, op.getType());
-
-    rewriter.replaceOp(op, DM);
-    return success();
-  }
-
-  // tile_loops requires 5 tile sizes. If fewer tile sizes are specified, we can
-  // extend with the size of the loop, since loops with only one iteration are
-  // later removed.
-  SmallVector<unsigned, 5> extendTileSizes(int64_t max_loop_length) const {
-    SmallVector<unsigned, 5> tile_sizes = options.tile_sizes;
-    while (tile_sizes.size() < 5) {
-      tile_sizes.push_back(max_loop_length);
-    }
-    return tile_sizes;
-  }
-
-  // Choose tile sizes so that reuse is happening across the cache levels. This
-  // is just a proof of concept and not a very sophisticated strategy. Assuming
-  // cache sizes are in Bytes not KB or other units. Assume square matmul of
-  // length loop_length. The target below is laid out assuming there are a
-  // number of vector registers available. If not all cache sizes "move down" a
-  // slot if set. If there are also no cache sizes available, set MR and NR to
-  // 2, since otherwise the tiling breaks. Target:  MR * NR ~ Register size * 3
-  // / 4
-  //          KC * NR ~ L1,
-  //          MC * KC ~ L2,
-  //          NC * MC ~ L3
-  //          & NR divides NC & MR divides MC
-  SmallVector<unsigned, 5> getTileSizesFromCache(Type const matrixElementType,
-                                                 int64_t vec_size,
-                                                 int64_t loop_length) const {
-    SmallVector<unsigned, 5> tile_sizes;
-    int bitwidth = matrixElementType.getIntOrFloatBitWidth();
-    int register_size = options.getRegisterSize();
-    int no_register = 0;
-    if (register_size == 1) {
-      if (options.cache_sizes.size() > 0) {
-        tile_sizes.push_back(
-            std::max(2, (int)(std::sqrt(register_size / bitwidth))));
-        tile_sizes.push_back(tile_sizes.back());
-        no_register++;
-      } else {
-        tile_sizes.push_back(2);
-        tile_sizes.push_back(2);
-      }
-    } else {
-      tile_sizes.push_back(
-          std::max(2, (int)(std::sqrt(register_size / bitwidth * 3 / 4))));
-      tile_sizes.push_back(tile_sizes.back());
+
+    bool is_tileable(ArrayRef<int64_t> const rhsShape) const { return true; }
+
+    llvm::SmallVector<AffineForOp, 3> affineMatMul(mlir::Value &lhs, mlir::Value &rhs, mlir::Value &output,
+                                                   ConversionPatternRewriter &rewriter, mlir::Location loc,
+                                                   ArrayRef<int64_t> lhsShape, ArrayRef<int64_t> rhsShape,
+                                                   mlir::MLIRContext *ctx, SmallVector<AffineForOp, 3> &loops,
+                                                   Type elementType) const {
+        // row loop
+        auto rowLoop = rewriter.create<AffineForOp>(loc, 0, lhsShape[ROW], 1);
+        // row loop body
+        rewriter.setInsertionPointToStart(rowLoop.getBody());
+        // col loop
+        auto colLoop = rewriter.create<AffineForOp>(loc, 0, rhsShape[COL], 1);
+        // col loop body
+        rewriter.setInsertionPointToStart(colLoop.getBody());
+        // fma loop
+        auto fmaLoop = rewriter.create<AffineForOp>(loc, 0, rhsShape[ROW], 1);
+        // inner loop body
+        rewriter.setInsertionPointToStart(fmaLoop.getBody());
+
+        auto a =
+            rewriter.create<AffineLoadOp>(loc, lhs, ValueRange{rowLoop.getInductionVar(), fmaLoop.getInductionVar()});
+        auto b =
+            rewriter.create<AffineLoadOp>(loc, rhs, ValueRange{fmaLoop.getInductionVar(), colLoop.getInductionVar()});
+        auto c = rewriter.create<AffineLoadOp>(loc, output,
+                                               ValueRange{rowLoop.getInductionVar(), colLoop.getInductionVar()});
+        if (elementType.isIntOrIndex()) {
+            // Arith operates on MLIR signless integers, while Daphne uses
+            // (un)signed integers.
+            Value castedA = this->typeConverter->materializeTargetConversion(
+                rewriter, loc, rewriter.getIntegerType(elementType.getIntOrFloatBitWidth()), ValueRange{a});
+            Value castedB = this->typeConverter->materializeTargetConversion(
+                rewriter, loc, rewriter.getIntegerType(elementType.getIntOrFloatBitWidth()), ValueRange{b});
+            Value castedC = this->typeConverter->materializeTargetConversion(
+                rewriter, loc, rewriter.getIntegerType(elementType.getIntOrFloatBitWidth()), ValueRange{c});
+            Value added = rewriter.create<arith::MulIOp>(loc, castedA, castedB);
+            Value res = rewriter.create<arith::AddIOp>(loc, added, castedC);
+            Value castedRes =
+                this->typeConverter->materializeSourceConversion(rewriter, loc, elementType, ValueRange{res});
+            rewriter.create<AffineStoreOp>(loc, castedRes, output,
+                                           ValueRange{rowLoop.getInductionVar(), colLoop.getInductionVar()});
+        } else {
+            Value res = rewriter.create<LLVM::FMAOp>(loc, a, b, c);
+            rewriter.create<AffineStoreOp>(loc, res, output,
+                                           ValueRange{rowLoop.getInductionVar(), colLoop.getInductionVar()});
+        }
+
+        // AffineYieldOp at end of loop blocks
+        rewriter.setInsertionPointAfter(fmaLoop);
+        rewriter.setInsertionPointAfter(colLoop);
+        rewriter.setInsertionPointAfter(rowLoop);
+
+        loops.push_back(rowLoop);
+        loops.push_back(colLoop);
+        loops.push_back(fmaLoop);
+        return loops;
     }
-    if (options.cache_sizes.size() > 0) {
-      int idx = 0;
-      for (auto cache_size = options.cache_sizes.begin() + no_register;
-           cache_size != options.cache_sizes.end(); cache_size++) {
-        unsigned candidate =
-            std::max(1, (int)(*cache_size / tile_sizes.back() / bitwidth));
-        if (idx == 3)
-          candidate = candidate - (candidate % tile_sizes[0]);
-        if (idx == 4)
-          candidate = candidate - (candidate % tile_sizes[1]);
-        tile_sizes.push_back(candidate);
-        idx++;
-      }
+
+    llvm::SmallVector<AffineForOp, 3> vectorizedAffineMatMul(mlir::Value &lhs, mlir::Value &rhs, mlir::Value &output,
+                                                             ConversionPatternRewriter &rewriter, mlir::Location loc,
+                                                             ArrayRef<int64_t> lhsShape, ArrayRef<int64_t> rhsShape,
+                                                             mlir::MLIRContext *ctx,
+                                                             llvm::SmallVector<AffineForOp, 3> &loops, Type elementType,
+                                                             int64_t vec_size) const {
+        auto vec_Type = mlir::VectorType::get({vec_size}, elementType);
+
+        // row loop
+        auto rowLoop = rewriter.create<AffineForOp>(loc, 0, lhsShape[ROW], 1);
+        // row loop body
+        rewriter.setInsertionPointToStart(rowLoop.getBody());
+        // col loop
+        auto colLoop = rewriter.create<AffineForOp>(loc, 0, rhsShape[COL], vec_size);
+        // col loop body
+        rewriter.setInsertionPointToStart(colLoop.getBody());
+        // fma loop
+        auto fmaLoop = rewriter.create<AffineForOp>(loc, 0, rhsShape[ROW], 1);
+        // inner loop body
+        rewriter.setInsertionPointToStart(fmaLoop.getBody());
+
+        auto a_single =
+            rewriter.create<AffineLoadOp>(loc, lhs, ValueRange{rowLoop.getInductionVar(), fmaLoop.getInductionVar()});
+        auto a = rewriter.create<vector::SplatOp>(loc, a_single, vec_Type);
+        auto b = rewriter.create<AffineVectorLoadOp>(loc, vec_Type, rhs,
+                                                     ValueRange{fmaLoop.getInductionVar(), colLoop.getInductionVar()});
+        auto c = rewriter.create<AffineVectorLoadOp>(loc, vec_Type, output,
+                                                     ValueRange{rowLoop.getInductionVar(), colLoop.getInductionVar()});
+
+        // TODO: Integer doesn't actually work yet, so is disabled in
+        // is_vectorizable.
+        if (elementType.isIntOrIndex()) {
+            Value added = rewriter.create<arith::MulIOp>(loc, a, b);
+            Value res = rewriter.create<arith::AddIOp>(loc, added, c);
+            rewriter.create<AffineVectorStoreOp>(loc, res, output,
+                                                 ValueRange{rowLoop.getInductionVar(), colLoop.getInductionVar()});
+        } else {
+            Value res = rewriter.create<vector::FMAOp>(loc, a, b, c);
+            rewriter.create<AffineVectorStoreOp>(loc, res, output,
+                                                 ValueRange{rowLoop.getInductionVar(), colLoop.getInductionVar()});
+        }
+
+        // AffineYieldOp at end of loop blocks
+        rewriter.setInsertionPointAfter(fmaLoop);
+        rewriter.setInsertionPointAfter(colLoop);
+        rewriter.setInsertionPointAfter(rowLoop);
+
+        loops.push_back(rowLoop);
+        loops.push_back(colLoop);
+        loops.push_back(fmaLoop);
+        return loops;
     }
-    while (tile_sizes.size() < 5) {
-      tile_sizes.push_back(loop_length);
+
+    LogicalResult matchAndRewrite(daphne::MatMulOp op, OpAdaptor adaptor,
+                                  ConversionPatternRewriter &rewriter) const override {
+        auto loc = op->getLoc();
+        mlir::daphne::MatrixType lhsMatrixType = adaptor.getLhs().getType().dyn_cast<mlir::daphne::MatrixType>();
+        mlir::daphne::MatrixType rhsMatrixType = adaptor.getRhs().getType().dyn_cast<mlir::daphne::MatrixType>();
+
+        auto lhsRows = lhsMatrixType.getNumRows();
+        auto lhsCols = lhsMatrixType.getNumCols();
+
+        auto rhsRows = rhsMatrixType.getNumRows();
+        auto rhsCols = rhsMatrixType.getNumCols();
+
+        auto matrixElementType = lhsMatrixType.getElementType();
+
+        // TODO(phil): if shape is unknown, e.g., row/col = -1 we currently
+        // can't create a MemRefType
+        auto lhsMemRefType = mlir::MemRefType::get({lhsRows, lhsCols}, matrixElementType);
+        auto rhsMemRefType = mlir::MemRefType::get({rhsRows, rhsCols}, matrixElementType);
+
+        mlir::MemRefType outputMemRefType = mlir::MemRefType::get({lhsRows, rhsCols}, matrixElementType);
+
+        // daphne::Matrix -> memref
+        mlir::Value lhs =
+            rewriter.create<mlir::daphne::ConvertDenseMatrixToMemRef>(op->getLoc(), lhsMemRefType, adaptor.getLhs());
+        mlir::Value rhs =
+            rewriter.create<mlir::daphne::ConvertDenseMatrixToMemRef>(op->getLoc(), rhsMemRefType, adaptor.getRhs());
+
+        // Alloc output memref
+        mlir::Value outputMemRef = insertMemRefAlloc(outputMemRefType, loc, rewriter);
+
+        // Fill the output MemRef
+        if (matrixElementType.isIntOrIndex()) {
+            auto signless_type = rewriter.getIntegerType(matrixElementType.getIntOrFloatBitWidth());
+            auto fillValue =
+                rewriter.create<arith::ConstantOp>(loc, signless_type, rewriter.getIntegerAttr(signless_type, 0));
+            auto castedFillValue = this->typeConverter->materializeTargetConversion(rewriter, loc, matrixElementType,
+                                                                                    mlir::ValueRange{fillValue});
+            affineFillMemRef(castedFillValue, rewriter, loc, outputMemRefType.getShape(), op->getContext(),
+                             outputMemRef);
+        } else {
+            mlir::Value fillValue = rewriter.create<mlir::arith::ConstantOp>(
+                loc, matrixElementType, rewriter.getFloatAttr(matrixElementType, 0.0));
+            affineFillMemRef(fillValue, rewriter, loc, outputMemRefType.getShape(), op->getContext(), outputMemRef);
+        }
+        // Do the actual MatMul with hand built codegen
+        SmallVector<AffineForOp, 3> loops;
+        if (options.vectorize && is_vectorizable(rhsMemRefType.getShape(), matrixElementType)) {
+            vectorizedAffineMatMul(lhs, rhs, outputMemRef, rewriter, loc, lhsMemRefType.getShape(),
+                                   rhsMemRefType.getShape(), op->getContext(), loops, matrixElementType,
+                                   options.getVecSize(matrixElementType.getIntOrFloatBitWidth()));
+        } else {
+            affineMatMul(lhs, rhs, outputMemRef, rewriter, loc, lhsMemRefType.getShape(), rhsMemRefType.getShape(),
+                         op->getContext(), loops, matrixElementType);
+        }
+        if (options.tile && is_tileable(rhsMemRefType.getShape())) {
+            auto tile_sizes = extendTileSizes(lhsRows);
+            if (!options.useFixedTileSizes) {
+                tile_sizes = getTileSizesFromCache(matrixElementType, loops[1].getStep(), lhsRows);
+            }
+            tile_loops(loc, loops, tile_sizes);
+        } else if (options.invert_loops) {
+            permuteLoops(loops, {0, 2, 1});
+        }
+        mlir::Value DM = convertMemRefToDenseMatrix(loc, rewriter, outputMemRef, op.getType());
+
+        rewriter.replaceOp(op, DM);
+        return success();
     }
-    // If vector size is longer than 1, we need to keep that in mind for the NR
-    // loop
-    if (vec_size > 1)
-      tile_sizes[1] = std::max(1, (int)(tile_sizes[1] / vec_size));
-    return tile_sizes;
-  }
-
-  // Tile the affine loop nest generated from MatMulOp with the specified tile
-  // sizes. Includes validations to follow the movement and creation of the tile
-  // loops.
-  void tile_loops(mlir::Location loc,
-                  SmallVector<AffineForOp, 3> loops,
-                  SmallVector<unsigned, 5> tile_sizes) const {
-    unsigned NC = tile_sizes[4];
-    unsigned MC = tile_sizes[3];
-    unsigned KC = tile_sizes[2];
-    unsigned NR = tile_sizes[1];
-    unsigned MR = tile_sizes[0];
-    unsigned KU = options.unroll_factor;
-    [[maybe_unused]] auto vec_size = loops[1].getStep();
-    llvm::SmallVector<AffineForOp> loopNest;
-    getPerfectlyNestedLoops(loopNest, loops.front());
-    // tile i with MC, j with NC, k with KC
-    llvm::SmallVector<AffineForOp> tiledNest;
-    if (failed(tilePerfectlyNested(loopNest, {MC, NC, KC}, &tiledNest))) {
-      spdlog::warn("Could not tile the loop nest in MatMulLowering");
-    };
-
-    #define GEN_ERR_MSG(name, size, expected) \
-      std::string(name) + " should have step size " + std::string(expected) + " but is " + std::to_string(size)
-
-    if (tiledNest[0].getStep() != MC)
-      throw ErrorHandler::compilerError(loc, "MatMulOpLowering (tile_loops)", GEN_ERR_MSG("tiledNest 0", tiledNest[0].getStep(), "MC (" + std::to_string(MC) + ")"));
-    if (tiledNest[1].getStep() != NC * vec_size)
-      throw ErrorHandler::compilerError(loc, "MatMulOpLowering (tile_loops)", GEN_ERR_MSG("tiledNest 1", tiledNest[1].getStep(), "NC * vec_size (" + std::to_string(NC * vec_size) + ")"));
-    if (tiledNest[2].getStep() != KC)
-      throw ErrorHandler::compilerError(loc, "MatMulOpLowering (tile_loops)", GEN_ERR_MSG("tiledNest 2", tiledNest[2].getStep(), "KC (" + std::to_string(KC) + ")"));
-    if (tiledNest[3].getStep() != 1)
-      throw ErrorHandler::compilerError(loc, "MatMulOpLowering (tile_loops)", GEN_ERR_MSG("tiledNest 3", tiledNest[3].getStep(), "1"));
-    if (tiledNest[4].getStep() != vec_size)
-      throw ErrorHandler::compilerError(loc, "MatMulOpLowering (tile_loops)", GEN_ERR_MSG("tiledNest 4", tiledNest[4].getStep(), "vec_size (" + std::to_string(vec_size) + ")"));
-    if (tiledNest[5].getStep() != 1)
-      throw ErrorHandler::compilerError(loc, "MatMulOpLowering (tile_loops)", GEN_ERR_MSG("tiledNest 5", tiledNest[5].getStep(), "1"));
-
-    // Further tile the i mod MC loop with MR
-    if (failed(tilePerfectlyNested(tiledNest[3], {MR}))) {
-      spdlog::warn("Could not tile the second i loop in MatMulLowering");
-    };
-
-    // Further tile the j mod NC loop with NR
-    if (tiledNest[4].getStep() != vec_size)
-      throw ErrorHandler::compilerError(loc, "MatMulOpLowering (tile_loops)", GEN_ERR_MSG("tiledNest 4", tiledNest[4].getStep(), "vec_size (" + std::to_string(vec_size) + ")"));
-    if (failed(tilePerfectlyNested(tiledNest[4], {NR}))) {
-      spdlog::warn("Could not tile the second j loop in MatMulLowering");
-    };
-
-    llvm::SmallVector<AffineForOp> twiceTiledNest;
-    getPerfectlyNestedLoops(twiceTiledNest, tiledNest[0]);
-    // i loops
-    if (twiceTiledNest[0].getStep() != MC)
-      throw ErrorHandler::compilerError(loc, "MatMulOpLowering (tile_loops)", GEN_ERR_MSG("twiceTiledNest 0", twiceTiledNest[0].getStep(), "MC (" +  std::to_string(MC) + ")"));
-    if (twiceTiledNest[3].getStep() != MR)
-      throw ErrorHandler::compilerError(loc, "MatMulOpLowering (tile_loops)", GEN_ERR_MSG("twiceTiledNest 3", twiceTiledNest[3].getStep(), "MR (" +  std::to_string(MR) + ")"));
-    if (twiceTiledNest[4].getStep() != 1)
-      throw ErrorHandler::compilerError(loc, "MatMulOpLowering (tile_loops)", GEN_ERR_MSG("twiceTiledNest 4", twiceTiledNest[4].getStep(), "1"));
-
-    // j loops
-    if (twiceTiledNest[1].getStep() != NC * vec_size)
-      throw ErrorHandler::compilerError(loc, "MatMulOpLowering (tile_loops)", GEN_ERR_MSG("twiceTiledNest 1", twiceTiledNest[1].getStep(), "NC * vec_size (" +  std::to_string(NC * vec_size) + ")"));
-    if (twiceTiledNest[5].getStep() != NR * vec_size)
-      throw ErrorHandler::compilerError(loc, "MatMulOpLowering (tile_loops)", GEN_ERR_MSG("twiceTiledNest 5", twiceTiledNest[5].getStep(), "NR * vec_size (" +  std::to_string(NR * vec_size) + ")"));
-    if (twiceTiledNest[6].getStep() != vec_size)
-      throw ErrorHandler::compilerError(loc, "MatMulOpLowering (tile_loops)", GEN_ERR_MSG("twiceTiledNest 6", twiceTiledNest[6].getStep(), "vec_size (" +  std::to_string(vec_size) + ")"));
-
-    // k loops
-    if (twiceTiledNest[2].getStep() != KC)
-      throw ErrorHandler::compilerError(loc, "MatMulOpLowering (tile_loops)", GEN_ERR_MSG("twiceTiledNest 2", twiceTiledNest[2].getStep(), "KC (" + std::to_string(KC) + ")"));
-    if (twiceTiledNest[7].getStep() != 1)
-      throw ErrorHandler::compilerError(loc, "MatMulOpLowering (tile_loops)", GEN_ERR_MSG("twiceTiledNest 7", twiceTiledNest[7].getStep(), "1"));
-
-    // permute loops to final order (i / MC, j / NC, k / KC, i / MR, i mod MR, j
-    // / NR, j mod NR, k mod KC) ->
-    //                              (j / NC, k / KC, i / MC, j / NR, i / MR, k
-    //                              mod KC, j mod NR, i mod MR)
-    unsigned root_idx = permuteLoops(twiceTiledNest, {2, 0, 1, 4, 7, 3, 6, 5});
-
-    // Unroll and jam
-    llvm::SmallVector<AffineForOp> blisTiledLoops;
-    getPerfectlyNestedLoops(blisTiledLoops, twiceTiledNest[root_idx]);
-    // i loops
-    if (blisTiledLoops[2].getStep() != MC)
-      throw ErrorHandler::compilerError(loc, "MatMulOpLowering (tile_loops)", GEN_ERR_MSG("blisTiled 2", blisTiledLoops[2].getStep(), "MC (" + std::to_string(MC) + ")"));
-    if (blisTiledLoops[4].getStep() != MR)
-      throw ErrorHandler::compilerError(loc, "MatMulOpLowering (tile_loops)", GEN_ERR_MSG("blisTiled 4", blisTiledLoops[4].getStep(), "MR (" + std::to_string(MR) + ")"));
-    if (blisTiledLoops[7].getStep() != 1)
-      throw ErrorHandler::compilerError(loc, "MatMulOpLowering (tile_loops)", GEN_ERR_MSG("blisTiled 7", blisTiledLoops[7].getStep(), "1"));
-
-    // j loops
-    if (blisTiledLoops[0].getStep() != NC * vec_size)
-      throw ErrorHandler::compilerError(loc, "MatMulOpLowering (tile_loops)", GEN_ERR_MSG("blisTiled 0", blisTiledLoops[0].getStep(), "NC * vec_size (" + std::to_string(NC * vec_size) + ")"));
-    if (blisTiledLoops[3].getStep() != NR * vec_size)
-      throw ErrorHandler::compilerError(loc, "MatMulOpLowering (tile_loops)", GEN_ERR_MSG("blisTiled 3", blisTiledLoops[3].getStep(), "NR * vec_size (" + std::to_string(NR * vec_size) + ")"));
-    if (blisTiledLoops[6].getStep() != vec_size)
-      throw ErrorHandler::compilerError(loc, "MatMulOpLowering (tile_loops)", GEN_ERR_MSG("blisTiled 6", blisTiledLoops[6].getStep(), "vec_size (" + std::to_string(vec_size) + ")"));
-    
-    // k loops
-    if (blisTiledLoops[1].getStep() != KC)
-      throw ErrorHandler::compilerError(loc, "MatMulOpLowering (tile_loops)", GEN_ERR_MSG("blisTiled 1", blisTiledLoops[1].getStep(), "KC (" + std::to_string(KC) + ")"));
-    if (blisTiledLoops[5].getStep() != 1)
-      throw ErrorHandler::compilerError(loc, "MatMulOpLowering (tile_loops)", GEN_ERR_MSG("blisTiled 5", blisTiledLoops[5].getStep(), "1"));
-
-    #undef GEN_ERR_MSG
-
-    // Unroll jam causes Segfault, if called in a way where the loop is not
-    // cleanly divided.
-    if (options.unroll_jam_factor > 0 &&
-        blisTiledLoops[5].getUpperBound().getMap().getNumResults() == 1 &&
-        succeeded(loopUnrollJamUpToFactor(blisTiledLoops[5],
-                                          options.unroll_jam_factor))) {
-      if (blisTiledLoops[6].getUpperBound().getMap().getNumResults() != 1 ||
-          failed(loopUnrollJamUpToFactor(blisTiledLoops[6],
-                                         options.unroll_jam_factor))) {
-        spdlog::warn(
-            "Could not unroll the (j mod NC) mod NR loop in MatMulLowering");
-      }
-    } else {
-      spdlog::warn(
-          "Could not unroll the (i mod MC) mod MR loop in MatMulLowering");
+
+    // tile_loops requires 5 tile sizes. If fewer tile sizes are specified, we
+    // can extend with the size of the loop, since loops with only one iteration
+    // are later removed.
+    SmallVector<unsigned, 5> extendTileSizes(int64_t max_loop_length) const {
+        SmallVector<unsigned, 5> tile_sizes = options.tile_sizes;
+        while (tile_sizes.size() < 5) {
+            tile_sizes.push_back(max_loop_length);
+        }
+        return tile_sizes;
     }
 
-    llvm::SmallVector<AffineForOp> lastNest;
-    getPerfectlyNestedLoops(lastNest, blisTiledLoops.front());
-    int64_t i = 0;
-    while (succeeded(promoteIfSingleIteration(lastNest[i])) && i < 4) {
-      i++;
+    // Choose tile sizes so that reuse is happening across the cache levels.
+    // This is just a proof of concept and not a very sophisticated strategy.
+    // Assuming cache sizes are in Bytes not KB or other units. Assume square
+    // matmul of length loop_length. The target below is laid out assuming there
+    // are a number of vector registers available. If not all cache sizes "move
+    // down" a slot if set. If there are also no cache sizes available, set MR
+    // and NR to 2, since otherwise the tiling breaks. Target:  MR * NR ~
+    // Register size * 3 / 4
+    //          KC * NR ~ L1,
+    //          MC * KC ~ L2,
+    //          NC * MC ~ L3
+    //          & NR divides NC & MR divides MC
+    SmallVector<unsigned, 5> getTileSizesFromCache(Type const matrixElementType, int64_t vec_size,
+                                                   int64_t loop_length) const {
+        SmallVector<unsigned, 5> tile_sizes;
+        int bitwidth = matrixElementType.getIntOrFloatBitWidth();
+        int register_size = options.getRegisterSize();
+        int no_register = 0;
+        if (register_size == 1) {
+            if (options.cache_sizes.size() > 0) {
+                tile_sizes.push_back(std::max(2, (int)(std::sqrt(register_size / bitwidth))));
+                tile_sizes.push_back(tile_sizes.back());
+                no_register++;
+            } else {
+                tile_sizes.push_back(2);
+                tile_sizes.push_back(2);
+            }
+        } else {
+            tile_sizes.push_back(std::max(2, (int)(std::sqrt(register_size / bitwidth * 3 / 4))));
+            tile_sizes.push_back(tile_sizes.back());
+        }
+        if (options.cache_sizes.size() > 0) {
+            int idx = 0;
+            for (auto cache_size = options.cache_sizes.begin() + no_register; cache_size != options.cache_sizes.end();
+                 cache_size++) {
+                unsigned candidate = std::max(1, (int)(*cache_size / tile_sizes.back() / bitwidth));
+                if (idx == 3)
+                    candidate = candidate - (candidate % tile_sizes[0]);
+                if (idx == 4)
+                    candidate = candidate - (candidate % tile_sizes[1]);
+                tile_sizes.push_back(candidate);
+                idx++;
+            }
+        }
+        while (tile_sizes.size() < 5) {
+            tile_sizes.push_back(loop_length);
+        }
+        // If vector size is longer than 1, we need to keep that in mind for the
+        // NR loop
+        if (vec_size > 1)
+            tile_sizes[1] = std::max(1, (int)(tile_sizes[1] / vec_size));
+        return tile_sizes;
     }
 
-    if (KU > 0 && failed(loopUnrollUpToFactor(lastNest.back(), KU))) {
-      spdlog::warn("Could not unroll the K loop in MatMulLowering");
+    // Tile the affine loop nest generated from MatMulOp with the specified tile
+    // sizes. Includes validations to follow the movement and creation of the
+    // tile loops.
+    void tile_loops(mlir::Location loc, SmallVector<AffineForOp, 3> loops, SmallVector<unsigned, 5> tile_sizes) const {
+        unsigned NC = tile_sizes[4];
+        unsigned MC = tile_sizes[3];
+        unsigned KC = tile_sizes[2];
+        unsigned NR = tile_sizes[1];
+        unsigned MR = tile_sizes[0];
+        unsigned KU = options.unroll_factor;
+        [[maybe_unused]] auto vec_size = loops[1].getStep();
+        llvm::SmallVector<AffineForOp> loopNest;
+        getPerfectlyNestedLoops(loopNest, loops.front());
+        // tile i with MC, j with NC, k with KC
+        llvm::SmallVector<AffineForOp> tiledNest;
+        if (failed(tilePerfectlyNested(loopNest, {MC, NC, KC}, &tiledNest))) {
+            spdlog::warn("Could not tile the loop nest in MatMulLowering");
+        };
+
+#define GEN_ERR_MSG(name, size, expected)                                                                              \
+    std::string(name) + " should have step size " + std::string(expected) + " but is " + std::to_string(size)
+
+        if (tiledNest[0].getStep() != MC)
+            throw ErrorHandler::compilerError(
+                loc, "MatMulOpLowering (tile_loops)",
+                GEN_ERR_MSG("tiledNest 0", tiledNest[0].getStep(), "MC (" + std::to_string(MC) + ")"));
+        if (tiledNest[1].getStep() != NC * vec_size)
+            throw ErrorHandler::compilerError(loc, "MatMulOpLowering (tile_loops)",
+                                              GEN_ERR_MSG("tiledNest 1", tiledNest[1].getStep(),
+                                                          "NC * vec_size (" + std::to_string(NC * vec_size) + ")"));
+        if (tiledNest[2].getStep() != KC)
+            throw ErrorHandler::compilerError(
+                loc, "MatMulOpLowering (tile_loops)",
+                GEN_ERR_MSG("tiledNest 2", tiledNest[2].getStep(), "KC (" + std::to_string(KC) + ")"));
+        if (tiledNest[3].getStep() != 1)
+            throw ErrorHandler::compilerError(loc, "MatMulOpLowering (tile_loops)",
+                                              GEN_ERR_MSG("tiledNest 3", tiledNest[3].getStep(), "1"));
+        if (tiledNest[4].getStep() != vec_size)
+            throw ErrorHandler::compilerError(
+                loc, "MatMulOpLowering (tile_loops)",
+                GEN_ERR_MSG("tiledNest 4", tiledNest[4].getStep(), "vec_size (" + std::to_string(vec_size) + ")"));
+        if (tiledNest[5].getStep() != 1)
+            throw ErrorHandler::compilerError(loc, "MatMulOpLowering (tile_loops)",
+                                              GEN_ERR_MSG("tiledNest 5", tiledNest[5].getStep(), "1"));
+
+        // Further tile the i mod MC loop with MR
+        if (failed(tilePerfectlyNested(tiledNest[3], {MR}))) {
+            spdlog::warn("Could not tile the second i loop in MatMulLowering");
+        };
+
+        // Further tile the j mod NC loop with NR
+        if (tiledNest[4].getStep() != vec_size)
+            throw ErrorHandler::compilerError(
+                loc, "MatMulOpLowering (tile_loops)",
+                GEN_ERR_MSG("tiledNest 4", tiledNest[4].getStep(), "vec_size (" + std::to_string(vec_size) + ")"));
+        if (failed(tilePerfectlyNested(tiledNest[4], {NR}))) {
+            spdlog::warn("Could not tile the second j loop in MatMulLowering");
+        };
+
+        llvm::SmallVector<AffineForOp> twiceTiledNest;
+        getPerfectlyNestedLoops(twiceTiledNest, tiledNest[0]);
+        // i loops
+        if (twiceTiledNest[0].getStep() != MC)
+            throw ErrorHandler::compilerError(
+                loc, "MatMulOpLowering (tile_loops)",
+                GEN_ERR_MSG("twiceTiledNest 0", twiceTiledNest[0].getStep(), "MC (" + std::to_string(MC) + ")"));
+        if (twiceTiledNest[3].getStep() != MR)
+            throw ErrorHandler::compilerError(
+                loc, "MatMulOpLowering (tile_loops)",
+                GEN_ERR_MSG("twiceTiledNest 3", twiceTiledNest[3].getStep(), "MR (" + std::to_string(MR) + ")"));
+        if (twiceTiledNest[4].getStep() != 1)
+            throw ErrorHandler::compilerError(loc, "MatMulOpLowering (tile_loops)",
+                                              GEN_ERR_MSG("twiceTiledNest 4", twiceTiledNest[4].getStep(), "1"));
+
+        // j loops
+        if (twiceTiledNest[1].getStep() != NC * vec_size)
+            throw ErrorHandler::compilerError(loc, "MatMulOpLowering (tile_loops)",
+                                              GEN_ERR_MSG("twiceTiledNest 1", twiceTiledNest[1].getStep(),
+                                                          "NC * vec_size (" + std::to_string(NC * vec_size) + ")"));
+        if (twiceTiledNest[5].getStep() != NR * vec_size)
+            throw ErrorHandler::compilerError(loc, "MatMulOpLowering (tile_loops)",
+                                              GEN_ERR_MSG("twiceTiledNest 5", twiceTiledNest[5].getStep(),
+                                                          "NR * vec_size (" + std::to_string(NR * vec_size) + ")"));
+        if (twiceTiledNest[6].getStep() != vec_size)
+            throw ErrorHandler::compilerError(loc, "MatMulOpLowering (tile_loops)",
+                                              GEN_ERR_MSG("twiceTiledNest 6", twiceTiledNest[6].getStep(),
+                                                          "vec_size (" + std::to_string(vec_size) + ")"));
+
+        // k loops
+        if (twiceTiledNest[2].getStep() != KC)
+            throw ErrorHandler::compilerError(
+                loc, "MatMulOpLowering (tile_loops)",
+                GEN_ERR_MSG("twiceTiledNest 2", twiceTiledNest[2].getStep(), "KC (" + std::to_string(KC) + ")"));
+        if (twiceTiledNest[7].getStep() != 1)
+            throw ErrorHandler::compilerError(loc, "MatMulOpLowering (tile_loops)",
+                                              GEN_ERR_MSG("twiceTiledNest 7", twiceTiledNest[7].getStep(), "1"));
+
+        // permute loops to final order (i / MC, j / NC, k / KC, i / MR, i mod
+        // MR, j / NR, j mod NR, k mod KC) ->
+        //                              (j / NC, k / KC, i / MC, j / NR, i / MR,
+        //                              k mod KC, j mod NR, i mod MR)
+        unsigned root_idx = permuteLoops(twiceTiledNest, {2, 0, 1, 4, 7, 3, 6, 5});
+
+        // Unroll and jam
+        llvm::SmallVector<AffineForOp> blisTiledLoops;
+        getPerfectlyNestedLoops(blisTiledLoops, twiceTiledNest[root_idx]);
+        // i loops
+        if (blisTiledLoops[2].getStep() != MC)
+            throw ErrorHandler::compilerError(
+                loc, "MatMulOpLowering (tile_loops)",
+                GEN_ERR_MSG("blisTiled 2", blisTiledLoops[2].getStep(), "MC (" + std::to_string(MC) + ")"));
+        if (blisTiledLoops[4].getStep() != MR)
+            throw ErrorHandler::compilerError(
+                loc, "MatMulOpLowering (tile_loops)",
+                GEN_ERR_MSG("blisTiled 4", blisTiledLoops[4].getStep(), "MR (" + std::to_string(MR) + ")"));
+        if (blisTiledLoops[7].getStep() != 1)
+            throw ErrorHandler::compilerError(loc, "MatMulOpLowering (tile_loops)",
+                                              GEN_ERR_MSG("blisTiled 7", blisTiledLoops[7].getStep(), "1"));
+
+        // j loops
+        if (blisTiledLoops[0].getStep() != NC * vec_size)
+            throw ErrorHandler::compilerError(loc, "MatMulOpLowering (tile_loops)",
+                                              GEN_ERR_MSG("blisTiled 0", blisTiledLoops[0].getStep(),
+                                                          "NC * vec_size (" + std::to_string(NC * vec_size) + ")"));
+        if (blisTiledLoops[3].getStep() != NR * vec_size)
+            throw ErrorHandler::compilerError(loc, "MatMulOpLowering (tile_loops)",
+                                              GEN_ERR_MSG("blisTiled 3", blisTiledLoops[3].getStep(),
+                                                          "NR * vec_size (" + std::to_string(NR * vec_size) + ")"));
+        if (blisTiledLoops[6].getStep() != vec_size)
+            throw ErrorHandler::compilerError(
+                loc, "MatMulOpLowering (tile_loops)",
+                GEN_ERR_MSG("blisTiled 6", blisTiledLoops[6].getStep(), "vec_size (" + std::to_string(vec_size) + ")"));
+
+        // k loops
+        if (blisTiledLoops[1].getStep() != KC)
+            throw ErrorHandler::compilerError(
+                loc, "MatMulOpLowering (tile_loops)",
+                GEN_ERR_MSG("blisTiled 1", blisTiledLoops[1].getStep(), "KC (" + std::to_string(KC) + ")"));
+        if (blisTiledLoops[5].getStep() != 1)
+            throw ErrorHandler::compilerError(loc, "MatMulOpLowering (tile_loops)",
+                                              GEN_ERR_MSG("blisTiled 5", blisTiledLoops[5].getStep(), "1"));
+
+#undef GEN_ERR_MSG
+
+        // Unroll jam causes Segfault, if called in a way where the loop is not
+        // cleanly divided.
+        if (options.unroll_jam_factor > 0 && blisTiledLoops[5].getUpperBound().getMap().getNumResults() == 1 &&
+            succeeded(loopUnrollJamUpToFactor(blisTiledLoops[5], options.unroll_jam_factor))) {
+            if (blisTiledLoops[6].getUpperBound().getMap().getNumResults() != 1 ||
+                failed(loopUnrollJamUpToFactor(blisTiledLoops[6], options.unroll_jam_factor))) {
+                spdlog::warn("Could not unroll the (j mod NC) mod NR loop in "
+                             "MatMulLowering");
+            }
+        } else {
+            spdlog::warn("Could not unroll the (i mod MC) mod MR loop in "
+                         "MatMulLowering");
+        }
+
+        llvm::SmallVector<AffineForOp> lastNest;
+        getPerfectlyNestedLoops(lastNest, blisTiledLoops.front());
+        int64_t i = 0;
+        while (succeeded(promoteIfSingleIteration(lastNest[i])) && i < 4) {
+            i++;
+        }
+
+        if (KU > 0 && failed(loopUnrollUpToFactor(lastNest.back(), KU))) {
+            spdlog::warn("Could not unroll the K loop in MatMulLowering");
+        }
     }
-  }
 };
 
 namespace {
@@ -626,129 +620,118 @@ namespace {
  *
  * A more detailed description can be found in 'daphneir/Passes.td'.
  */
-struct MatMulLoweringPass
-    : public impl::MatMulOpLoweringPassBase<MatMulLoweringPass> {
-  MatMulLoweringPass() = default;
-
-public:
-  explicit MatMulLoweringPass(bool matmul_tile, int matmul_vec_size_bits,
-                              std::vector<unsigned> matmul_fixed_tile_sizes,
-                              bool matmul_use_fixed_tile_sizes,
-                              int matmul_unroll_factor,
-                              int matmul_unroll_jam_factor,
-                              int matmul_num_vec_registers,
-                              bool matmul_invert_loops)
-      : impl::MatMulOpLoweringPassBase<MatMulLoweringPass>() {
-    this->matmul_tile = matmul_tile;
-    this->matmul_vec_size_bits = matmul_vec_size_bits;
-    this->matmul_fixed_tile_sizes = matmul_fixed_tile_sizes;
-    this->matmul_use_fixed_tile_sizes = matmul_use_fixed_tile_sizes;
-    this->matmul_unroll_factor = matmul_unroll_factor;
-    this->matmul_unroll_jam_factor = matmul_unroll_jam_factor;
-    this->matmul_num_vec_registers = matmul_num_vec_registers;
-    this->matmul_invert_loops = matmul_invert_loops;
-  }
-
-  void runOnOperation() override;
-
-private:
-  // Get the L1, L2 and L3 cache sizes to adapt tile sizes.
-  // So far assumes process is executed on a single processing unit.
-  // See example:
-  // https://www.open-mpi.org/projects/hwloc/doc/v2.2.0/a00324.php#cli_examples
-  SmallVector<int> get_cache_sizes() const {
-    hwloc_topology_t topology;
-    hwloc_obj_t obj;
-    SmallVector<int> sizes;
-
-    // Allocate and initialize topology object
-    hwloc_topology_init(&topology);
-    // Perform topology detection
-    hwloc_topology_load(topology);
-
-    for (obj = hwloc_get_obj_by_type(topology, HWLOC_OBJ_PU, 0); obj;
-         obj = obj->parent)
-      if (hwloc_obj_type_is_cache(obj->type)) {
-        sizes.push_back(obj->attr->cache.size);
-      }
-    return sizes;
-  }
+struct MatMulLoweringPass : public impl::MatMulOpLoweringPassBase<MatMulLoweringPass> {
+    MatMulLoweringPass() = default;
+
+  public:
+    explicit MatMulLoweringPass(bool matmul_tile, int matmul_vec_size_bits,
+                                std::vector<unsigned> matmul_fixed_tile_sizes, bool matmul_use_fixed_tile_sizes,
+                                int matmul_unroll_factor, int matmul_unroll_jam_factor, int matmul_num_vec_registers,
+                                bool matmul_invert_loops)
+        : impl::MatMulOpLoweringPassBase<MatMulLoweringPass>() {
+        this->matmul_tile = matmul_tile;
+        this->matmul_vec_size_bits = matmul_vec_size_bits;
+        this->matmul_fixed_tile_sizes = matmul_fixed_tile_sizes;
+        this->matmul_use_fixed_tile_sizes = matmul_use_fixed_tile_sizes;
+        this->matmul_unroll_factor = matmul_unroll_factor;
+        this->matmul_unroll_jam_factor = matmul_unroll_jam_factor;
+        this->matmul_num_vec_registers = matmul_num_vec_registers;
+        this->matmul_invert_loops = matmul_invert_loops;
+    }
+
+    void runOnOperation() override;
+
+  private:
+    // Get the L1, L2 and L3 cache sizes to adapt tile sizes.
+    // So far assumes process is executed on a single processing unit.
+    // See example:
+    // https://www.open-mpi.org/projects/hwloc/doc/v2.2.0/a00324.php#cli_examples
+    SmallVector<int> get_cache_sizes() const {
+        hwloc_topology_t topology;
+        hwloc_obj_t obj;
+        SmallVector<int> sizes;
+
+        // Allocate and initialize topology object
+        hwloc_topology_init(&topology);
+        // Perform topology detection
+        hwloc_topology_load(topology);
+
+        for (obj = hwloc_get_obj_by_type(topology, HWLOC_OBJ_PU, 0); obj; obj = obj->parent)
+            if (hwloc_obj_type_is_cache(obj->type)) {
+                sizes.push_back(obj->attr->cache.size);
+            }
+        return sizes;
+    }
 };
 } // end anonymous namespace
 
 void MatMulLoweringPass::runOnOperation() {
-  auto module = getOperation();
-  mlir::ConversionTarget target(getContext());
-  mlir::RewritePatternSet patterns(&getContext());
-  LowerToLLVMOptions llvmOptions(&getContext());
-  LLVMTypeConverter typeConverter(&getContext(), llvmOptions);
-
-  typeConverter.addConversion(convertInteger);
-  typeConverter.addConversion(convertFloat);
-  typeConverter.addConversion([](Type type) { return type; });
-  typeConverter.addArgumentMaterialization(materializeCastFromIllegal);
-  typeConverter.addSourceMaterialization(materializeCastToIllegal);
-  typeConverter.addTargetMaterialization(materializeCastFromIllegal);
-
-  target.addLegalDialect<mlir::memref::MemRefDialect>();
-  target.addLegalDialect<mlir::arith::ArithDialect>();
-  target.addLegalDialect<mlir::scf::SCFDialect>();
-  target.addLegalDialect<mlir::AffineDialect>();
-  target.addLegalDialect<mlir::linalg::LinalgDialect>();
-  target.addLegalDialect<mlir::LLVM::LLVMDialect>();
-  target.addLegalDialect<mlir::vector::VectorDialect>();
-  target.addLegalDialect<daphne::DaphneDialect>();
-  target.addLegalDialect<BuiltinDialect>();
-  target.addLegalDialect<math::MathDialect>();
-
-  target.addLegalOp<mlir::daphne::ConvertDenseMatrixToMemRef>();
-  target.addLegalOp<mlir::daphne::ConvertMemRefToDenseMatrix>();
-  target.addLegalOp<mlir::daphne::DecRefOp>();
-  LowerMatMulOpOptions options;
-  if (matmul_tile) {
-    options.enableTiling();
-    if (matmul_use_fixed_tile_sizes) {
-      options.useFixedTileSizes = true;
-      options.setTileSizes(matmul_fixed_tile_sizes);
-    } else {
-      options.setCacheSizes(get_cache_sizes());
+    auto module = getOperation();
+    mlir::ConversionTarget target(getContext());
+    mlir::RewritePatternSet patterns(&getContext());
+    LowerToLLVMOptions llvmOptions(&getContext());
+    LLVMTypeConverter typeConverter(&getContext(), llvmOptions);
+
+    typeConverter.addConversion(convertInteger);
+    typeConverter.addConversion(convertFloat);
+    typeConverter.addConversion([](Type type) { return type; });
+    typeConverter.addArgumentMaterialization(materializeCastFromIllegal);
+    typeConverter.addSourceMaterialization(materializeCastToIllegal);
+    typeConverter.addTargetMaterialization(materializeCastFromIllegal);
+
+    target.addLegalDialect<mlir::memref::MemRefDialect>();
+    target.addLegalDialect<mlir::arith::ArithDialect>();
+    target.addLegalDialect<mlir::scf::SCFDialect>();
+    target.addLegalDialect<mlir::AffineDialect>();
+    target.addLegalDialect<mlir::linalg::LinalgDialect>();
+    target.addLegalDialect<mlir::LLVM::LLVMDialect>();
+    target.addLegalDialect<mlir::vector::VectorDialect>();
+    target.addLegalDialect<daphne::DaphneDialect>();
+    target.addLegalDialect<BuiltinDialect>();
+    target.addLegalDialect<math::MathDialect>();
+
+    target.addLegalOp<mlir::daphne::ConvertDenseMatrixToMemRef>();
+    target.addLegalOp<mlir::daphne::ConvertMemRefToDenseMatrix>();
+    target.addLegalOp<mlir::daphne::DecRefOp>();
+    LowerMatMulOpOptions options;
+    if (matmul_tile) {
+        options.enableTiling();
+        if (matmul_use_fixed_tile_sizes) {
+            options.useFixedTileSizes = true;
+            options.setTileSizes(matmul_fixed_tile_sizes);
+        } else {
+            options.setCacheSizes(get_cache_sizes());
+        }
+        options.setUnrollFactor(matmul_unroll_factor);
+        options.setUnrollJamFactor(matmul_unroll_jam_factor);
+    }
+    if (matmul_vec_size_bits > 0) {
+        options.enableVectorization();
+        options.setVectorSizeBits(matmul_vec_size_bits);
+    }
+    options.enableLoopInversion(matmul_invert_loops);
+    options.setNumberOfVectorRegisters(matmul_num_vec_registers);
+    target.addDynamicallyLegalOp<mlir::daphne::MatMulOp>(
+        [options](Operation *op) { return !is_valid_options(options); });
+
+    patterns.insert<MatMulLowering>(typeConverter, &getContext(), options);
+
+    if (failed(applyPartialConversion(module, target, std::move(patterns)))) {
+        signalPassFailure();
     }
-    options.setUnrollFactor(matmul_unroll_factor);
-    options.setUnrollJamFactor(matmul_unroll_jam_factor);
-  }
-  if (matmul_vec_size_bits > 0) {
-    options.enableVectorization();
-    options.setVectorSizeBits(matmul_vec_size_bits);
-  }
-  options.enableLoopInversion(matmul_invert_loops);
-  options.setNumberOfVectorRegisters(matmul_num_vec_registers);
-  target.addDynamicallyLegalOp<mlir::daphne::MatMulOp>(
-      [options](Operation *op) { return !is_valid_options(options); });
-
-  patterns.insert<MatMulLowering>(typeConverter, &getContext(), options);
-
-  if (failed(applyPartialConversion(module, target, std::move(patterns)))) {
-    signalPassFailure();
-  }
 }
 
-std::unique_ptr<OperationPass<ModuleOp>>
-mlir::daphne::createMatMulOpLoweringPass(
-    bool matmul_tile, int matmul_vec_size_bits,
-    std::vector<unsigned> matmul_fixed_tile_sizes,
-    bool matmul_use_fixed_tile_sizes, int matmul_unroll_factor,
-    int matmul_unroll_jam_factor, int matmul_num_vec_registers,
-    bool matmul_invert_loops) {
-  return std::make_unique<MatMulLoweringPass>(
-      matmul_tile, matmul_vec_size_bits, matmul_fixed_tile_sizes,
-      matmul_use_fixed_tile_sizes, matmul_unroll_factor,
-      matmul_unroll_jam_factor, matmul_num_vec_registers,
-      matmul_invert_loops);
+std::unique_ptr<OperationPass<ModuleOp>> mlir::daphne::createMatMulOpLoweringPass(
+    bool matmul_tile, int matmul_vec_size_bits, std::vector<unsigned> matmul_fixed_tile_sizes,
+    bool matmul_use_fixed_tile_sizes, int matmul_unroll_factor, int matmul_unroll_jam_factor,
+    int matmul_num_vec_registers, bool matmul_invert_loops) {
+    return std::make_unique<MatMulLoweringPass>(
+        matmul_tile, matmul_vec_size_bits, matmul_fixed_tile_sizes, matmul_use_fixed_tile_sizes, matmul_unroll_factor,
+        matmul_unroll_jam_factor, matmul_num_vec_registers, matmul_invert_loops);
 }
 
 // This is used by daphne-opt and automatically inserts the options provided on
 // the command line into the pass.
-std::unique_ptr<OperationPass<ModuleOp>>
-mlir::daphne::createMatMulOpLoweringPass() {
-  return std::make_unique<MatMulLoweringPass>();
+std::unique_ptr<OperationPass<ModuleOp>> mlir::daphne::createMatMulOpLoweringPass() {
+    return std::make_unique<MatMulLoweringPass>();
 }
diff --git a/src/compiler/lowering/ModOpLowering.cpp b/src/compiler/lowering/ModOpLowering.cpp
index fb1fd8f11..8d047726e 100644
--- a/src/compiler/lowering/ModOpLowering.cpp
+++ b/src/compiler/lowering/ModOpLowering.cpp
@@ -32,131 +32,97 @@
 
 using namespace mlir;
 
-class EwModOpLowering
-    : public mlir::OpConversionPattern<mlir::daphne::EwModOp> {
-   public:
+class EwModOpLowering : public mlir::OpConversionPattern<mlir::daphne::EwModOp> {
+  public:
     using OpConversionPattern::OpConversionPattern;
 
     [[nodiscard]] bool optimization_viable(mlir::Value divisor) const {
-        std::pair<bool, int64_t> isConstant =
-            CompilerUtils::isConstant<int64_t>(divisor);
+        std::pair<bool, int64_t> isConstant = CompilerUtils::isConstant<int64_t>(divisor);
         return isConstant.first && (isConstant.second & (isConstant.second - 1)) == 0;
     }
 
-    void optimizeEwModOp(mlir::Value memRef, mlir::Value divisor,
-                         ArrayRef<int64_t> shape,
-                         ConversionPatternRewriter &rewriter,
-                         Location loc) const {
+    void optimizeEwModOp(mlir::Value memRef, mlir::Value divisor, ArrayRef<int64_t> shape,
+                         ConversionPatternRewriter &rewriter, Location loc) const {
         // divisor - 1
-        mlir::Value cst_one = rewriter.create<mlir::arith::ConstantOp>(
-            loc, rewriter.getI64Type(), rewriter.getI64IntegerAttr(1));
+        mlir::Value cst_one =
+            rewriter.create<mlir::arith::ConstantOp>(loc, rewriter.getI64Type(), rewriter.getI64IntegerAttr(1));
 
-        auto casted_divisor = typeConverter->materializeTargetConversion(
-            rewriter, loc, rewriter.getI64Type(), ValueRange{divisor});
+        auto casted_divisor =
+            typeConverter->materializeTargetConversion(rewriter, loc, rewriter.getI64Type(), ValueRange{divisor});
 
-        mlir::Value rhs =
-            rewriter.create<mlir::arith::SubIOp>(loc, casted_divisor, cst_one);
+        mlir::Value rhs = rewriter.create<mlir::arith::SubIOp>(loc, casted_divisor, cst_one);
 
         SmallVector<int64_t, 4> lowerBounds(/*Rank=*/2, /*Value=*/0);
         SmallVector<int64_t, 4> steps(/*Rank=*/2, /*Value=*/1);
         buildAffineLoopNest(
-            rewriter, loc, lowerBounds, shape, steps,
-            [&](OpBuilder &nestedBuilder, Location loc, ValueRange ivs) {
-                mlir::Value load =
-                    nestedBuilder.create<AffineLoadOp>(loc, memRef, ivs);
+            rewriter, loc, lowerBounds, shape, steps, [&](OpBuilder &nestedBuilder, Location loc, ValueRange ivs) {
+                mlir::Value load = nestedBuilder.create<AffineLoadOp>(loc, memRef, ivs);
                 mlir::Value res{};
 
-                Value castedLhs =
-                    this->typeConverter->materializeTargetConversion(
-                        nestedBuilder, loc,
-                        nestedBuilder.getIntegerType(
-                            divisor.getType().getIntOrFloatBitWidth()),
-                        ValueRange{load});
+                Value castedLhs = this->typeConverter->materializeTargetConversion(
+                    nestedBuilder, loc, nestedBuilder.getIntegerType(divisor.getType().getIntOrFloatBitWidth()),
+                    ValueRange{load});
 
                 res = nestedBuilder.create<arith::AndIOp>(loc, castedLhs, rhs);
-                Value castedRes =
-                    this->typeConverter->materializeSourceConversion(
-                        nestedBuilder, loc, divisor.getType(), ValueRange{res});
+                Value castedRes = this->typeConverter->materializeSourceConversion(nestedBuilder, loc,
+                                                                                   divisor.getType(), ValueRange{res});
 
-                nestedBuilder.create<AffineStoreOp>(loc, castedRes, memRef,
-                                                    ivs);
+                nestedBuilder.create<AffineStoreOp>(loc, castedRes, memRef, ivs);
             });
     }
 
-    void lowerEwModOp(mlir::Value memRef, mlir::Value divisor,
-                      ArrayRef<int64_t> shape,
+    void lowerEwModOp(mlir::Value memRef, mlir::Value divisor, ArrayRef<int64_t> shape,
                       ConversionPatternRewriter &rewriter, Location loc) const {
         SmallVector<int64_t, 4> lowerBounds(/*Rank=*/2, /*Value=*/0);
         SmallVector<int64_t, 4> steps(/*Rank=*/2, /*Value=*/1);
         buildAffineLoopNest(
-            rewriter, loc, lowerBounds, shape, steps,
-            [&](OpBuilder &nestedBuilder, Location loc, ValueRange ivs) {
-                mlir::Value load =
-                    nestedBuilder.create<AffineLoadOp>(loc, memRef, ivs);
+            rewriter, loc, lowerBounds, shape, steps, [&](OpBuilder &nestedBuilder, Location loc, ValueRange ivs) {
+                mlir::Value load = nestedBuilder.create<AffineLoadOp>(loc, memRef, ivs);
                 mlir::Value res{};
 
                 // this is enough since divisor will be casted to float if
                 // matrix is float
                 if (llvm::isa<mlir::FloatType>(divisor.getType())) {
-                    res =
-                        nestedBuilder.create<arith::RemFOp>(loc, load, divisor);
+                    res = nestedBuilder.create<arith::RemFOp>(loc, load, divisor);
                     nestedBuilder.create<AffineStoreOp>(loc, res, memRef, ivs);
                     return;
                 }
 
-                Value castedLhs =
-                    this->typeConverter->materializeTargetConversion(
-                        nestedBuilder, loc,
-                        nestedBuilder.getIntegerType(
-                            divisor.getType().getIntOrFloatBitWidth()),
-                        ValueRange{load});
-
-                Value castedRhs =
-                    this->typeConverter->materializeTargetConversion(
-                        nestedBuilder, loc,
-                        nestedBuilder.getIntegerType(
-                            divisor.getType().getIntOrFloatBitWidth()),
-                        ValueRange{divisor});
-
-                res = nestedBuilder.create<arith::RemSIOp>(loc, castedLhs,
-                                                           castedRhs);
-                Value castedRes =
-                    this->typeConverter->materializeSourceConversion(
-                        nestedBuilder, loc, divisor.getType(), ValueRange{res});
-
-                nestedBuilder.create<AffineStoreOp>(loc, castedRes, memRef,
-                                                    ivs);
+                Value castedLhs = this->typeConverter->materializeTargetConversion(
+                    nestedBuilder, loc, nestedBuilder.getIntegerType(divisor.getType().getIntOrFloatBitWidth()),
+                    ValueRange{load});
+
+                Value castedRhs = this->typeConverter->materializeTargetConversion(
+                    nestedBuilder, loc, nestedBuilder.getIntegerType(divisor.getType().getIntOrFloatBitWidth()),
+                    ValueRange{divisor});
+
+                res = nestedBuilder.create<arith::RemSIOp>(loc, castedLhs, castedRhs);
+                Value castedRes = this->typeConverter->materializeSourceConversion(nestedBuilder, loc,
+                                                                                   divisor.getType(), ValueRange{res});
+
+                nestedBuilder.create<AffineStoreOp>(loc, castedRes, memRef, ivs);
             });
     }
 
-    mlir::LogicalResult matchAndRewrite(
-        mlir::daphne::EwModOp op, OpAdaptor adaptor,
-        mlir::ConversionPatternRewriter &rewriter) const override {
-        mlir::daphne::MatrixType lhsTensor =
-            adaptor.getLhs().getType().dyn_cast<mlir::daphne::MatrixType>();
+    mlir::LogicalResult matchAndRewrite(mlir::daphne::EwModOp op, OpAdaptor adaptor,
+                                        mlir::ConversionPatternRewriter &rewriter) const override {
+        mlir::daphne::MatrixType lhsTensor = adaptor.getLhs().getType().dyn_cast<mlir::daphne::MatrixType>();
         auto lhsRows = lhsTensor.getNumRows();
         auto lhsCols = lhsTensor.getNumCols();
 
-        auto lhsMemRefType = mlir::MemRefType::get({lhsRows, lhsCols},
-                                                   lhsTensor.getElementType());
+        auto lhsMemRefType = mlir::MemRefType::get({lhsRows, lhsCols}, lhsTensor.getElementType());
 
         // daphne::Matrix -> memref
         mlir::Value lhs =
-            rewriter.create<mlir::daphne::ConvertDenseMatrixToMemRef>(
-                op->getLoc(), lhsMemRefType, adaptor.getLhs());
+            rewriter.create<mlir::daphne::ConvertDenseMatrixToMemRef>(op->getLoc(), lhsMemRefType, adaptor.getLhs());
         mlir::Value rhs = adaptor.getRhs();
 
         if (optimization_viable(rhs))
-            optimizeEwModOp(lhs, rhs,
-                            {lhsTensor.getNumRows(), lhsTensor.getNumCols()},
-                            rewriter, op->getLoc());
+            optimizeEwModOp(lhs, rhs, {lhsTensor.getNumRows(), lhsTensor.getNumCols()}, rewriter, op->getLoc());
         else
-            lowerEwModOp(lhs, rhs,
-                         {lhsTensor.getNumRows(), lhsTensor.getNumCols()},
-                         rewriter, op->getLoc());
+            lowerEwModOp(lhs, rhs, {lhsTensor.getNumRows(), lhsTensor.getNumCols()}, rewriter, op->getLoc());
 
-        mlir::Value output = convertMemRefToDenseMatrix(op->getLoc(), rewriter,
-                                                        lhs, op.getType());
+        mlir::Value output = convertMemRefToDenseMatrix(op->getLoc(), rewriter, lhs, op.getType());
         rewriter.replaceOp(op, output);
         return success();
     }
@@ -171,15 +137,12 @@ namespace {
  * If possible, we additionally perform the integer modulo optimization by
  * replacing the modulo with an bitwise AND and a subtraction.
  */
-struct ModOpLoweringPass
-    : public mlir::PassWrapper<ModOpLoweringPass,
-                               mlir::OperationPass<mlir::ModuleOp>> {
+struct ModOpLoweringPass : public mlir::PassWrapper<ModOpLoweringPass, mlir::OperationPass<mlir::ModuleOp>> {
     explicit ModOpLoweringPass() {}
 
     void getDependentDialects(mlir::DialectRegistry &registry) const override {
-        registry
-            .insert<mlir::LLVM::LLVMDialect, mlir::AffineDialect,
-                    mlir::memref::MemRefDialect, mlir::daphne::DaphneDialect>();
+        registry.insert<mlir::LLVM::LLVMDialect, mlir::AffineDialect, mlir::memref::MemRefDialect,
+                        mlir::daphne::DaphneDialect>();
     }
     void runOnOperation() final;
 
@@ -190,7 +153,7 @@ struct ModOpLoweringPass
                "and performing the mod op on values loaded from a MemRef.";
     }
 };
-}  // end anonymous namespace
+} // end anonymous namespace
 
 void ModOpLoweringPass::runOnOperation() {
     mlir::ConversionTarget target(getContext());
@@ -221,6 +184,4 @@ void ModOpLoweringPass::runOnOperation() {
     }
 }
 
-std::unique_ptr<mlir::Pass> mlir::daphne::createModOpLoweringPass() {
-    return std::make_unique<ModOpLoweringPass>();
-}
+std::unique_ptr<mlir::Pass> mlir::daphne::createModOpLoweringPass() { return std::make_unique<ModOpLoweringPass>(); }
diff --git a/src/compiler/lowering/PhyOperatorSelectionPass.cpp b/src/compiler/lowering/PhyOperatorSelectionPass.cpp
index 0ac0de062..798784df9 100644
--- a/src/compiler/lowering/PhyOperatorSelectionPass.cpp
+++ b/src/compiler/lowering/PhyOperatorSelectionPass.cpp
@@ -14,9 +14,9 @@
  * limitations under the License.
  */
 
-#include <compiler/utils/CompilerUtils.h>
 #include "ir/daphneir/Daphne.h"
 #include "ir/daphneir/Passes.h"
+#include <compiler/utils/CompilerUtils.h>
 
 #include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h"
 #include "mlir/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.h"
@@ -25,10 +25,10 @@
 #include "mlir/Conversion/LLVMCommon/LoweringOptions.h"
 #include "mlir/Conversion/LLVMCommon/TypeConverter.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Func/Transforms/FuncConversions.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Transforms/DialectConversion.h"
 
 #include <memory>
@@ -38,25 +38,23 @@
 using namespace mlir;
 
 class MatMulOpLowering : public OpConversionPattern<daphne::MatMulOp> {
-public:
+  public:
     using OpConversionPattern::OpConversionPattern;
 
-    LogicalResult
-    matchAndRewrite(daphne::MatMulOp op, OpAdaptor adaptor,
-                    ConversionPatternRewriter &rewriter) const override {
+    LogicalResult matchAndRewrite(daphne::MatMulOp op, OpAdaptor adaptor,
+                                  ConversionPatternRewriter &rewriter) const override {
         Value lhs = op.getLhs();
         Value rhs = op.getRhs();
-        if(auto to = lhs.getDefiningOp<daphne::TransposeOp>()) {
+        if (auto to = lhs.getDefiningOp<daphne::TransposeOp>()) {
             bool rhsTransposed = CompilerUtils::constantOrThrow<bool>(
-                    op.getTransb(), "MatMulOp.getTransb() is expected to be a constant"
-            );
-            if(to.getArg() == rhs && !rhsTransposed) {
+                op.getTransb(), "MatMulOp.getTransb() is expected to be a constant");
+            if (to.getArg() == rhs && !rhsTransposed) {
                 // `t(M) @ M` -> `syrk(M)`
                 rewriter.replaceOpWithNewOp<daphne::SyrkOp>(op, op.getResult().getType(), rhs);
                 return success();
             }
             auto rhsMatTy = rhs.getType().dyn_cast<daphne::MatrixType>();
-            if((!rhsTransposed && rhsMatTy.getNumCols() == 1) || (rhsTransposed && rhsMatTy.getNumRows() == 1)) {
+            if ((!rhsTransposed && rhsMatTy.getNumCols() == 1) || (rhsTransposed && rhsMatTy.getNumRows() == 1)) {
                 // `t(M) @ v` -> `gemv(M, v)`
                 rewriter.replaceOpWithNewOp<daphne::GemvOp>(op, op.getResult().getType(), to.getArg(), rhs);
                 return success();
@@ -67,11 +65,10 @@ class MatMulOpLowering : public OpConversionPattern<daphne::MatMulOp> {
 };
 
 namespace {
-    struct PhyOperatorSelectionPass
-    : public PassWrapper<PhyOperatorSelectionPass, OperationPass<ModuleOp>> {
-		explicit PhyOperatorSelectionPass() { }
-        void runOnOperation() final;
-    };
+struct PhyOperatorSelectionPass : public PassWrapper<PhyOperatorSelectionPass, OperationPass<ModuleOp>> {
+    explicit PhyOperatorSelectionPass() {}
+    void runOnOperation() final;
+};
 } // end anonymous namespace
 
 void PhyOperatorSelectionPass::runOnOperation() {
@@ -93,26 +90,22 @@ void PhyOperatorSelectionPass::runOnOperation() {
         // (see MatMulOp::canonicalize()), once we do it there again, we need
         // to account for it here (and above in MatMulOpLowering).
         auto to = op.getLhs().getDefiningOp<daphne::TransposeOp>();
-        bool rhsTransposed = CompilerUtils::constantOrThrow<bool>(
-                op.getTransb(), "MatMulOp.getTransb() is expected to be a constant"
-        );
+        bool rhsTransposed =
+            CompilerUtils::constantOrThrow<bool>(op.getTransb(), "MatMulOp.getTransb() is expected to be a constant");
         auto rhsMatTy = op.getRhs().getType().dyn_cast<daphne::MatrixType>();
-        return !(to && (
-            // `t(M) @ M` -> `syrk(M)`
-            (to.getArg() == op.getRhs() && !rhsTransposed) ||
-            // `t(M) @ v` -> `gemv(M, v)`
-            (!rhsTransposed && rhsMatTy.getNumCols() == 1) ||
-            (rhsTransposed && rhsMatTy.getNumRows() == 1)
-        ));
+        return !(to &&
+                 (
+                     // `t(M) @ M` -> `syrk(M)`
+                     (to.getArg() == op.getRhs() && !rhsTransposed) ||
+                     // `t(M) @ v` -> `gemv(M, v)`
+                     (!rhsTransposed && rhsMatTy.getNumCols() == 1) || (rhsTransposed && rhsMatTy.getNumRows() == 1)));
     });
 
     RewritePatternSet patterns(&getContext());
     patterns.insert<MatMulOpLowering>(&getContext());
 
-    if(failed(applyPartialConversion(module, target, std::move(patterns))))
+    if (failed(applyPartialConversion(module, target, std::move(patterns))))
         signalPassFailure();
 }
 
-std::unique_ptr<Pass> daphne::createPhyOperatorSelectionPass() {
-    return std::make_unique<PhyOperatorSelectionPass>();
-}
\ No newline at end of file
+std::unique_ptr<Pass> daphne::createPhyOperatorSelectionPass() { return std::make_unique<PhyOperatorSelectionPass>(); }
\ No newline at end of file
diff --git a/src/compiler/lowering/ProfilingPass.cpp b/src/compiler/lowering/ProfilingPass.cpp
index beda1edc0..d91a28c27 100644
--- a/src/compiler/lowering/ProfilingPass.cpp
+++ b/src/compiler/lowering/ProfilingPass.cpp
@@ -26,16 +26,14 @@ using namespace mlir;
 /**
  * @brief Inserts profiling tracepoints
  */
-struct ProfilingPass: public PassWrapper<ProfilingPass, OperationPass<func::FuncOp>>
-{
+struct ProfilingPass : public PassWrapper<ProfilingPass, OperationPass<func::FuncOp>> {
     explicit ProfilingPass() {}
     void runOnOperation() final;
 };
 
-void ProfilingPass::runOnOperation()
-{
+void ProfilingPass::runOnOperation() {
     func::FuncOp f = getOperation();
-    Block & b = f.getBody().front();
+    Block &b = f.getBody().front();
 
     OpBuilder builder(&b, b.begin());
     Location loc = builder.getUnknownLoc();
@@ -45,7 +43,4 @@ void ProfilingPass::runOnOperation()
     builder.create<daphne::StopProfilingOp>(loc);
 }
 
-std::unique_ptr<Pass> daphne::createProfilingPass()
-{
-    return std::make_unique<ProfilingPass>();
-}
+std::unique_ptr<Pass> daphne::createProfilingPass() { return std::make_unique<ProfilingPass>(); }
diff --git a/src/compiler/lowering/RewriteSqlOpPass.cpp b/src/compiler/lowering/RewriteSqlOpPass.cpp
index 832d3a82c..02509516e 100644
--- a/src/compiler/lowering/RewriteSqlOpPass.cpp
+++ b/src/compiler/lowering/RewriteSqlOpPass.cpp
@@ -13,9 +13,9 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <parser/sql/SQLParser.h>
 #include "ir/daphneir/Daphne.h"
 #include "ir/daphneir/Passes.h"
+#include <parser/sql/SQLParser.h>
 #include <util/ErrorHandler.h>
 
 #include "mlir/Dialect/Arith/IR/Arith.h"
@@ -23,77 +23,67 @@
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Transforms/DialectConversion.h"
 
-#include <memory>
-#include <utility>
 #include <iostream>
+#include <memory>
 #include <sstream>
 #include <stdexcept>
 #include <string>
 #include <unordered_map>
+#include <utility>
 
 using namespace mlir;
 
-namespace
-{
-
-    std::unordered_map <std::string, mlir::Value> tables;
-    struct SqlReplacement : public RewritePattern{
-
-        SqlReplacement(MLIRContext * context, PatternBenefit benefit = 1)
-        : RewritePattern(Pattern::MatchAnyOpTypeTag(), benefit, context)
-        {}
-
-        LogicalResult matchAndRewrite(
-            Operation *op,
-            PatternRewriter &rewriter
-        ) const override
-        {
-            if(auto rOp = llvm::dyn_cast<mlir::daphne::RegisterViewOp>(op)){
-                std::stringstream view_stream;
-                view_stream << rOp.getView().str();
-                mlir::Value arg = rOp.getArg();
-
-                tables[view_stream.str()] = arg;
-                rewriter.eraseOp(op);
-                return success();
-            }else if(auto sqlop = llvm::dyn_cast<mlir::daphne::SqlOp>(op)){
-                std::stringstream sql_query;
-                sql_query << sqlop.getSql().str();
-
-                SQLParser parser;
-                parser.setView(tables);
-                parser.setSqlOp(sqlop);
-                std::string sourceName;
-                llvm::raw_string_ostream ss(sourceName);
-                ss << sqlop->getLoc();
-                mlir::Value result_op;
-                try {
-                    result_op = parser.parseStreamFrame(rewriter, sql_query, sourceName);
-                } catch (std::runtime_error &re) {
-                    throw ErrorHandler::rethrowError("RewriteSqlOpPass",
-                                                     re.what());
-                }
-                rewriter.replaceOp(op, result_op);
-                // TODO Why is this necessary when we have already replaced the op?
-                rewriter.replaceAllUsesWith(op->getResult(0), result_op);
-                return success();
+namespace {
+
+std::unordered_map<std::string, mlir::Value> tables;
+struct SqlReplacement : public RewritePattern {
+
+    SqlReplacement(MLIRContext *context, PatternBenefit benefit = 1)
+        : RewritePattern(Pattern::MatchAnyOpTypeTag(), benefit, context) {}
+
+    LogicalResult matchAndRewrite(Operation *op, PatternRewriter &rewriter) const override {
+        if (auto rOp = llvm::dyn_cast<mlir::daphne::RegisterViewOp>(op)) {
+            std::stringstream view_stream;
+            view_stream << rOp.getView().str();
+            mlir::Value arg = rOp.getArg();
+
+            tables[view_stream.str()] = arg;
+            rewriter.eraseOp(op);
+            return success();
+        } else if (auto sqlop = llvm::dyn_cast<mlir::daphne::SqlOp>(op)) {
+            std::stringstream sql_query;
+            sql_query << sqlop.getSql().str();
+
+            SQLParser parser;
+            parser.setView(tables);
+            parser.setSqlOp(sqlop);
+            std::string sourceName;
+            llvm::raw_string_ostream ss(sourceName);
+            ss << sqlop->getLoc();
+            mlir::Value result_op;
+            try {
+                result_op = parser.parseStreamFrame(rewriter, sql_query, sourceName);
+            } catch (std::runtime_error &re) {
+                throw ErrorHandler::rethrowError("RewriteSqlOpPass", re.what());
             }
-            return failure();
+            rewriter.replaceOp(op, result_op);
+            // TODO Why is this necessary when we have already replaced the op?
+            rewriter.replaceAllUsesWith(op->getResult(0), result_op);
+            return success();
         }
-    };
+        return failure();
+    }
+};
 
-    struct RewriteSqlOpPass
-    : public PassWrapper <RewriteSqlOpPass, OperationPass<ModuleOp>>
-    {
-        void runOnOperation() final;
+struct RewriteSqlOpPass : public PassWrapper<RewriteSqlOpPass, OperationPass<ModuleOp>> {
+    void runOnOperation() final;
 
     StringRef getArgument() const final { return "rewrite-sqlop"; }
     StringRef getDescription() const final { return "TODO"; }
-    };
-}
+};
+} // namespace
 
-void RewriteSqlOpPass::runOnOperation()
-{
+void RewriteSqlOpPass::runOnOperation() {
     auto module = getOperation();
 
     RewritePatternSet patterns(&getContext());
@@ -108,7 +98,4 @@ void RewriteSqlOpPass::runOnOperation()
         signalPassFailure();
 }
 
-std::unique_ptr<Pass> daphne::createRewriteSqlOpPass()
-{
-    return std::make_unique<RewriteSqlOpPass>();
-}
+std::unique_ptr<Pass> daphne::createRewriteSqlOpPass() { return std::make_unique<RewriteSqlOpPass>(); }
diff --git a/src/compiler/lowering/RewriteToCallKernelOpPass.cpp b/src/compiler/lowering/RewriteToCallKernelOpPass.cpp
index 8f41d8f06..f99c9e472 100644
--- a/src/compiler/lowering/RewriteToCallKernelOpPass.cpp
+++ b/src/compiler/lowering/RewriteToCallKernelOpPass.cpp
@@ -15,11 +15,11 @@
  */
 
 #include "compiler/utils/CompilerUtils.h"
-#include <util/ErrorHandler.h>
-#include <util/KernelDispatchMapping.h>
-#include <compiler/utils/TypePrinting.h>
 #include "ir/daphneir/Daphne.h"
 #include "ir/daphneir/Passes.h"
+#include <compiler/utils/TypePrinting.h>
+#include <util/ErrorHandler.h>
+#include <util/KernelDispatchMapping.h>
 
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
@@ -29,683 +29,574 @@
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/IR/BuiltinDialect.h"
+#include "mlir/IR/IRMapping.h"
 #include "mlir/IR/Location.h"
 #include "mlir/Transforms/DialectConversion.h"
-#include "mlir/IR/IRMapping.h"
 
-#include <memory>
-#include <utility>
 #include <iostream>
+#include <memory>
 #include <sstream>
 #include <stdexcept>
 #include <string_view>
 #include <tuple>
 #include <unordered_map>
+#include <utility>
 #include <vector>
 
 using namespace mlir;
 
-namespace
-{
-    class KernelReplacement : public RewritePattern
-    {
-        // TODO This method is only required since MLIR does not seem to
-        // provide a means to get this information.
-        static size_t getNumODSOperands(Operation * op) {
-            if(llvm::isa<daphne::ThetaJoinOp>(op))
-                return 4;
-            if(llvm::isa<daphne::OrderOp>(op))
-                return 4;
-            if(llvm::isa<daphne::GroupOp>(op))
-                return 3;
-            if(llvm::isa<daphne::CreateFrameOp, daphne::SetColLabelsOp>(op))
-                return 2;
-            if(llvm::isa<daphne::DistributedComputeOp, daphne::CreateListOp>(op))
-                return 1;
-
-            throw ErrorHandler::compilerError(
-                op, "RewriteToCallKernelOpPass",
-                "lowering to kernel call not yet supported for this variadic "
-                "operation: " +
-                    op->getName().getStringRef().str());
+namespace {
+class KernelReplacement : public RewritePattern {
+    // TODO This method is only required since MLIR does not seem to
+    // provide a means to get this information.
+    static size_t getNumODSOperands(Operation *op) {
+        if (llvm::isa<daphne::ThetaJoinOp>(op))
+            return 4;
+        if (llvm::isa<daphne::OrderOp>(op))
+            return 4;
+        if (llvm::isa<daphne::GroupOp>(op))
+            return 3;
+        if (llvm::isa<daphne::CreateFrameOp, daphne::SetColLabelsOp>(op))
+            return 2;
+        if (llvm::isa<daphne::DistributedComputeOp, daphne::CreateListOp>(op))
+            return 1;
+
+        throw ErrorHandler::compilerError(op, "RewriteToCallKernelOpPass",
+                                          "lowering to kernel call not yet supported for this variadic "
+                                          "operation: " +
+                                              op->getName().getStringRef().str());
+    }
+
+    // TODO This method is only required since MLIR does not seem to
+    // provide a means to get this information. But, for instance, the
+    // isVariadic boolean array is automatically generated *within* the
+    // getODSOperandIndexAndLength method.
+    static std::tuple<unsigned, unsigned, bool> getODSOperandInfo(Operation *op, unsigned index) {
+        // TODO Simplify those by a macro.
+        if (auto concreteOp = llvm::dyn_cast<daphne::CreateFrameOp>(op)) {
+            auto idxAndLen = concreteOp.getODSOperandIndexAndLength(index);
+            static bool isVariadic[] = {true, true};
+            return std::make_tuple(idxAndLen.first, idxAndLen.second, isVariadic[index]);
+        }
+        if (auto concreteOp = llvm::dyn_cast<daphne::CreateListOp>(op)) {
+            auto idxAndLen = concreteOp.getODSOperandIndexAndLength(index);
+            static bool isVariadic[] = {true};
+            return std::make_tuple(idxAndLen.first, idxAndLen.second, isVariadic[index]);
+        }
+        if (auto concreteOp = llvm::dyn_cast<daphne::SetColLabelsOp>(op)) {
+            auto idxAndLen = concreteOp.getODSOperandIndexAndLength(index);
+            static bool isVariadic[] = {false, true};
+            return std::make_tuple(idxAndLen.first, idxAndLen.second, isVariadic[index]);
         }
+        if (auto concreteOp = llvm::dyn_cast<daphne::DistributedComputeOp>(op)) {
+            auto idxAndLen = concreteOp.getODSOperandIndexAndLength(index);
+            static bool isVariadic[] = {true};
+            return std::make_tuple(idxAndLen.first, idxAndLen.second, isVariadic[index]);
+        }
+        if (auto concreteOp = llvm::dyn_cast<daphne::GroupOp>(op)) {
+            auto idxAndLen = concreteOp.getODSOperandIndexAndLength(index);
+            static bool isVariadic[] = {false, true, true};
+            return std::make_tuple(idxAndLen.first, idxAndLen.second, isVariadic[index]);
+        }
+        if (auto concreteOp = llvm::dyn_cast<daphne::ThetaJoinOp>(op)) {
+            auto idxAndLen = concreteOp.getODSOperandIndexAndLength(index);
+            static bool isVariadic[] = {false, false, true, true};
+            return std::make_tuple(idxAndLen.first, idxAndLen.second, isVariadic[index]);
+        }
+        if (auto concreteOp = llvm::dyn_cast<daphne::OrderOp>(op)) {
+            auto idxAndLen = concreteOp.getODSOperandIndexAndLength(index);
+            static bool isVariadic[] = {false, true, true, false};
+            return std::make_tuple(idxAndLen.first, idxAndLen.second, isVariadic[index]);
+        }
+        throw ErrorHandler::compilerError(op, "RewriteToCallKernelOpPass",
+                                          "lowering to kernel call not yet supported for this variadic "
+                                          "operation: " +
+                                              op->getName().getStringRef().str());
+    }
+
+    /**
+     * @brief The value of type `DaphneContext` to insert as the last
+     * argument to all kernel calls.
+     */
+    Value dctx;
+
+    const DaphneUserConfig &userConfig;
+    std::unordered_map<std::string, bool> &usedLibPaths;
+
+    mlir::Type adaptType(mlir::Type t, bool generalizeToStructure) const {
+        MLIRContext *mctx = t.getContext();
+        if (generalizeToStructure && t.isa<mlir::daphne::MatrixType, mlir::daphne::FrameType, mlir::daphne::ListType>())
+            return mlir::daphne::StructureType::get(mctx);
+        if (auto mt = t.dyn_cast<mlir::daphne::MatrixType>())
+            return mt.withSameElementTypeAndRepr();
+        if (t.isa<mlir::daphne::FrameType>())
+            return mlir::daphne::FrameType::get(mctx, {mlir::daphne::UnknownType::get(mctx)});
+        if (auto lt = t.dyn_cast<mlir::daphne::ListType>())
+            return mlir::daphne::ListType::get(mctx, adaptType(lt.getElementType(), generalizeToStructure));
+        if (auto mrt = t.dyn_cast<mlir::MemRefType>())
+            // Remove any dimension information ({0, 0}), but retain the element
+            // type.
+            return mlir::MemRefType::get({0, 0}, mrt.getElementType());
+        return t;
+    }
+
+  public:
+    /**
+     * Creates a new KernelReplacement rewrite pattern.
+     *
+     * @param mctx The MLIR context.
+     * @param dctx The DaphneContext to pass to the kernels.
+     * @param userConfig The user config.
+     * @param benefit
+     */
+    KernelReplacement(MLIRContext *mctx, Value dctx, const DaphneUserConfig &userConfig,
+                      std::unordered_map<std::string, bool> &usedLibPaths, PatternBenefit benefit = 1)
+        : RewritePattern(Pattern::MatchAnyOpTypeTag(), benefit, mctx), dctx(dctx), userConfig(userConfig),
+          usedLibPaths(usedLibPaths) {}
+
+    /**
+     * @brief Rewrites the given operation to a `CallKernelOp`.
+     *
+     * This involves looking up a matching kernel from the kernel catalog based
+     * on the mnemonic, argument/result types, and backend (e.g., hardware
+     * accelerator) of the given operation. Variadic operands are also taken
+     * into account.
+     *
+     * @param op The operation to rewrite.
+     * @param rewriter The rewriter.
+     * @result Always returns `mlir::success()` unless an exception is thrown.
+     */
+    LogicalResult matchAndRewrite(Operation *op, PatternRewriter &rewriter) const override {
+        Location loc = op->getLoc();
+
+        // The argument/result types of the given operation.
+        Operation::operand_type_range opArgTys = op->getOperandTypes();
+        Operation::result_type_range opResTys = op->getResultTypes();
+
+        // The argument/result types to use for kernel look-up.
+        std::vector<mlir::Type> lookupArgTys;
+        std::vector<mlir::Type> lookupResTys;
+        // Differences between op argument types and look-up argument types:
+        // - The look-up argument types summarize n occurrences of a variadic
+        // operand into
+        //   one variadic pack and one number of occurrences.
+        // - The look-up argument types omit most of the properties of the op
+        // argument types,
+        //   because those would complicate the search for matching kernels.
+        // Differences between op result types and look-up result types:
+        // - The look-up result types omit most of the properties of the op
+        // result types,
+        //   because those would complicate the search for matching kernels.
+
+        // The operands to use for the CallKernelOp to be created. These may
+        // differ from the operands of the given operation, if it has a variadic
+        // operand.
+        std::vector<Value> kernelArgs;
+
+        // *****************************************************************************
+        // Prepare the kernel look-up and the creation of the CallKernelOp
+        // *****************************************************************************
+        // Determine the argument/result types for the kernel look-up as well as
+        // the arguments of the CallKernelOp to be created. Variadic operands
+        // are taken into account.
+
+        // Find out if argument types shall the generalized from matrix/frame to
+        // the supertype structure.
+        // TODO Don't enumerate all ops, decide based on a trait.
+        const bool generalizeInputTypes =
+            llvm::isa<daphne::CreateFrameOp>(op) || llvm::isa<daphne::DistributedComputeOp>(op) ||
+            llvm::isa<daphne::NumCellsOp>(op) || llvm::isa<daphne::NumColsOp>(op) || llvm::isa<daphne::NumRowsOp>(op) ||
+            llvm::isa<daphne::IncRefOp>(op) || llvm::isa<daphne::DecRefOp>(op);
+
+        // Append converted op result types to the look-up result types.
+        for (size_t i = 0; i < opResTys.size(); i++)
+            lookupResTys.push_back(adaptType(opResTys[i], false));
+
+        // Append converted op argument types to the look-up argument types.
+        // Variadic operands, which can have an arbitrary number of occurrences,
+        // are treated specially.
+        if (
+            // TODO Unfortunately, one needs to know the exact N for
+            // AtLeastNOperands... There seems to be no simple way to
+            // detect if an operation has variadic ODS operands with any N.
+            op->hasTrait<OpTrait::VariadicOperands>() || op->hasTrait<OpTrait::AtLeastNOperands<1>::Impl>() ||
+            op->hasTrait<OpTrait::AtLeastNOperands<2>::Impl>()) {
+            // For operations with variadic ODS operands, we replace all
+            // occurrences of a variadic ODS operand by a single operand of
+            // type VariadicPack as well as an operand for the number of
+            // occurrences. All occurrences of the variadic ODS operand are
+            // stored in the VariadicPack.
+            // Note that a variadic ODS operand may have zero occurrences.
+            // In that case, there is no operand corresponding to the
+            // variadic ODS operand.
+            const size_t numODSOperands = getNumODSOperands(op);
+            for (size_t i = 0; i < numODSOperands; i++) {
+                auto odsOpInfo = getODSOperandInfo(op, i);
+                const unsigned idx = std::get<0>(odsOpInfo);
+                const unsigned len = std::get<1>(odsOpInfo);
+                const bool isVariadic = std::get<2>(odsOpInfo);
+
+                // Determine the MLIR type of the current ODS operand.
+                Type odsOperandTy;
+                if (len > 0) {
+                    // If the current ODS operand has occurrences, then
+                    // we use the type of the first operand belonging to
+                    // the current ODS operand.
+                    odsOperandTy = opArgTys[idx];
+                } else { // len == 0
+                    // If the current ODS operand does not have any occurrences
+                    // (e.g., a variadic ODS operand with zero concrete operands
+                    // provided), then we cannot derive the type of the
+                    // current ODS operand from any given operand. Instead,
+                    // we use a default type depending on which ODS operand of
+                    // which operation it is.
+                    // Note that we cannot simply omit the type, since the
+                    // underlying kernel expects an "empty list" (represented
+                    // in the DAPHNE compiler by an empty VariadicPack).
+                    if (llvm::dyn_cast<daphne::GroupOp>(op) && i == 2)
+                        // A GroupOp may have zero aggregation column names.
+                        odsOperandTy = daphne::StringType::get(rewriter.getContext());
+                    else
+                        throw std::runtime_error("RewriteToCallKernelOpPass encountered a variadic "
+                                                 "ODS operand with zero occurrences, "
+                                                 "but does not know how to handle it: ODS operand " +
+                                                 std::to_string(i) + " of operation " +
+                                                 op->getName().getStringRef().str());
+                }
 
-        // TODO This method is only required since MLIR does not seem to
-        // provide a means to get this information. But, for instance, the
-        // isVariadic boolean array is automatically generated *within* the
-        // getODSOperandIndexAndLength method.
-        static std::tuple<unsigned, unsigned, bool> getODSOperandInfo(Operation * op, unsigned index) {
-            // TODO Simplify those by a macro.
-            if(auto concreteOp = llvm::dyn_cast<daphne::CreateFrameOp>(op)) {
-                auto idxAndLen = concreteOp.getODSOperandIndexAndLength(index);
-                static bool isVariadic[] = {true, true};
-                return std::make_tuple(
-                        idxAndLen.first,
-                        idxAndLen.second,
-                        isVariadic[index]
-                );
-            }
-            if(auto concreteOp = llvm::dyn_cast<daphne::CreateListOp>(op)) {
-                auto idxAndLen = concreteOp.getODSOperandIndexAndLength(index);
-                static bool isVariadic[] = {true};
-                return std::make_tuple(
-                        idxAndLen.first,
-                        idxAndLen.second,
-                        isVariadic[index]
-                );
+                lookupArgTys.push_back(adaptType(odsOperandTy, generalizeInputTypes));
+
+                if (isVariadic) {
+                    // Variadic operand.
+                    lookupArgTys.push_back(rewriter.getIndexType());
+                    auto cvpOp = rewriter.create<daphne::CreateVariadicPackOp>(
+                        loc, daphne::VariadicPackType::get(rewriter.getContext(), odsOperandTy),
+                        rewriter.getI64IntegerAttr(len));
+                    for (int64_t k = 0; k < len; k++)
+                        rewriter.create<daphne::StoreVariadicPackOp>(loc, cvpOp, op->getOperand(idx + k),
+                                                                     rewriter.getI64IntegerAttr(k));
+                    kernelArgs.push_back(cvpOp);
+                    kernelArgs.push_back(
+                        rewriter.create<daphne::ConstantOp>(loc, rewriter.getIndexType(), rewriter.getIndexAttr(len)));
+                } else
+                    // Non-variadic operand.
+                    kernelArgs.push_back(op->getOperand(idx));
             }
-            if(auto concreteOp = llvm::dyn_cast<daphne::SetColLabelsOp>(op)) {
-                auto idxAndLen = concreteOp.getODSOperandIndexAndLength(index);
-                static bool isVariadic[] = {false, true};
-                return std::make_tuple(
-                        idxAndLen.first,
-                        idxAndLen.second,
-                        isVariadic[index]
-                );
+        } else
+            // For operations without variadic operands, we simply append
+            // the type of each operand to the vector of types to use for
+            // kernel look-up, and pass all operands to the CallKernelOp as-is.
+            for (size_t i = 0; i < opArgTys.size(); i++) {
+                lookupArgTys.push_back(adaptType(opArgTys[i], generalizeInputTypes));
+                kernelArgs.push_back(op->getOperand(i));
             }
-            if(auto concreteOp = llvm::dyn_cast<daphne::DistributedComputeOp>(op)) {
-                auto idxAndLen = concreteOp.getODSOperandIndexAndLength(index);
-                static bool isVariadic[] = {true};
-                return std::make_tuple(
-                    idxAndLen.first,
-                    idxAndLen.second,
-                    isVariadic[index]
-                );
-            }
-            if(auto concreteOp = llvm::dyn_cast<daphne::GroupOp>(op)) {
-                auto idxAndLen = concreteOp.getODSOperandIndexAndLength(index);
-                static bool isVariadic[] = {false, true, true};
-                return std::make_tuple(
-                        idxAndLen.first,
-                        idxAndLen.second,
-                        isVariadic[index]
-                );
-            }
-            if(auto concreteOp = llvm::dyn_cast<daphne::ThetaJoinOp>(op)) {
-                auto idxAndLen = concreteOp.getODSOperandIndexAndLength(index);
-                static bool isVariadic[] = {false, false, true, true};
-                return std::make_tuple(
-                        idxAndLen.first,
-                        idxAndLen.second,
-                        isVariadic[index]
-                );
-            }
-            if(auto concreteOp = llvm::dyn_cast<daphne::OrderOp>(op)) {
-                auto idxAndLen = concreteOp.getODSOperandIndexAndLength(index);
-                static bool isVariadic[] = {false, true, true, false};
-                return std::make_tuple(
-                        idxAndLen.first,
-                        idxAndLen.second,
-                        isVariadic[index]
-                );
-            }
-            throw ErrorHandler::compilerError(
-                op, "RewriteToCallKernelOpPass",
-                "lowering to kernel call not yet supported for this variadic "
-                "operation: " +
-                    op->getName().getStringRef().str());
-        }
 
-        /**
-         * @brief The value of type `DaphneContext` to insert as the last
-         * argument to all kernel calls.
-         */
-        Value dctx;
-
-        const DaphneUserConfig & userConfig;
-        std::unordered_map<std::string, bool> & usedLibPaths;
-
-        mlir::Type adaptType(mlir::Type t, bool generalizeToStructure) const {
-            MLIRContext * mctx = t.getContext();
-            if(generalizeToStructure && t.isa<mlir::daphne::MatrixType, mlir::daphne::FrameType, mlir::daphne::ListType>())
-                return mlir::daphne::StructureType::get(mctx);
-            if(auto mt = t.dyn_cast<mlir::daphne::MatrixType>())
-                return mt.withSameElementTypeAndRepr();
-            if(t.isa<mlir::daphne::FrameType>())
-                return mlir::daphne::FrameType::get(mctx, {mlir::daphne::UnknownType::get(mctx)});
-            if(auto lt = t.dyn_cast<mlir::daphne::ListType>())
-                return mlir::daphne::ListType::get(mctx, adaptType(lt.getElementType(), generalizeToStructure));
-            if(auto mrt = t.dyn_cast<mlir::MemRefType>())
-                // Remove any dimension information ({0, 0}), but retain the element type.
-                return mlir::MemRefType::get({0, 0}, mrt.getElementType());
-            return t;
+        if (auto groupOp = llvm::dyn_cast<daphne::GroupOp>(op)) {
+            // GroupOp carries the aggregation functions to apply as an
+            // attribute. Since attributes do not automatically become
+            // inputs to the kernel call, we need to add them explicitly
+            // here.
+
+            ArrayAttr aggFuncs = groupOp.getAggFuncs();
+            const size_t numAggFuncs = aggFuncs.size();
+            const Type t = rewriter.getIntegerType(32, false);
+            auto cvpOp = rewriter.create<daphne::CreateVariadicPackOp>(
+                loc, daphne::VariadicPackType::get(rewriter.getContext(), t), rewriter.getI64IntegerAttr(numAggFuncs));
+            size_t k = 0;
+            for (Attribute aggFunc : aggFuncs.getValue())
+                rewriter.create<daphne::StoreVariadicPackOp>(
+                    loc, cvpOp,
+                    rewriter.create<daphne::ConstantOp>(
+                        loc, t,
+                        rewriter.getIntegerAttr(
+                            t, static_cast<uint32_t>(aggFunc.dyn_cast<daphne::GroupEnumAttr>().getValue()))),
+                    rewriter.getI64IntegerAttr(k++));
+            kernelArgs.push_back(cvpOp);
+            kernelArgs.push_back(
+                rewriter.create<daphne::ConstantOp>(loc, rewriter.getIndexType(), rewriter.getIndexAttr(numAggFuncs)));
         }
 
-    public:
-        /**
-         * Creates a new KernelReplacement rewrite pattern.
-         *
-         * @param mctx The MLIR context.
-         * @param dctx The DaphneContext to pass to the kernels.
-         * @param userConfig The user config.
-         * @param benefit
-         */
-        KernelReplacement(
-            MLIRContext * mctx,
-            Value dctx,
-            const DaphneUserConfig & userConfig,
-            std::unordered_map<std::string, bool> & usedLibPaths,
-            PatternBenefit benefit = 1
-        )
-        : RewritePattern(Pattern::MatchAnyOpTypeTag(), benefit, mctx),
-        dctx(dctx), userConfig(userConfig), usedLibPaths(usedLibPaths)
-        {
+        if (auto thetaJoinOp = llvm::dyn_cast<daphne::ThetaJoinOp>(op)) {
+            // ThetaJoinOp carries multiple CompareOperation as an
+            // attribute. Since attributes do not automatically become
+            // inputs to the kernel call, we need to add them explicitly
+            // here.
+
+            // get array of CompareOperations
+            ArrayAttr compareOperations = thetaJoinOp.getCmp();
+            const size_t numCompareOperations = compareOperations.size();
+            const Type t = rewriter.getIntegerType(32, false);
+            // create Variadic Pack
+            auto cvpOp = rewriter.create<daphne::CreateVariadicPackOp>(
+                loc, daphne::VariadicPackType::get(rewriter.getContext(), t),
+                rewriter.getI64IntegerAttr(numCompareOperations));
+            // fill variadic pack
+            size_t k = 0;
+            for (Attribute compareOperation : compareOperations.getValue())
+                rewriter.create<daphne::StoreVariadicPackOp>(
+                    loc, cvpOp,
+                    rewriter.create<daphne::ConstantOp>(
+                        loc, t,
+                        rewriter.getIntegerAttr(
+                            t, static_cast<uint32_t>(
+                                   compareOperation.dyn_cast<daphne::CompareOperationAttr>().getValue()))),
+                    rewriter.getI64IntegerAttr(k++));
+            // add created variadic pack and size of this pack as
+            // new operands / parameters of the ThetaJoin-Kernel call
+            kernelArgs.push_back(cvpOp);
+            kernelArgs.push_back(rewriter.create<daphne::ConstantOp>(loc, rewriter.getIndexType(),
+                                                                     rewriter.getIndexAttr(numCompareOperations)));
         }
 
-        /**
-         * @brief Rewrites the given operation to a `CallKernelOp`.
-         * 
-         * This involves looking up a matching kernel from the kernel catalog based on the
-         * mnemonic, argument/result types, and backend (e.g., hardware accelerator) of the
-         * given operation. Variadic operands are also taken into account.
-         * 
-         * @param op The operation to rewrite.
-         * @param rewriter The rewriter.
-         * @result Always returns `mlir::success()` unless an exception is thrown.
-         */
-        LogicalResult matchAndRewrite(Operation *op, PatternRewriter &rewriter) const override {
-            Location loc = op->getLoc();
-
-            // The argument/result types of the given operation.
-            Operation::operand_type_range opArgTys = op->getOperandTypes();
-            Operation::result_type_range opResTys = op->getResultTypes();
-
-            // The argument/result types to use for kernel look-up.
-            std::vector<mlir::Type> lookupArgTys;
-            std::vector<mlir::Type> lookupResTys;
-            // Differences between op argument types and look-up argument types:
-            // - The look-up argument types summarize n occurrences of a variadic operand into
-            //   one variadic pack and one number of occurrences.
-            // - The look-up argument types omit most of the properties of the op argument types,
-            //   because those would complicate the search for matching kernels.
-            // Differences between op result types and look-up result types:
-            // - The look-up result types omit most of the properties of the op result types,
-            //   because those would complicate the search for matching kernels.
-
-            // The operands to use for the CallKernelOp to be created. These may differ from
-            // the operands of the given operation, if it has a variadic operand.
-            std::vector<Value> kernelArgs;
-
-            // *****************************************************************************
-            // Prepare the kernel look-up and the creation of the CallKernelOp
-            // *****************************************************************************
-            // Determine the argument/result types for the kernel look-up as well as
-            // the arguments of the CallKernelOp to be created. Variadic operands are taken
-            // into account.
-
-            // Find out if argument types shall the generalized from matrix/frame to the
-            // supertype structure.
-            // TODO Don't enumerate all ops, decide based on a trait.
-            const bool generalizeInputTypes =
-                llvm::isa<daphne::CreateFrameOp>(op) ||
-                llvm::isa<daphne::DistributedComputeOp>(op) ||
-                llvm::isa<daphne::NumCellsOp>(op) ||
-                llvm::isa<daphne::NumColsOp>(op) ||
-                llvm::isa<daphne::NumRowsOp>(op) ||
-                llvm::isa<daphne::IncRefOp>(op) ||
-                llvm::isa<daphne::DecRefOp>(op);
-
-            // Append converted op result types to the look-up result types.
-            for(size_t i = 0; i < opResTys.size(); i++)
-                lookupResTys.push_back(adaptType(opResTys[i], false));
-
-            // Append converted op argument types to the look-up argument types.
-            // Variadic operands, which can have an arbitrary number of occurrences, are
-            // treated specially.
-            if(
-                // TODO Unfortunately, one needs to know the exact N for
-                // AtLeastNOperands... There seems to be no simple way to
-                // detect if an operation has variadic ODS operands with any N.
-                op->hasTrait<OpTrait::VariadicOperands>() ||
-                op->hasTrait<OpTrait::AtLeastNOperands<1>::Impl>() ||
-                op->hasTrait<OpTrait::AtLeastNOperands<2>::Impl>()
-            ) {
-                // For operations with variadic ODS operands, we replace all
-                // occurrences of a variadic ODS operand by a single operand of
-                // type VariadicPack as well as an operand for the number of
-                // occurrences. All occurrences of the variadic ODS operand are
-                // stored in the VariadicPack.
-                // Note that a variadic ODS operand may have zero occurrences.
-                // In that case, there is no operand corresponding to the
-                // variadic ODS operand.
-                const size_t numODSOperands = getNumODSOperands(op);
-                for(size_t i = 0; i < numODSOperands; i++) {
-                    auto odsOpInfo = getODSOperandInfo(op, i);
-                    const unsigned idx = std::get<0>(odsOpInfo);
-                    const unsigned len = std::get<1>(odsOpInfo);
-                    const bool isVariadic = std::get<2>(odsOpInfo);
-
-                    // Determine the MLIR type of the current ODS operand.
-                    Type odsOperandTy;
-                    if(len > 0) {
-                        // If the current ODS operand has occurrences, then
-                        // we use the type of the first operand belonging to
-                        // the current ODS operand.
-                        odsOperandTy = opArgTys[idx];
-                    }
-                    else { // len == 0
-                        // If the current ODS operand does not have any occurrences
-                        // (e.g., a variadic ODS operand with zero concrete operands
-                        // provided), then we cannot derive the type of the
-                        // current ODS operand from any given operand. Instead,
-                        // we use a default type depending on which ODS operand of
-                        // which operation it is.
-                        // Note that we cannot simply omit the type, since the
-                        // underlying kernel expects an "empty list" (represented
-                        // in the DAPHNE compiler by an empty VariadicPack).
-                        if(llvm::dyn_cast<daphne::GroupOp>(op) && i == 2)
-                            // A GroupOp may have zero aggregation column names.
-                            odsOperandTy = daphne::StringType::get(rewriter.getContext());
-                        else
-                            throw std::runtime_error(
-                                "RewriteToCallKernelOpPass encountered a variadic ODS operand with zero occurrences, "
-                                "but does not know how to handle it: ODS operand " + std::to_string(i) +
-                                " of operation " + op->getName().getStringRef().str()
-                            );
-                    }
-
-                    lookupArgTys.push_back(adaptType(odsOperandTy, generalizeInputTypes));
-
-                    if(isVariadic) {
-                        // Variadic operand.
-                        lookupArgTys.push_back(rewriter.getIndexType());
-                        auto cvpOp = rewriter.create<daphne::CreateVariadicPackOp>(
-                                loc,
-                                daphne::VariadicPackType::get(
-                                        rewriter.getContext(),
-                                        odsOperandTy
-                                ),
-                                rewriter.getI64IntegerAttr(len)
-                        );
-                        for(int64_t k = 0; k < len; k++)
-                            rewriter.create<daphne::StoreVariadicPackOp>(
-                                    loc,
-                                    cvpOp,
-                                    op->getOperand(idx + k),
-                                    rewriter.getI64IntegerAttr(k)
-                            );
-                        kernelArgs.push_back(cvpOp);
-                        kernelArgs.push_back(rewriter.create<daphne::ConstantOp>(
-                                loc, rewriter.getIndexType(), rewriter.getIndexAttr(len)
-                        ));
-                    }
-                    else
-                        // Non-variadic operand.
-                        kernelArgs.push_back(op->getOperand(idx));
-                }
-            }
-            else
-                // For operations without variadic operands, we simply append
-                // the type of each operand to the vector of types to use for
-                // kernel look-up, and pass all operands to the CallKernelOp as-is.
-                for(size_t i = 0; i < opArgTys.size(); i++) {
-                    lookupArgTys.push_back(adaptType(opArgTys[i], generalizeInputTypes));
-                    kernelArgs.push_back(op->getOperand(i));
-                }
+        if (auto distCompOp = llvm::dyn_cast<daphne::DistributedComputeOp>(op)) {
+            MLIRContext newContext; // TODO Reuse the existing context.
+            OpBuilder tempBuilder(&newContext);
+            std::string funcName = "dist";
 
-            if(auto groupOp = llvm::dyn_cast<daphne::GroupOp>(op)) {
-                // GroupOp carries the aggregation functions to apply as an
-                // attribute. Since attributes do not automatically become
-                // inputs to the kernel call, we need to add them explicitly
-                // here.
-
-                ArrayAttr aggFuncs = groupOp.getAggFuncs();
-                const size_t numAggFuncs = aggFuncs.size();
-                const Type t = rewriter.getIntegerType(32, false);
-                auto cvpOp = rewriter.create<daphne::CreateVariadicPackOp>(
-                        loc,
-                        daphne::VariadicPackType::get(rewriter.getContext(), t),
-                        rewriter.getI64IntegerAttr(numAggFuncs)
-                );
-                size_t k = 0;
-                for(Attribute aggFunc : aggFuncs.getValue())
-                    rewriter.create<daphne::StoreVariadicPackOp>(
-                            loc,
-                            cvpOp,
-                            rewriter.create<daphne::ConstantOp>(
-                                    loc,
-                                    t,
-                                    rewriter.getIntegerAttr(
-                                            t,
-                                            static_cast<uint32_t>(
-                                                    aggFunc.dyn_cast<daphne::GroupEnumAttr>().getValue()
-                                            )
-                                    )
-                            ),
-                            rewriter.getI64IntegerAttr(k++)
-                    );
-                kernelArgs.push_back(cvpOp);
-                kernelArgs.push_back(rewriter.create<daphne::ConstantOp>(
-                        loc, rewriter.getIndexType(), rewriter.getIndexAttr(numAggFuncs))
-                );
-            }
-            
-            if(auto thetaJoinOp = llvm::dyn_cast<daphne::ThetaJoinOp>(op)) {
-                // ThetaJoinOp carries multiple CompareOperation as an
-                // attribute. Since attributes do not automatically become
-                // inputs to the kernel call, we need to add them explicitly
-                // here.
-
-                // get array of CompareOperations
-                ArrayAttr compareOperations = thetaJoinOp.getCmp();
-                const size_t numCompareOperations = compareOperations.size();
-                const Type t = rewriter.getIntegerType(32, false);
-                // create Variadic Pack
-                auto cvpOp = rewriter.create<daphne::CreateVariadicPackOp>(
-                        loc,
-                        daphne::VariadicPackType::get(rewriter.getContext(), t),
-                        rewriter.getI64IntegerAttr(numCompareOperations)
-                );
-                // fill variadic pack
-                size_t k = 0;
-                for(Attribute compareOperation : compareOperations.getValue())
-                    rewriter.create<daphne::StoreVariadicPackOp>(
-                            loc,
-                            cvpOp,
-                            rewriter.create<daphne::ConstantOp>(
-                                    loc,
-                                    t,
-                                    rewriter.getIntegerAttr(
-                                            t,
-                                            static_cast<uint32_t>(
-                                                    compareOperation.dyn_cast<daphne::CompareOperationAttr>().getValue()
-                                            )
-                                    )
-                            ),
-                            rewriter.getI64IntegerAttr(k++)
-                    );
-                // add created variadic pack and size of this pack as
-                // new operands / parameters of the ThetaJoin-Kernel call
-                kernelArgs.push_back(cvpOp);
-                kernelArgs.push_back(rewriter.create<daphne::ConstantOp>(
-                        loc, rewriter.getIndexType(), rewriter.getIndexAttr(numCompareOperations))
-                );
-            }
+            auto &bodyBlock = distCompOp.getBody().front();
+            auto funcType =
+                tempBuilder.getFunctionType(bodyBlock.getArgumentTypes(), bodyBlock.getTerminator()->getOperandTypes());
+            auto funcOp = tempBuilder.create<func::FuncOp>(loc, funcName, funcType);
 
-            if(auto distCompOp = llvm::dyn_cast<daphne::DistributedComputeOp>(op)) {
-                MLIRContext newContext; // TODO Reuse the existing context.
-                OpBuilder tempBuilder(&newContext);
-                std::string funcName = "dist";
-
-                auto &bodyBlock = distCompOp.getBody().front();
-                auto funcType = tempBuilder.getFunctionType(
-                    bodyBlock.getArgumentTypes(), bodyBlock.getTerminator()->getOperandTypes());
-                auto funcOp = tempBuilder.create<func::FuncOp>(loc, funcName, funcType);
-
-                IRMapping mapper;
-                distCompOp.getBody().cloneInto(&funcOp.getRegion(), mapper);
-
-                // write recompile region as string constant
-                std::string s;
-                llvm::raw_string_ostream stream(s);
-                funcOp.print(stream);
-
-                auto strTy = daphne::StringType::get(rewriter.getContext());
-                Value
-                    rewriteStr = rewriter.create<daphne::ConstantOp>(loc, strTy, rewriter.getStringAttr(stream.str()));
-                lookupArgTys.push_back(mlir::daphne::StringType::get(&newContext));
-                kernelArgs.push_back(rewriteStr);
-            }
+            IRMapping mapper;
+            distCompOp.getBody().cloneInto(&funcOp.getRegion(), mapper);
+
+            // write recompile region as string constant
+            std::string s;
+            llvm::raw_string_ostream stream(s);
+            funcOp.print(stream);
+
+            auto strTy = daphne::StringType::get(rewriter.getContext());
+            Value rewriteStr = rewriter.create<daphne::ConstantOp>(loc, strTy, rewriter.getStringAttr(stream.str()));
+            lookupArgTys.push_back(mlir::daphne::StringType::get(&newContext));
+            kernelArgs.push_back(rewriteStr);
+        }
 
-            // *****************************************************************************
-            // Look up a matching kernel from the kernel catalog.
-            // *****************************************************************************
-
-            const KernelCatalog & kc = userConfig.kernelCatalog;
-            const std::string opMnemonic = op->getName().stripDialect().data();
-            std::vector<KernelInfo> kernelInfos = kc.getKernelInfos(opMnemonic);
-
-            std::string libPath;
-            std::string kernelFuncName;
-            // TODO Don't hardcode the attribute name, put it in a central place.
-            if(op->hasAttr("kernel_hint")) {
-                // The operation has a kernel hint. Lower to the hinted kernel if possible.
-
-                // TODO Check if the attribute has the right type.
-                kernelFuncName = op->getAttrOfType<mlir::StringAttr>("kernel_hint").getValue().str();
-                bool found = false;
-                for(size_t i = 0; i < kernelInfos.size() && !found; i++) {
-                    auto ki = kernelInfos[i];
-                    if(ki.kernelFuncName == kernelFuncName) {
-                        libPath = ki.libPath;
-                        found = true;
-                    }
+        // *****************************************************************************
+        // Look up a matching kernel from the kernel catalog.
+        // *****************************************************************************
+
+        const KernelCatalog &kc = userConfig.kernelCatalog;
+        const std::string opMnemonic = op->getName().stripDialect().data();
+        std::vector<KernelInfo> kernelInfos = kc.getKernelInfos(opMnemonic);
+
+        std::string libPath;
+        std::string kernelFuncName;
+        // TODO Don't hardcode the attribute name, put it in a central place.
+        if (op->hasAttr("kernel_hint")) {
+            // The operation has a kernel hint. Lower to the hinted kernel if
+            // possible.
+
+            // TODO Check if the attribute has the right type.
+            kernelFuncName = op->getAttrOfType<mlir::StringAttr>("kernel_hint").getValue().str();
+            bool found = false;
+            for (size_t i = 0; i < kernelInfos.size() && !found; i++) {
+                auto ki = kernelInfos[i];
+                if (ki.kernelFuncName == kernelFuncName) {
+                    libPath = ki.libPath;
+                    found = true;
                 }
-                if(!found)
-                    throw ErrorHandler::compilerError(
-                        loc,
-                        "RewriteToCallKernelOpPass",
-                        "no kernel found for operation `" + opMnemonic +
-                        "` with hinted name `" + kernelFuncName + "`"
-                    );
             }
-            else {
-                // The operation does not have a kernel hint. Search for a kernel
-                // for this operation and the given result/argument types and backend.
-
-                if(kernelInfos.empty())
-                    throw ErrorHandler::compilerError(
-                        loc,
-                        "RewriteToCallKernelOpPass",
-                        "no kernels registered for operation `" + opMnemonic + "`"
-                    );
-
-                std::string backend;
-                if(op->hasAttr("cuda_device"))
-                    backend = "CUDA";
-                else if(op->hasAttr("fpgaopencl_device"))
-                    backend = "FPGAOPENCL";
-                else
-                    backend = "CPP";
-
-                const size_t numArgs = lookupArgTys.size();
-                const size_t numRess = lookupResTys.size();
-                int chosenKernelIdx = -1;
-                for(size_t i = 0; i < kernelInfos.size() && chosenKernelIdx == -1; i++) {
-                    auto ki = kernelInfos[i];
-                    if(ki.backend != backend)
-                        continue;
-                    if(numArgs != ki.argTypes.size())
-                        continue;
-                    if(numRess != ki.resTypes.size())
-                        continue;
-
-                    bool mismatch = false;
-                    for(size_t i = 0; i < numArgs && !mismatch; i++)
-                        if(lookupArgTys[i] != ki.argTypes[i])
-                            mismatch = true;
-                    for(size_t i = 0; i < numRess && !mismatch; i++)
-                        if(lookupResTys[i] != ki.resTypes[i])
-                            mismatch = true;
-                    if(!mismatch)
-                        chosenKernelIdx = i;
+            if (!found)
+                throw ErrorHandler::compilerError(loc, "RewriteToCallKernelOpPass",
+                                                  "no kernel found for operation `" + opMnemonic +
+                                                      "` with hinted name `" + kernelFuncName + "`");
+        } else {
+            // The operation does not have a kernel hint. Search for a kernel
+            // for this operation and the given result/argument types and
+            // backend.
+
+            if (kernelInfos.empty())
+                throw ErrorHandler::compilerError(loc, "RewriteToCallKernelOpPass",
+                                                  "no kernels registered for operation `" + opMnemonic + "`");
+
+            std::string backend;
+            if (op->hasAttr("cuda_device"))
+                backend = "CUDA";
+            else if (op->hasAttr("fpgaopencl_device"))
+                backend = "FPGAOPENCL";
+            else
+                backend = "CPP";
+
+            const size_t numArgs = lookupArgTys.size();
+            const size_t numRess = lookupResTys.size();
+            int chosenKernelIdx = -1;
+            for (size_t i = 0; i < kernelInfos.size() && chosenKernelIdx == -1; i++) {
+                auto ki = kernelInfos[i];
+                if (ki.backend != backend)
+                    continue;
+                if (numArgs != ki.argTypes.size())
+                    continue;
+                if (numRess != ki.resTypes.size())
+                    continue;
+
+                bool mismatch = false;
+                for (size_t i = 0; i < numArgs && !mismatch; i++)
+                    if (lookupArgTys[i] != ki.argTypes[i])
+                        mismatch = true;
+                for (size_t i = 0; i < numRess && !mismatch; i++)
+                    if (lookupResTys[i] != ki.resTypes[i])
+                        mismatch = true;
+                if (!mismatch)
+                    chosenKernelIdx = i;
+            }
+            if (chosenKernelIdx == -1) {
+                std::stringstream s;
+                s << "no kernel for operation `" << opMnemonic << "` available for the required input types `(";
+                for (size_t i = 0; i < numArgs; i++) {
+                    s << lookupArgTys[i];
+                    if (i < numArgs - 1)
+                        s << ", ";
                 }
-                if(chosenKernelIdx == -1) {
-                    std::stringstream s;
-                    s << "no kernel for operation `" << opMnemonic
-                        << "` available for the required input types `(";
-                    for(size_t i = 0; i < numArgs; i++) {
-                        s << lookupArgTys[i];
-                        if(i < numArgs - 1)
-                            s << ", ";
-                    }
-                    s << + ")` and output types `(";
-                    for(size_t i = 0; i < numRess; i++) {
-                        s << lookupResTys[i];
-                        if(i < numRess - 1)
-                            s << ", ";
-                    }
-                    s << ")` for backend `" << backend << "`, registered kernels for this op:" << std::endl;
-                    kc.dump(opMnemonic, s);
-                    throw ErrorHandler::compilerError(loc, "RewriteToCallKernelOpPass", s.str());
+                s << +")` and output types `(";
+                for (size_t i = 0; i < numRess; i++) {
+                    s << lookupResTys[i];
+                    if (i < numRess - 1)
+                        s << ", ";
                 }
-                KernelInfo chosenKI = kernelInfos[chosenKernelIdx];
-                libPath = chosenKI.libPath;
-                kernelFuncName = chosenKI.kernelFuncName;
+                s << ")` for backend `" << backend << "`, registered kernels for this op:" << std::endl;
+                kc.dump(opMnemonic, s);
+                throw ErrorHandler::compilerError(loc, "RewriteToCallKernelOpPass", s.str());
             }
-
-            // *****************************************************************************
-            // Add kernel id and DAPHNE context as arguments
-            // *****************************************************************************
-
-            auto kId = rewriter.create<mlir::arith::ConstantOp>(
-                loc, rewriter.getI32IntegerAttr(
-                         KernelDispatchMapping::instance().registerKernel(
-                             kernelFuncName, op)));
-
-            // NOTE: kId has to be added before CreateDaphneContextOp because
-            // there is an assumption that the CTX is the last argument
-            // (LowerToLLVMPass.cpp::623,702). This means the kId is expected to
-            // be the second to last argument.
-            kernelArgs.push_back(kId);
-
-            // Inject the current DaphneContext as the last input parameter to
-            // all kernel calls, unless it's a CreateDaphneContextOp.
-            if(!llvm::isa<daphne::CreateDaphneContextOp>(op))
-                kernelArgs.push_back(dctx);
-
-            // *****************************************************************************
-            // Create the CallKernelOp
-            // *****************************************************************************
-            
-            // Mark the shared library the chosen kernel comes from as used. This means we
-            // will link this library into the JIT-compiled program later.
-            usedLibPaths.at(libPath) = true;
-
-            // Create a CallKernelOp for the kernel function to call and return success().
-            auto kernel = rewriter.create<daphne::CallKernelOp>(
-                    loc,
-                    kernelFuncName,
-                    kernelArgs,
-                    opResTys
-            );
-            rewriter.replaceOp(op, kernel.getResults());
-            return success();
-        }
-    };
-    
-    class DistributedPipelineKernelReplacement : public OpConversionPattern<daphne::DistributedPipelineOp> {
-        Value dctx;
-        const DaphneUserConfig & userConfig;
-        std::unordered_map<std::string, bool> & usedLibPaths;
-        
-    public:
-        using OpConversionPattern::OpConversionPattern;
-        DistributedPipelineKernelReplacement(
-            MLIRContext * mctx,
-            Value dctx,
-            const DaphneUserConfig & userConfig,
-            std::unordered_map<std::string, bool> & usedLibPaths,
-            PatternBenefit benefit = 2
-        )
-        : OpConversionPattern(mctx, benefit),
-        dctx(dctx), userConfig(userConfig), usedLibPaths(usedLibPaths)
-        {
+            KernelInfo chosenKI = kernelInfos[chosenKernelIdx];
+            libPath = chosenKI.libPath;
+            kernelFuncName = chosenKI.kernelFuncName;
         }
 
-        LogicalResult matchAndRewrite(daphne::DistributedPipelineOp op, OpAdaptor adaptor,
-                                      ConversionPatternRewriter &rewriter) const override
-        {
-            size_t numOutputs = op.getOutputs().size();
-            size_t numInputs = op.getInputs().size();
-                     
-            
-            std::stringstream callee;
-            callee << "_distributedPipeline"; // kernel name
-            callee << "__DenseMatrix_double_variadic" // outputs
-                << "__size_t" // numOutputs
-                << "__Structure_variadic" // inputs
-                << "__size_t" // numInputs
-                << "__int64_t" // outRows
-                << "__int64_t" // outCols
-                << "__int64_t" // splits
-                << "__int64_t" // combines
-                << "__char"; // irCode
-            
-            MLIRContext* mctx = rewriter.getContext();
-            
-            Location loc = op.getLoc();
-            Type vptObj = daphne::VariadicPackType::get(mctx, daphne::MatrixType::get(mctx, rewriter.getF64Type()));
-            Type vptSize = daphne::VariadicPackType::get(mctx, rewriter.getIntegerType(64, false));
-            Type vptInt64 = daphne::VariadicPackType::get(mctx, rewriter.getIntegerType(64, true));
-            
-            // Variadic pack for inputs.
-            auto cvpInputs = rewriter.create<daphne::CreateVariadicPackOp>(loc, vptObj, rewriter.getI64IntegerAttr(numInputs));
-            for(size_t i = 0; i < numInputs; i++)
-                rewriter.create<daphne::StoreVariadicPackOp>(
-                        loc, cvpInputs, op.getInputs()[i], rewriter.getI64IntegerAttr(i)
-                );
-            // Constants for #inputs.
-            auto coNumInputs = rewriter.create<daphne::ConstantOp>(loc, numInputs);
-            [[maybe_unused]] auto coNumOutputs = rewriter.create<daphne::ConstantOp>(loc, numOutputs);
-            // Variadic pack for out_rows.
-            auto cvpOutRows = rewriter.create<daphne::CreateVariadicPackOp>(loc, vptSize, rewriter.getI64IntegerAttr(numOutputs));
-            for(size_t i = 0; i < numOutputs; i++)
-                rewriter.create<daphne::StoreVariadicPackOp>(
-                        loc, cvpOutRows, op.getOutRows()[i], rewriter.getI64IntegerAttr(i)
-                );
-            // Variadic pack for out_cols.
-            auto cvpOutCols = rewriter.create<daphne::CreateVariadicPackOp>(loc, vptSize, rewriter.getI64IntegerAttr(numOutputs));
-            for(size_t i = 0; i < numOutputs; i++)
-                rewriter.create<daphne::StoreVariadicPackOp>(
-                        loc, cvpOutCols, op.getOutCols()[i], rewriter.getI64IntegerAttr(i)
-                );
-            // Variadic pack for splits.
-            auto cvpSplits = rewriter.create<daphne::CreateVariadicPackOp>(loc, vptInt64, rewriter.getI64IntegerAttr(numInputs));
-            for(size_t i = 0; i < numInputs; i++)
-                rewriter.create<daphne::StoreVariadicPackOp>(
-                        loc,
-                        cvpSplits,
-                        rewriter.create<daphne::ConstantOp>(
-                                loc, static_cast<int64_t>(op.getSplits()[i].dyn_cast<daphne::VectorSplitAttr>().getValue())
-                        ),
-                        rewriter.getI64IntegerAttr(i)
-                );
-            // Variadic pack for combines.
-            auto cvpCombines = rewriter.create<daphne::CreateVariadicPackOp>(loc, vptInt64, rewriter.getI64IntegerAttr(numOutputs));
-            for(size_t i = 0; i < numOutputs; i++)
-                rewriter.create<daphne::StoreVariadicPackOp>(
-                        loc,
-                        cvpCombines,
-                        rewriter.create<daphne::ConstantOp>(
-                                loc, static_cast<int64_t>(op.getCombines()[i].dyn_cast<daphne::VectorCombineAttr>().getValue())
-                        ),
-                        rewriter.getI64IntegerAttr(i)
-                );
-            
-            // Create CallKernelOp.
-            std::vector<Value> newOperands = {
-                cvpInputs, coNumInputs, cvpOutRows, cvpOutCols, cvpSplits, cvpCombines, op.getIr(), dctx
-            };
-            auto cko = rewriter.replaceOpWithNewOp<daphne::CallKernelOp>(
-                    op.getOperation(),
-                    callee.str(),
-                    newOperands,
-                    op.getOutputs().getTypes()
-            );
-            // TODO Use ATTR_HASVARIADICRESULTS from LowerToLLVMPass.cpp.
-            cko->setAttr("hasVariadicResults", rewriter.getBoolAttr(true));
-      
-            return success();
-        }
-    };
-
-    struct RewriteToCallKernelOpPass
-    : public PassWrapper<RewriteToCallKernelOpPass, OperationPass<func::FuncOp>>
-    {
-        const DaphneUserConfig& userConfig;
-        std::unordered_map<std::string, bool> & usedLibPaths;
-
-        explicit RewriteToCallKernelOpPass(
-            const DaphneUserConfig& cfg, std::unordered_map<std::string, bool> & usedLibPaths
-        ) : userConfig(cfg), usedLibPaths(usedLibPaths) {}
-
-        void runOnOperation() final;
-    };
-}
-
-void RewriteToCallKernelOpPass::runOnOperation()
-{
+        // *****************************************************************************
+        // Add kernel id and DAPHNE context as arguments
+        // *****************************************************************************
+
+        auto kId = rewriter.create<mlir::arith::ConstantOp>(
+            loc, rewriter.getI32IntegerAttr(KernelDispatchMapping::instance().registerKernel(kernelFuncName, op)));
+
+        // NOTE: kId has to be added before CreateDaphneContextOp because
+        // there is an assumption that the CTX is the last argument
+        // (LowerToLLVMPass.cpp::623,702). This means the kId is expected to
+        // be the second to last argument.
+        kernelArgs.push_back(kId);
+
+        // Inject the current DaphneContext as the last input parameter to
+        // all kernel calls, unless it's a CreateDaphneContextOp.
+        if (!llvm::isa<daphne::CreateDaphneContextOp>(op))
+            kernelArgs.push_back(dctx);
+
+        // *****************************************************************************
+        // Create the CallKernelOp
+        // *****************************************************************************
+
+        // Mark the shared library the chosen kernel comes from as used. This
+        // means we will link this library into the JIT-compiled program later.
+        usedLibPaths.at(libPath) = true;
+
+        // Create a CallKernelOp for the kernel function to call and return
+        // success().
+        auto kernel = rewriter.create<daphne::CallKernelOp>(loc, kernelFuncName, kernelArgs, opResTys);
+        rewriter.replaceOp(op, kernel.getResults());
+        return success();
+    }
+};
+
+class DistributedPipelineKernelReplacement : public OpConversionPattern<daphne::DistributedPipelineOp> {
+    Value dctx;
+    const DaphneUserConfig &userConfig;
+    std::unordered_map<std::string, bool> &usedLibPaths;
+
+  public:
+    using OpConversionPattern::OpConversionPattern;
+    DistributedPipelineKernelReplacement(MLIRContext *mctx, Value dctx, const DaphneUserConfig &userConfig,
+                                         std::unordered_map<std::string, bool> &usedLibPaths,
+                                         PatternBenefit benefit = 2)
+        : OpConversionPattern(mctx, benefit), dctx(dctx), userConfig(userConfig), usedLibPaths(usedLibPaths) {}
+
+    LogicalResult matchAndRewrite(daphne::DistributedPipelineOp op, OpAdaptor adaptor,
+                                  ConversionPatternRewriter &rewriter) const override {
+        size_t numOutputs = op.getOutputs().size();
+        size_t numInputs = op.getInputs().size();
+
+        std::stringstream callee;
+        callee << "_distributedPipeline";         // kernel name
+        callee << "__DenseMatrix_double_variadic" // outputs
+               << "__size_t"                      // numOutputs
+               << "__Structure_variadic"          // inputs
+               << "__size_t"                      // numInputs
+               << "__int64_t"                     // outRows
+               << "__int64_t"                     // outCols
+               << "__int64_t"                     // splits
+               << "__int64_t"                     // combines
+               << "__char";                       // irCode
+
+        MLIRContext *mctx = rewriter.getContext();
+
+        Location loc = op.getLoc();
+        Type vptObj = daphne::VariadicPackType::get(mctx, daphne::MatrixType::get(mctx, rewriter.getF64Type()));
+        Type vptSize = daphne::VariadicPackType::get(mctx, rewriter.getIntegerType(64, false));
+        Type vptInt64 = daphne::VariadicPackType::get(mctx, rewriter.getIntegerType(64, true));
+
+        // Variadic pack for inputs.
+        auto cvpInputs =
+            rewriter.create<daphne::CreateVariadicPackOp>(loc, vptObj, rewriter.getI64IntegerAttr(numInputs));
+        for (size_t i = 0; i < numInputs; i++)
+            rewriter.create<daphne::StoreVariadicPackOp>(loc, cvpInputs, op.getInputs()[i],
+                                                         rewriter.getI64IntegerAttr(i));
+        // Constants for #inputs.
+        auto coNumInputs = rewriter.create<daphne::ConstantOp>(loc, numInputs);
+        [[maybe_unused]] auto coNumOutputs = rewriter.create<daphne::ConstantOp>(loc, numOutputs);
+        // Variadic pack for out_rows.
+        auto cvpOutRows =
+            rewriter.create<daphne::CreateVariadicPackOp>(loc, vptSize, rewriter.getI64IntegerAttr(numOutputs));
+        for (size_t i = 0; i < numOutputs; i++)
+            rewriter.create<daphne::StoreVariadicPackOp>(loc, cvpOutRows, op.getOutRows()[i],
+                                                         rewriter.getI64IntegerAttr(i));
+        // Variadic pack for out_cols.
+        auto cvpOutCols =
+            rewriter.create<daphne::CreateVariadicPackOp>(loc, vptSize, rewriter.getI64IntegerAttr(numOutputs));
+        for (size_t i = 0; i < numOutputs; i++)
+            rewriter.create<daphne::StoreVariadicPackOp>(loc, cvpOutCols, op.getOutCols()[i],
+                                                         rewriter.getI64IntegerAttr(i));
+        // Variadic pack for splits.
+        auto cvpSplits =
+            rewriter.create<daphne::CreateVariadicPackOp>(loc, vptInt64, rewriter.getI64IntegerAttr(numInputs));
+        for (size_t i = 0; i < numInputs; i++)
+            rewriter.create<daphne::StoreVariadicPackOp>(
+                loc, cvpSplits,
+                rewriter.create<daphne::ConstantOp>(
+                    loc, static_cast<int64_t>(op.getSplits()[i].dyn_cast<daphne::VectorSplitAttr>().getValue())),
+                rewriter.getI64IntegerAttr(i));
+        // Variadic pack for combines.
+        auto cvpCombines =
+            rewriter.create<daphne::CreateVariadicPackOp>(loc, vptInt64, rewriter.getI64IntegerAttr(numOutputs));
+        for (size_t i = 0; i < numOutputs; i++)
+            rewriter.create<daphne::StoreVariadicPackOp>(
+                loc, cvpCombines,
+                rewriter.create<daphne::ConstantOp>(
+                    loc, static_cast<int64_t>(op.getCombines()[i].dyn_cast<daphne::VectorCombineAttr>().getValue())),
+                rewriter.getI64IntegerAttr(i));
+
+        // Create CallKernelOp.
+        std::vector<Value> newOperands = {cvpInputs, coNumInputs, cvpOutRows, cvpOutCols,
+                                          cvpSplits, cvpCombines, op.getIr(), dctx};
+        auto cko = rewriter.replaceOpWithNewOp<daphne::CallKernelOp>(op.getOperation(), callee.str(), newOperands,
+                                                                     op.getOutputs().getTypes());
+        // TODO Use ATTR_HASVARIADICRESULTS from LowerToLLVMPass.cpp.
+        cko->setAttr("hasVariadicResults", rewriter.getBoolAttr(true));
+
+        return success();
+    }
+};
+
+struct RewriteToCallKernelOpPass : public PassWrapper<RewriteToCallKernelOpPass, OperationPass<func::FuncOp>> {
+    const DaphneUserConfig &userConfig;
+    std::unordered_map<std::string, bool> &usedLibPaths;
+
+    explicit RewriteToCallKernelOpPass(const DaphneUserConfig &cfg, std::unordered_map<std::string, bool> &usedLibPaths)
+        : userConfig(cfg), usedLibPaths(usedLibPaths) {}
+
+    void runOnOperation() final;
+};
+} // namespace
+
+void RewriteToCallKernelOpPass::runOnOperation() {
     func::FuncOp func = getOperation();
 
     RewritePatternSet patterns(&getContext());
@@ -713,47 +604,29 @@ void RewriteToCallKernelOpPass::runOnOperation()
     // Specification of (il)legal dialects/operations. All DaphneIR operations
     // but those explicitly marked as legal will be replaced by CallKernelOp.
     ConversionTarget target(getContext());
-    target.addLegalDialect<mlir::AffineDialect, LLVM::LLVMDialect,
-                           scf::SCFDialect, memref::MemRefDialect,
-                           mlir::linalg::LinalgDialect,
-                           mlir::arith::ArithDialect, mlir::BuiltinDialect>();
+    target.addLegalDialect<mlir::AffineDialect, LLVM::LLVMDialect, scf::SCFDialect, memref::MemRefDialect,
+                           mlir::linalg::LinalgDialect, mlir::arith::ArithDialect, mlir::BuiltinDialect>();
 
     target.addLegalOp<ModuleOp, func::FuncOp, func::CallOp, func::ReturnOp>();
     target.addIllegalDialect<daphne::DaphneDialect>();
-    target.addLegalOp<
-            daphne::ConstantOp,
-            daphne::ReturnOp,
-            daphne::CallKernelOp,
-            daphne::CreateVariadicPackOp,
-            daphne::StoreVariadicPackOp,
-            daphne::VectorizedPipelineOp,
-            scf::ForOp,
-            memref::LoadOp,
-            daphne::GenericCallOp,
-            daphne::MapOp
-    >();
-    target.addDynamicallyLegalOp<daphne::CastOp>([](daphne::CastOp op) {
-        return op.isTrivialCast() || op.isRemovePropertyCast();
-    });
+    target.addLegalOp<daphne::ConstantOp, daphne::ReturnOp, daphne::CallKernelOp, daphne::CreateVariadicPackOp,
+                      daphne::StoreVariadicPackOp, daphne::VectorizedPipelineOp, scf::ForOp, memref::LoadOp,
+                      daphne::GenericCallOp, daphne::MapOp>();
+    target.addDynamicallyLegalOp<daphne::CastOp>(
+        [](daphne::CastOp op) { return op.isTrivialCast() || op.isRemovePropertyCast(); });
 
     // Determine the DaphneContext valid in the MLIR function being rewritten.
     mlir::Value dctx = CompilerUtils::getDaphneContext(func);
-    func->walk([&](daphne::VectorizedPipelineOp vpo)
-    {
-      vpo.getCtxMutable().assign(dctx);
-    });
+    func->walk([&](daphne::VectorizedPipelineOp vpo) { vpo.getCtxMutable().assign(dctx); });
 
     // Apply conversion to CallKernelOps.
-    patterns.insert<
-            KernelReplacement,
-            DistributedPipelineKernelReplacement
-    >(&getContext(), dctx, userConfig, usedLibPaths);
+    patterns.insert<KernelReplacement, DistributedPipelineKernelReplacement>(&getContext(), dctx, userConfig,
+                                                                             usedLibPaths);
     if (failed(applyPartialConversion(func, target, std::move(patterns))))
         signalPassFailure();
-
 }
 
-std::unique_ptr<Pass> daphne::createRewriteToCallKernelOpPass(const DaphneUserConfig& cfg, std::unordered_map<std::string, bool> & usedLibPaths)
-{
+std::unique_ptr<Pass> daphne::createRewriteToCallKernelOpPass(const DaphneUserConfig &cfg,
+                                                              std::unordered_map<std::string, bool> &usedLibPaths) {
     return std::make_unique<RewriteToCallKernelOpPass>(cfg, usedLibPaths);
 }
diff --git a/src/compiler/lowering/SpecializeGenericFunctionsPass.cpp b/src/compiler/lowering/SpecializeGenericFunctionsPass.cpp
index 3541121c2..e6b98bc8a 100644
--- a/src/compiler/lowering/SpecializeGenericFunctionsPass.cpp
+++ b/src/compiler/lowering/SpecializeGenericFunctionsPass.cpp
@@ -14,10 +14,10 @@
  * limitations under the License.
  */
 
-#include <compiler/utils/CompilerUtils.h>
-#include <util/ErrorHandler.h>
 #include "ir/daphneir/Daphne.h"
 #include "ir/daphneir/Passes.h"
+#include <compiler/utils/CompilerUtils.h>
+#include <util/ErrorHandler.h>
 
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
@@ -26,435 +26,448 @@
 #include "mlir/Transforms/Passes.h"
 
 #include <memory>
+#include <set>
 #include <stdexcept>
 #include <string>
-#include <set>
 #include <unordered_map>
 
 using namespace mlir;
 
 namespace {
 
-    /**
-     * @brief Checks if the function is untyped, i.e., if at least one of the inputs is
-     * of unknown type.
-     * 
-     * @param op The `FuncOp` to check
-     * @return true if `FuncOp` is untyped, false otherwise
-     */
-    bool isUntypedFunction(func::FuncOp op) {
-        return llvm::any_of(
-                op.getFunctionType().getInputs(),
-                [&](Type ty) {
-                    auto matTy = ty.dyn_cast<daphne::MatrixType>();
-                    return
-                        llvm::isa<daphne::UnknownType>(ty) ||
-                        (matTy && (llvm::isa<daphne::UnknownType>(matTy.getElementType())));
-                }
-        );
-    }
+/**
+ * @brief Checks if the function is untyped, i.e., if at least one of the inputs
+ * is of unknown type.
+ *
+ * @param op The `FuncOp` to check
+ * @return true if `FuncOp` is untyped, false otherwise
+ */
+bool isUntypedFunction(func::FuncOp op) {
+    return llvm::any_of(op.getFunctionType().getInputs(), [&](Type ty) {
+        auto matTy = ty.dyn_cast<daphne::MatrixType>();
+        return llvm::isa<daphne::UnknownType>(ty) ||
+               (matTy && (llvm::isa<daphne::UnknownType>(matTy.getElementType())));
+    });
+}
 
-    /**
-     * @brief Checks if the function is a template, by checking the types of input arguments.
-     * 
-     * We consider a function a template iff:
-     * (1) it is an untyped function (i.e., at least one of the inputs is of unknown type
-     *     or a matrix of unknown value type), or
-     * (2) at least one of the inputs is a matrix with unknown properties
-     * 
-     * @param op The `FuncOp` to check
-     * @return true if `FuncOp` is a template, false otherwise
-     */
-    bool isFunctionTemplate(func::FuncOp op) {
-        return llvm::any_of(
-                op.getFunctionType().getInputs(),
-                [&](Type ty) {
-                    auto matTy = ty.dyn_cast<daphne::MatrixType>();
-                    return
-                        llvm::isa<daphne::UnknownType>(ty) ||
-                        (matTy && (
-                            llvm::isa<daphne::UnknownType>(matTy.getElementType()) ||
-                            (matTy.getNumRows() == -1 && matTy.getNumCols() == -1 && matTy.getSparsity() == -1)
-                        ));
-                }
-        );
+/**
+ * @brief Checks if the function is a template, by checking the types of input
+ * arguments.
+ *
+ * We consider a function a template iff:
+ * (1) it is an untyped function (i.e., at least one of the inputs is of unknown
+ * type or a matrix of unknown value type), or (2) at least one of the inputs is
+ * a matrix with unknown properties
+ *
+ * @param op The `FuncOp` to check
+ * @return true if `FuncOp` is a template, false otherwise
+ */
+bool isFunctionTemplate(func::FuncOp op) {
+    return llvm::any_of(op.getFunctionType().getInputs(), [&](Type ty) {
+        auto matTy = ty.dyn_cast<daphne::MatrixType>();
+        return llvm::isa<daphne::UnknownType>(ty) ||
+               (matTy && (llvm::isa<daphne::UnknownType>(matTy.getElementType()) ||
+                          (matTy.getNumRows() == -1 && matTy.getNumCols() == -1 && matTy.getSparsity() == -1)));
+    });
+}
+
+std::string uniqueSpecializedFuncName(const std::string &functionName) {
+    static unsigned functionUniqueId = 0;
+    return functionName + "-" + std::to_string(++functionUniqueId);
+}
+
+/**
+ * @brief Check if a function with the given input/output types can be called
+ * with the input types given.
+ * @param functionType The type of the function
+ * @param callTypes The types used in the call
+ * @return true if the types match for a call, false otherwise
+ */
+bool callTypesMatchFunctionTypes(FunctionType functionType, TypeRange callTypes) {
+    for (auto zipIt : llvm::zip(functionType.getInputs(), callTypes)) {
+        auto funcTy = std::get<0>(zipIt);
+        auto callTy = std::get<1>(zipIt);
+        // Note that we explicitly take all properties (e.g., shape) into
+        // account.
+        if (funcTy != callTy)
+            return false;
     }
+    return true;
+}
 
-    std::string uniqueSpecializedFuncName(const std::string &functionName) {
-        static unsigned functionUniqueId = 0;
-        return functionName + "-" + std::to_string(++functionUniqueId);
+/**
+ * @brief Get argument types for the specialized version of a template function.
+ * @param functionType The types of the template function.
+ * @param callTypes The types used in the call to the specialized version.
+ * @param funcName The name of the function to call
+ * @param callLoc The location of the call
+ * @return The argument types to use for the specialized version
+ */
+std::vector<Type> getSpecializedFuncArgTypes(FunctionType functionType, TypeRange callTypes,
+                                             const std::string &funcName, mlir::Location callLoc) {
+    auto unknownTy = daphne::UnknownType::get(functionType.getContext());
+    std::vector<mlir::Type> specializedTypes;
+    for (auto it : llvm::enumerate(llvm::zip(functionType.getInputs(), callTypes))) {
+        auto index = it.index();
+        auto funcInTy = std::get<0>(it.value());
+        auto specializedTy = std::get<1>(it.value());
+        if (funcInTy != specializedTy) {
+            auto funcMatTy = funcInTy.dyn_cast<daphne::MatrixType>();
+            auto specializedMatTy = specializedTy.dyn_cast<daphne::MatrixType>();
+            bool isMatchingUnknownMatrix = funcMatTy && specializedMatTy && funcMatTy.getElementType() == unknownTy;
+            bool isMatchingUnknownPropertiesMatrix =
+                funcMatTy && specializedMatTy && funcMatTy.getElementType() == specializedMatTy.getElementType() &&
+                funcMatTy.getNumRows() == -1 && funcMatTy.getNumCols() == -1 && funcMatTy.getSparsity() == -1;
+            if (!isMatchingUnknownMatrix && !isMatchingUnknownPropertiesMatrix && funcInTy != unknownTy) {
+                std::string s;
+                llvm::raw_string_ostream stream(s);
+                // TODO The function name funcName has a cryptic suffix from
+                // overloading/specialization, which is not suitable for users
+                // for see.
+                // TODO This error message can shiw up even for typed functions
+                // which are no "templates", which is confusing for a user.
+                // TODO The index seems to be off by 1 (too large)... (or not,
+                // simply 0-based counting).
+                stream << "call to function template `" << funcName << "` with invalid types for argument " << index
+                       << ": expected `" << funcInTy << "`, got `" << specializedTy << "`";
+                throw ErrorHandler::compilerError(callLoc, "SpecializeGenericFunctionsPass", stream.str());
+            }
+        }
+        // Note that specializedTy may explicitly contain property information
+        // (e.g., shape).
+        specializedTypes.push_back(specializedTy);
     }
+    return specializedTypes;
+}
 
-    /**
-     * @brief Check if a function with the given input/output types can be called with the input types given.
-     * @param functionType The type of the function
-     * @param callTypes The types used in the call
-     * @return true if the types match for a call, false otherwise
-     */
-    bool callTypesMatchFunctionTypes(FunctionType functionType, TypeRange callTypes) {
-        for(auto zipIt : llvm::zip(functionType.getInputs(), callTypes)) {
-            auto funcTy = std::get<0>(zipIt);
-            auto callTy = std::get<1>(zipIt);
-            // Note that we explicitly take all properties (e.g., shape) into account.
-            if(funcTy != callTy)
-                return false;
+/**
+ * @brief Set the result types to the types of the function results.
+ * @param results The results for which to fix the types
+ * @param functionType The function type
+ * @return true if changes where made, else false
+ */
+bool fixResultTypes(ResultRange results, FunctionType functionType) {
+    bool madeChanges = false;
+    for (auto it : llvm::zip(results, functionType.getResults())) {
+        auto result = std::get<0>(it);
+        auto functionResultTy = std::get<1>(it);
+        if (result.getType() != functionResultTy) {
+            madeChanges = true;
+            result.setType(functionResultTy);
         }
-        return true;
     }
+    return madeChanges;
+}
+
+/**
+ * @brief Run partial type and label inference on the given `FuncOp`.
+ * @param function The `FuncOp`
+ * @return The inferred `FuncOp` (same as input), or `nullptr` if an error
+ * happened
+ */
+func::FuncOp inferTypesInFunction(func::FuncOp function) {
+    // Run inference
+    mlir::PassManager pm(function->getContext(), "func.func");
+    pm.enableVerifier(false);
+    // TODO There is a cyclic dependency between (shape) inference and
+    // constant folding (included in canonicalization), at the moment we
+    // run only three iterations of both passes (see #173).
+    pm.addPass(daphne::createInferencePass({true, true, true, true, true}));
+    pm.addPass(createCanonicalizerPass());
+    pm.addPass(daphne::createInferencePass({true, true, true, true, true}));
+    pm.addPass(createCanonicalizerPass());
+    pm.addPass(daphne::createInferencePass({true, true, true, true, true}));
+    pm.addPass(createCanonicalizerPass());
+    pm.addPass(daphne::createInferencePass({true, true, true, true, true}));
+    pm.addPass(createCanonicalizerPass());
+    if (failed(pm.run(function))) {
+        throw ErrorHandler::compilerError(function.getOperation(), "SpecializeGenericFunctionsPass",
+                                          "could not infer types for a call of function template: " +
+                                              function.getName().str());
+    }
+    return function;
+}
 
+class SpecializeGenericFunctionsPass : public PassWrapper<SpecializeGenericFunctionsPass, OperationPass<ModuleOp>> {
+    std::unordered_map<std::string, func::FuncOp> functions;
+    std::multimap<std::string, func::FuncOp> specializedVersions;
+    std::set<func::FuncOp> visited;
+    std::set<func::FuncOp> called;
+    std::set<func::FuncOp> templateFunctions;
+
+    const DaphneUserConfig &userConfig;
+    std::shared_ptr<spdlog::logger> logger;
+
+  public:
+    explicit SpecializeGenericFunctionsPass(const DaphneUserConfig &cfg) : userConfig(cfg) {
+        logger = spdlog::get("compiler");
+    }
+
+  private:
     /**
-     * @brief Get argument types for the specialized version of a template function.
-     * @param functionType The types of the template function.
-     * @param callTypes The types used in the call to the specialized version.
-     * @param funcName The name of the function to call
-     * @param callLoc The location of the call
-     * @return The argument types to use for the specialized version
+     * @brief Create a specialized version of the template function.
+     * @param templateFunction The template function.
+     * @param specializedTypes The specialized function arguments
+     * @param operands The operands of the call operation
+     * @return The specialized function
      */
-    std::vector<Type> getSpecializedFuncArgTypes(FunctionType functionType, TypeRange callTypes, const std::string & funcName, mlir::Location callLoc) {
-        auto unknownTy = daphne::UnknownType::get(functionType.getContext());
-        std::vector<mlir::Type> specializedTypes;
-        for(auto it : llvm::enumerate(llvm::zip(functionType.getInputs(), callTypes))) {
-            auto index = it.index();
-            auto funcInTy = std::get<0>(it.value());
-            auto specializedTy = std::get<1>(it.value());
-            if(funcInTy != specializedTy) {
-                auto funcMatTy = funcInTy.dyn_cast<daphne::MatrixType>();
-                auto specializedMatTy = specializedTy.dyn_cast<daphne::MatrixType>();
-                bool isMatchingUnknownMatrix =
-                    funcMatTy && specializedMatTy && funcMatTy.getElementType() == unknownTy;
-                bool isMatchingUnknownPropertiesMatrix =
-                    funcMatTy && specializedMatTy && funcMatTy.getElementType() == specializedMatTy.getElementType() &&
-                    funcMatTy.getNumRows() == -1 && funcMatTy.getNumCols() == -1 && funcMatTy.getSparsity() == -1;
-                if(!isMatchingUnknownMatrix && !isMatchingUnknownPropertiesMatrix && funcInTy != unknownTy) {
-                    std::string s;
-                    llvm::raw_string_ostream stream(s);
-                    // TODO The function name funcName has a cryptic suffix from overloading/specialization, which is not suitable for users for see.
-                    // TODO This error message can shiw up even for typed functions which are no "templates", which is confusing for a user.
-                    // TODO The index seems to be off by 1 (too large)... (or not, simply 0-based counting).
-                    stream << "call to function template `" << funcName << "` with invalid types for argument " << index
-                           << ": expected `" << funcInTy << "`, got `" << specializedTy << "`";
-                    throw ErrorHandler::compilerError(callLoc, "SpecializeGenericFunctionsPass", stream.str());
+    func::FuncOp createSpecializedFunction(func::FuncOp templateFunction, TypeRange specializedTypes,
+                                           ValueRange operands) {
+        OpBuilder builder(templateFunction);
+        auto specializedFunc = templateFunction.clone();
+        builder.insert(specializedFunc);
+
+        auto uniqueFuncName = uniqueSpecializedFuncName(templateFunction.getSymName().str());
+        specializedFunc.setName(uniqueFuncName);
+        functions.insert({uniqueFuncName, specializedFunc});
+
+        // change argument types
+        specializedFunc.setType(
+            builder.getFunctionType(specializedTypes, specializedFunc.getFunctionType().getResults()));
+        for (auto it : llvm::zip(specializedFunc.getArguments(), specializedTypes)) {
+            std::get<0>(it).setType(std::get<1>(it));
+        }
+
+        bool insertedConst = false;
+        // Don't propagate constants into untyped functions, since that still
+        // causes problems for some reason.
+        if (userConfig.use_ipa_const_propa && !isUntypedFunction(templateFunction)) {
+            // Insert compile-time constant scalar call operands into the
+            // function.
+            Block &specializedFuncBodyBlock = specializedFunc.getBody().front();
+            builder.setInsertionPointToStart(&specializedFuncBodyBlock);
+            for (auto it : llvm::enumerate(operands)) {
+                auto i = it.index();
+                Value v = it.value();
+                if (Operation *co = CompilerUtils::constantOfAnyType(v)) {
+                    // Clone the constant operation into the function body.
+                    Operation *coNew = co->clone();
+                    builder.insert(coNew);
+                    // Replace all uses of the corresponding block argument by
+                    // the newly inserted constant.
+                    specializedFuncBodyBlock.getArgument(i).replaceAllUsesWith(coNew->getResult(0));
+                    // TODO We could even remove the corresponding function
+                    // argument.
+                    insertedConst = true;
                 }
             }
-            // Note that specializedTy may explicitly contain property information (e.g., shape).
-            specializedTypes.push_back(specializedTy);
         }
-        return specializedTypes;
+        // Remember the newly specialized function for reuse only if we did not
+        // insert any constant call operands.
+        // TODO We could reuse it for other calls with the same constant (it's
+        // just more book-keeping effort).
+        if (!insertedConst)
+            specializedVersions.insert({templateFunction.getSymName().str(), specializedFunc});
+
+        return inferTypesInFunction(specializedFunc);
     }
 
     /**
-     * @brief Set the result types to the types of the function results.
-     * @param results The results for which to fix the types
-     * @param functionType The function type
-     * @return true if changes where made, else false
+     * @brief Try to reuse an existing specialization for the given template
+     * function
+     * @param operandTypes Operand types of the call operation
+     * @param operands Operands of the call operation or an empty list if the
+     * operands are not available
+     * @param templateFunction The template function called by the call
+     * operation
+     * @return either an existing and matching `FuncOp`, `nullptr` otherwise
      */
-    bool fixResultTypes(ResultRange results, FunctionType functionType) {
-        bool madeChanges = false;
-        for(auto it : llvm::zip(results, functionType.getResults())) {
-            auto result = std::get<0>(it);
-            auto functionResultTy = std::get<1>(it);
-            if(result.getType() != functionResultTy) {
-                madeChanges = true;
-                result.setType(functionResultTy);
+    func::FuncOp tryReuseExistingSpecialization(TypeRange operandTypes, ValueRange operands,
+                                                func::FuncOp templateFunction) {
+        if (userConfig.use_ipa_const_propa) {
+            // If any call operand is a compile-time constant scalar, we don't
+            // reuse an existing specialization, but create a new one while
+            // propagating the constant to the function body.
+            // TODO We could reuse a former specialization that uses the same
+            // constant.
+            for (Value v : operands)
+                if (CompilerUtils::constantOfAnyType(v))
+                    return nullptr;
+        }
+
+        // Try to find a reusable function specialization based on types and
+        // data properties.
+        auto eqIt = specializedVersions.equal_range(templateFunction.getSymName().str());
+        for (auto it = eqIt.first; it != eqIt.second; ++it) {
+            auto specializedFunc = it->second;
+
+            if (callTypesMatchFunctionTypes(specializedFunc.getFunctionType(), operandTypes)) {
+                // reuse existing specialized function
+                return specializedFunc;
             }
         }
-        return madeChanges;
+
+        return nullptr;
     }
 
     /**
-     * @brief Run partial type and label inference on the given `FuncOp`.
-     * @param function The `FuncOp`
-     * @return The inferred `FuncOp` (same as input), or `nullptr` if an error happened
+     * @brief Try to reuse an existing specializtion if one exists, else creates
+     * a new specialization
+     * @param operandTypes Operand types of the call operation
+     * @param operands Operands of the call operation or an empty list if the
+     * operands are not available
+     * @param calledFunction The function called by the call operation
+     * @param callLoc The location of the call for which a function
+     * specialization shall be created or reused
+     * @return A `FuncOp`for the specialization
      */
-    func::FuncOp inferTypesInFunction(func::FuncOp function) {
-        // Run inference
-        mlir::PassManager pm(function->getContext(), "func.func");
-        pm.enableVerifier(false);
-        // TODO There is a cyclic dependency between (shape) inference and
-        // constant folding (included in canonicalization), at the moment we
-        // run only three iterations of both passes (see #173).
-        pm.addPass(daphne::createInferencePass({true, true, true, true, true}));
-        pm.addPass(createCanonicalizerPass());
-        pm.addPass(daphne::createInferencePass({true, true, true, true, true}));
-        pm.addPass(createCanonicalizerPass());
-        pm.addPass(daphne::createInferencePass({true, true, true, true, true}));
-        pm.addPass(createCanonicalizerPass());
-        pm.addPass(daphne::createInferencePass({true, true, true, true, true}));
-        pm.addPass(createCanonicalizerPass());
-        if(failed(pm.run(function))) {
-            throw ErrorHandler::compilerError(
-                function.getOperation(), "SpecializeGenericFunctionsPass",
-                "could not infer types for a call of function template: " +
-                    function.getName().str());
+    func::FuncOp createOrReuseSpecialization(TypeRange operandTypes, ValueRange operands, func::FuncOp calledFunction,
+                                             mlir::Location callLoc) {
+        // check for existing specialization that matches
+        func::FuncOp specializedFunc = tryReuseExistingSpecialization(operandTypes, operands, calledFunction);
+        if (!specializedFunc) {
+            // Create specialized function
+            auto specializedTypes = getSpecializedFuncArgTypes(calledFunction.getFunctionType(), operandTypes,
+                                                               calledFunction.getSymName().str(), callLoc);
+            specializedFunc = createSpecializedFunction(calledFunction, specializedTypes, operands);
+        }
+        if (logger->should_log(spdlog::level::debug)) {
+            std::string s;
+            llvm::raw_string_ostream stream(s);
+            calledFunction->getLoc().print(stream);
+            logger->debug("calledFunction\n\tname: {}\n\tlocation: {}", calledFunction.getSymName().str(), s);
         }
-        return function;
+        templateFunctions.insert(calledFunction);
+        return specializedFunc;
     }
 
-    class SpecializeGenericFunctionsPass
-        : public PassWrapper<SpecializeGenericFunctionsPass, OperationPass<ModuleOp>> {
-        std::unordered_map<std::string, func::FuncOp> functions;
-        std::multimap<std::string, func::FuncOp> specializedVersions;
-        std::set<func::FuncOp> visited;
-        std::set<func::FuncOp> called;
-        std::set<func::FuncOp> templateFunctions;
-
-        const DaphneUserConfig& userConfig;
-        std::shared_ptr<spdlog::logger> logger;
-
-    public:
-        explicit SpecializeGenericFunctionsPass(const DaphneUserConfig& cfg) : userConfig(cfg) {
-            logger = spdlog::get("compiler");
+    /**
+     * @brief Recursively specializes all functions within a `FuncOp` based on
+     * calls to the functions
+     * @param function The `FuncOp` to scan for function specializations
+     */
+    void specializeCallsInFunction(func::FuncOp function) {
+        if (visited.count(function)) {
+            return;
         }
-
-    private:
-        /**
-         * @brief Create a specialized version of the template function.
-         * @param templateFunction The template function.
-         * @param specializedTypes The specialized function arguments
-         * @param operands The operands of the call operation
-         * @return The specialized function
-         */
-        func::FuncOp createSpecializedFunction(func::FuncOp templateFunction, TypeRange specializedTypes, ValueRange operands) {
-            OpBuilder builder(templateFunction);
-            auto specializedFunc = templateFunction.clone();
-            builder.insert(specializedFunc);
-
-            auto uniqueFuncName = uniqueSpecializedFuncName(templateFunction.getSymName().str());
-            specializedFunc.setName(uniqueFuncName);
-            functions.insert({uniqueFuncName, specializedFunc});
-
-            // change argument types
-            specializedFunc
-                .setType(builder.getFunctionType(specializedTypes, specializedFunc.getFunctionType().getResults()));
-            for(auto it : llvm::zip(specializedFunc.getArguments(), specializedTypes)) {
-                std::get<0>(it).setType(std::get<1>(it));
-            }
-
-            bool insertedConst = false;
-            // Don't propagate constants into untyped functions, since that still causes problems for some reason.
-            if(userConfig.use_ipa_const_propa && !isUntypedFunction(templateFunction)) {
-                // Insert compile-time constant scalar call operands into the function.
-                Block & specializedFuncBodyBlock = specializedFunc.getBody().front();
-                builder.setInsertionPointToStart(&specializedFuncBodyBlock);
-                for(auto it : llvm::enumerate(operands)) {
-                    auto i = it.index();
-                    Value v = it.value();
-                    if(Operation * co = CompilerUtils::constantOfAnyType(v)) {
-                        // Clone the constant operation into the function body.
-                        Operation * coNew = co->clone();
-                        builder.insert(coNew);
-                        // Replace all uses of the corresponding block argument by the newly inserted constant.
-                        specializedFuncBodyBlock.getArgument(i).replaceAllUsesWith(coNew->getResult(0));
-                        // TODO We could even remove the corresponding function argument.
-                        insertedConst = true;
-                    }
+        visited.insert(function);
+        // Specialize all functions called directly
+        function.walk([&](daphne::GenericCallOp callOp) {
+            auto calledFunction = functions[callOp.getCallee().str()];
+            bool hasConstantInput = llvm::any_of(
+                callOp.getOperands(), [&](Value v) { return CompilerUtils::constantOfAnyType(v) != nullptr; });
+            if (isFunctionTemplate(calledFunction) || hasConstantInput) {
+                func::FuncOp specializedFunc = createOrReuseSpecialization(
+                    callOp.getOperandTypes(), callOp.getOperands(), calledFunction, callOp.getLoc());
+                callOp.setCalleeAttr(specializedFunc.getSymNameAttr());
+                if (fixResultTypes(callOp->getResults(), specializedFunc.getFunctionType())) {
+                    inferTypesInFunction(function);
                 }
+                specializeCallsInFunction(specializedFunc);
+                called.insert(specializedFunc);
+            } else {
+                specializeCallsInFunction(calledFunction);
+                called.insert(calledFunction);
             }
-            // Remember the newly specialized function for reuse only if we did not insert any constant
-            // call operands.
-            // TODO We could reuse it for other calls with the same constant (it's just more book-keeping effort).
-            if(!insertedConst)
-                specializedVersions.insert({templateFunction.getSymName().str(), specializedFunc});
-
-            return inferTypesInFunction(specializedFunc);
-        }
-
-        /**
-         * @brief Try to reuse an existing specialization for the given template function
-         * @param operandTypes Operand types of the call operation
-         * @param operands Operands of the call operation or an empty list if the operands are not available
-         * @param templateFunction The template function called by the call operation
-         * @return either an existing and matching `FuncOp`, `nullptr` otherwise
-         */
-        func::FuncOp tryReuseExistingSpecialization(TypeRange operandTypes, ValueRange operands, func::FuncOp templateFunction) {
-            if(userConfig.use_ipa_const_propa) {
-                // If any call operand is a compile-time constant scalar, we don't reuse an existing specialization,
-                // but create a new one while propagating the constant to the function body.
-                // TODO We could reuse a former specialization that uses the same constant.
-                for(Value v : operands)
-                    if(CompilerUtils::constantOfAnyType(v))
-                        return nullptr;
-            }
-
-            // Try to find a reusable function specialization based on types and data properties.
-            auto eqIt = specializedVersions.equal_range(templateFunction.getSymName().str());
-            for(auto it = eqIt.first ; it != eqIt.second ; ++it) {
-                auto specializedFunc = it->second;
-
-                if(callTypesMatchFunctionTypes(specializedFunc.getFunctionType(), operandTypes)) {
-                    // reuse existing specialized function
-                    return specializedFunc;
+        });
+
+        // Specialize all functions called by MapOp
+        function.walk([&](daphne::MapOp mapOp) {
+            auto calledFunction = functions[mapOp.getFunc().str()];
+            if (isFunctionTemplate(calledFunction)) {
+                // Get the element type of the matrix the function should be
+                // mapped on
+                mlir::Type opTy = mapOp.getArg().getType();
+                auto inpMatrixTy = opTy.dyn_cast<daphne::MatrixType>();
+                func::FuncOp specializedFunc =
+                    createOrReuseSpecialization(inpMatrixTy.getElementType(), {}, calledFunction, mapOp.getLoc());
+                mapOp.setFuncAttr(specializedFunc.getSymNameAttr());
+
+                // We only allow functions that return exactly one result for
+                // mapOp
+                if (specializedFunc.getFunctionType().getNumResults() != 1) {
+                    throw ErrorHandler::compilerError(
+                        mapOp.getOperation(), "SpecializeGenericFunctionsPass",
+                        "map expects a function with exactly one return "
+                        "value. The provided function returns" +
+                            std::to_string(specializedFunc.getFunctionType().getNumResults()) + "values instead.");
                 }
-            }
 
-            return nullptr;
-        }
-
-        /**
-         * @brief Try to reuse an existing specializtion if one exists, else creates a new 
-         *  specialization
-         * @param operandTypes Operand types of the call operation
-         * @param operands Operands of the call operation or an empty list if the operands are not available
-         * @param calledFunction The function called by the call operation
-         * @param callLoc The location of the call for which a function specialization shall be created or reused
-         * @return A `FuncOp`for the specialization
-         */
-        func::FuncOp createOrReuseSpecialization(TypeRange operandTypes, ValueRange operands, func::FuncOp calledFunction, mlir::Location callLoc) {
-            // check for existing specialization that matches
-            func::FuncOp specializedFunc = tryReuseExistingSpecialization(operandTypes, operands, calledFunction);
-            if(!specializedFunc) {
-                // Create specialized function
-                auto specializedTypes =
-                    getSpecializedFuncArgTypes(calledFunction.getFunctionType(), operandTypes, calledFunction.getSymName().str(), callLoc);
-                specializedFunc = createSpecializedFunction(calledFunction, specializedTypes, operands);
-            }
-            if(logger->should_log(spdlog::level::debug)) {
-                std::string s;
-                llvm::raw_string_ostream stream(s);
-                calledFunction->getLoc().print(stream);
-                logger->debug("calledFunction\n\tname: {}\n\tlocation: {}", calledFunction.getSymName().str(), s);
-            }
-            templateFunctions.insert(calledFunction);
-            return specializedFunc;
-        }
+                // Get current mapOp result matrix type and fix it if needed.
+                // If we fixed something we rerun inference of the whole
+                // function
+                daphne::MatrixType resMatrixTy = mapOp.getType().dyn_cast<daphne::MatrixType>();
+                mlir::Type funcResTy = specializedFunc.getFunctionType().getResult(0);
+
+                // The matrix that results from the mapOp has the same dimension
+                // as the input matrix and the element-type returned by the
+                // specialized function
+                if (resMatrixTy.getNumCols() != inpMatrixTy.getNumCols() ||
+                    resMatrixTy.getNumRows() != inpMatrixTy.getNumRows() || resMatrixTy.getElementType() != funcResTy) {
+                    mapOp.getResult().setType(inpMatrixTy.withElementType(funcResTy));
+                    inferTypesInFunction(function);
+                }
 
-        /**
-         * @brief Recursively specializes all functions within a `FuncOp` based on calls to the functions
-         * @param function The `FuncOp` to scan for function specializations
-         */
-        void specializeCallsInFunction(func::FuncOp function) {
-            if(visited.count(function)) {
-                return;
+                specializeCallsInFunction(specializedFunc);
+                called.insert(specializedFunc);
+            } else {
+                specializeCallsInFunction(calledFunction);
+                called.insert(calledFunction);
             }
-            visited.insert(function);
-            // Specialize all functions called directly
-            function.walk([&](daphne::GenericCallOp callOp) {
-                auto calledFunction = functions[callOp.getCallee().str()];
-                bool hasConstantInput = llvm::any_of(
-                        callOp.getOperands(),
-                        [&](Value v) {
-                            return CompilerUtils::constantOfAnyType(v) != nullptr;
-                        }
-                );
-                if(isFunctionTemplate(calledFunction) || hasConstantInput) {
-                    func::FuncOp specializedFunc = createOrReuseSpecialization(callOp.getOperandTypes(), callOp.getOperands(), calledFunction, callOp.getLoc());
-                    callOp.setCalleeAttr(specializedFunc.getSymNameAttr());
-                    if(fixResultTypes(callOp->getResults(), specializedFunc.getFunctionType())) {
-                        inferTypesInFunction(function);
-                    }
-                    specializeCallsInFunction(specializedFunc);
-                    called.insert(specializedFunc);
-                }
-                else {
-                    specializeCallsInFunction(calledFunction);
-                    called.insert(calledFunction);
-                }
-            });
-
-            // Specialize all functions called by MapOp
-            function.walk([&](daphne::MapOp mapOp) {
-                auto calledFunction = functions[mapOp.getFunc().str()];
-                if(isFunctionTemplate(calledFunction)) {
-                     // Get the element type of the matrix the function should be mapped on
-                    mlir::Type opTy = mapOp.getArg().getType();
-                    auto inpMatrixTy = opTy.dyn_cast<daphne::MatrixType>();
-                    func::FuncOp specializedFunc = createOrReuseSpecialization(inpMatrixTy.getElementType(), {}, calledFunction, mapOp.getLoc());
-                    mapOp.setFuncAttr(specializedFunc.getSymNameAttr());
-
-                    // We only allow functions that return exactly one result for mapOp
-                    if (specializedFunc.getFunctionType().getNumResults() != 1) {
-                        throw ErrorHandler::compilerError(
-                            mapOp.getOperation(),
-                            "SpecializeGenericFunctionsPass",
-                            "map expects a function with exactly one return "
-                            "value. The provided function returns" +
-                                std::to_string(specializedFunc.getFunctionType()
-                                                   .getNumResults()) +
-                                "values instead.");
-                    }
-
-                    // Get current mapOp result matrix type and fix it if needed.
-                    // If we fixed something we rerun inference of the whole function
-                    daphne::MatrixType resMatrixTy = mapOp.getType().dyn_cast<daphne::MatrixType>();
-                    mlir::Type funcResTy = specializedFunc.getFunctionType().getResult(0);
-
-                    // The matrix that results from the mapOp has the same dimension as the input 
-                    // matrix and the element-type returned by the specialized function
-                    if(resMatrixTy.getNumCols() != inpMatrixTy.getNumCols() || 
-                        resMatrixTy.getNumRows() != inpMatrixTy.getNumRows() ||
-                        resMatrixTy.getElementType() != funcResTy) {
-                        mapOp.getResult().setType(inpMatrixTy.withElementType(funcResTy));
-                        inferTypesInFunction(function);
-                    }
-
-                    specializeCallsInFunction(specializedFunc);
-                    called.insert(specializedFunc);
-                }
-                else {
-                    specializeCallsInFunction(calledFunction);
-                    called.insert(calledFunction);
-                }
-            });
-        }
+        });
+    }
 
-    public:
-        void runOnOperation() final;
+  public:
+    void runOnOperation() final;
 
     StringRef getArgument() const final { return "specialize-generic-funcs"; }
     StringRef getDescription() const final { return "TODO"; }
-    };
-}
+};
+} // namespace
 
 /**
- * @brief Generate and call specialized functions from template definitions and remove templates.
+ * @brief Generate and call specialized functions from template definitions and
+ * remove templates.
  *
  * We start entry functions (like `main` or `dist`) and then proceed as follows:
  *
- * 1. Infer types (types up to the first `GenericCallOp` will be inferred for sure)
- * 2. If the function called by `GenericCallOp` is untyped (input types are unknown), we clone it and set the input types
- *      to the types used in the call. For this specialized function we then do the same steps starting at 1.
- * 3. With the (possibly cloned) specialized function we now know the outputs. Starting here we infer up to the next
- *      `GenericCallOp` and go back to step 2.
+ * 1. Infer types (types up to the first `GenericCallOp` will be inferred for
+ * sure)
+ * 2. If the function called by `GenericCallOp` is untyped (input types are
+ * unknown), we clone it and set the input types to the types used in the call.
+ * For this specialized function we then do the same steps starting at 1.
+ * 3. With the (possibly cloned) specialized function we now know the outputs.
+ * Starting here we infer up to the next `GenericCallOp` and go back to step 2.
  * 4. When all `GenericCallOp`s are specialized we are finished
  *
- * Finally we delete all the template functions such that the MLIR code can be verified for correct input and output types.
+ * Finally we delete all the template functions such that the MLIR code can be
+ * verified for correct input and output types.
  */
 void SpecializeGenericFunctionsPass::runOnOperation() {
     auto module = getOperation();
 
-    module.walk([&](func::FuncOp funcOp) {
-        functions.insert({funcOp.getSymName().str(), funcOp});
-    });
+    module.walk([&](func::FuncOp funcOp) { functions.insert({funcOp.getSymName().str(), funcOp}); });
 
-    // `entryFunctions` will hold entry functions like `main`, but also `dist` (for distributed computation)
-    // we could also directly specify the names `main`, `dist` etc. (if we add more `entry` functions), or just set
-    // an attribute flag for those functions.
+    // `entryFunctions` will hold entry functions like `main`, but also `dist`
+    // (for distributed computation) we could also directly specify the names
+    // `main`, `dist` etc. (if we add more `entry` functions), or just set an
+    // attribute flag for those functions.
     std::vector<func::FuncOp> entryFunctions;
-    for(const auto &entry : functions) {
+    for (const auto &entry : functions) {
         entryFunctions.push_back(entry.second);
     }
-    for(const auto &function : entryFunctions) {
-        if(isFunctionTemplate(function) || visited.count(function) || templateFunctions.count(function))
+    for (const auto &function : entryFunctions) {
+        if (isFunctionTemplate(function) || visited.count(function) || templateFunctions.count(function))
             continue;
         try {
             inferTypesInFunction(function);
-        } catch (std::runtime_error& e) {
+        } catch (std::runtime_error &e) {
             throw ErrorHandler::rethrowError("SpecializeGenericFunctionsPass", e.what());
         }
         specializeCallsInFunction(function);
     }
     // Delete non-called functions.
-    for(auto f : functions) {
+    for (auto f : functions) {
         // Never remove the main or dist function.
-        if(f.first == "main" or f.first == "dist")
+        if (f.first == "main" or f.first == "dist")
             continue;
         // Remove a function that was present before creating specializations,
         // if it is never called.
-        if(!called.count(f.second) || templateFunctions.count(f.second))
+        if (!called.count(f.second) || templateFunctions.count(f.second))
             f.second.erase();
     }
 }
 
-std::unique_ptr<Pass> daphne::createSpecializeGenericFunctionsPass(const DaphneUserConfig& cfg) {
+std::unique_ptr<Pass> daphne::createSpecializeGenericFunctionsPass(const DaphneUserConfig &cfg) {
     return std::make_unique<SpecializeGenericFunctionsPass>(cfg);
 }
diff --git a/src/compiler/lowering/VectorizeComputationsPass.cpp b/src/compiler/lowering/VectorizeComputationsPass.cpp
index a891f3543..985c6442e 100644
--- a/src/compiler/lowering/VectorizeComputationsPass.cpp
+++ b/src/compiler/lowering/VectorizeComputationsPass.cpp
@@ -14,190 +14,196 @@
  *  limitations under the License.
  */
 
-
 #include "compiler/utils/CompilerUtils.h"
-#include <util/ErrorHandler.h>
 #include "ir/daphneir/Daphne.h"
 #include "ir/daphneir/Passes.h"
+#include <util/ErrorHandler.h>
 
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Transforms/DialectConversion.h"
 
+#include <iostream>
 #include <memory>
 #include <set>
-#include <iostream>
 
 using namespace mlir;
 
-namespace
-{
-    /**
-     * @brief Recursive function checking if the given value is transitively dependant on the operation `op`.
-     * @param value The value to check
-     * @param op The operation to check
-     * @return true if there is a dependency, false otherwise
-     */
-    bool valueDependsOnResultOf(Value value, Operation *op) {
-        if (auto defOp = value.getDefiningOp()) {
-            if (defOp == op)
-                return true;
+namespace {
+/**
+ * @brief Recursive function checking if the given value is transitively
+ * dependant on the operation `op`.
+ * @param value The value to check
+ * @param op The operation to check
+ * @return true if there is a dependency, false otherwise
+ */
+bool valueDependsOnResultOf(Value value, Operation *op) {
+    if (auto defOp = value.getDefiningOp()) {
+        if (defOp == op)
+            return true;
 #if 1
-            // TODO This crashes if defOp and op are not in the same block.
-            // At the same time, it does not seem to be strictly required.
-//            if (defOp->isBeforeInBlock(op))
-            // Nevertheless, this modified line seems to be a good soft-filter;
-            // without that, the vectorization pass may take very long on
-            // programs with 100s of operations.
-            if (defOp->getBlock() == op->getBlock() && defOp->isBeforeInBlock(op))
-                // can't have results of `op` as inputs, as it is defined before
-                return false;
+        // TODO This crashes if defOp and op are not in the same block.
+        // At the same time, it does not seem to be strictly required.
+        //            if (defOp->isBeforeInBlock(op))
+        // Nevertheless, this modified line seems to be a good soft-filter;
+        // without that, the vectorization pass may take very long on
+        // programs with 100s of operations.
+        if (defOp->getBlock() == op->getBlock() && defOp->isBeforeInBlock(op))
+            // can't have results of `op` as inputs, as it is defined before
+            return false;
 #endif
-            for (auto operand : defOp->getOperands()) {
-                if (valueDependsOnResultOf(operand, op))
-                    return true;
-            }
+        for (auto operand : defOp->getOperands()) {
+            if (valueDependsOnResultOf(operand, op))
+                return true;
         }
-        return false;
     }
+    return false;
+}
 
-    /**
-     * @brief Check if the vectorizable operation can directly be fused into the pipeline, without requiring any other
-     * operation to be fused first.
-     * @param opBefore The vectorizable operation to check
-     * @param pipeline The pipeline
-     * @return true if it can be directly fused, false otherwise
-     */
-    bool isDirectlyFusible(daphne::Vectorizable opBefore, const std::vector<daphne::Vectorizable>& pipeline) {
-        for (auto pipeOp : pipeline) {
-            for (auto operand : pipeOp->getOperands()) {
-                if (std::find(pipeline.begin(), pipeline.end(), operand.getDefiningOp()) != pipeline.end()) {
-                    // transitive dependencies inside the pipeline are of course fine.
-                    continue;
-                }
-                if (operand.getDefiningOp() != opBefore && valueDependsOnResultOf(operand, opBefore)) {
-                    return false;
-                }
+/**
+ * @brief Check if the vectorizable operation can directly be fused into the
+ * pipeline, without requiring any other operation to be fused first.
+ * @param opBefore The vectorizable operation to check
+ * @param pipeline The pipeline
+ * @return true if it can be directly fused, false otherwise
+ */
+bool isDirectlyFusible(daphne::Vectorizable opBefore, const std::vector<daphne::Vectorizable> &pipeline) {
+    for (auto pipeOp : pipeline) {
+        for (auto operand : pipeOp->getOperands()) {
+            if (std::find(pipeline.begin(), pipeline.end(), operand.getDefiningOp()) != pipeline.end()) {
+                // transitive dependencies inside the pipeline are of course
+                // fine.
+                continue;
+            }
+            if (operand.getDefiningOp() != opBefore && valueDependsOnResultOf(operand, opBefore)) {
+                return false;
             }
         }
-        return true;
     }
+    return true;
+}
 
-    /**
-     * @brief Greedily fuses the operation into the pipeline if possible.
-     * @param operationToPipelineIx A map of operations to their index in the pipelines collection
-     * @param pipelines The collection of pipelines
-     * @param currentPipelineIx The index of the current pipeline into which we want to possibly fuse the operation
-     * @param operationToCheck The operation we possibly want to fuse into the current pipeline
-     */
-    void greedyPipelineFusion(std::map<daphne::Vectorizable, size_t> &operationToPipelineIx,
-                              std::vector<std::vector<daphne::Vectorizable>> &pipelines,
-                              size_t currentPipelineIx, daphne::Vectorizable operationToCheck) {
-        auto &currentPipeline = pipelines[currentPipelineIx];
-        auto existingPipelineIt = operationToPipelineIx.find(operationToCheck);
-        if(existingPipelineIt != operationToPipelineIx.end()) {
-            // existing pipeline is sure to be after the current pipeline (due to reverse iteration order)
-            auto existingPipelineIx = existingPipelineIt->second;
-            auto &existingPipeline = pipelines[existingPipelineIx];
-            for (auto op : currentPipeline) {
-                if (!isDirectlyFusible(op, existingPipeline)) {
-                    continue;
-                }
-            }
-            // append existing to current
-            currentPipeline.insert(currentPipeline.end(), existingPipeline.begin(), existingPipeline.end());
-            for (auto vectorizable : existingPipeline) {
-                operationToPipelineIx[vectorizable] = currentPipelineIx;
+/**
+ * @brief Greedily fuses the operation into the pipeline if possible.
+ * @param operationToPipelineIx A map of operations to their index in the
+ * pipelines collection
+ * @param pipelines The collection of pipelines
+ * @param currentPipelineIx The index of the current pipeline into which we want
+ * to possibly fuse the operation
+ * @param operationToCheck The operation we possibly want to fuse into the
+ * current pipeline
+ */
+void greedyPipelineFusion(std::map<daphne::Vectorizable, size_t> &operationToPipelineIx,
+                          std::vector<std::vector<daphne::Vectorizable>> &pipelines, size_t currentPipelineIx,
+                          daphne::Vectorizable operationToCheck) {
+    auto &currentPipeline = pipelines[currentPipelineIx];
+    auto existingPipelineIt = operationToPipelineIx.find(operationToCheck);
+    if (existingPipelineIt != operationToPipelineIx.end()) {
+        // existing pipeline is sure to be after the current pipeline (due to
+        // reverse iteration order)
+        auto existingPipelineIx = existingPipelineIt->second;
+        auto &existingPipeline = pipelines[existingPipelineIx];
+        for (auto op : currentPipeline) {
+            if (!isDirectlyFusible(op, existingPipeline)) {
+                continue;
             }
-            // just make it empty, it will be skipped later. Ixs changes and reshuffling is therefore not necessary.
-            existingPipeline.clear();
         }
-        else if(isDirectlyFusible(operationToCheck, currentPipeline)) {
-            currentPipeline.push_back(operationToCheck);
-            operationToPipelineIx[operationToCheck] = currentPipelineIx;
+        // append existing to current
+        currentPipeline.insert(currentPipeline.end(), existingPipeline.begin(), existingPipeline.end());
+        for (auto vectorizable : existingPipeline) {
+            operationToPipelineIx[vectorizable] = currentPipelineIx;
         }
+        // just make it empty, it will be skipped later. Ixs changes and
+        // reshuffling is therefore not necessary.
+        existingPipeline.clear();
+    } else if (isDirectlyFusible(operationToCheck, currentPipeline)) {
+        currentPipeline.push_back(operationToCheck);
+        operationToPipelineIx[operationToCheck] = currentPipelineIx;
     }
+}
 
-    /**
-     * @brief Moves operation which are between the operations, which should be fused into a single pipeline, before
-     * or after the position where the pipeline will be placed.
-     * @param pipelinePosition The position where the pipeline will be
-     * @param pipeline The pipeline for which this function should be executed
-     */
-    void movePipelineInterleavedOperations(Block::iterator pipelinePosition, const std::vector<daphne::Vectorizable> &pipeline) {
-        // first operation in pipeline vector is last in IR, and the last is the first
-        auto startPos = pipeline.back()->getIterator();
-        auto endPos = pipeline.front()->getIterator();
-        auto currSkip = pipeline.rbegin();
-        std::vector<Operation*> moveBeforeOps;
-        std::vector<Operation*> moveAfterOps;
-        for(auto it = startPos; it != endPos; ++it) {
-            if (it == (*currSkip)->getIterator()) {
-                ++currSkip;
-                continue;
-            }
+/**
+ * @brief Moves operation which are between the operations, which should be
+ * fused into a single pipeline, before or after the position where the pipeline
+ * will be placed.
+ * @param pipelinePosition The position where the pipeline will be
+ * @param pipeline The pipeline for which this function should be executed
+ */
+void movePipelineInterleavedOperations(Block::iterator pipelinePosition,
+                                       const std::vector<daphne::Vectorizable> &pipeline) {
+    // first operation in pipeline vector is last in IR, and the last is the
+    // first
+    auto startPos = pipeline.back()->getIterator();
+    auto endPos = pipeline.front()->getIterator();
+    auto currSkip = pipeline.rbegin();
+    std::vector<Operation *> moveBeforeOps;
+    std::vector<Operation *> moveAfterOps;
+    for (auto it = startPos; it != endPos; ++it) {
+        if (it == (*currSkip)->getIterator()) {
+            ++currSkip;
+            continue;
+        }
 
-            bool dependsOnPipeline = false;
-            auto pipelineOpsBeforeIt = currSkip;
-            while (--pipelineOpsBeforeIt != pipeline.rbegin()) {
-                for (auto operand : it->getOperands()) {
-                    if(valueDependsOnResultOf(operand, *pipelineOpsBeforeIt)) {
-                        dependsOnPipeline = true;
-                        break;
-                    }
-                }
-                if (dependsOnPipeline) {
-                    break;
-                }
-            }
-            // check first pipeline op
+        bool dependsOnPipeline = false;
+        auto pipelineOpsBeforeIt = currSkip;
+        while (--pipelineOpsBeforeIt != pipeline.rbegin()) {
             for (auto operand : it->getOperands()) {
-                if(valueDependsOnResultOf(operand, *pipelineOpsBeforeIt)) {
+                if (valueDependsOnResultOf(operand, *pipelineOpsBeforeIt)) {
                     dependsOnPipeline = true;
                     break;
                 }
             }
             if (dependsOnPipeline) {
-                moveAfterOps.push_back(&(*it));
-            }
-            else {
-                moveBeforeOps.push_back(&(*it));
+                break;
             }
         }
-
-        for(auto moveBeforeOp: moveBeforeOps) {
-            moveBeforeOp->moveBefore(pipelinePosition->getBlock(), pipelinePosition);
+        // check first pipeline op
+        for (auto operand : it->getOperands()) {
+            if (valueDependsOnResultOf(operand, *pipelineOpsBeforeIt)) {
+                dependsOnPipeline = true;
+                break;
+            }
         }
-        for(auto moveAfterOp: moveAfterOps) {
-            moveAfterOp->moveAfter(pipelinePosition->getBlock(), pipelinePosition);
-            pipelinePosition = moveAfterOp->getIterator();
+        if (dependsOnPipeline) {
+            moveAfterOps.push_back(&(*it));
+        } else {
+            moveBeforeOps.push_back(&(*it));
         }
     }
 
-    struct VectorizeComputationsPass : public PassWrapper<VectorizeComputationsPass, OperationPass<func::FuncOp>> {
-        void runOnOperation() final;
-    };
+    for (auto moveBeforeOp : moveBeforeOps) {
+        moveBeforeOp->moveBefore(pipelinePosition->getBlock(), pipelinePosition);
+    }
+    for (auto moveAfterOp : moveAfterOps) {
+        moveAfterOp->moveAfter(pipelinePosition->getBlock(), pipelinePosition);
+        pipelinePosition = moveAfterOp->getIterator();
+    }
 }
 
-void VectorizeComputationsPass::runOnOperation()
-{
+struct VectorizeComputationsPass : public PassWrapper<VectorizeComputationsPass, OperationPass<func::FuncOp>> {
+    void runOnOperation() final;
+};
+} // namespace
+
+void VectorizeComputationsPass::runOnOperation() {
     auto func = getOperation();
-    // TODO: fuse pipelines that have the matching inputs, even if no output of the one pipeline is used by the other.
-    //  This requires multi-returns in way more cases, which is not implemented yet.
+    // TODO: fuse pipelines that have the matching inputs, even if no output of
+    // the one pipeline is used by the other.
+    //  This requires multi-returns in way more cases, which is not implemented
+    //  yet.
 
     // Find vectorizable operations and their inputs of vectorizable operations
     std::vector<daphne::Vectorizable> vectOps;
-    func->walk([&](daphne::Vectorizable op)
-    {
-      if(CompilerUtils::isMatrixComputation(op))
-          vectOps.emplace_back(op);
+    func->walk([&](daphne::Vectorizable op) {
+        if (CompilerUtils::isMatrixComputation(op))
+            vectOps.emplace_back(op);
     });
     std::vector<daphne::Vectorizable> vectorizables(vectOps.begin(), vectOps.end());
     std::multimap<daphne::Vectorizable, daphne::Vectorizable> possibleMerges;
-    for(auto v : vectorizables) {
-        for(auto e : llvm::zip(v->getOperands(), v.getVectorSplits())) {
+    for (auto v : vectorizables) {
+        for (auto e : llvm::zip(v->getOperands(), v.getVectorSplits())) {
             auto operand = std::get<0>(e);
             auto defOp = operand.getDefiningOp<daphne::Vectorizable>();
-            if(defOp && v->getBlock() == defOp->getBlock() && CompilerUtils::isMatrixComputation(defOp)) {
+            if (defOp && v->getBlock() == defOp->getBlock() && CompilerUtils::isMatrixComputation(defOp)) {
                 // defOp is not a candidate for fusion with v, if the
                 // result/operand along which we would fuse is used within a
                 // nested block (e.g., control structure) between defOp and v.
@@ -207,56 +213,53 @@ void VectorizeComputationsPass::runOnOperation()
                 // when it would be safe (also taking NoSideEffect into
                 // account).
                 bool qualified = true;
-                for(OpOperand & use : operand.getUses()) {
-                    Operation * user = use.getOwner();
-                    if(user->getBlock() != v->getBlock()) {
+                for (OpOperand &use : operand.getUses()) {
+                    Operation *user = use.getOwner();
+                    if (user->getBlock() != v->getBlock()) {
                         // user must be in a child block of the block in which
                         // v resides, because we have already checked that v
                         // and defOp are in the same block.
-                        while(user->getBlock() != v->getBlock())
+                        while (user->getBlock() != v->getBlock())
                             user = user->getParentOp();
-                        if(user->isBeforeInBlock(v)) {
+                        if (user->isBeforeInBlock(v)) {
                             qualified = false;
                             break;
                         }
                     }
                 }
 
-                if(qualified){
+                if (qualified) {
                     auto split = std::get<1>(e);
                     // find the corresponding `OpResult` to figure out combine
                     auto opResult = *llvm::find(defOp->getResults(), operand);
                     auto combine = defOp.getVectorCombines()[opResult.getResultNumber()];
 
-                    if(split == daphne::VectorSplit::ROWS) {
-                        if(combine == daphne::VectorCombine::ROWS)
+                    if (split == daphne::VectorSplit::ROWS) {
+                        if (combine == daphne::VectorCombine::ROWS)
                             possibleMerges.insert({v, defOp});
-                    }
-                    else if (split == daphne::VectorSplit::NONE) {
+                    } else if (split == daphne::VectorSplit::NONE) {
                         // can't be merged
-                    }
-                    else {
-                        throw ErrorHandler::compilerError(
-                            v, "VectorizeComputationsPass",
-                            "VectorSplit case `" + stringifyEnum(split).str() +
-                                "` not handled");
+                    } else {
+                        throw ErrorHandler::compilerError(v, "VectorizeComputationsPass",
+                                                          "VectorSplit case `" + stringifyEnum(split).str() +
+                                                              "` not handled");
                     }
                 }
             }
         }
     }
 
-    // Collect vectorizable operations that can be computed together in pipelines
+    // Collect vectorizable operations that can be computed together in
+    // pipelines
     std::map<daphne::Vectorizable, size_t> operationToPipelineIx;
     std::vector<std::vector<daphne::Vectorizable>> pipelines;
-    for(auto vIt = vectorizables.rbegin(); vIt != vectorizables.rend(); ++vIt) {
+    for (auto vIt = vectorizables.rbegin(); vIt != vectorizables.rend(); ++vIt) {
         auto v = *vIt;
         size_t pipelineIx;
         auto pipelineIt = operationToPipelineIx.find(v);
-        if(pipelineIt != operationToPipelineIx.end()) {
+        if (pipelineIt != operationToPipelineIx.end()) {
             pipelineIx = pipelineIt->second;
-        }
-        else {
+        } else {
             pipelineIx = pipelines.size();
             std::vector<daphne::Vectorizable> pipeline;
             pipeline.push_back(v);
@@ -265,17 +268,18 @@ void VectorizeComputationsPass::runOnOperation()
 
         // iterate all operands that could be combined into the pipeline
         auto itRange = possibleMerges.equal_range(v);
-        for(auto it = itRange.first; it != itRange.second; ++it) {
+        for (auto it = itRange.first; it != itRange.second; ++it) {
             auto operandVectorizable = it->second;
-            // TODO: this fuses greedily, the first pipeline we can fuse this operation into, we do. improve
+            // TODO: this fuses greedily, the first pipeline we can fuse this
+            // operation into, we do. improve
             greedyPipelineFusion(operationToPipelineIx, pipelines, pipelineIx, operandVectorizable);
         }
     }
 
     OpBuilder builder(func);
     // Create the `VectorizedPipelineOp`s
-    for(auto pipeline : pipelines) {
-        if(pipeline.empty()) {
+    for (auto pipeline : pipelines) {
+        if (pipeline.empty()) {
             continue;
         }
         auto valueIsPartOfPipeline = [&](Value operand) {
@@ -291,110 +295,106 @@ void VectorizeComputationsPass::runOnOperation()
 
         // first op in pipeline is last in IR
         builder.setInsertionPoint(pipeline.front());
-        // move all operations, between the operations that will be part of the pipeline, before or after the
-        // completed pipeline
+        // move all operations, between the operations that will be part of the
+        // pipeline, before or after the completed pipeline
         movePipelineInterleavedOperations(builder.getInsertionPoint(), pipeline);
-        for(auto vIt = pipeline.rbegin(); vIt != pipeline.rend(); ++vIt) {
+        for (auto vIt = pipeline.rbegin(); vIt != pipeline.rend(); ++vIt) {
             auto v = *vIt;
             auto vSplits = v.getVectorSplits();
             auto vCombines = v.getVectorCombines();
-            // TODO: although we do create enum attributes, it might make sense/make it easier to
+            // TODO: although we do create enum attributes, it might make
+            // sense/make it easier to
             //  just directly use an I64ArrayAttribute
-            for(auto i = 0u; i < v->getNumOperands(); ++i) {
+            for (auto i = 0u; i < v->getNumOperands(); ++i) {
                 auto operand = v->getOperand(i);
-                if(!valueIsPartOfPipeline(operand)) {
+                if (!valueIsPartOfPipeline(operand)) {
                     vSplitAttrs.push_back(daphne::VectorSplitAttr::get(&getContext(), vSplits[i]));
                     operands.push_back(operand);
                 }
             }
-            for(auto vCombine : vCombines) {
+            for (auto vCombine : vCombines) {
                 vCombineAttrs.push_back(daphne::VectorCombineAttr::get(&getContext(), vCombine));
             }
             locations.push_back(v->getLoc());
-            for(auto result: v->getResults()) {
+            for (auto result : v->getResults()) {
                 results.push_back(result);
             }
-            for(auto outSize: v.createOpsOutputSizes(builder)) {
+            for (auto outSize : v.createOpsOutputSizes(builder)) {
                 outRows.push_back(outSize.first);
                 outCols.push_back(outSize.second);
             }
         }
         std::vector<Location> locs;
         locs.reserve(pipeline.size());
-        for(auto op: pipeline) {
+        for (auto op : pipeline) {
             locs.push_back(op->getLoc());
         }
         auto loc = builder.getFusedLoc(locs);
-        auto pipelineOp = builder.create<daphne::VectorizedPipelineOp>(loc,
-            ValueRange(results).getTypes(),
-            operands,
-            outRows,
-            outCols,
-            builder.getArrayAttr(vSplitAttrs),
-            builder.getArrayAttr(vCombineAttrs),
-            nullptr);
+        auto pipelineOp = builder.create<daphne::VectorizedPipelineOp>(
+            loc, ValueRange(results).getTypes(), operands, outRows, outCols, builder.getArrayAttr(vSplitAttrs),
+            builder.getArrayAttr(vCombineAttrs), nullptr);
         Block *bodyBlock = builder.createBlock(&pipelineOp.getBody());
 
-        for(size_t i = 0u; i < operands.size(); ++i) {
+        for (size_t i = 0u; i < operands.size(); ++i) {
             auto argTy = operands[i].getType();
             switch (vSplitAttrs[i].cast<daphne::VectorSplitAttr>().getValue()) {
-                case daphne::VectorSplit::ROWS: {
-                    auto matTy = argTy.cast<daphne::MatrixType>();
-                    // only remove row information
-                    argTy = matTy.withShape(-1, matTy.getNumCols());
-                    break;
-                }
-                case daphne::VectorSplit::NONE:
-                    // keep any size information
-                    break;
+            case daphne::VectorSplit::ROWS: {
+                auto matTy = argTy.cast<daphne::MatrixType>();
+                // only remove row information
+                argTy = matTy.withShape(-1, matTy.getNumCols());
+                break;
+            }
+            case daphne::VectorSplit::NONE:
+                // keep any size information
+                break;
             }
             bodyBlock->addArgument(argTy, builder.getUnknownLoc());
         }
 
         auto argsIx = 0u;
         auto resultsIx = 0u;
-        for(auto vIt = pipeline.rbegin(); vIt != pipeline.rend(); ++vIt) {
+        for (auto vIt = pipeline.rbegin(); vIt != pipeline.rend(); ++vIt) {
             auto v = *vIt;
             auto numOperands = v->getNumOperands();
             auto numResults = v->getNumResults();
 
             v->moveBefore(bodyBlock, bodyBlock->end());
 
-            for(auto i = 0u; i < numOperands; ++i) {
-                if(!valueIsPartOfPipeline(v->getOperand(i))) {
+            for (auto i = 0u; i < numOperands; ++i) {
+                if (!valueIsPartOfPipeline(v->getOperand(i))) {
                     v->setOperand(i, bodyBlock->getArgument(argsIx++));
                 }
             }
 
             auto pipelineReplaceResults = pipelineOp->getResults().drop_front(resultsIx).take_front(numResults);
             resultsIx += numResults;
-            for(auto z: llvm::zip(v->getResults(), pipelineReplaceResults)) {
+            for (auto z : llvm::zip(v->getResults(), pipelineReplaceResults)) {
                 auto old = std::get<0>(z);
                 auto replacement = std::get<1>(z);
 
                 // TODO: switch to type based size inference instead
                 // FIXME: if output is dynamic sized, we can't do this
                 // replace `NumRowOp` and `NumColOp`s for output size inference
-                for(auto& use: old.getUses()) {
-                    auto* op = use.getOwner();
-                    if(auto nrowOp = llvm::dyn_cast<daphne::NumRowsOp>(op)) {
+                for (auto &use : old.getUses()) {
+                    auto *op = use.getOwner();
+                    if (auto nrowOp = llvm::dyn_cast<daphne::NumRowsOp>(op)) {
                         nrowOp.replaceAllUsesWith(pipelineOp.getOutRows()[replacement.getResultNumber()]);
                         nrowOp.erase();
                     }
-                    if(auto ncolOp = llvm::dyn_cast<daphne::NumColsOp>(op)) {
+                    if (auto ncolOp = llvm::dyn_cast<daphne::NumColsOp>(op)) {
                         ncolOp.replaceAllUsesWith(pipelineOp.getOutCols()[replacement.getResultNumber()]);
                         ncolOp.erase();
                     }
                 }
                 // Replace only if not used by pipeline op
-                old.replaceUsesWithIf(replacement, [&](OpOperand& opOperand) {
+                old.replaceUsesWithIf(replacement, [&](OpOperand &opOperand) {
                     return llvm::count(pipeline, opOperand.getOwner()) == 0;
                 });
             }
         }
-        bodyBlock->walk([](Operation* op) {
-            for(auto resVal: op->getResults()) {
-                if(auto ty = resVal.getType().dyn_cast<daphne::MatrixType>()) {
+        bodyBlock->walk([](Operation *op) {
+            for (auto resVal : op->getResults()) {
+                if (auto ty = resVal.getType().dyn_cast<daphne::MatrixType>()) {
                     resVal.setType(ty.withShape(-1, -1));
                 }
             }
diff --git a/src/compiler/lowering/WhileLoopInvariantCodeMotionPass.cpp b/src/compiler/lowering/WhileLoopInvariantCodeMotionPass.cpp
deleted file mode 100644
index 8e933155e..000000000
--- a/src/compiler/lowering/WhileLoopInvariantCodeMotionPass.cpp
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright 2021 The DAPHNE Consortium
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "ir/daphneir/Passes.h"
-
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/SCF/IR/SCF.h"
-
-#include <memory>
-
-using namespace mlir;
-
-/**
- * @brief This is a very limited variant of loop invariant code motion (LICM),
- * tailored just to WhileOp.
- * 
- * We need this because MLIR does not seem to support LICM for while loops.
- * Nevertheless, we should clarify this (see #175).
- * 
- * This pass is strongly inspired by MLIR's LoopInvariantCodeMotion.cpp, but
- * significantly simplified.
- */
-struct WhileLoopInvariantCodeMotionPass
-: public PassWrapper <WhileLoopInvariantCodeMotionPass, OperationPass<func::FuncOp>> {
-    void runOnOperation() final;
-
-    StringRef getArgument() const final { return "while-loop-invariant-code-motion"; }
-    StringRef getDescription() const final { return "TODO"; }
-};
-
-void WhileLoopInvariantCodeMotionPass::runOnOperation() {
-    getOperation()->walk([&](scf::WhileOp whileOp) {
-        Region & loopBody = whileOp.getAfter();
-
-        SmallPtrSet<Operation *, 8> willBeMovedSet;
-        SmallVector<Operation *, 8> opsToMove;
-
-        auto isDefinedOutsideOfBody = [&](Value value) {
-            auto definingOp = value.getDefiningOp();
-            return (definingOp && !!willBeMovedSet.count(definingOp)) ||
-                    !loopBody.isAncestor(value.getParentRegion());
-        };
-
-        for(auto & block : loopBody)
-            for(auto & op : block.without_terminator()) {
-                auto memInterface = dyn_cast<MemoryEffectOpInterface>(op);
-                if(
-                    llvm::all_of(op.getOperands(), isDefinedOutsideOfBody) &&
-                    op.hasTrait<OpTrait::ZeroRegions>() && // such that we don't need to recurse
-                    memInterface && memInterface.hasNoEffect()
-                ) {
-                    opsToMove.push_back(&op);
-                    willBeMovedSet.insert(&op);
-                }
-            }
-
-        for(auto op : opsToMove)
-            op->moveBefore(whileOp);
-    });
-}
-
-std::unique_ptr<Pass> daphne::createWhileLoopInvariantCodeMotionPass() {
-    return std::make_unique<WhileLoopInvariantCodeMotionPass>();
-}
diff --git a/src/compiler/utils/CompilerUtils.cpp b/src/compiler/utils/CompilerUtils.cpp
index 85c55a814..da0931ab5 100644
--- a/src/compiler/utils/CompilerUtils.cpp
+++ b/src/compiler/utils/CompilerUtils.cpp
@@ -24,15 +24,16 @@
 // Specializations of isConstantHelper for string types
 // **************************************************************************************************
 
-template<>
-std::pair<bool, std::string> CompilerUtils::isConstantHelper<std::string, mlir::StringAttr>(mlir::Value v, const std::function<std::string(const mlir::StringAttr&)>& func) {
-    if(auto co = v.getDefiningOp<mlir::daphne::ConstantOp>()) {
-        if(auto attr = co.getValue().dyn_cast<mlir::StringAttr>()) {
+template <>
+std::pair<bool, std::string> CompilerUtils::isConstantHelper<std::string, mlir::StringAttr>(
+    mlir::Value v, const std::function<std::string(const mlir::StringAttr &)> &func) {
+    if (auto co = v.getDefiningOp<mlir::daphne::ConstantOp>()) {
+        if (auto attr = co.getValue().dyn_cast<mlir::StringAttr>()) {
             return std::make_pair(true, func(attr));
         }
     }
-    if(auto co = v.getDefiningOp<mlir::arith::ConstantOp>()) {
-        if(auto attr = co.getValue().dyn_cast<mlir::StringAttr>()) {
+    if (auto co = v.getDefiningOp<mlir::arith::ConstantOp>()) {
+        if (auto attr = co.getValue().dyn_cast<mlir::StringAttr>()) {
             return std::make_pair(true, func(attr));
         }
     }
@@ -43,166 +44,120 @@ std::pair<bool, std::string> CompilerUtils::isConstantHelper<std::string, mlir::
 // Specializations of isConstant for various types
 // **************************************************************************************************
 
-template<>
-std::pair<bool, std::string> CompilerUtils::isConstant<std::string>(mlir::Value v) {
-    return isConstantHelper<std::string, mlir::StringAttr>(
-            v, [](mlir::StringAttr attr){return attr.getValue().str();}
-    );
+template <> std::pair<bool, std::string> CompilerUtils::isConstant<std::string>(mlir::Value v) {
+    return isConstantHelper<std::string, mlir::StringAttr>(v,
+                                                           [](mlir::StringAttr attr) { return attr.getValue().str(); });
 }
 
-template<>
-std::pair<bool, int64_t> CompilerUtils::isConstant<int64_t>(mlir::Value v) {
+template <> std::pair<bool, int64_t> CompilerUtils::isConstant<int64_t>(mlir::Value v) {
     return isConstantHelper<int64_t, mlir::IntegerAttr>(
-            v, [](mlir::IntegerAttr attr){return attr.getValue().getLimitedValue();}
-    );
+        v, [](mlir::IntegerAttr attr) { return attr.getValue().getLimitedValue(); });
 }
 
-template<>
-std::pair<bool, int32_t> CompilerUtils::isConstant<int32_t>(mlir::Value v) {
+template <> std::pair<bool, int32_t> CompilerUtils::isConstant<int32_t>(mlir::Value v) {
     return isConstantHelper<int32_t, mlir::IntegerAttr>(
-            v, [](mlir::IntegerAttr attr){return attr.getValue().getLimitedValue();}
-    );
+        v, [](mlir::IntegerAttr attr) { return attr.getValue().getLimitedValue(); });
 }
 
-template<>
-std::pair<bool, int8_t> CompilerUtils::isConstant<int8_t>(mlir::Value v) {
+template <> std::pair<bool, int8_t> CompilerUtils::isConstant<int8_t>(mlir::Value v) {
     return isConstantHelper<int8_t, mlir::IntegerAttr>(
-            v, [](mlir::IntegerAttr attr){return attr.getValue().getLimitedValue();}
-    );
+        v, [](mlir::IntegerAttr attr) { return attr.getValue().getLimitedValue(); });
 }
 
-template<>
-std::pair<bool, uint64_t> CompilerUtils::isConstant<uint64_t>(mlir::Value v) {
+template <> std::pair<bool, uint64_t> CompilerUtils::isConstant<uint64_t>(mlir::Value v) {
     return isConstantHelper<uint64_t, mlir::IntegerAttr>(
-            v, [](mlir::IntegerAttr attr){return attr.getValue().getLimitedValue();}
-    );
+        v, [](mlir::IntegerAttr attr) { return attr.getValue().getLimitedValue(); });
 }
 
-template<>
-std::pair<bool, uint32_t> CompilerUtils::isConstant<uint32_t>(mlir::Value v) {
+template <> std::pair<bool, uint32_t> CompilerUtils::isConstant<uint32_t>(mlir::Value v) {
     return isConstantHelper<uint32_t, mlir::IntegerAttr>(
-            v, [](mlir::IntegerAttr attr){return attr.getValue().getLimitedValue();}
-    );
+        v, [](mlir::IntegerAttr attr) { return attr.getValue().getLimitedValue(); });
 }
 
-template<>
-std::pair<bool, uint8_t> CompilerUtils::isConstant<uint8_t>(mlir::Value v) {
+template <> std::pair<bool, uint8_t> CompilerUtils::isConstant<uint8_t>(mlir::Value v) {
     return isConstantHelper<uint8_t, mlir::IntegerAttr>(
-            v, [](mlir::IntegerAttr attr){return attr.getValue().getLimitedValue();}
-    );
+        v, [](mlir::IntegerAttr attr) { return attr.getValue().getLimitedValue(); });
 }
 
-template<>
-std::pair<bool, float> CompilerUtils::isConstant<float>(mlir::Value v) {
+template <> std::pair<bool, float> CompilerUtils::isConstant<float>(mlir::Value v) {
     return isConstantHelper<float, mlir::FloatAttr>(
-            v, [](mlir::FloatAttr attr){return attr.getValue().convertToFloat();}
-    );
+        v, [](mlir::FloatAttr attr) { return attr.getValue().convertToFloat(); });
 }
 
-template<>
-std::pair<bool, double> CompilerUtils::isConstant<double>(mlir::Value v) {
+template <> std::pair<bool, double> CompilerUtils::isConstant<double>(mlir::Value v) {
     return isConstantHelper<double, mlir::FloatAttr>(
-            v, [](mlir::FloatAttr attr){return attr.getValue().convertToDouble();}
-    );
+        v, [](mlir::FloatAttr attr) { return attr.getValue().convertToDouble(); });
 }
 
-template<>
-std::pair<bool, bool> CompilerUtils::isConstant<bool>(mlir::Value v) {
-    return isConstantHelper<bool, mlir::BoolAttr>(
-            v, [](mlir::BoolAttr attr){return attr.getValue();}
-    );
+template <> std::pair<bool, bool> CompilerUtils::isConstant<bool>(mlir::Value v) {
+    return isConstantHelper<bool, mlir::BoolAttr>(v, [](mlir::BoolAttr attr) { return attr.getValue(); });
 }
 
 // **************************************************************************************************
 // Specializations of constantOrThrow for various types
 // **************************************************************************************************
 
-template<>
-std::string CompilerUtils::constantOrThrow<std::string>(mlir::Value v, const std::string & errorMsg) {
+template <> std::string CompilerUtils::constantOrThrow<std::string>(mlir::Value v, const std::string &errorMsg) {
     return constantOrThrowHelper<std::string, mlir::StringAttr>(
-            v, [](mlir::StringAttr attr){return attr.getValue().str();}, errorMsg, "string"
-    );
+        v, [](mlir::StringAttr attr) { return attr.getValue().str(); }, errorMsg, "string");
 }
 
-template<>
-int64_t CompilerUtils::constantOrThrow<int64_t>(mlir::Value v, const std::string & errorMsg) {
+template <> int64_t CompilerUtils::constantOrThrow<int64_t>(mlir::Value v, const std::string &errorMsg) {
     return constantOrThrowHelper<int64_t, mlir::IntegerAttr>(
-            v, [](mlir::IntegerAttr attr){return attr.getValue().getLimitedValue();}, errorMsg, "integer"
-    );
+        v, [](mlir::IntegerAttr attr) { return attr.getValue().getLimitedValue(); }, errorMsg, "integer");
 }
 
-template<>
-uint64_t CompilerUtils::constantOrThrow<uint64_t>(mlir::Value v, const std::string & errorMsg) {
+template <> uint64_t CompilerUtils::constantOrThrow<uint64_t>(mlir::Value v, const std::string &errorMsg) {
     return constantOrThrowHelper<uint64_t, mlir::IntegerAttr>(
-            v, [](mlir::IntegerAttr attr){return attr.getValue().getLimitedValue();}, errorMsg, "integer"
-    );
+        v, [](mlir::IntegerAttr attr) { return attr.getValue().getLimitedValue(); }, errorMsg, "integer");
 }
 
-template<>
-float CompilerUtils::constantOrThrow<float>(mlir::Value v, const std::string & errorMsg) {
+template <> float CompilerUtils::constantOrThrow<float>(mlir::Value v, const std::string &errorMsg) {
     return constantOrThrowHelper<float, mlir::FloatAttr>(
-            v, [](mlir::FloatAttr attr){return attr.getValue().convertToFloat();}, errorMsg, "float"
-    );
+        v, [](mlir::FloatAttr attr) { return attr.getValue().convertToFloat(); }, errorMsg, "float");
 }
 
-template<>
-double CompilerUtils::constantOrThrow<double>(mlir::Value v, const std::string & errorMsg) {
+template <> double CompilerUtils::constantOrThrow<double>(mlir::Value v, const std::string &errorMsg) {
     return constantOrThrowHelper<double, mlir::FloatAttr>(
-            v, [](mlir::FloatAttr attr){return attr.getValue().convertToDouble();}, errorMsg, "double"
-    );
+        v, [](mlir::FloatAttr attr) { return attr.getValue().convertToDouble(); }, errorMsg, "double");
 }
 
-template<>
-bool CompilerUtils::constantOrThrow<bool>(mlir::Value v, const std::string & errorMsg) {
+template <> bool CompilerUtils::constantOrThrow<bool>(mlir::Value v, const std::string &errorMsg) {
     return constantOrThrowHelper<bool, mlir::BoolAttr>(
-            v, [](mlir::BoolAttr attr){return attr.getValue();}, errorMsg, "bool"
-    );
+        v, [](mlir::BoolAttr attr) { return attr.getValue(); }, errorMsg, "bool");
 }
 
 // **************************************************************************************************
 // Specializations of constantOrDefault for various types
 // **************************************************************************************************
 
-template<>
-std::string CompilerUtils::constantOrDefault<std::string>(mlir::Value v, std::string d) {
+template <> std::string CompilerUtils::constantOrDefault<std::string>(mlir::Value v, std::string d) {
     return constantOrDefaultHelper<std::string, mlir::StringAttr>(
-            v, std::move(d), [](mlir::StringAttr attr){return attr.getValue().str();}
-    );
+        v, std::move(d), [](mlir::StringAttr attr) { return attr.getValue().str(); });
 }
 
-template<>
-int64_t CompilerUtils::constantOrDefault<int64_t>(mlir::Value v, int64_t d) {
+template <> int64_t CompilerUtils::constantOrDefault<int64_t>(mlir::Value v, int64_t d) {
     return constantOrDefaultHelper<int64_t, mlir::IntegerAttr>(
-            v, d, [](mlir::IntegerAttr attr){return attr.getValue().getLimitedValue();}
-    );
+        v, d, [](mlir::IntegerAttr attr) { return attr.getValue().getLimitedValue(); });
 }
 
-template<>
-uint64_t CompilerUtils::constantOrDefault<uint64_t>(mlir::Value v, uint64_t d) {
+template <> uint64_t CompilerUtils::constantOrDefault<uint64_t>(mlir::Value v, uint64_t d) {
     return constantOrDefaultHelper<uint64_t, mlir::IntegerAttr>(
-            v, d, [](mlir::IntegerAttr attr){return attr.getValue().getLimitedValue();}
-    );
+        v, d, [](mlir::IntegerAttr attr) { return attr.getValue().getLimitedValue(); });
 }
 
-template<>
-float CompilerUtils::constantOrDefault<float>(mlir::Value v, float d) {
+template <> float CompilerUtils::constantOrDefault<float>(mlir::Value v, float d) {
     return constantOrDefaultHelper<float, mlir::FloatAttr>(
-            v, d, [](mlir::FloatAttr attr){return attr.getValue().convertToFloat();}
-    );
+        v, d, [](mlir::FloatAttr attr) { return attr.getValue().convertToFloat(); });
 }
 
-template<>
-double CompilerUtils::constantOrDefault<double>(mlir::Value v, double d) {
+template <> double CompilerUtils::constantOrDefault<double>(mlir::Value v, double d) {
     return constantOrDefaultHelper<double, mlir::FloatAttr>(
-            v, d, [](mlir::FloatAttr attr){return attr.getValue().convertToDouble();}
-    );
+        v, d, [](mlir::FloatAttr attr) { return attr.getValue().convertToDouble(); });
 }
 
-template<>
-bool CompilerUtils::constantOrDefault<bool>(mlir::Value v, bool d) {
-    return constantOrDefaultHelper<bool, mlir::BoolAttr>(
-            v, d, [](mlir::BoolAttr attr){return attr.getValue();}
-    );
+template <> bool CompilerUtils::constantOrDefault<bool>(mlir::Value v, bool d) {
+    return constantOrDefaultHelper<bool, mlir::BoolAttr>(v, d, [](mlir::BoolAttr attr) { return attr.getValue(); });
 }
 
 // **************************************************************************************************
@@ -214,8 +169,6 @@ bool CompilerUtils::constantOrDefault<bool>(mlir::Value v, bool d) {
 }
 
 bool CompilerUtils::isMatrixComputation(mlir::Operation *v) {
-    return
-            llvm::any_of(v->getOperandTypes(), [&](mlir::Type ty){ return llvm::isa<mlir::daphne::MatrixType>(ty); })
-            ||
-            llvm::any_of(v->getResultTypes(), [&](mlir::Type ty){ return llvm::isa<mlir::daphne::MatrixType>(ty); });
+    return llvm::any_of(v->getOperandTypes(), [&](mlir::Type ty) { return llvm::isa<mlir::daphne::MatrixType>(ty); }) ||
+           llvm::any_of(v->getResultTypes(), [&](mlir::Type ty) { return llvm::isa<mlir::daphne::MatrixType>(ty); });
 }
diff --git a/src/compiler/utils/CompilerUtils.h b/src/compiler/utils/CompilerUtils.h
index fe418419e..a8dd3dfd2 100644
--- a/src/compiler/utils/CompilerUtils.h
+++ b/src/compiler/utils/CompilerUtils.h
@@ -16,6 +16,7 @@
 
 #pragma once
 
+// clang-format off
 #include <ir/daphneir/Daphne.h>
 #include <parser/metadata/MetaDataParser.h>
 #include "util/ErrorHandler.h"
@@ -25,183 +26,187 @@
 
 #include <stdexcept>
 #include <string>
+// clang-format on
 
 struct CompilerUtils {
 
-private:
-
-    template<typename ValT, typename AttrT>
-    static std::pair<bool, ValT> isConstantHelper(mlir::Value v, const std::function<ValT(const AttrT &)>& func) {
-        if(auto co = v.getDefiningOp<mlir::daphne::ConstantOp>())
-            if(auto attr = co.getValue().dyn_cast<AttrT>())
+  private:
+    template <typename ValT, typename AttrT>
+    static std::pair<bool, ValT> isConstantHelper(mlir::Value v, const std::function<ValT(const AttrT &)> &func) {
+        if (auto co = v.getDefiningOp<mlir::daphne::ConstantOp>())
+            if (auto attr = co.getValue().dyn_cast<AttrT>())
                 return std::make_pair(true, func(attr));
-        if(auto co = v.getDefiningOp<mlir::arith::ConstantOp>())
-            if(auto attr = co.getValue().dyn_cast<AttrT>())
+        if (auto co = v.getDefiningOp<mlir::arith::ConstantOp>())
+            if (auto attr = co.getValue().dyn_cast<AttrT>())
                 return std::make_pair(true, func(attr));
         return std::make_pair(false, ValT(0));
     }
 
-    template<typename ValT, typename AttrT>
-    static ValT constantOrThrowHelper(mlir::Value v, std::function<ValT(const AttrT &)> func, const std::string & errorMsg, const std::string & valTypeName) {
+    template <typename ValT, typename AttrT>
+    static ValT constantOrThrowHelper(mlir::Value v, std::function<ValT(const AttrT &)> func,
+                                      const std::string &errorMsg, const std::string &valTypeName) {
         auto p = isConstantHelper<ValT, AttrT>(v, func);
-        if(p.first)
+        if (p.first)
             return p.second;
         else
-            throw ErrorHandler::compilerError(v.getLoc(), "constantOrThrow",
-                    errorMsg.empty() ?
-                    ("the given value must be a constant of " + valTypeName + " type")
-                    : errorMsg
-            );
+            throw ErrorHandler::compilerError(
+                v.getLoc(), "constantOrThrow",
+                errorMsg.empty() ? ("the given value must be a constant of " + valTypeName + " type") : errorMsg);
     }
 
-    template<typename ValT, typename AttrT>
+    template <typename ValT, typename AttrT>
     static ValT constantOrDefaultHelper(mlir::Value v, ValT d, std::function<ValT(const AttrT &)> func) {
         auto p = isConstantHelper<ValT, AttrT>(v, func);
-        if(p.first)
+        if (p.first)
             return p.second;
         else
             return d;
     }
-    
-public:
 
+  public:
     /**
-     * @brief If the given `Value` is defined by some constant operation, return that constant
-     * operation; otherwise, return `nullptr`.
-     * 
+     * @brief If the given `Value` is defined by some constant operation, return
+     * that constant operation; otherwise, return `nullptr`.
+     *
      * @param v The `Value`.
      * @return The defining constant operation or `nullptr`.
      */
-    static mlir::Operation * constantOfAnyType(mlir::Value v) {
-        if(auto co = v.getDefiningOp<mlir::daphne::ConstantOp>())
+    static mlir::Operation *constantOfAnyType(mlir::Value v) {
+        if (auto co = v.getDefiningOp<mlir::daphne::ConstantOp>())
             return co;
-        if(auto co = v.getDefiningOp<mlir::arith::ConstantOp>())
+        if (auto co = v.getDefiningOp<mlir::arith::ConstantOp>())
             return co;
         return nullptr;
     }
 
     /**
-     * @brief Returns if the given `Value` is a constant, and if so, also the constant itself.
-     * 
+     * @brief Returns if the given `Value` is a constant, and if so, also the
+     * constant itself.
+     *
      * @tparam T The C++ type of the constant to extract.
      * @param v The `Value`.
-     * @return If the given value is a constant: a pair of the value `true` and the constant value as type `T`;
-     * otherwise, a pair of the value `false` and an unspecified value of type `T`.
+     * @return If the given value is a constant: a pair of the value `true` and
+     * the constant value as type `T`; otherwise, a pair of the value `false`
+     * and an unspecified value of type `T`.
      */
-    template<typename T>
-    static std::pair<bool, T> isConstant(mlir::Value v);
-    
+    template <typename T> static std::pair<bool, T> isConstant(mlir::Value v);
+
     /**
-     * @brief Returns a constant extracted from the given `Value`, or throws an exception if this is not possible.
-     * 
+     * @brief Returns a constant extracted from the given `Value`, or throws an
+     * exception if this is not possible.
+     *
      * @tparam T The C++ type of the constant to extract.
      * @param v The `Value`.
-     * @param errorMsg The message of the exception to throw. In case of an empty string (default), the exception
-     * will have a generic error message.
-     * @return The extracted constant as a value of type `T`, if the given value is a constant.
+     * @param errorMsg The message of the exception to throw. In case of an
+     * empty string (default), the exception will have a generic error message.
+     * @return The extracted constant as a value of type `T`, if the given value
+     * is a constant.
      */
-    template<typename T>
-    static T constantOrThrow(mlir::Value v, const std::string & errorMsg = "");
+    template <typename T> static T constantOrThrow(mlir::Value v, const std::string &errorMsg = "");
 
     /**
-     * @brief Returns a constant extracted from the given `Value`, or a default value if this is not possible.
-     * 
+     * @brief Returns a constant extracted from the given `Value`, or a default
+     * value if this is not possible.
+     *
      * @tparam T The C++ type of the constant to extract.
      * @param v The `Value`.
      * @param d The default value.
-     * @return The extracted constant as a value of type `T`, if the given value is a constant, or the given
-     * default value, otherwise.
+     * @return The extracted constant as a value of type `T`, if the given value
+     * is a constant, or the given default value, otherwise.
      */
-    template<typename T>
-    static T constantOrDefault(mlir::Value v, T d);
+    template <typename T> static T constantOrDefault(mlir::Value v, T d);
 
     [[maybe_unused]] static FileMetaData getFileMetaData(mlir::Value filename);
 
     /**
-     * @brief Produces a string containing the C++ type name of the corresponding MLIR type. Mainly used to
-     * generate function names for generated kernel libraries. This function is defined recursively to also print
-     * the value types of templated containers (e.g., DenseMatrix<float>). A pragma is added to silence clang-tidy which
-     * might complain about recursion.
+     * @brief Produces a string containing the C++ type name of the
+     * corresponding MLIR type. Mainly used to generate function names for
+     * generated kernel libraries. This function is defined recursively to also
+     * print the value types of templated containers (e.g., DenseMatrix<float>).
+     * A pragma is added to silence clang-tidy which might complain about
+     * recursion.
      *
      * @param t MLIR type name
-     * @param angleBrackets If `true` (default), angle brackets are used for C++ template types (e.g., `DenseMatrix<float>`);
-     * Otherwise, underscores are used (e.g., `DenseMatrix_float`).
-     * @param generalizeToStructure If `true`, `Structure` is used instead of derived types like `DenseMatrix` etc.
+     * @param angleBrackets If `true` (default), angle brackets are used for C++
+     * template types (e.g., `DenseMatrix<float>`); Otherwise, underscores are
+     * used (e.g., `DenseMatrix_float`).
+     * @param generalizeToStructure If `true`, `Structure` is used instead of
+     * derived types like `DenseMatrix` etc.
      * @return A string representation of the C++ type names
      */
-    // TODO The parameter generalizeToStructure seems to be used only by some remaining kernel name generation
-    // in LowerToLLVMPass. Once those call-sites have been refactored to use the kernel catalog, this feature
+    // TODO The parameter generalizeToStructure seems to be used only by some
+    // remaining kernel name generation in LowerToLLVMPass. Once those
+    // call-sites have been refactored to use the kernel catalog, this feature
     // can be removed here.
-    static std::string mlirTypeToCppTypeName(mlir::Type t, bool angleBrackets = true, bool generalizeToStructure = false) { // NOLINT(misc-no-recursion)
-        if(t.isF64())
+    static std::string mlirTypeToCppTypeName(mlir::Type t, bool angleBrackets = true,
+                                             bool generalizeToStructure = false) { // NOLINT(misc-no-recursion)
+        if (t.isF64())
             return "double";
-        else if(t.isF32())
+        else if (t.isF32())
             return "float";
-        else if(t.isSignedInteger(8))
+        else if (t.isSignedInteger(8))
             return "int8_t";
-        else if(t.isSignedInteger(32))
+        else if (t.isSignedInteger(32))
             return "int32_t";
-        else if(t.isSignedInteger(64))
+        else if (t.isSignedInteger(64))
             return "int64_t";
-        else if(t.isUnsignedInteger(8))
+        else if (t.isUnsignedInteger(8))
             return "uint8_t";
-        else if(t.isUnsignedInteger(32))
+        else if (t.isUnsignedInteger(32))
             return "uint32_t";
-        else if(t.isUnsignedInteger(64))
+        else if (t.isUnsignedInteger(64))
             return "uint64_t";
-        else if(t.isSignlessInteger(1))
+        else if (t.isSignlessInteger(1))
             return "bool";
-        else if(t.isIndex())
+        else if (t.isIndex())
             return "size_t";
-        else if(t.isa<mlir::daphne::StructureType>())
+        else if (t.isa<mlir::daphne::StructureType>())
             return "Structure";
-        else if(auto matTy = t.dyn_cast<mlir::daphne::MatrixType>()) {
-            if(generalizeToStructure)
+        else if (auto matTy = t.dyn_cast<mlir::daphne::MatrixType>()) {
+            if (generalizeToStructure)
                 return "Structure";
             else {
                 switch (matTy.getRepresentation()) {
-                    case mlir::daphne::MatrixRepresentation::Dense: {
-                        const std::string vtName = mlirTypeToCppTypeName(matTy.getElementType(), angleBrackets, false);
-                        return angleBrackets ? ("DenseMatrix<" + vtName + ">") : ("DenseMatrix_" + vtName);
-                    }
-                    case mlir::daphne::MatrixRepresentation::Sparse: {
-                        const std::string vtName = mlirTypeToCppTypeName(matTy.getElementType(), angleBrackets, false);
-                        return angleBrackets ? ("CSRMatrix<" + vtName + ">") : ("CSRMatrix_" + vtName);
-                    }
+                case mlir::daphne::MatrixRepresentation::Dense: {
+                    const std::string vtName = mlirTypeToCppTypeName(matTy.getElementType(), angleBrackets, false);
+                    return angleBrackets ? ("DenseMatrix<" + vtName + ">") : ("DenseMatrix_" + vtName);
+                }
+                case mlir::daphne::MatrixRepresentation::Sparse: {
+                    const std::string vtName = mlirTypeToCppTypeName(matTy.getElementType(), angleBrackets, false);
+                    return angleBrackets ? ("CSRMatrix<" + vtName + ">") : ("CSRMatrix_" + vtName);
+                }
                 }
             }
-        }
-        else if(llvm::isa<mlir::daphne::FrameType>(t))
-            if(generalizeToStructure)
+        } else if (llvm::isa<mlir::daphne::FrameType>(t))
+            if (generalizeToStructure)
                 return "Structure";
             else
                 return "Frame";
-        else if(auto lstTy = t.dyn_cast<mlir::daphne::ListType>()) {
-            if(generalizeToStructure)
+        else if (auto lstTy = t.dyn_cast<mlir::daphne::ListType>()) {
+            if (generalizeToStructure)
                 return "Structure";
             else {
                 const std::string dtName = mlirTypeToCppTypeName(lstTy.getElementType(), angleBrackets, false);
                 return angleBrackets ? ("List<" + dtName + ">") : ("List_" + dtName);
             }
-        }
-        else if(llvm::isa<mlir::daphne::StringType>(t))
+        } else if (llvm::isa<mlir::daphne::StringType>(t))
             // This becomes "const char *" (which makes perfect sense for
             // strings) when inserted into the typical "const DT *" template of
             // kernel input parameters.
             return "char";
-        else if(llvm::isa<mlir::daphne::DaphneContextType>(t))
+        else if (llvm::isa<mlir::daphne::DaphneContextType>(t))
             return "DaphneContext";
-        else if(auto handleTy = t.dyn_cast<mlir::daphne::HandleType>()) {
-            const std::string tName = mlirTypeToCppTypeName(handleTy.getDataType(), angleBrackets, generalizeToStructure);
+        else if (auto handleTy = t.dyn_cast<mlir::daphne::HandleType>()) {
+            const std::string tName =
+                mlirTypeToCppTypeName(handleTy.getDataType(), angleBrackets, generalizeToStructure);
             return angleBrackets ? ("Handle<" + tName + ">") : ("Handle_" + tName);
-        }
-        else if(llvm::isa<mlir::daphne::FileType>(t))
+        } else if (llvm::isa<mlir::daphne::FileType>(t))
             return "File";
-        else if(llvm::isa<mlir::daphne::DescriptorType>(t))
+        else if (llvm::isa<mlir::daphne::DescriptorType>(t))
             return "Descriptor";
-        else if(llvm::isa<mlir::daphne::TargetType>(t))
+        else if (llvm::isa<mlir::daphne::TargetType>(t))
             return "Target";
-        else if(auto memRefType = t.dyn_cast<mlir::MemRefType>()) {
+        else if (auto memRefType = t.dyn_cast<mlir::MemRefType>()) {
             const std::string vtName = mlirTypeToCppTypeName(memRefType.getElementType(), angleBrackets, false);
             return angleBrackets ? ("StridedMemRefType<" + vtName + ",2>") : ("StridedMemRefType_" + vtName + "_2");
         }
@@ -209,60 +214,55 @@ struct CompilerUtils {
         std::string typeName;
         llvm::raw_string_ostream rsos(typeName);
         t.print(rsos);
-        throw std::runtime_error(
-            "no C++ type name known for the given MLIR type: " + typeName
-        );
+        throw std::runtime_error("no C++ type name known for the given MLIR type: " + typeName);
     }
 
     static bool isMatrixComputation(mlir::Operation *v);
 
     /**
      * @brief Returns the DAPHNE context used in the given function.
-     * 
+     *
      * Throws if there is not exactly one DAPHNE context.
-     * 
+     *
      * @param func
-     * @return 
+     * @return
      */
-    [[maybe_unused]] mlir::Value static getDaphneContext(mlir::func::FuncOp & func) {
+    [[maybe_unused]] mlir::Value static getDaphneContext(mlir::func::FuncOp &func) {
         mlir::Value dctx = nullptr;
         auto ops = func.getBody().front().getOps<mlir::daphne::CreateDaphneContextOp>();
-        for(auto op : ops) {
-            if(!dctx)
+        for (auto op : ops) {
+            if (!dctx)
                 dctx = op.getResult();
             else
                 throw ErrorHandler::compilerError(op.getLoc(), "getDaphneContext",
-                        "function body block contains more than one CreateDaphneContextOp"
-                );
+                                                  "function body block contains more than one "
+                                                  "CreateDaphneContextOp");
         }
-        if(!dctx)
+        if (!dctx)
             throw ErrorHandler::compilerError(func.getLoc(), "getDaphneContext",
-                    "function body block contains no CreateDaphneContextOp"
-            );
+                                              "function body block contains no CreateDaphneContextOp");
         return dctx;
     }
-    
+
     [[maybe_unused]] static bool isObjType(mlir::Type t) {
         return llvm::isa<mlir::daphne::MatrixType, mlir::daphne::FrameType>(t);
     }
-    
-    [[maybe_unused]] static bool hasObjType(mlir::Value v) {
-        return isObjType(v.getType());
-    }
+
+    [[maybe_unused]] static bool hasObjType(mlir::Value v) { return isObjType(v.getType()); }
 
     /**
      * @brief Returns the value type of the given scalar/matrix/frame type.
-     * 
+     *
      * For matrices and frames, the value type is extracted. For scalars,
      * the type itself is the value type.
-     * 
+     *
      * @param t the given scalar/matrix/frame type
      * @return the value type of the given type
      */
     static mlir::Type getValueType(mlir::Type t) {
-        if(auto mt = t.dyn_cast<mlir::daphne::MatrixType>())
+        if (auto mt = t.dyn_cast<mlir::daphne::MatrixType>())
             return mt.getElementType();
-        if(auto ft = t.dyn_cast<mlir::daphne::FrameType>())
+        if (auto ft = t.dyn_cast<mlir::daphne::FrameType>())
             throw std::runtime_error("getValueType() doesn't support frames yet"); // TODO
         else // TODO Check if this is really a scalar.
             return t;
@@ -271,18 +271,18 @@ struct CompilerUtils {
     /**
      * @brief Sets the value type of the given scalar/matrix/frame type to the
      * given value type and returns this derived type.
-     * 
+     *
      * For matrices and frames, the value type is set to the given value type.
      * For scalars, the given value type itself is returned.
-     * 
+     *
      * @param t the scalar/matrix/frame type whose value type shall be set
      * @param vt the value type to use
      * @return the derived scalar/matrix/frame type
      */
     static mlir::Type setValueType(mlir::Type t, mlir::Type vt) {
-        if(auto mt = t.dyn_cast<mlir::daphne::MatrixType>())
+        if (auto mt = t.dyn_cast<mlir::daphne::MatrixType>())
             return mt.withElementType(vt);
-        if(auto ft = t.dyn_cast<mlir::daphne::FrameType>())
+        if (auto ft = t.dyn_cast<mlir::daphne::FrameType>())
             throw std::runtime_error("setValueType() doesn't support frames yet"); // TODO
         else // TODO Check if this is really a scalar.
             return vt;
@@ -291,13 +291,13 @@ struct CompilerUtils {
     /**
      * @brief Checks if the two given types are the same, whereby
      * DaphneIR's unknown type acts as a wildcard.
-     * 
+     *
      * The two types are considered equal, iff they are exactly the same
      * type, or one of the following "excuses" holds:
      * - at least one of the types is unknown
      * - both types are matrices and at least one of them has an unknown
      *   value type
-     * 
+     *
      * @param t1 The first type
      * @param t2 The second type
      * @result `true` if the two types are considered equal, `false` otherwise
@@ -311,16 +311,13 @@ struct CompilerUtils {
             // The two types are exactly the same...
             t1 == t2
             // ...or one of the following "excuses" holds:
-            || (
+            ||
+            (
                 // at least one of the types is unknown
                 llvm::isa<UnknownType>(t1) || llvm::isa<UnknownType>(t2) ||
                 // both types are matrices and at least one of them
                 // has an unknown value type
-                (matT1 && matT2 && (
-                    llvm::isa<UnknownType>(matT1.getElementType()) ||
-                    llvm::isa<UnknownType>(matT2.getElementType())
-                ))
-            )
-        );
+                (matT1 && matT2 &&
+                 (llvm::isa<UnknownType>(matT1.getElementType()) || llvm::isa<UnknownType>(matT2.getElementType())))));
     }
 };
diff --git a/src/compiler/utils/LoweringUtils.cpp b/src/compiler/utils/LoweringUtils.cpp
index b9f3107a2..230ff0281 100644
--- a/src/compiler/utils/LoweringUtils.cpp
+++ b/src/compiler/utils/LoweringUtils.cpp
@@ -28,8 +28,7 @@
 #include "mlir/Transforms/Passes.h"
 
 /// Insert an allocation for the given MemRefType.
-mlir::Value insertMemRefAlloc(mlir::MemRefType type, mlir::Location loc,
-                              mlir::PatternRewriter &rewriter) {
+mlir::Value insertMemRefAlloc(mlir::MemRefType type, mlir::Location loc, mlir::PatternRewriter &rewriter) {
     auto alloc = rewriter.create<mlir::memref::AllocOp>(loc, type);
 
     // Make sure to allocate at the beginning of the block.
@@ -39,49 +38,13 @@ mlir::Value insertMemRefAlloc(mlir::MemRefType type, mlir::Location loc,
     return alloc;
 }
 
-void insertMemRefDealloc(mlir::Value memref, mlir::Location loc,
-                         mlir::PatternRewriter &rewriter) {
+void insertMemRefDealloc(mlir::Value memref, mlir::Location loc, mlir::PatternRewriter &rewriter) {
     auto dealloc = rewriter.create<mlir::memref::DeallocOp>(loc, memref);
     dealloc->moveBefore(&memref.getParentBlock()->back());
 }
 
-// TODO(phil) try to provide function templates to remove duplication
-void affineFillMemRefInt(int value, mlir::ConversionPatternRewriter &rewriter,
-                         mlir::Location loc, mlir::ArrayRef<int64_t> shape,
-                         mlir::MLIRContext *ctx, mlir::Value memRef,
-                         mlir::Type elemType) {
-    constexpr int ROW = 0;
-    constexpr int COL = 1;
-    mlir::Value fillValue = rewriter.create<mlir::arith::ConstantOp>(
-        loc, rewriter.getI64Type(), rewriter.getI64IntegerAttr(value));
-
-    llvm::SmallVector<mlir::Value, 4> loopIvs;
-
-    auto outerLoop = rewriter.create<mlir::AffineForOp>(loc, 0, shape[ROW], 1);
-    for (mlir::Operation &nested : *outerLoop.getBody()) {
-        rewriter.eraseOp(&nested);
-    }
-    loopIvs.push_back(outerLoop.getInductionVar());
-
-    // outer loop body
-    rewriter.setInsertionPointToStart(outerLoop.getBody());
-    auto innerLoop = rewriter.create<mlir::AffineForOp>(loc, 0, shape[COL], 1);
-    for (mlir::Operation &nested : *innerLoop.getBody()) {
-        rewriter.eraseOp(&nested);
-    }
-    loopIvs.push_back(innerLoop.getInductionVar());
-    rewriter.create<mlir::AffineYieldOp>(loc);
-    rewriter.setInsertionPointToStart(innerLoop.getBody());
-    rewriter.create<mlir::AffineStoreOp>(loc, fillValue, memRef, loopIvs);
-
-    rewriter.create<mlir::AffineYieldOp>(loc);
-    rewriter.setInsertionPointAfter(outerLoop);
-}
-
-// Specify the fill Value directly
-void affineFillMemRefInt(mlir::Value value, mlir::ConversionPatternRewriter &rewriter,
-                         mlir::Location loc, mlir::ArrayRef<int64_t> shape,
-                         mlir::MLIRContext *ctx, mlir::Value memRef) {
+void affineFillMemRef(mlir::Value value, mlir::ConversionPatternRewriter &rewriter, mlir::Location loc,
+                      mlir::ArrayRef<int64_t> shape, mlir::MLIRContext *ctx, mlir::Value memRef) {
     constexpr int ROW = 0;
     constexpr int COL = 1;
     llvm::SmallVector<mlir::Value, 4> loopIvs;
@@ -107,47 +70,11 @@ void affineFillMemRefInt(mlir::Value value, mlir::ConversionPatternRewriter &rew
     rewriter.setInsertionPointAfter(outerLoop);
 }
 
-void affineFillMemRef(double value, mlir::ConversionPatternRewriter &rewriter,
-                      mlir::Location loc, mlir::ArrayRef<int64_t> shape,
-                      mlir::MLIRContext *ctx, mlir::Value memRef,
-                      mlir::Type elemType) {
-    constexpr int ROW = 0;
-    constexpr int COL = 1;
-    mlir::Value fillValue = rewriter.create<mlir::arith::ConstantOp>(
-        loc, elemType, rewriter.getFloatAttr(elemType, value));
-
-    llvm::SmallVector<mlir::Value, 4> loopIvs;
-
-    auto outerLoop = rewriter.create<mlir::AffineForOp>(loc, 0, shape[ROW], 1);
-    for (mlir::Operation &nested : *outerLoop.getBody()) {
-        rewriter.eraseOp(&nested);
-    }
-    loopIvs.push_back(outerLoop.getInductionVar());
-
-    // outer loop body
-    rewriter.setInsertionPointToStart(outerLoop.getBody());
-    auto innerLoop = rewriter.create<mlir::AffineForOp>(loc, 0, shape[COL], 1);
-    for (mlir::Operation &nested : *innerLoop.getBody()) {
-        rewriter.eraseOp(&nested);
-    }
-    loopIvs.push_back(innerLoop.getInductionVar());
-    rewriter.create<mlir::AffineYieldOp>(loc);
-    rewriter.setInsertionPointToStart(innerLoop.getBody());
-    rewriter.create<mlir::AffineStoreOp>(loc, fillValue, memRef, loopIvs);
-
-    rewriter.create<mlir::AffineYieldOp>(loc);
-    rewriter.setInsertionPointAfter(outerLoop);
-}
-
-mlir::Value convertMemRefToDenseMatrix(
-    mlir::Location loc, mlir::ConversionPatternRewriter &rewriter,
-    mlir::Value memRef, mlir::Type type) {
-    auto extractStridedMetadataOp =
-        rewriter.create<mlir::memref::ExtractStridedMetadataOp>(loc, memRef);
+mlir::Value convertMemRefToDenseMatrix(mlir::Location loc, mlir::ConversionPatternRewriter &rewriter,
+                                       mlir::Value memRef, mlir::Type type) {
+    auto extractStridedMetadataOp = rewriter.create<mlir::memref::ExtractStridedMetadataOp>(loc, memRef);
     // aligned ptr (memref.data)
-    mlir::Value alignedPtr =
-        rewriter.create<mlir::memref::ExtractAlignedPointerAsIndexOp>(loc,
-                                                                      memRef);
+    mlir::Value alignedPtr = rewriter.create<mlir::memref::ExtractAlignedPointerAsIndexOp>(loc, memRef);
     // offset
     mlir::Value offset = extractStridedMetadataOp.getOffset();
     // strides
@@ -155,51 +82,38 @@ mlir::Value convertMemRefToDenseMatrix(
     // sizes
     mlir::ResultRange sizes = extractStridedMetadataOp.getSizes();
 
-    return rewriter.create<mlir::daphne::ConvertMemRefToDenseMatrix>(
-        loc, type, alignedPtr, offset, sizes[0], sizes[1], strides[0],
-        strides[1]);
+    return rewriter.create<mlir::daphne::ConvertMemRefToDenseMatrix>(loc, type, alignedPtr, offset, sizes[0], sizes[1],
+                                                                     strides[0], strides[1]);
 }
 
 mlir::Type convertFloat(mlir::FloatType floatType) {
-    return mlir::IntegerType::get(floatType.getContext(),
-                                  floatType.getIntOrFloatBitWidth());
+    return mlir::IntegerType::get(floatType.getContext(), floatType.getIntOrFloatBitWidth());
 }
 
 mlir::Type convertInteger(mlir::IntegerType intType) {
-    return mlir::IntegerType::get(intType.getContext(),
-                                  intType.getIntOrFloatBitWidth());
+    return mlir::IntegerType::get(intType.getContext(), intType.getIntOrFloatBitWidth());
 }
 
-llvm::Optional<mlir::Value> materializeCastFromIllegal(mlir::OpBuilder &builder,
-                                                       mlir::Type type,
-                                                       mlir::ValueRange inputs,
-                                                       mlir::Location loc) {
+llvm::Optional<mlir::Value> materializeCastFromIllegal(mlir::OpBuilder &builder, mlir::Type type,
+                                                       mlir::ValueRange inputs, mlir::Location loc) {
     mlir::Type fromType = getElementTypeOrSelf(inputs[0].getType());
     mlir::Type toType = getElementTypeOrSelf(type);
 
-    if ((!fromType.isSignedInteger() && !fromType.isUnsignedInteger()) ||
-        !toType.isSignlessInteger())
+    if ((!fromType.isSignedInteger() && !fromType.isUnsignedInteger()) || !toType.isSignlessInteger())
         return std::nullopt;
     // Use unrealized conversion casts to do signful->signless conversions.
-    return builder
-        .create<mlir::UnrealizedConversionCastOp>(loc, type, inputs[0])
-        ->getResult(0);
+    return builder.create<mlir::UnrealizedConversionCastOp>(loc, type, inputs[0])->getResult(0);
 }
 
-llvm::Optional<mlir::Value> materializeCastToIllegal(mlir::OpBuilder &builder,
-                                                     mlir::Type type,
-                                                     mlir::ValueRange inputs,
+llvm::Optional<mlir::Value> materializeCastToIllegal(mlir::OpBuilder &builder, mlir::Type type, mlir::ValueRange inputs,
                                                      mlir::Location loc) {
     mlir::Type fromType = getElementTypeOrSelf(inputs[0].getType());
     mlir::Type toType = getElementTypeOrSelf(type);
 
-    if (!fromType.isSignlessInteger() ||
-        (!toType.isSignedInteger() && !toType.isUnsignedInteger()))
+    if (!fromType.isSignlessInteger() || (!toType.isSignedInteger() && !toType.isUnsignedInteger()))
         return std::nullopt;
     // Use unrealized conversion casts to do signless->signful conversions.
-    return builder
-        .create<mlir::UnrealizedConversionCastOp>(loc, type, inputs[0])
-        ->getResult(0);
+    return builder.create<mlir::UnrealizedConversionCastOp>(loc, type, inputs[0])->getResult(0);
 }
 
 mlir::Operation *findLastUseOfSSAValue(mlir::Value &v) {
diff --git a/src/compiler/utils/LoweringUtils.h b/src/compiler/utils/LoweringUtils.h
index e5c49123a..6a3d11ed3 100644
--- a/src/compiler/utils/LoweringUtils.h
+++ b/src/compiler/utils/LoweringUtils.h
@@ -33,39 +33,20 @@
 #include "mlir/Transforms/DialectConversion.h"
 #include "llvm/ADT/ArrayRef.h"
 
-mlir::Value insertMemRefAlloc(mlir::MemRefType type, mlir::Location loc,
-                              mlir::PatternRewriter &rewriter);
+mlir::Value insertMemRefAlloc(mlir::MemRefType type, mlir::Location loc, mlir::PatternRewriter &rewriter);
 
-void insertMemRefDealloc(mlir::Value memref, mlir::Location loc,
-                         mlir::PatternRewriter &rewriter);
+void insertMemRefDealloc(mlir::Value memref, mlir::Location loc, mlir::PatternRewriter &rewriter);
 
-void affineFillMemRefInt(int value, mlir::ConversionPatternRewriter &rewriter,
-                         mlir::Location loc, mlir::ArrayRef<int64_t> shape,
-                         mlir::MLIRContext *ctx, mlir::Value memRef,
-                         mlir::Type elemType);
+void affineFillMemRef(mlir::Value value, mlir::ConversionPatternRewriter &rewriter, mlir::Location loc,
+                      mlir::ArrayRef<int64_t> shape, mlir::MLIRContext *ctx, mlir::Value memRef);
 
-void affineFillMemRefInt(mlir::Value value,
-                         mlir::ConversionPatternRewriter &rewriter,
-                         mlir::Location loc, mlir::ArrayRef<int64_t> shape,
-                         mlir::MLIRContext *ctx, mlir::Value memRef);
+mlir::Value convertMemRefToDenseMatrix(mlir::Location, mlir::ConversionPatternRewriter &, mlir::Value memRef,
+                                       mlir::Type);
 
-void affineFillMemRef(double value, mlir::ConversionPatternRewriter &rewriter,
-                      mlir::Location loc, mlir::ArrayRef<int64_t> shape,
-                      mlir::MLIRContext *ctx, mlir::Value memRef,
-                      mlir::Type elemType);
+llvm::Optional<mlir::Value> materializeCastFromIllegal(mlir::OpBuilder &builder, mlir::Type type,
+                                                       mlir::ValueRange inputs, mlir::Location loc);
 
-mlir::Value convertMemRefToDenseMatrix(mlir::Location,
-                                       mlir::ConversionPatternRewriter &,
-                                       mlir::Value memRef, mlir::Type);
-
-llvm::Optional<mlir::Value> materializeCastFromIllegal(mlir::OpBuilder &builder,
-                                                       mlir::Type type,
-                                                       mlir::ValueRange inputs,
-                                                       mlir::Location loc);
-
-llvm::Optional<mlir::Value> materializeCastToIllegal(mlir::OpBuilder &builder,
-                                                     mlir::Type type,
-                                                     mlir::ValueRange inputs,
+llvm::Optional<mlir::Value> materializeCastToIllegal(mlir::OpBuilder &builder, mlir::Type type, mlir::ValueRange inputs,
                                                      mlir::Location loc);
 
 mlir::Type convertFloat(mlir::FloatType floatType);
diff --git a/src/compiler/utils/TypePrinting.cpp b/src/compiler/utils/TypePrinting.cpp
index bb2e98582..dc58cb6b3 100644
--- a/src/compiler/utils/TypePrinting.cpp
+++ b/src/compiler/utils/TypePrinting.cpp
@@ -22,7 +22,7 @@
 #include <ostream>
 #include <string>
 
-std::ostream & operator<<(std::ostream & os, mlir::Type t) {
+std::ostream &operator<<(std::ostream &os, mlir::Type t) {
     std::string s;
     llvm::raw_string_ostream rsos(s);
     t.print(rsos);
diff --git a/src/compiler/utils/TypePrinting.h b/src/compiler/utils/TypePrinting.h
index cfde50395..50d5d7bdd 100644
--- a/src/compiler/utils/TypePrinting.h
+++ b/src/compiler/utils/TypePrinting.h
@@ -20,4 +20,4 @@
 
 #include <ostream>
 
-std::ostream & operator<<(std::ostream & os, mlir::Type t);
\ No newline at end of file
+std::ostream &operator<<(std::ostream &os, mlir::Type t);
\ No newline at end of file
diff --git a/src/ir/daphneir/CMakeLists.txt b/src/ir/daphneir/CMakeLists.txt
index 87a0fb6ac..3016dc0c5 100644
--- a/src/ir/daphneir/CMakeLists.txt
+++ b/src/ir/daphneir/CMakeLists.txt
@@ -36,6 +36,8 @@ add_mlir_doc(Passes -gen-pass-doc DaphnePasses Dialects/)
 
 add_mlir_dialect_library(MLIRDaphne
     DaphneDialect.cpp
+    Fold.cpp
+    Canonicalize.cpp
     DaphneDistributableOpInterface.cpp
     DaphneInferFrameLabelsOpInterface.cpp
     DaphneInferShapeOpInterface.cpp
diff --git a/src/ir/daphneir/Canonicalize.cpp b/src/ir/daphneir/Canonicalize.cpp
new file mode 100644
index 000000000..b296cc08b
--- /dev/null
+++ b/src/ir/daphneir/Canonicalize.cpp
@@ -0,0 +1,516 @@
+/*
+ * Copyright 2024 The DAPHNE Consortium
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ir/daphneir/Daphne.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Support/LogicalResult.h"
+#include <compiler/utils/CompilerUtils.h>
+
+mlir::LogicalResult mlir::daphne::VectorizedPipelineOp::canonicalize(mlir::daphne::VectorizedPipelineOp op,
+                                                                     mlir::PatternRewriter &rewriter) {
+    // // Find duplicate inputs
+    std::vector<Attribute> vSplitsAttrs;
+    for (auto &split : op.getSplits())
+        vSplitsAttrs.push_back(split);
+    auto currentSize = op.getInputs().size();
+
+    DenseMap<Value, size_t> inputMap;
+
+    for (size_t i = 0; i < currentSize; i++) {
+        const auto &input = op.getInputs()[i];
+        const auto &split = vSplitsAttrs[i].cast<daphne::VectorSplitAttr>().getValue();
+
+        if (inputMap.count(input) == 0) {
+            inputMap[input] = i;
+        } else {
+            size_t j = inputMap[input];
+            if (op.getSplits()[j].cast<daphne::VectorSplitAttr>().getValue() == split) {
+                op.getBody().getArgument(i).replaceAllUsesWith(op.getBody().getArgument(j));
+                op.getBody().eraseArgument(i);
+                op.getInputsMutable().erase(i);
+                vSplitsAttrs.erase(vSplitsAttrs.begin() + i);
+                currentSize--;
+                i--;
+            }
+        }
+    }
+
+    std::vector<Value> resultsToReplace;
+    std::vector<Value> outRows;
+    std::vector<Value> outCols;
+    std::vector<Attribute> vCombineAttrs;
+
+    llvm::BitVector eraseIxs;
+    eraseIxs.resize(op.getNumResults());
+    for (auto result : op.getResults()) {
+        auto resultIx = result.getResultNumber();
+        if (result.use_empty()) {
+            // remove
+            eraseIxs.set(resultIx);
+        } else {
+            resultsToReplace.push_back(result);
+            outRows.push_back(op.getOutRows()[resultIx]);
+            outCols.push_back(op.getOutCols()[resultIx]);
+            vCombineAttrs.push_back(op.getCombines()[resultIx]);
+        }
+    }
+    op.getBody().front().getTerminator()->eraseOperands(eraseIxs);
+    if (!op.getCuda().getBlocks().empty())
+        op.getCuda().front().getTerminator()->eraseOperands(eraseIxs);
+
+    if (resultsToReplace.size() == op->getNumResults() && op.getSplits().size() == vSplitsAttrs.size()) {
+        return failure();
+    }
+    auto pipelineOp = rewriter.create<daphne::VectorizedPipelineOp>(
+        op.getLoc(), ValueRange(resultsToReplace).getTypes(), op.getInputs(), outRows, outCols,
+        rewriter.getArrayAttr(vSplitsAttrs), rewriter.getArrayAttr(vCombineAttrs), op.getCtx());
+    pipelineOp.getBody().takeBody(op.getBody());
+    if (!op.getCuda().getBlocks().empty())
+        pipelineOp.getCuda().takeBody(op.getCuda());
+    for (auto e : llvm::enumerate(resultsToReplace)) {
+        auto resultToReplace = e.value();
+        auto i = e.index();
+        resultToReplace.replaceAllUsesWith(pipelineOp.getResult(i));
+    }
+    op.erase();
+    return success();
+}
+
+/**
+ * @brief Transposition-aware matrix multiplication
+ * Identifies if an input to a MatMulOp is the result of a TransposeOp; Rewrites
+ * the Operation, passing transposition info as a flag, instead of transposing
+ * the matrix before multiplication
+ */
+mlir::LogicalResult mlir::daphne::MatMulOp::canonicalize(mlir::daphne::MatMulOp op, PatternRewriter &rewriter) {
+    mlir::Value lhs = op.getLhs();
+    mlir::Value rhs = op.getRhs();
+    mlir::Value transa = op.getTransa();
+    mlir::Value transb = op.getTransb();
+
+    // TODO If transa or transb are not constant, we cannot continue on the
+    // respective side; we cannot just assume false then.
+    bool ta = CompilerUtils::constantOrDefault<bool>(transa, false);
+    bool tb = CompilerUtils::constantOrDefault<bool>(transb, false);
+
+    // TODO Turn on the transposition-awareness for the left-hand-side argument
+    // again (see #447). mlir::daphne::TransposeOp lhsTransposeOp =
+    // lhs.getDefiningOp<mlir::daphne::TransposeOp>();
+    mlir::daphne::TransposeOp rhsTransposeOp = rhs.getDefiningOp<mlir::daphne::TransposeOp>();
+
+    // if (!lhsTransposeOp && !rhsTransposeOp){
+    if (!rhsTransposeOp) {
+        return mlir::failure();
+    }
+
+    // ToDo: This check prevents merging transpose into matrix multiplication
+    // because that is not yet supported by our
+    //   sparse kernels.
+    // ToDo: bring user config here for sparsity threshold or properly use
+    // MatrixRepresentation
+    if (auto t = rhs.getType().dyn_cast<mlir::daphne::MatrixType>()) {
+        auto sparsity = t.getSparsity();
+        if (sparsity < 0.25)
+            return mlir::failure();
+    }
+
+#if 0
+    // TODO Adapt PhyOperatorSelectionPass once this code is turned on again.
+    if(lhsTransposeOp) {
+        lhs = lhsTransposeOp.getArg();
+        ta = !ta;
+    }
+#endif
+    if (rhsTransposeOp) {
+        rhs = rhsTransposeOp.getArg();
+        tb = !tb;
+    }
+
+    rewriter.replaceOpWithNewOp<mlir::daphne::MatMulOp>(
+        op, op.getType(), lhs, rhs,
+        static_cast<mlir::Value>(rewriter.create<mlir::daphne::ConstantOp>(transa.getLoc(), ta)),
+        static_cast<mlir::Value>(rewriter.create<mlir::daphne::ConstantOp>(transb.getLoc(), tb)));
+    return mlir::success();
+}
+
+/**
+ * @brief Replaces NumRowsOp by a constant, if the #rows of the input is known
+ * (e.g., due to shape inference).
+ */
+mlir::LogicalResult mlir::daphne::NumRowsOp::canonicalize(mlir::daphne::NumRowsOp op, PatternRewriter &rewriter) {
+    ssize_t numRows = -1;
+
+    mlir::Type inTy = op.getArg().getType();
+    if (auto t = inTy.dyn_cast<mlir::daphne::MatrixType>())
+        numRows = t.getNumRows();
+    else if (auto t = inTy.dyn_cast<mlir::daphne::FrameType>())
+        numRows = t.getNumRows();
+
+    if (numRows != -1) {
+        rewriter.replaceOpWithNewOp<mlir::daphne::ConstantOp>(op, rewriter.getIndexType(),
+                                                              rewriter.getIndexAttr(numRows));
+        return mlir::success();
+    }
+    return mlir::failure();
+}
+
+/**
+ * @brief Replaces NumColsOp by a constant, if the #cols of the input is known
+ * (e.g., due to shape inference).
+ */
+mlir::LogicalResult mlir::daphne::NumColsOp::canonicalize(mlir::daphne::NumColsOp op, PatternRewriter &rewriter) {
+    ssize_t numCols = -1;
+
+    mlir::Type inTy = op.getArg().getType();
+    if (auto t = inTy.dyn_cast<mlir::daphne::MatrixType>())
+        numCols = t.getNumCols();
+    else if (auto t = inTy.dyn_cast<mlir::daphne::FrameType>())
+        numCols = t.getNumCols();
+
+    if (numCols != -1) {
+        rewriter.replaceOpWithNewOp<mlir::daphne::ConstantOp>(op, rewriter.getIndexType(),
+                                                              rewriter.getIndexAttr(numCols));
+        return mlir::success();
+    }
+    return mlir::failure();
+}
+
+/**
+ * @brief Replaces NumCellsOp by a constant, if the #rows and #cols of the
+ * input is known (e.g., due to shape inference).
+ */
+mlir::LogicalResult mlir::daphne::NumCellsOp::canonicalize(mlir::daphne::NumCellsOp op, PatternRewriter &rewriter) {
+    ssize_t numRows = -1;
+    ssize_t numCols = -1;
+
+    mlir::Type inTy = op.getArg().getType();
+    if (auto t = inTy.dyn_cast<mlir::daphne::MatrixType>()) {
+        numRows = t.getNumRows();
+        numCols = t.getNumCols();
+    } else if (auto t = inTy.dyn_cast<mlir::daphne::FrameType>()) {
+        numRows = t.getNumRows();
+        numCols = t.getNumCols();
+    }
+
+    if (numRows != -1 && numCols != -1) {
+        rewriter.replaceOpWithNewOp<mlir::daphne::ConstantOp>(op, rewriter.getIndexType(),
+                                                              rewriter.getIndexAttr(numRows * numCols));
+        return mlir::success();
+    }
+    return mlir::failure();
+}
+
+/**
+ * @brief Replaces SparsityOp by a constant, if the sparsity of the input is
+ * known (e.g., due to sparsity inference).
+ */
+mlir::LogicalResult mlir::daphne::SparsityOp::canonicalize(mlir::daphne::SparsityOp op, PatternRewriter &rewriter) {
+    double sparsity = -1.0;
+
+    mlir::Type inTy = op.getArg().getType();
+    if (auto t = inTy.dyn_cast<mlir::daphne::MatrixType>())
+        sparsity = t.getSparsity();
+
+    if (sparsity != -1) {
+        rewriter.replaceOpWithNewOp<mlir::daphne::ConstantOp>(op, sparsity);
+        return mlir::success();
+    }
+    return mlir::failure();
+}
+
+/**
+ * @brief Replaces (1) `a + b` by `a concat b`, if `a` or `b` is a string,
+ * and (2) `a + X` by `X + a` (`a` scalar, `X` matrix/frame).
+ *
+ * (1) is important, since we use the `+`-operator for both addition and
+ * string concatenation in DaphneDSL, while the types of the operands might be
+ * known only after type inference.
+ *
+ * (2) is important, since our kernels for elementwise binary operations only
+ * support scalars as the right-hand-side operand so far (see #203).
+ *
+ * @param op
+ * @param rewriter
+ * @return
+ */
+mlir::LogicalResult mlir::daphne::EwAddOp::canonicalize(mlir::daphne::EwAddOp op, PatternRewriter &rewriter) {
+    mlir::Value lhs = op.getLhs();
+    mlir::Value rhs = op.getRhs();
+
+    const bool lhsIsStr = llvm::isa<mlir::daphne::StringType>(lhs.getType());
+    const bool rhsIsStr = llvm::isa<mlir::daphne::StringType>(rhs.getType());
+    if (lhsIsStr || rhsIsStr) {
+        mlir::Type strTy = mlir::daphne::StringType::get(rewriter.getContext());
+        if (!lhsIsStr)
+            lhs = rewriter.create<mlir::daphne::CastOp>(op.getLoc(), strTy, lhs);
+        if (!rhsIsStr)
+            rhs = rewriter.create<mlir::daphne::CastOp>(op.getLoc(), strTy, rhs);
+        rewriter.replaceOpWithNewOp<mlir::daphne::EwConcatOp>(op, strTy, lhs, rhs);
+        return mlir::success();
+    } else {
+        const bool lhsIsSca = !llvm::isa<mlir::daphne::MatrixType, mlir::daphne::FrameType>(lhs.getType());
+        const bool rhsIsSca = !llvm::isa<mlir::daphne::MatrixType, mlir::daphne::FrameType>(rhs.getType());
+        if (lhsIsSca && !rhsIsSca) {
+            rewriter.replaceOpWithNewOp<mlir::daphne::EwAddOp>(op, op.getResult().getType(), rhs, lhs);
+            return mlir::success();
+        }
+        return mlir::failure();
+    }
+}
+
+/**
+ * @brief Replaces `a - X` by `(X * -1) + a` (`a` scalar, `X` matrix/frame).
+ *
+ * This is important, since our kernels for elementwise binary operations only
+ * support scalars as the right-hand-side operand so far (see #203).
+ *
+ * As a downside, an additional operation and intermediate result is introduced.
+ *
+ * @param op
+ * @param rewriter
+ * @return
+ */
+mlir::LogicalResult mlir::daphne::EwSubOp::canonicalize(mlir::daphne::EwSubOp op, PatternRewriter &rewriter) {
+    mlir::Value lhs = op.getLhs();
+    mlir::Value rhs = op.getRhs();
+    const bool lhsIsSca = !llvm::isa<mlir::daphne::MatrixType, mlir::daphne::FrameType>(lhs.getType());
+    const bool rhsIsSca = !llvm::isa<mlir::daphne::MatrixType, mlir::daphne::FrameType>(rhs.getType());
+    if (lhsIsSca && !rhsIsSca) {
+        rewriter.replaceOpWithNewOp<mlir::daphne::EwAddOp>(
+            op, op.getResult().getType(),
+            rewriter.create<mlir::daphne::EwMulOp>(
+                op->getLoc(),
+                mlir::daphne::UnknownType::get(op->getContext()), // to be inferred
+                rhs, rewriter.create<mlir::daphne::ConstantOp>(op->getLoc(), int64_t(-1))),
+            lhs);
+        return mlir::success();
+    }
+    return mlir::failure();
+}
+
+/**
+ * @brief Replaces `a * X` by `X * a` (`a` scalar, `X` matrix/frame).
+ *
+ * This is important, since our kernels for elementwise binary operations only
+ * support scalars as the right-hand-side operand so far (see #203).
+ *
+ * @param op
+ * @param rewriter
+ * @return
+ */
+mlir::LogicalResult mlir::daphne::EwMulOp::canonicalize(mlir::daphne::EwMulOp op, PatternRewriter &rewriter) {
+    mlir::Value lhs = op.getLhs();
+    mlir::Value rhs = op.getRhs();
+    const bool lhsIsSca = !llvm::isa<mlir::daphne::MatrixType, mlir::daphne::FrameType>(lhs.getType());
+    const bool rhsIsSca = !llvm::isa<mlir::daphne::MatrixType, mlir::daphne::FrameType>(rhs.getType());
+    if (lhsIsSca && !rhsIsSca) {
+        rewriter.replaceOpWithNewOp<mlir::daphne::EwMulOp>(op, op.getResult().getType(), rhs, lhs);
+        return mlir::success();
+    }
+    return mlir::failure();
+}
+
+/**
+ * @brief Replaces `a / X` by `(X ^ -1) * a` (`a` scalar, `X` matrix/frame),
+ * if `X` has a floating-point value type.
+ *
+ * This is important, since our kernels for elementwise binary operations only
+ * support scalars as the right-hand-side operand so far (see #203).
+ *
+ * As a downside, an additional operation and intermediate result is introduced.
+ *
+ * @param op
+ * @param rewriter
+ * @return
+ */
+mlir::LogicalResult mlir::daphne::EwDivOp::canonicalize(mlir::daphne::EwDivOp op, PatternRewriter &rewriter) {
+    mlir::Value lhs = op.getLhs();
+    mlir::Value rhs = op.getRhs();
+    const bool lhsIsSca = !llvm::isa<mlir::daphne::MatrixType, mlir::daphne::FrameType>(lhs.getType());
+    const bool rhsIsSca = !llvm::isa<mlir::daphne::MatrixType, mlir::daphne::FrameType>(rhs.getType());
+    const bool rhsIsFP = llvm::isa<mlir::FloatType>(CompilerUtils::getValueType(rhs.getType()));
+    if (lhsIsSca && !rhsIsSca && rhsIsFP) {
+        rewriter.replaceOpWithNewOp<mlir::daphne::EwMulOp>(
+            op, op.getResult().getType(),
+            rewriter.create<mlir::daphne::EwPowOp>(op->getLoc(),
+                                                   mlir::daphne::UnknownType::get(op->getContext()), // to be inferred
+                                                   rhs,
+                                                   rewriter.create<mlir::daphne::ConstantOp>(op->getLoc(), double(-1))),
+            lhs);
+        return mlir::success();
+    }
+    return mlir::failure();
+}
+
+/**
+ * @brief Replaces a `DistributeOp` by a `DistributedReadOp`, if its input
+ * value (a) is defined by a `ReadOp`, and (b) is not used elsewhere.
+ * @param context
+ */
+struct SimplifyDistributeRead : public mlir::OpRewritePattern<mlir::daphne::DistributeOp> {
+    SimplifyDistributeRead(mlir::MLIRContext *context) : OpRewritePattern<mlir::daphne::DistributeOp>(context, 1) {
+        //
+    }
+
+    mlir::LogicalResult matchAndRewrite(mlir::daphne::DistributeOp op, mlir::PatternRewriter &rewriter) const override {
+        mlir::daphne::ReadOp readOp = op.getMat().getDefiningOp<mlir::daphne::ReadOp>();
+        if (!readOp || !readOp.getOperation()->hasOneUse())
+            return mlir::failure();
+        rewriter.replaceOp(op, {rewriter.create<mlir::daphne::DistributedReadOp>(readOp.getLoc(), op.getType(),
+                                                                                 readOp.getFileName())});
+        // TODO Instead of erasing the ReadOp here, the compiler should
+        // generally remove unused SSA values. Then, we might even drop the
+        // hasOneUse requirement above.
+        rewriter.eraseOp(readOp);
+        return mlir::success();
+    }
+};
+
+void mlir::daphne::DistributeOp::getCanonicalizationPatterns(RewritePatternSet &results, MLIRContext *context) {
+    results.add<SimplifyDistributeRead>(context);
+}
+
+mlir::LogicalResult mlir::daphne::CondOp::canonicalize(mlir::daphne::CondOp op, mlir::PatternRewriter &rewriter) {
+    mlir::Value cond = op.getCond();
+    if (llvm::isa<mlir::daphne::UnknownType, mlir::daphne::MatrixType, mlir::daphne::FrameType>(cond.getType()))
+        // If the condition is not a scalar, we cannot rewrite the operation
+        // here.
+        return mlir::failure();
+    else {
+        // If the condition is a scalar, we rewrite the operation to an
+        // if-then-else construct using the SCF dialect.
+        // TODO Check if it is really a scalar.
+
+        mlir::Location loc = op.getLoc();
+
+        // Ensure that the condition is a boolean.
+        if (!cond.getType().isSignlessInteger(1))
+            cond = rewriter.create<mlir::daphne::CastOp>(loc, rewriter.getI1Type(), cond);
+
+        mlir::Block thenBlock;
+        mlir::Block elseBlock;
+        mlir::Value thenVal = op.getThenVal();
+        mlir::Value elseVal = op.getElseVal();
+
+        // Get rid of frame column labels, since they interfere with the type
+        // comparison (see #485).
+        if (auto thenFrmTy = thenVal.getType().dyn_cast<daphne::FrameType>())
+            if (thenFrmTy.getLabels() != nullptr)
+                thenVal = rewriter.create<mlir::daphne::CastOp>(loc, thenFrmTy.withLabels(nullptr), thenVal);
+        if (auto elseFrmTy = elseVal.getType().dyn_cast<daphne::FrameType>())
+            if (elseFrmTy.getLabels() != nullptr)
+                elseVal = rewriter.create<mlir::daphne::CastOp>(loc, elseFrmTy.withLabels(nullptr), elseVal);
+
+        // Check if the types of the then-value and the else-value are the same.
+        if (thenVal.getType() != elseVal.getType()) {
+            if (llvm::isa<daphne::UnknownType>(thenVal.getType()) || llvm::isa<daphne::UnknownType>(elseVal.getType()))
+                // If one of them is unknown, we abort the rewrite (but this is
+                // not an error). The type may become known later, this rewrite
+                // will be triggered again.
+                return mlir::failure();
+            else
+                // If both types are known, but different, this is an error.
+                // TODO We could try to cast the types.
+                throw ErrorHandler::compilerError(op, "CanonicalizerPass (mlir::daphne::CondOp)",
+                                                  "the then/else-values of CondOp must have the same value "
+                                                  "type");
+        }
+
+        {
+            // Save the insertion point (automatically restored at the end of
+            // the block).
+            PatternRewriter::InsertionGuard insertGuard(rewriter);
+
+            // TODO The current implementation only makes sure that the correct
+            // value is returned, but the operations calculating the
+            // then/else-values are still outside the if-then-else and will
+            // always both be executed (unless, e.g., the entire branching can
+            // be elimitated). This could be good (e.g., if the then/else-values
+            // have common subexpressions with other code) or bad (e.g., if they
+            // are expensive to compute). See #486.
+
+            // Create yield-operations in both branches.
+            rewriter.setInsertionPointToEnd(&thenBlock);
+            rewriter.create<mlir::scf::YieldOp>(loc, thenVal);
+            rewriter.setInsertionPointToEnd(&elseBlock);
+            rewriter.create<mlir::scf::YieldOp>(loc, elseVal);
+        }
+
+        // Helper functions to move the operations in the two blocks created
+        // above into the actual branches of the if-operation.
+        auto insertThenBlockDo = [&](mlir::OpBuilder &nested, mlir::Location loc) {
+            nested.getBlock()->getOperations().splice(nested.getBlock()->end(), thenBlock.getOperations());
+        };
+        auto insertElseBlockDo = [&](mlir::OpBuilder &nested, mlir::Location loc) {
+            nested.getBlock()->getOperations().splice(nested.getBlock()->end(), elseBlock.getOperations());
+        };
+
+        // Replace the daphne::CondOp by an scf::IfOp.
+        rewriter.replaceOpWithNewOp<mlir::scf::IfOp>(op, cond, insertThenBlockDo, insertElseBlockDo);
+
+        return mlir::success();
+    }
+}
+
+mlir::LogicalResult mlir::daphne::ConvertDenseMatrixToMemRef::canonicalize(mlir::daphne::ConvertDenseMatrixToMemRef op,
+                                                                           mlir::PatternRewriter &rewriter) {
+    // removes unnecessary conversions of MemRef -> DM -> MemRef
+    mlir::Operation *dmNode = op->getOperand(0).getDefiningOp();
+
+    if (!llvm::isa<mlir::daphne::ConvertMemRefToDenseMatrix>(dmNode))
+        return failure();
+
+    mlir::Operation *originalMemRefOp = dmNode->getPrevNode()->getOperand(0).getDefiningOp();
+    op.replaceAllUsesWith(originalMemRefOp);
+
+    rewriter.eraseOp(op);
+    if (dmNode->getUsers().empty())
+        rewriter.eraseOp(dmNode);
+
+    return mlir::success();
+}
+
+mlir::LogicalResult mlir::daphne::ConvertMemRefToDenseMatrix::canonicalize(mlir::daphne::ConvertMemRefToDenseMatrix op,
+                                                                           mlir::PatternRewriter &rewriter) {
+    mlir::Operation *extractPtr = op->getPrevNode();
+    auto srcMemRef = extractPtr->getOperand(0).getDefiningOp();
+    extractPtr->moveAfter(srcMemRef);
+    op->moveAfter(extractPtr);
+
+    return mlir::success();
+}
+
+mlir::LogicalResult mlir::daphne::RenameOp::canonicalize(mlir::daphne::RenameOp op, mlir::PatternRewriter &rewriter) {
+    // Replace the RenameOp by its argument, since we only need
+    // this operation during DaphneDSL parsing.
+    rewriter.replaceOp(op, op.getArg());
+    return mlir::success();
+}
+
+/**
+ * @brief Replaces `--a` by `a` (`a` scalar).
+ *
+ * @param op
+ * @param rewriter
+ * @return
+ */
+mlir::LogicalResult mlir::daphne::EwMinusOp::canonicalize(mlir::daphne::EwMinusOp op, PatternRewriter &rewriter) {
+    if (auto innerOp = op.getOperand().getDefiningOp<mlir::daphne::EwMinusOp>()) {
+        rewriter.replaceOp(op, innerOp.getOperand());
+        return mlir::success();
+    }
+    return mlir::failure();
+}
diff --git a/src/ir/daphneir/Daphne.h b/src/ir/daphneir/Daphne.h
index 73a2e6b23..f960c4e3c 100644
--- a/src/ir/daphneir/Daphne.h
+++ b/src/ir/daphneir/Daphne.h
@@ -20,11 +20,13 @@
 // The following includes are required by...
 #include "llvm/ADT/StringRef.h"
 
-// TODO Get rid of this workaround by removing the pragmas and the include within
+// TODO Get rid of this workaround by removing the pragmas and the include
+// within
 //      (note that this header is also included transitively by FuncOps.h),
 //      once the problem is fixed in MLIR/LLVM.
 // As of MLIR llvm/llvm-project@20d454c79bbca7822eee88d188afb7a8747dac58,
-// AttrTypeSubElements.h yields the following warnings, which are hereby ignored:
+// AttrTypeSubElements.h yields the following warnings, which are hereby
+// ignored:
 // - "... parameter 'derived' set but not used [-Wunused-but-set-parameter]"
 // - "... parameter 'walkAttrsFn' set but not used [-Wunused-but-set-parameter]"
 // - "... parameter 'walkTypesFn' set but not used [-Wunused-but-set-parameter]"
@@ -33,10 +35,8 @@
 #include "mlir/IR/AttrTypeSubElements.h"
 #pragma GCC diagnostic pop
 
-#include "mlir/Dialect/LLVMIR/LLVMTypes.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Interfaces/ControlFlowInterfaces.h"
-#include "mlir/Interfaces/SideEffectInterfaces.h"
+#include "mlir/Dialect/LLVMIR/LLVMTypes.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinAttributes.h"
@@ -44,16 +44,19 @@
 #include "mlir/IR/Dialect.h"
 #include "mlir/IR/Location.h"
 #include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/OpImplementation.h"
 #include "mlir/IR/Operation.h"
 #include "mlir/IR/OperationSupport.h"
-#include "mlir/IR/OpImplementation.h"
 #include "mlir/IR/Types.h"
+#include "mlir/Interfaces/ControlFlowInterfaces.h"
+#include "mlir/Interfaces/SideEffectInterfaces.h"
 
 // TODO Get rid of this workaround by removing the pragmas,
 //      once the problem is fixed in MLIR/LLVM.
 // As of MLIR llvm/llvm-project@20d454c79bbca7822eee88d188afb7a8747dac58,
 // PatternMatch.h yields the following warning, which is hereby ignored:
-// - "... typedef 'using FnTraitsT = struct llvm::function_traits<PDLFnT>' locally defined but not used [-Wunused-local-typedefs]"
+// - "... typedef 'using FnTraitsT = struct llvm::function_traits<PDLFnT>'
+// locally defined but not used [-Wunused-local-typedefs]"
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-local-typedefs"
 #include "mlir/IR/PatternMatch.h"
@@ -62,12 +65,12 @@
 #include "mlir/Support/TypeID.h"
 
 #include <ir/daphneir/DaphneAdaptTypesToKernelsTraits.h>
-#include <ir/daphneir/DaphneOpsEnums.h.inc>
 #include <ir/daphneir/DaphneDistributableOpInterface.h>
 #include <ir/daphneir/DaphneInferFrameLabelsOpInterface.h>
-#include <ir/daphneir/DaphneInferSparsityOpInterface.h>
 #include <ir/daphneir/DaphneInferShapeOpInterface.h>
+#include <ir/daphneir/DaphneInferSparsityOpInterface.h>
 #include <ir/daphneir/DaphneInferTypesOpInterface.h>
+#include <ir/daphneir/DaphneOpsEnums.h.inc>
 #include <ir/daphneir/DaphneVectorizableOpInterface.h>
 
 #include <string>
@@ -75,29 +78,27 @@
 #include <vector>
 
 namespace mlir::OpTrait {
-    template<class ConcreteOp>
-    class FPGAOPENCLSupport : public TraitBase<ConcreteOp, FPGAOPENCLSupport> {
-    };
-}
+template <class ConcreteOp> class FPGAOPENCLSupport : public TraitBase<ConcreteOp, FPGAOPENCLSupport> {};
+} // namespace mlir::OpTrait
 
 namespace mlir::daphne {
-    enum class MatrixRepresentation {
-        Dense = 0,
-        // default is dense
-        Default = MatrixRepresentation::Dense,
-        Sparse = 1,
-    };
+enum class MatrixRepresentation {
+    Dense = 0,
+    // default is dense
+    Default = MatrixRepresentation::Dense,
+    Sparse = 1,
+};
 
-    std::string matrixRepresentationToString(MatrixRepresentation rep);
+std::string matrixRepresentationToString(MatrixRepresentation rep);
 
-    MatrixRepresentation stringToMatrixRepresentation(const std::string &str);
-}
+MatrixRepresentation stringToMatrixRepresentation(const std::string &str);
+} // namespace mlir::daphne
 
 // ... the following tablegen'erated headers.
 #define GET_TYPEDEF_CLASSES
-#include <ir/daphneir/DaphneOpsTypes.h.inc>
 #include "ir/daphneir/DaphneOpsDialect.h.inc"
+#include <ir/daphneir/DaphneOpsTypes.h.inc>
 #define GET_OP_CLASSES
 #include "ir/daphneir/DaphneOps.h.inc"
 
-#endif //SRC_IR_DAPHNEIR_DAPHNE_H
+#endif // SRC_IR_DAPHNEIR_DAPHNE_H
diff --git a/src/ir/daphneir/DaphneAdaptTypesToKernelsTraits.h b/src/ir/daphneir/DaphneAdaptTypesToKernelsTraits.h
index 6e30169ac..01db2d091 100644
--- a/src/ir/daphneir/DaphneAdaptTypesToKernelsTraits.h
+++ b/src/ir/daphneir/DaphneAdaptTypesToKernelsTraits.h
@@ -19,18 +19,17 @@
 
 namespace mlir::OpTrait {
 
-template<class ConcreteOp>
-class CastArgsToResType : public TraitBase<ConcreteOp, CastArgsToResType> {};
+template <class ConcreteOp> class CastArgsToResType : public TraitBase<ConcreteOp, CastArgsToResType> {};
 
-template<class ConcreteOp>
+template <class ConcreteOp>
 class CastFirstTwoArgsToResType : public TraitBase<ConcreteOp, CastFirstTwoArgsToResType> {};
 
-template<class ConcreteOp>
+template <class ConcreteOp>
 class CastArgsToResTypeRandMatrixOp : public TraitBase<ConcreteOp, CastArgsToResTypeRandMatrixOp> {};
 
-template<class ConcreteOp>
+template <class ConcreteOp>
 class CastArgsToMostGeneralArgType : public TraitBase<ConcreteOp, CastArgsToMostGeneralArgType> {};
 
-}
+} // namespace mlir::OpTrait
 
-#endif //SRC_IR_DAPHNEIR_DAPHNEADAPTTYPESTOKERNELSTRAITS_H
\ No newline at end of file
+#endif // SRC_IR_DAPHNEIR_DAPHNEADAPTTYPESTOKERNELSTRAITS_H
\ No newline at end of file
diff --git a/src/ir/daphneir/DaphneDialect.cpp b/src/ir/daphneir/DaphneDialect.cpp
index 542e66810..6c543d314 100644
--- a/src/ir/daphneir/DaphneDialect.cpp
+++ b/src/ir/daphneir/DaphneDialect.cpp
@@ -15,8 +15,8 @@
  */
 
 #include <compiler/utils/CompilerUtils.h>
-#include <util/ErrorHandler.h>
 #include <ir/daphneir/Daphne.h>
+#include <util/ErrorHandler.h>
 
 #include <ir/daphneir/DaphneOpsEnums.cpp.inc>
 
@@ -31,7 +31,6 @@
 #include <ir/daphneir/DaphneOpsDialect.cpp.inc>
 #include <ir/daphneir/DaphneOpsTypes.cpp.inc>
 
-#include "llvm/ADT/ArrayRef.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinOps.h"
@@ -52,80 +51,69 @@
 #include "mlir/Interfaces/VectorInterfaces.h"
 #include "mlir/Interfaces/ViewLikeInterface.h"
 #include "mlir/Transforms/InliningUtils.h"
+#include "llvm/ADT/ArrayRef.h"
 
-#include <llvm/ADT/BitVector.h>
 #include <llvm/ADT/APInt.h>
 #include <llvm/ADT/APSInt.h>
+#include <llvm/ADT/BitVector.h>
 #include <llvm/ADT/DenseMap.h>
 
 #include <stdexcept>
 #include <string>
 
 struct DaphneInlinerInterface : public mlir::DialectInlinerInterface {
-  using DialectInlinerInterface::DialectInlinerInterface;
+    using DialectInlinerInterface::DialectInlinerInterface;
 
-  bool isLegalToInline(mlir::Operation *call, mlir::Operation *callable,
-                       bool wouldBeCloned) const final {
-    return true;
-  }
+    bool isLegalToInline(mlir::Operation *call, mlir::Operation *callable, bool wouldBeCloned) const final {
+        return true;
+    }
 
-  bool isLegalToInline(mlir::Operation *, mlir::Region *, bool, mlir::IRMapping &) const final {
-    return true;
-  }
+    bool isLegalToInline(mlir::Operation *, mlir::Region *, bool, mlir::IRMapping &) const final { return true; }
 
-  bool isLegalToInline(mlir::Region *, mlir::Region *, bool, mlir::IRMapping &) const final {
-    return true;
-  }
+    bool isLegalToInline(mlir::Region *, mlir::Region *, bool, mlir::IRMapping &) const final { return true; }
 
-  void handleTerminator(mlir::Operation *op,
-                        mlir::ArrayRef<mlir::Value> valuesToRepl) const final {
-    auto returnOp = mlir::dyn_cast<mlir::daphne::ReturnOp>(op);
+    void handleTerminator(mlir::Operation *op, mlir::ArrayRef<mlir::Value> valuesToRepl) const final {
+        auto returnOp = mlir::dyn_cast<mlir::daphne::ReturnOp>(op);
 
-    // Replace the values directly with the return operands.
-    if (returnOp.getNumOperands() != valuesToRepl.size()) {
-      throw ErrorHandler::compilerError(op, "DaphneInlinerInterface (handleTerminator)",
-                                      "number of operands " + std::to_string(returnOp.getNumOperands())
-                                      + " from " + op->getName().getStringRef().str()
-                                      + " do not match size " + std::to_string(valuesToRepl.size())
-                                      );
-    }
+        // Replace the values directly with the return operands.
+        if (returnOp.getNumOperands() != valuesToRepl.size()) {
+            throw ErrorHandler::compilerError(op, "DaphneInlinerInterface (handleTerminator)",
+                                              "number of operands " + std::to_string(returnOp.getNumOperands()) +
+                                                  " from " + op->getName().getStringRef().str() +
+                                                  " do not match size " + std::to_string(valuesToRepl.size()));
+        }
 
-    for (const auto &it : llvm::enumerate(returnOp.getOperands()))
-      valuesToRepl[it.index()].replaceAllUsesWith(it.value());
-  }
+        for (const auto &it : llvm::enumerate(returnOp.getOperands()))
+            valuesToRepl[it.index()].replaceAllUsesWith(it.value());
+    }
 
-  mlir::Operation *materializeCallConversion(mlir::OpBuilder &builder, mlir::Value input,
-                                       mlir::Type resultType,
-                                       mlir::Location conversionLoc) const final {
-    return builder.create<mlir::daphne::CastOp>(conversionLoc, resultType, input);
-  }
+    mlir::Operation *materializeCallConversion(mlir::OpBuilder &builder, mlir::Value input, mlir::Type resultType,
+                                               mlir::Location conversionLoc) const final {
+        return builder.create<mlir::daphne::CastOp>(conversionLoc, resultType, input);
+    }
 };
 
-void mlir::daphne::DaphneDialect::initialize()
-{
+void mlir::daphne::DaphneDialect::initialize() {
     addOperations<
-        #define GET_OP_LIST
-        #include <ir/daphneir/DaphneOps.cpp.inc>
-    >();
+#define GET_OP_LIST
+#include <ir/daphneir/DaphneOps.cpp.inc>
+        >();
     addTypes<
-        #define GET_TYPEDEF_LIST
-        #include <ir/daphneir/DaphneOpsTypes.cpp.inc>
-    >();
+#define GET_TYPEDEF_LIST
+#include <ir/daphneir/DaphneOpsTypes.cpp.inc>
+        >();
     addInterfaces<DaphneInlinerInterface>();
 }
 
-mlir::Operation *mlir::daphne::DaphneDialect::materializeConstant(OpBuilder &builder,
-                                                                  Attribute value, Type type,
-                                                                  mlir::Location loc)
-{
+mlir::Operation *mlir::daphne::DaphneDialect::materializeConstant(OpBuilder &builder, Attribute value, Type type,
+                                                                  mlir::Location loc) {
     return builder.create<mlir::daphne::ConstantOp>(loc, type, value);
 }
 
-mlir::Type mlir::daphne::DaphneDialect::parseType(mlir::DialectAsmParser &parser) const
-{
+mlir::Type mlir::daphne::DaphneDialect::parseType(mlir::DialectAsmParser &parser) const {
     llvm::StringRef keyword;
     mlir::ParseResult pr = parser.parseKeyword(&keyword);
-    if(mlir::failed(pr))
+    if (mlir::failed(pr))
         throw std::runtime_error("parsing a DaphneIR type failed");
     // `Matrix` `<` (`?` | \d+) `x` (`?` | \d+) `x` \type
     //      (`:` (
@@ -156,12 +144,11 @@ mlir::Type mlir::daphne::DaphneDialect::parseType(mlir::DialectAsmParser &parser
                 return nullptr;
             }
         }
-        if (parser.parseXInDimensionList() ||
-            parser.parseType(elementType)
-        ) {
+        if (parser.parseXInDimensionList() || parser.parseType(elementType)) {
             return nullptr;
         }
-        // additional properties (only print/read them when present, as this will probably get more and more)
+        // additional properties (only print/read them when present, as this
+        // will probably get more and more)
         while (succeeded(parser.parseOptionalColon())) {
             if (succeeded(parser.parseOptionalKeyword("sp"))) {
                 if (sparsity != -1.0) {
@@ -171,42 +158,33 @@ mlir::Type mlir::daphne::DaphneDialect::parseType(mlir::DialectAsmParser &parser
                 if (parser.parseLSquare() || parser.parseFloat(sparsity) || parser.parseRSquare()) {
                     return nullptr;
                 }
-            }
-            else if (succeeded(parser.parseOptionalKeyword("rep"))) {
+            } else if (succeeded(parser.parseOptionalKeyword("rep"))) {
                 llvm::StringRef repName;
                 if (parser.parseLSquare() || parser.parseKeyword(&repName) || parser.parseRSquare()) {
                     return nullptr;
                 }
                 representation = stringToMatrixRepresentation(repName.str());
-            }
-            else {
+            } else {
                 return nullptr;
             }
         }
-        if(parser.parseGreater()) {
+        if (parser.parseGreater()) {
             return nullptr;
         }
 
-        return MatrixType::get(
-                parser.getBuilder().getContext(), elementType, numRows, numCols, sparsity, representation
-        );
-    }
-    else if (keyword == "Frame") {
+        return MatrixType::get(parser.getBuilder().getContext(), elementType, numRows, numCols, sparsity,
+                               representation);
+    } else if (keyword == "Frame") {
         ssize_t numRows = -1;
         ssize_t numCols = -1;
-        if (
-            parser.parseLess() ||
-            parser.parseOptionalQuestion() ||
+        if (parser.parseLess() || parser.parseOptionalQuestion() ||
             // TODO Parse #rows if there was no '?'.
-            //parser.parseInteger<ssize_t>(numRows) ||
-            parser.parseKeyword("x") ||
-            parser.parseLSquare() ||
-            parser.parseOptionalQuestion() ||
+            // parser.parseInteger<ssize_t>(numRows) ||
+            parser.parseKeyword("x") || parser.parseLSquare() || parser.parseOptionalQuestion() ||
             // TODO Parse #cols if there was no '?'.
-            //parser.parseInteger<ssize_t>(numCols) ||
+            // parser.parseInteger<ssize_t>(numCols) ||
             // TODO Parse sparsity
-            parser.parseColon()
-        ) {
+            parser.parseColon()) {
             return nullptr;
         }
         std::vector<mlir::Type> cts;
@@ -215,52 +193,37 @@ mlir::Type mlir::daphne::DaphneDialect::parseType(mlir::DialectAsmParser &parser
             if (parser.parseType(type))
                 return nullptr;
             cts.push_back(type);
-        }
-        while (succeeded(parser.parseOptionalComma()));
+        } while (succeeded(parser.parseOptionalComma()));
         if (parser.parseRSquare() || parser.parseGreater()) {
             return nullptr;
         }
-        return FrameType::get(
-                parser.getBuilder().getContext(), cts, numRows, numCols, nullptr
-        );
-    }
-    else if (keyword == "Handle") {
+        return FrameType::get(parser.getBuilder().getContext(), cts, numRows, numCols, nullptr);
+    } else if (keyword == "Handle") {
         mlir::Type dataType;
         if (parser.parseLess() || parser.parseType(dataType) || parser.parseGreater()) {
             return nullptr;
         }
         return mlir::daphne::HandleType::get(parser.getBuilder().getContext(), dataType);
-    }
-    else if (keyword == "String") {
+    } else if (keyword == "String") {
         return StringType::get(parser.getBuilder().getContext());
-    }
-    else if (keyword == "DaphneContext") {
+    } else if (keyword == "DaphneContext") {
         return mlir::daphne::DaphneContextType::get(parser.getBuilder().getContext());
-    }
-    else {
+    } else {
         parser.emitError(parser.getCurrentLocation()) << "Parsing failed, keyword `" << keyword << "` not recognized!";
         return nullptr;
     }
 }
 
-std::string unknownStrIf(ssize_t val) {
-    return (val == -1) ? "?" : std::to_string(val);
-}
+std::string unknownStrIf(ssize_t val) { return (val == -1) ? "?" : std::to_string(val); }
 
-std::string unknownStrIf(double val) {
-    return (val == -1.0) ? "?" : std::to_string(val);
-}
+std::string unknownStrIf(double val) { return (val == -1.0) ? "?" : std::to_string(val); }
 
-void mlir::daphne::DaphneDialect::printType(mlir::Type type,
-                                            mlir::DialectAsmPrinter &os) const
-{
+void mlir::daphne::DaphneDialect::printType(mlir::Type type, mlir::DialectAsmPrinter &os) const {
     if (type.isa<mlir::daphne::StructureType>())
         os << "Structure";
     else if (auto t = type.dyn_cast<mlir::daphne::MatrixType>()) {
-        os << "Matrix<"
-                << unknownStrIf(t.getNumRows()) << 'x'
-                << unknownStrIf(t.getNumCols()) << 'x'
-                << t.getElementType();
+        os << "Matrix<" << unknownStrIf(t.getNumRows()) << 'x' << unknownStrIf(t.getNumCols()) << 'x'
+           << t.getElementType();
         auto sparsity = t.getSparsity();
         auto representation = t.getRepresentation();
 
@@ -271,41 +234,34 @@ void mlir::daphne::DaphneDialect::printType(mlir::Type type,
             os << ":rep[" << matrixRepresentationToString(representation) << ']';
         }
         os << '>';
-    }
-    else if (auto t = type.dyn_cast<mlir::daphne::FrameType>()) {
-        os << "Frame<"
-                << unknownStrIf(t.getNumRows()) << "x["
-                << unknownStrIf(t.getNumCols()) << ": ";
+    } else if (auto t = type.dyn_cast<mlir::daphne::FrameType>()) {
+        os << "Frame<" << unknownStrIf(t.getNumRows()) << "x[" << unknownStrIf(t.getNumCols()) << ": ";
         // Column types.
         std::vector<mlir::Type> cts = t.getColumnTypes();
         for (size_t i = 0; i < cts.size(); i++) {
             os << cts[i];
-            if(i < cts.size() - 1)
+            if (i < cts.size() - 1)
                 os << ", ";
         }
         os << "], ";
         // Column labels.
-        std::vector<std::string> * labels = t.getLabels();
-        if(labels) {
+        std::vector<std::string> *labels = t.getLabels();
+        if (labels) {
             os << '[';
             for (size_t i = 0; i < labels->size(); i++) {
                 os << '"' << (*labels)[i] << '"';
-                if(i < labels->size() - 1)
+                if (i < labels->size() - 1)
                     os << ", ";
             }
             os << ']';
-        }
-        else
+        } else
             os << '?';
         os << '>';
-    }
-    else if (auto t = type.dyn_cast<mlir::daphne::ListType>()) {
+    } else if (auto t = type.dyn_cast<mlir::daphne::ListType>()) {
         os << "List<" << t.getElementType() << '>';
-    }
-    else if (auto handle = type.dyn_cast<mlir::daphne::HandleType>()) {
+    } else if (auto handle = type.dyn_cast<mlir::daphne::HandleType>()) {
         os << "Handle<" << handle.getDataType() << ">";
-    }
-    else if (isa<mlir::daphne::StringType>(type))
+    } else if (isa<mlir::daphne::StringType>(type))
         os << "String";
     else if (auto t = type.dyn_cast<mlir::daphne::VariadicPackType>())
         os << "VariadicPack<" << t.getContainedType() << '>';
@@ -328,13 +284,12 @@ std::string mlir::daphne::matrixRepresentationToString(MatrixRepresentation rep)
     case MatrixRepresentation::Sparse:
         return "sparse";
     default:
-        throw std::runtime_error("unknown mlir::daphne::MatrixRepresentation " +
-                std::to_string(static_cast<int>(rep)));
+        throw std::runtime_error("unknown mlir::daphne::MatrixRepresentation " + std::to_string(static_cast<int>(rep)));
     }
 }
 
 mlir::daphne::MatrixRepresentation mlir::daphne::stringToMatrixRepresentation(const std::string &str) {
-    if(str == "dense")
+    if (str == "dense")
         return MatrixRepresentation::Dense;
     else if (str == "sparse")
         return MatrixRepresentation::Sparse;
@@ -343,1219 +298,110 @@ mlir::daphne::MatrixRepresentation mlir::daphne::stringToMatrixRepresentation(co
 }
 
 namespace mlir::daphne {
-    namespace detail {
-        struct MatrixTypeStorage : public ::mlir::TypeStorage {
-            // TODO: adapt epsilon for equality check (I think the only use is saving memory for the MLIR-IR representation of this type)
-            //  the choosen epsilon directly defines how accurate our sparsity inference can be
-            constexpr static const double epsilon = 1e-6;
-            MatrixTypeStorage(::mlir::Type elementType,
-                              ssize_t numRows,
-                              ssize_t numCols,
-                              double sparsity,
-                              MatrixRepresentation representation)
-                : elementType(elementType), numRows(numRows), numCols(numCols), sparsity(sparsity),
-                  representation(representation) {}
-
-            /// The hash key is a tuple of the parameter types.
-            using KeyTy = std::tuple<::mlir::Type, ssize_t, ssize_t, double, MatrixRepresentation>;
-            bool operator==(const KeyTy &tblgenKey) const {
-                if(!(elementType == std::get<0>(tblgenKey)))
-                    return false;
-                if(numRows != std::get<1>(tblgenKey))
-                    return false;
-                if(numCols != std::get<2>(tblgenKey))
-                    return false;
-                if(std::fabs(sparsity - std::get<3>(tblgenKey)) >= epsilon)
-                    return false;
-                if(representation != std::get<4>(tblgenKey))
-                    return false;
-                return true;
-            }
-            static ::llvm::hash_code hashKey(const KeyTy &tblgenKey) {
-                auto float_hashable = static_cast<ssize_t>(std::get<3>(tblgenKey) / epsilon);
-                return ::llvm::hash_combine(std::get<0>(tblgenKey),
-                    std::get<1>(tblgenKey),
-                    std::get<2>(tblgenKey),
-                    float_hashable,
-                    std::get<4>(tblgenKey));
-            }
-
-            /// Define a construction method for creating a new instance of this
-            /// storage.
-            static MatrixTypeStorage *construct(::mlir::TypeStorageAllocator &allocator,
-                                                const KeyTy &tblgenKey) {
-                auto elementType = std::get<0>(tblgenKey);
-                auto numRows = std::get<1>(tblgenKey);
-                auto numCols = std::get<2>(tblgenKey);
-                auto sparsity = std::get<3>(tblgenKey);
-                auto representation = std::get<4>(tblgenKey);
-
-                return new(allocator.allocate<MatrixTypeStorage>())
-                    MatrixTypeStorage(elementType, numRows, numCols, sparsity, representation);
-            }
-            ::mlir::Type elementType;
-            ssize_t numRows;
-            ssize_t numCols;
-            double sparsity;
-            MatrixRepresentation representation;
-        };
-    }
-    ::mlir::Type MatrixType::getElementType() const { return getImpl()->elementType; }
-    ssize_t MatrixType::getNumRows() const { return getImpl()->numRows; }
-    ssize_t MatrixType::getNumCols() const { return getImpl()->numCols; }
-    double MatrixType::getSparsity() const { return getImpl()->sparsity; }
-    MatrixRepresentation MatrixType::getRepresentation() const { return getImpl()->representation; }
-}
-
-mlir::OpFoldResult mlir::daphne::ConstantOp::fold(FoldAdaptor adaptor)
-{
-    if (!adaptor.getOperands().empty())
-        throw ErrorHandler::compilerError(
-                this->getLoc(), "CanonicalizerPass (mlir::daphne::ConstantOp::fold)",
-                "constant has no operands but " + std::to_string(adaptor.getOperands().size()) + " were given");
-
-    return getValue();
-}
-
-::mlir::LogicalResult mlir::daphne::MatrixType::verify(
-        ::llvm::function_ref<::mlir::InFlightDiagnostic()> emitError,
-        Type elementType,
-        ssize_t numRows, ssize_t numCols, double sparsity, MatrixRepresentation rep
-)
-{
-    if (
-        (
+namespace detail {
+struct MatrixTypeStorage : public ::mlir::TypeStorage {
+    // TODO: adapt epsilon for equality check (I think the only use is saving
+    // memory for the MLIR-IR representation of this type)
+    //  the choosen epsilon directly defines how accurate our sparsity inference
+    //  can be
+    constexpr static const double epsilon = 1e-6;
+    MatrixTypeStorage(::mlir::Type elementType, ssize_t numRows, ssize_t numCols, double sparsity,
+                      MatrixRepresentation representation)
+        : elementType(elementType), numRows(numRows), numCols(numCols), sparsity(sparsity),
+          representation(representation) {}
+
+    /// The hash key is a tuple of the parameter types.
+    using KeyTy = std::tuple<::mlir::Type, ssize_t, ssize_t, double, MatrixRepresentation>;
+    bool operator==(const KeyTy &tblgenKey) const {
+        if (!(elementType == std::get<0>(tblgenKey)))
+            return false;
+        if (numRows != std::get<1>(tblgenKey))
+            return false;
+        if (numCols != std::get<2>(tblgenKey))
+            return false;
+        if (std::fabs(sparsity - std::get<3>(tblgenKey)) >= epsilon)
+            return false;
+        if (representation != std::get<4>(tblgenKey))
+            return false;
+        return true;
+    }
+    static ::llvm::hash_code hashKey(const KeyTy &tblgenKey) {
+        auto float_hashable = static_cast<ssize_t>(std::get<3>(tblgenKey) / epsilon);
+        return ::llvm::hash_combine(std::get<0>(tblgenKey), std::get<1>(tblgenKey), std::get<2>(tblgenKey),
+                                    float_hashable, std::get<4>(tblgenKey));
+    }
+
+    /// Define a construction method for creating a new instance of this
+    /// storage.
+    static MatrixTypeStorage *construct(::mlir::TypeStorageAllocator &allocator, const KeyTy &tblgenKey) {
+        auto elementType = std::get<0>(tblgenKey);
+        auto numRows = std::get<1>(tblgenKey);
+        auto numCols = std::get<2>(tblgenKey);
+        auto sparsity = std::get<3>(tblgenKey);
+        auto representation = std::get<4>(tblgenKey);
+
+        return new (allocator.allocate<MatrixTypeStorage>())
+            MatrixTypeStorage(elementType, numRows, numCols, sparsity, representation);
+    }
+    ::mlir::Type elementType;
+    ssize_t numRows;
+    ssize_t numCols;
+    double sparsity;
+    MatrixRepresentation representation;
+};
+} // namespace detail
+::mlir::Type MatrixType::getElementType() const { return getImpl()->elementType; }
+ssize_t MatrixType::getNumRows() const { return getImpl()->numRows; }
+ssize_t MatrixType::getNumCols() const { return getImpl()->numCols; }
+double MatrixType::getSparsity() const { return getImpl()->sparsity; }
+MatrixRepresentation MatrixType::getRepresentation() const { return getImpl()->representation; }
+} // namespace mlir::daphne
+
+::mlir::LogicalResult mlir::daphne::MatrixType::verify(::llvm::function_ref<::mlir::InFlightDiagnostic()> emitError,
+                                                       Type elementType, ssize_t numRows, ssize_t numCols,
+                                                       double sparsity, MatrixRepresentation rep) {
+    if ((
             // Value type is unknown.
             llvm::isa<mlir::daphne::UnknownType>(elementType)
             // Value type is known.
-            || elementType.isSignedInteger(64)
-            || elementType.isUnsignedInteger(8)
-            || elementType.isUnsignedInteger(64)
-            || elementType.isF32()
-            || elementType.isF64()
-            || elementType.isIndex()
-            || elementType.isInteger(1)
-            || llvm::isa<mlir::daphne::StringType>(elementType)
-            || elementType.isUnsignedInteger(64)
-            || elementType.isUnsignedInteger(32)
-            || elementType.isSignedInteger(32)
-            || elementType.isSignedInteger(8)
-        ) && (
+            || elementType.isSignedInteger(64) || elementType.isUnsignedInteger(8) ||
+            elementType.isUnsignedInteger(64) || elementType.isF32() || elementType.isF64() || elementType.isIndex() ||
+            elementType.isInteger(1) || llvm::isa<mlir::daphne::StringType>(elementType) ||
+            elementType.isUnsignedInteger(64) || elementType.isUnsignedInteger(32) || elementType.isSignedInteger(32) ||
+            elementType.isSignedInteger(8)) &&
+        (
             // Number of rows and columns are valid (-1 for unknown).
-            numRows >= -1 && numCols >= -1
-        ) && (
-            sparsity == -1 || (sparsity >= 0.0 && sparsity <= 1.0)
-        )
-    )
+            numRows >= -1 && numCols >= -1) &&
+        (sparsity == -1 || (sparsity >= 0.0 && sparsity <= 1.0)))
         return mlir::success();
     else
         return emitError() << "invalid matrix element type: " << elementType;
 }
 
-::mlir::LogicalResult mlir::daphne::FrameType::verify(
-        ::llvm::function_ref<::mlir::InFlightDiagnostic()> emitError,
-        std::vector<Type> columnTypes,
-        ssize_t numRows, ssize_t numCols,
-        std::vector<std::string> * labels
-)
-{
+::mlir::LogicalResult mlir::daphne::FrameType::verify(::llvm::function_ref<::mlir::InFlightDiagnostic()> emitError,
+                                                      std::vector<Type> columnTypes, ssize_t numRows, ssize_t numCols,
+                                                      std::vector<std::string> *labels) {
     // TODO Verify the individual column types.
-    if(numRows < -1 || numCols < -1)
+    if (numRows < -1 || numCols < -1)
         return mlir::failure();
-    if(numCols != -1) {
+    if (numCols != -1) {
         // ToDo: ExtractColOp does not provide these columnTypes
-        if(!columnTypes.empty()) {
+        if (!columnTypes.empty()) {
             if (static_cast<ssize_t>(columnTypes.size()) != numCols)
                 return mlir::failure();
             if (labels && static_cast<ssize_t>(labels->size()) != numCols)
                 return mlir::failure();
         }
     }
-    if(labels && labels->size() != columnTypes.size())
+    if (labels && labels->size() != columnTypes.size())
         return mlir::failure();
     return mlir::success();
 }
 
 ::mlir::LogicalResult mlir::daphne::HandleType::verify(::llvm::function_ref<::mlir::InFlightDiagnostic()> emitError,
-                                                       Type dataType)
-{
+                                                       Type dataType) {
     if (llvm::isa<MatrixType>(dataType)) {
         return mlir::success();
-    }
-    else
+    } else
         return emitError() << "only matrix type is supported for handle atm, got: " << dataType;
 }
-
-mlir::LogicalResult mlir::daphne::VectorizedPipelineOp::canonicalize(mlir::daphne::VectorizedPipelineOp op,
-                                                                     mlir::PatternRewriter &rewriter)
-{
-    // // Find duplicate inputs
-    std::vector<Attribute> vSplitsAttrs;
-    for (auto & split : op.getSplits())
-        vSplitsAttrs.push_back(split);
-    auto currentSize = op.getInputs().size();
-    
-    DenseMap<Value, size_t> inputMap;
-
-    for (size_t i = 0; i < currentSize; i++) {
-        const auto& input = op.getInputs()[i];
-        const auto& split = op.getSplits()[i].cast<daphne::VectorSplitAttr>().getValue();
-
-        if (inputMap.count(input) == 0) {
-            inputMap[input] = i;
-        } else {
-            size_t j = inputMap[input];
-            if (op.getSplits()[j].cast<daphne::VectorSplitAttr>().getValue() == split) {
-                op.getBody().getArgument(i).replaceAllUsesWith(op.getBody().getArgument(j));
-                op.getBody().eraseArgument(i);
-                op.getInputsMutable().erase(i);
-                vSplitsAttrs.erase(vSplitsAttrs.begin() + i);
-                currentSize--;
-                i--;
-            }
-        }
-    }
-
-    std::vector<Value> resultsToReplace;
-    std::vector<Value> outRows;
-    std::vector<Value> outCols;
-    std::vector<Attribute> vCombineAttrs;
-
-    llvm::BitVector eraseIxs;
-    eraseIxs.resize(op.getNumResults());
-    for(auto result : op.getResults()) {
-        auto resultIx = result.getResultNumber();
-        if(result.use_empty()) {
-            // remove
-            eraseIxs.set(resultIx);
-        }
-        else {
-            resultsToReplace.push_back(result);
-            outRows.push_back(op.getOutRows()[resultIx]);
-            outCols.push_back(op.getOutCols()[resultIx]);
-            vCombineAttrs.push_back(op.getCombines()[resultIx]);
-        }
-    }
-    op.getBody().front().getTerminator()->eraseOperands(eraseIxs);
-    if(!op.getCuda().getBlocks().empty())
-        op.getCuda().front().getTerminator()->eraseOperands(eraseIxs);
-
-    if(resultsToReplace.size() == op->getNumResults() && op.getSplits().size() == vSplitsAttrs.size()) {
-        return failure();
-    }
-    auto pipelineOp = rewriter.create<daphne::VectorizedPipelineOp>(op.getLoc(),
-        ValueRange(resultsToReplace).getTypes(),
-        op.getInputs(),
-        outRows,
-        outCols,
-        rewriter.getArrayAttr(vSplitsAttrs),
-        rewriter.getArrayAttr(vCombineAttrs),
-        op.getCtx());
-    pipelineOp.getBody().takeBody(op.getBody());
-    if(!op.getCuda().getBlocks().empty())
-        pipelineOp.getCuda().takeBody(op.getCuda());
-    for (auto e : llvm::enumerate(resultsToReplace)) {
-        auto resultToReplace = e.value();
-        auto i = e.index();
-        resultToReplace.replaceAllUsesWith(pipelineOp.getResult(i));
-    }
-    op.erase();
-    return success();
-}
-
-// ****************************************************************************
-// Fold utility functions/macros
-// ****************************************************************************
-// For families of operations.
-
-// Adapted from "mlir/Dialect/CommonFolders.h"
-mlir::Attribute performCast(mlir::Attribute attr, mlir::Type targetType, mlir::Location loc);
-
-template<
-    class ArgAttrElementT,
-    class ResAttrElementT = ArgAttrElementT,
-    class ArgElementValueT = typename ArgAttrElementT::ValueType,
-    class ResElementValueT = typename ResAttrElementT::ValueType,
-    class CalculationT = std::function<ResElementValueT(const ArgElementValueT &, const ArgElementValueT &)>
->
-mlir::Attribute constFoldBinaryOp(mlir::Location loc, mlir::Type resultType, llvm::ArrayRef<mlir::Attribute> operands,
-                                  const CalculationT &calculate) {
-    if (operands.size() != 2)
-        throw ErrorHandler::compilerError(loc,
-                    "CanonicalizerPass (constFoldBinaryOp)", 
-                    "binary op takes two operands but " + std::to_string(operands.size()) + " were given");
-
-    if(!operands[0] || !operands[1])
-        return {};
-
-    if(llvm::isa<ArgAttrElementT>(operands[0]) && llvm::isa<ArgAttrElementT>(operands[1])) {
-        auto lhs = operands[0].cast<ArgAttrElementT>();
-        auto rhs = operands[1].cast<ArgAttrElementT>();
-
-        // We need dedicated cases, as the parameters of ResAttrElementT::get() depend on ResAttrElementT.
-        if constexpr(
-            std::is_same<ResAttrElementT, mlir::IntegerAttr>::value ||
-            std::is_same<ResAttrElementT, mlir::FloatAttr>::value
-        ) {
-            mlir::Type l = lhs.getType();
-            mlir::Type r = rhs.getType();
-            if ((l.dyn_cast<mlir::IntegerType>() || l.dyn_cast<mlir::FloatType>()) &&
-            (r.dyn_cast<mlir::IntegerType>() || r.dyn_cast<mlir::FloatType>())) {
-                auto lhsBitWidth = lhs.getType().getIntOrFloatBitWidth();
-                auto rhsBitWidth = rhs.getType().getIntOrFloatBitWidth();
-
-                if (lhsBitWidth < rhsBitWidth) {
-                    mlir::Attribute promotedLhs = performCast(lhs, rhs.getType(), loc);
-                    lhs = promotedLhs.cast<ArgAttrElementT>();
-                } else if (rhsBitWidth < lhsBitWidth) {
-                    mlir::Attribute promotedRhs = performCast(rhs, lhs.getType(), loc);
-                    rhs = promotedRhs.cast<ArgAttrElementT>();
-                }
-        }
-            return ResAttrElementT::get(resultType, calculate(lhs.getValue(), rhs.getValue()));
-        }
-        else if constexpr(std::is_same<ResAttrElementT, mlir::BoolAttr>::value) {
-            if(!resultType.isSignlessInteger(1))
-                throw ErrorHandler::compilerError(
-                    loc, "CanonicalizerPass (constFoldBinaryOp)", "expected boolean result type"
-                );
-            return ResAttrElementT::get(lhs.getContext(), calculate(lhs.getValue(), rhs.getValue()));
-        }
-        else if constexpr(std::is_same<ResAttrElementT, mlir::StringAttr>::value) {
-            if(!resultType.isa<mlir::daphne::StringType>())
-                throw ErrorHandler::compilerError(
-                    loc, "CanonicalizerPass (constFoldBinaryOp)", "expected string result type"
-                );
-            return ResAttrElementT::get(calculate(lhs.getValue(), rhs.getValue()), resultType);
-        }
-    }
-    return {};
-}
-template<class AttrElementT,
-    class ElementValueT = typename AttrElementT::ValueType,
-    class CalculationT = std::function<ElementValueT(const ElementValueT &)>>
-mlir::Attribute constFoldUnaryOp(mlir::Location loc, mlir::Type resultType, llvm::ArrayRef<mlir::Attribute> operands,
-                                 const CalculationT &calculate) {
-    if (operands.size() != 1)
-        throw ErrorHandler::compilerError(loc,
-                    "CanonicalizerPass (constFoldUnaryOp)",
-                    "unary op takes one operand but " + std::to_string(operands.size()) + " were given");
-
-    if (!operands[0])
-        return {};
-
-    if (llvm::isa<AttrElementT>(operands[0])) {
-        auto operand = operands[0].cast<AttrElementT>();
-
-        return AttrElementT::get(resultType, calculate(operand.getValue()));
-    }
-    return {};
-}
-
-// ****************************************************************************
-// Fold implementations
-// ****************************************************************************
-mlir::Attribute performCast(mlir::Attribute attr, mlir::Type targetType, mlir::Location loc) {
-    if (auto intAttr = attr.dyn_cast<mlir::IntegerAttr>()) {
-        auto apInt = intAttr.getValue();
-
-        if (auto outTy = targetType.dyn_cast<mlir::IntegerType>()) {
-            // Extend or truncate the integer value based on the target type
-            if (outTy.isUnsignedInteger()) {
-                apInt = apInt.zextOrTrunc(outTy.getWidth());
-            } else if (outTy.isSignedInteger()) {
-                apInt = (intAttr.getType().isSignedInteger())
-                        ? apInt.sextOrTrunc(outTy.getWidth())
-                        : apInt.zextOrTrunc(outTy.getWidth());
-            }
-            return mlir::IntegerAttr::getChecked(loc, outTy, apInt);
-        }
-
-        if (auto outTy = targetType.dyn_cast<mlir::IndexType>()) {
-            return mlir::IntegerAttr::getChecked(loc, outTy, apInt);
-        }
-
-        if (targetType.isF64()) {
-            if (intAttr.getType().isSignedInteger()) {
-                return mlir::FloatAttr::getChecked(loc, targetType,
-                    llvm::APIntOps::RoundSignedAPIntToDouble(apInt));
-            }
-            if (intAttr.getType().isUnsignedInteger() || intAttr.getType().isIndex()) {
-                return mlir::FloatAttr::getChecked(loc, targetType,
-                    llvm::APIntOps::RoundAPIntToDouble(apInt));
-            }
-        }
-
-        if (targetType.isF32()) {
-            if (intAttr.getType().isSignedInteger()) {
-                return mlir::FloatAttr::getChecked(loc, targetType,
-                    llvm::APIntOps::RoundSignedAPIntToFloat(apInt));
-            }
-            if (intAttr.getType().isUnsignedInteger()) {
-                return mlir::FloatAttr::get(targetType,
-                    llvm::APIntOps::RoundAPIntToFloat(apInt));
-            }
-        }
-    }
-    else if (auto floatAttr = attr.dyn_cast<mlir::FloatAttr>()) {
-        auto val = floatAttr.getValueAsDouble();
-
-        if (targetType.isF64()) {
-            return mlir::FloatAttr::getChecked(loc, targetType, val);
-        }
-        if (targetType.isF32()) {
-            return mlir::FloatAttr::getChecked(loc, targetType, static_cast<float>(val));
-        }
-        if (targetType.isIntOrIndex()) {
-            auto num = static_cast<int64_t>(val);
-            return mlir::IntegerAttr::getChecked(loc, targetType, num);
-        }
-    }
-
-    // If casting is not possible, return the original attribute
-    return {};
-}
-
-mlir::OpFoldResult mlir::daphne::CastOp::fold(FoldAdaptor adaptor) {
-    ArrayRef<Attribute> operands = adaptor.getOperands();
-
-    if (isTrivialCast()) {
-        if (operands[0])
-            return {operands[0]};
-        else
-            return {getArg()};
-    }
-
-    if (operands[0]) {
-        if (auto castedAttr = performCast(operands[0], getType(), getLoc())) {
-            return castedAttr;
-        }
-    }
-
-    return {};
-}
-
-mlir::OpFoldResult mlir::daphne::EwAddOp::fold(FoldAdaptor adaptor) {
-    ArrayRef<Attribute> operands = adaptor.getOperands();
-    auto floatOp = [](const llvm::APFloat &a, const llvm::APFloat &b) { return a + b; };
-    // TODO: we could check overflows
-    auto intOp = [](const llvm::APInt &a, const llvm::APInt &b) { return a + b; };
-    if(auto res = constFoldBinaryOp<FloatAttr>(getLoc(), getType(), operands, floatOp))
-        return res;
-    if(auto res = constFoldBinaryOp<IntegerAttr>(getLoc(), getType(), operands, intOp))
-        return res;
-    return {};
-}
-
-mlir::OpFoldResult mlir::daphne::EwSubOp::fold(FoldAdaptor adaptor) {
-    ArrayRef<Attribute> operands = adaptor.getOperands();
-    auto floatOp = [](const llvm::APFloat &a, const llvm::APFloat &b) { return a - b; };
-    auto intOp = [](const llvm::APInt &a, const llvm::APInt &b) { return a - b; };
-    if(auto res = constFoldBinaryOp<FloatAttr>(getLoc(), getType(), operands, floatOp))
-        return res;
-    if(auto res = constFoldBinaryOp<IntegerAttr>(getLoc(), getType(), operands, intOp))
-        return res;
-    return {};
-}
-
-mlir::OpFoldResult mlir::daphne::EwMulOp::fold(FoldAdaptor adaptor) {
-    ArrayRef<Attribute> operands = adaptor.getOperands();
-    auto floatOp = [](const llvm::APFloat &a, const llvm::APFloat &b) { return a * b; };
-    auto intOp = [](const llvm::APInt &a, const llvm::APInt &b) { return a * b; };
-    if(auto res = constFoldBinaryOp<FloatAttr>(getLoc(), getType(), operands, floatOp))
-        return res;
-    if(auto res = constFoldBinaryOp<IntegerAttr>(getLoc(), getType(), operands, intOp))
-        return res;
-    return {};
-}
-
-mlir::OpFoldResult mlir::daphne::EwDivOp::fold(FoldAdaptor adaptor) {
-    ArrayRef<Attribute> operands = adaptor.getOperands();
-    auto floatOp = [](const llvm::APFloat &a, const llvm::APFloat &b) { return a / b; };
-    auto sintOp = [&](const llvm::APInt &a, const llvm::APInt &b) {
-        if(b == 0) {
-            throw ErrorHandler::compilerError(
-                this->getLoc(), "CanonicalizerPass (mlir::daphne::EwDivOp::fold)",
-                "Can't divide by 0");
-        }
-        return a.sdiv(b);
-    };
-    auto uintOp = [&](const llvm::APInt &a, const llvm::APInt &b) {
-        if(b == 0) {
-            throw ErrorHandler::compilerError(
-                this->getLoc(), "CanonicalizerPass (mlir::daphne::EwDivOp::fold)",
-                "Can't divide by 0");
-        }
-        return a.udiv(b);
-    };
-
-    if(auto res = constFoldBinaryOp<FloatAttr>(getLoc(), getType(), operands, floatOp))
-        return res;
-    if(getType().isSignedInteger()) {
-        if(auto res = constFoldBinaryOp<IntegerAttr>(getLoc(), getType(), operands, sintOp))
-            return res;
-    }
-    else if(getType().isUnsignedInteger()) {
-        if(auto res = constFoldBinaryOp<IntegerAttr>(getLoc(), getType(), operands, uintOp))
-            return res;
-    }
-    return {};
-}
-
-mlir::OpFoldResult mlir::daphne::EwMinusOp::fold(FoldAdaptor adaptor) {
-    ArrayRef<Attribute> operands = adaptor.getOperands();
-    auto intOp = [](const llvm::APInt &a) { return -a; };
-    auto floatOp = [](const llvm::APFloat &a) { return -a; };
-
-    if (auto res = constFoldUnaryOp<IntegerAttr>(getLoc(), getType(), operands, intOp))
-        return res;
-    if (auto res = constFoldUnaryOp<FloatAttr>(getLoc(), getType(), operands, floatOp))
-        return res;
-
-    return {};
-}
-
-mlir::OpFoldResult mlir::daphne::EwPowOp::fold(FoldAdaptor adaptor) {
-    ArrayRef<Attribute> operands = adaptor.getOperands();
-    // TODO: EwPowOp integer constant folding
-    auto floatOp = [](const llvm::APFloat &a, const llvm::APFloat &b) {
-        return std::pow(a.convertToDouble(), b.convertToDouble());
-    };
-    if(auto res = constFoldBinaryOp<FloatAttr>(getLoc(), getType(), operands, floatOp))
-        return res;
-    return {};
-}
-
-mlir::OpFoldResult mlir::daphne::EwModOp::fold(FoldAdaptor adaptor) {
-    ArrayRef<Attribute> operands = adaptor.getOperands();
-    auto sintOp = [&](const llvm::APInt &a, const llvm::APInt &b) {
-        if(b == 0) {
-            throw ErrorHandler::compilerError(
-                this->getLoc(), "CanonicalizerPass (mlir::daphne::EwModOp::fold)",
-                "Can't compute mod 0");
-        }
-        return a.srem(b);
-    };
-    auto uintOp = [&](const llvm::APInt &a, const llvm::APInt &b) {
-        if(b == 0) {
-            throw ErrorHandler::compilerError(
-                this->getLoc(), "CanonicalizerPass (mlir::daphne::EwModOp::fold)",
-                "Can't compute mod 0");
-        }
-        return a.urem(b);
-    };
-    if(getType().isSignedInteger()) {
-        if(auto res = constFoldBinaryOp<IntegerAttr>(getLoc(), getType(), operands, sintOp))
-            return res;
-    }
-    else if(getType().isUnsignedInteger()) {
-        if(auto res = constFoldBinaryOp<IntegerAttr>(getLoc(), getType(), operands, uintOp))
-            return res;
-    }
-    return {};
-}
-
-mlir::OpFoldResult mlir::daphne::EwLogOp::fold(FoldAdaptor adaptor) {
-    ArrayRef<Attribute> operands = adaptor.getOperands();
-    auto floatOp = [](const llvm::APFloat &a, const llvm::APFloat &b) {
-        // Compute the element-wise logarithm of a to the base b
-        // Equivalent to log_b(a)
-        return log(a.convertToDouble()) / log(b.convertToDouble());
-    };
-    if (auto res = constFoldBinaryOp<FloatAttr>(getLoc(), getType(), operands, floatOp))
-        return res;
-    return {};
-}
-
-mlir::OpFoldResult mlir::daphne::EwMinOp::fold(FoldAdaptor adaptor) {
-    ArrayRef<Attribute> operands = adaptor.getOperands();
-    auto floatOp = [](const llvm::APFloat &a, const llvm::APFloat &b) { return llvm::minimum(a, b); };
-    auto sintOp = [&](const llvm::APInt &a, const llvm::APInt &b) {
-        if(a.slt(b))
-            return a;
-        else
-            return b;
-    };
-    auto uintOp = [&](const llvm::APInt &a, const llvm::APInt &b) {
-        if(a.ult(b))
-            return a;
-        else
-            return b;
-    };
-    if(auto res = constFoldBinaryOp<FloatAttr>(getLoc(), getType(), operands, floatOp))
-        return res;
-    if(getType().isSignedInteger()) {
-        if(auto res = constFoldBinaryOp<IntegerAttr>(getLoc(), getType(), operands, sintOp))
-            return res;
-    }
-    else if(getType().isUnsignedInteger()) {
-        if(auto res = constFoldBinaryOp<IntegerAttr>(getLoc(), getType(), operands, uintOp))
-            return res;
-    }
-    return {};
-}
-
-mlir::OpFoldResult mlir::daphne::EwMaxOp::fold(FoldAdaptor adaptor) {
-    ArrayRef<Attribute> operands = adaptor.getOperands();
-    auto floatOp = [](const llvm::APFloat &a, const llvm::APFloat &b) { return llvm::maximum(a, b); };
-    auto sintOp = [&](const llvm::APInt &a, const llvm::APInt &b) {
-        if(a.sgt(b))
-            return a;
-        else
-            return b;
-    };
-    auto uintOp = [&](const llvm::APInt &a, const llvm::APInt &b) {
-        if(a.ugt(b))
-            return a;
-        else
-            return b;
-    };
-    if(auto res = constFoldBinaryOp<FloatAttr>(getLoc(), getType(), operands, floatOp))
-        return res;
-    if(getType().isSignedInteger()) {
-        if(auto res = constFoldBinaryOp<IntegerAttr>(getLoc(), getType(), operands, sintOp))
-            return res;
-    }
-    else if(getType().isUnsignedInteger()) {
-        if(auto res = constFoldBinaryOp<IntegerAttr>(getLoc(), getType(), operands, uintOp))
-            return res;
-    }
-    return {};
-}
-
-mlir::OpFoldResult mlir::daphne::EwAndOp::fold(FoldAdaptor adaptor) {
-    ArrayRef<Attribute> operands = adaptor.getOperands();
-    auto boolOp = [](const bool &a, const bool &b) { return a && b; };
-    auto intOp = [](const llvm::APInt &a, const llvm::APInt &b) { return (a != 0) && (b != 0); };
-    if(auto res = constFoldBinaryOp<BoolAttr>(getLoc(), getType(), operands, boolOp))
-        return res;
-    // TODO: should output bool?
-    if(auto res = constFoldBinaryOp<IntegerAttr>(getLoc(), getType(), operands, intOp))
-        return res;
-    return {};
-}
-
-mlir::OpFoldResult mlir::daphne::EwBitwiseAndOp::fold(FoldAdaptor adaptor) {
-    return {};
-}
-
-mlir::OpFoldResult mlir::daphne::EwOrOp::fold(FoldAdaptor adaptor) {
-    ArrayRef<Attribute> operands = adaptor.getOperands();
-    auto boolOp = [](const bool &a, const bool &b) { return a || b; };
-    auto intOp = [](const llvm::APInt &a, const llvm::APInt &b) { return (a != 0) || (b != 0); };
-    if(auto res = constFoldBinaryOp<BoolAttr>(getLoc(), getType(), operands, boolOp))
-        return res;
-    // TODO: should output bool
-    if(auto res = constFoldBinaryOp<IntegerAttr>(getLoc(), getType(), operands, intOp))
-        return res;
-    return {};
-}
-
-mlir::OpFoldResult mlir::daphne::EwXorOp::fold(FoldAdaptor adaptor) {
-    ArrayRef<Attribute> operands = adaptor.getOperands();
-    auto boolOp = [](const bool &a, const bool &b) { return a ^ b; };
-    auto intOp = [](const llvm::APInt &a, const llvm::APInt &b) { return (a != 0) ^ (b != 0); };
-    if(auto res = constFoldBinaryOp<BoolAttr>(getLoc(), getType(), operands, boolOp))
-        return res;
-    // TODO: should output bool
-    if(auto res = constFoldBinaryOp<IntegerAttr>(getLoc(), getType(), operands, intOp))
-        return res;
-    return {};
-}
-
-mlir::OpFoldResult mlir::daphne::EwConcatOp::fold(FoldAdaptor adaptor) {
-    ArrayRef<Attribute> operands = adaptor.getOperands();
-
-    if (operands.size() != 2)
-        throw ErrorHandler::compilerError(
-                this->getLoc(), "CanonicalizerPass (mlir::daphne::EwConcatOp::fold)",
-                "binary op takes two operands but " + std::to_string(operands.size()) + " were given");
-
-    if(!operands[0] || !operands[1])
-        return {};
-
-    if(llvm::isa<StringAttr>(operands[0]) && isa<StringAttr>(operands[1])) {
-        auto lhs = operands[0].cast<StringAttr>();
-        auto rhs = operands[1].cast<StringAttr>();
-
-        auto concated = lhs.getValue().str() + rhs.getValue().str();
-        return StringAttr::get(concated, getType());
-    }
-    return {};
-}
-
-mlir::OpFoldResult mlir::daphne::EwEqOp::fold(FoldAdaptor adaptor) {
-    ArrayRef<Attribute> operands = adaptor.getOperands();
-    auto floatOp = [](const llvm::APFloat &a, const llvm::APFloat &b) { return a == b; };
-    auto intOp = [](const llvm::APInt &a, const llvm::APInt &b) { return a == b; };
-    auto strOp = [](const llvm::StringRef &a, const llvm::StringRef &b) { return a == b; };
-    // TODO: fix bool return
-    if(auto res = constFoldBinaryOp<FloatAttr>(getLoc(), getType(), operands, floatOp))
-        return res;
-    if(auto res = constFoldBinaryOp<IntegerAttr>(getLoc(), getType(), operands, intOp))
-        return res;
-    if(auto res = constFoldBinaryOp<StringAttr, IntegerAttr>(getLoc(), IntegerType::get(getContext(), 64, IntegerType::SignednessSemantics::Signed), operands, strOp))
-        return res;
-    return {};
-}
-
-mlir::OpFoldResult mlir::daphne::EwNeqOp::fold(FoldAdaptor adaptor) {
-    ArrayRef<Attribute> operands = adaptor.getOperands();
-    auto floatOp = [](const llvm::APFloat &a, const llvm::APFloat &b) { return a != b; };
-    auto intOp = [](const llvm::APInt &a, const llvm::APInt &b) { return a != b; };
-    // TODO: fix bool return
-    if(auto res = constFoldBinaryOp<FloatAttr>(getLoc(), getType(), operands, floatOp))
-        return res;
-    if(auto res = constFoldBinaryOp<IntegerAttr>(getLoc(), getType(), operands, intOp))
-        return res;
-    return {};
-}
-
-mlir::OpFoldResult mlir::daphne::EwLtOp::fold(FoldAdaptor adaptor) {
-    ArrayRef<Attribute> operands = adaptor.getOperands();
-    auto floatOp = [](const llvm::APFloat &a, const llvm::APFloat &b) { return a < b; };
-    auto sintOp = [](const llvm::APInt &a, const llvm::APInt &b) { return a.slt(b); };
-    auto uintOp = [](const llvm::APInt &a, const llvm::APInt &b) { return a.ult(b); };
-    // TODO: fix bool return
-    if(auto res = constFoldBinaryOp<FloatAttr>(getLoc(), getType(), operands, floatOp))
-        return res;
-    if(getType().isSignedInteger()) {
-        if(auto res = constFoldBinaryOp<IntegerAttr>(getLoc(), getType(), operands, sintOp))
-            return res;
-    }
-    else if(getType().isUnsignedInteger()) {
-        if(auto res = constFoldBinaryOp<IntegerAttr>(getLoc(), getType(), operands, uintOp))
-            return res;
-    }
-    return {};
-}
-
-mlir::OpFoldResult mlir::daphne::EwLeOp::fold(FoldAdaptor adaptor) {
-    ArrayRef<Attribute> operands = adaptor.getOperands();
-    auto floatOp = [](const llvm::APFloat &a, const llvm::APFloat &b) { return a <= b; };
-    auto sintOp = [](const llvm::APInt &a, const llvm::APInt &b) { return a.sle(b); };
-    auto uintOp = [](const llvm::APInt &a, const llvm::APInt &b) { return a.ule(b); };
-    // TODO: fix bool return
-    if(auto res = constFoldBinaryOp<FloatAttr>(getLoc(), getType(), operands, floatOp))
-        return res;
-    if(getType().isSignedInteger()) {
-        if(auto res = constFoldBinaryOp<IntegerAttr>(getLoc(), getType(), operands, sintOp))
-            return res;
-    }
-    else if(getType().isUnsignedInteger()) {
-        if(auto res = constFoldBinaryOp<IntegerAttr>(getLoc(), getType(), operands, uintOp))
-            return res;
-    }
-    return {};
-}
-
-mlir::OpFoldResult mlir::daphne::EwGtOp::fold(FoldAdaptor adaptor) {
-    ArrayRef<Attribute> operands = adaptor.getOperands();
-    auto floatOp = [](const llvm::APFloat &a, const llvm::APFloat &b) { return a > b; };
-    auto sintOp = [](const llvm::APInt &a, const llvm::APInt &b) { return a.sgt(b); };
-    auto uintOp = [](const llvm::APInt &a, const llvm::APInt &b) { return a.ugt(b); };
-    // TODO: fix bool return
-    if(auto res = constFoldBinaryOp<FloatAttr>(getLoc(), getType(), operands, floatOp))
-        return res;
-    if(getType().isSignedInteger()) {
-        if(auto res = constFoldBinaryOp<IntegerAttr>(getLoc(), getType(), operands, sintOp))
-            return res;
-    }
-    else if(getType().isUnsignedInteger()) {
-        if(auto res = constFoldBinaryOp<IntegerAttr>(getLoc(), getType(), operands, uintOp))
-            return res;
-    }
-    return {};
-}
-
-mlir::OpFoldResult mlir::daphne::EwGeOp::fold(FoldAdaptor adaptor) {
-    ArrayRef<Attribute> operands = adaptor.getOperands();
-    auto floatOp = [](const llvm::APFloat &a, const llvm::APFloat &b) { return a >= b; };
-    auto sintOp = [](const llvm::APInt &a, const llvm::APInt &b) { return a.sge(b); };
-    auto uintOp = [](const llvm::APInt &a, const llvm::APInt &b) { return a.uge(b); };
-    // TODO: fix bool return
-    if(auto res = constFoldBinaryOp<FloatAttr>(getLoc(), getType(), operands, floatOp))
-        return res;
-    if(getType().isSignedInteger()) {
-        if(auto res = constFoldBinaryOp<IntegerAttr>(getLoc(), getType(), operands, sintOp))
-            return res;
-    }
-    else if(getType().isUnsignedInteger()) {
-        if(auto res = constFoldBinaryOp<IntegerAttr>(getLoc(), getType(), operands, uintOp))
-            return res;
-    }
-    return {};
-}
-
-/**
- * @brief Transposition-aware matrix multiplication
- * Identifies if an input to a MatMulOp is the result of a TransposeOp; Rewrites the Operation,
- * passing transposition info as a flag, instead of transposing the matrix before multiplication
- */
-mlir::LogicalResult mlir::daphne::MatMulOp::canonicalize(
-        mlir::daphne::MatMulOp op, PatternRewriter &rewriter
-) {    
-    mlir::Value lhs = op.getLhs();
-    mlir::Value rhs = op.getRhs();
-    mlir::Value transa = op.getTransa();
-    mlir::Value transb = op.getTransb();
-
-    // TODO If transa or transb are not constant, we cannot continue on the respective side;
-    // we cannot just assume false then.
-    bool ta = CompilerUtils::constantOrDefault<bool>(transa, false);
-    bool tb = CompilerUtils::constantOrDefault<bool>(transb, false);
-
-    // TODO Turn on the transposition-awareness for the left-hand-side argument again (see #447).
-    // mlir::daphne::TransposeOp lhsTransposeOp = lhs.getDefiningOp<mlir::daphne::TransposeOp>();
-    mlir::daphne::TransposeOp rhsTransposeOp = rhs.getDefiningOp<mlir::daphne::TransposeOp>();
-
-    //if (!lhsTransposeOp && !rhsTransposeOp){
-    if (!rhsTransposeOp){
-        return mlir::failure();
-    }
-
-    // ToDo: This check prevents merging transpose into matrix multiplication because that is not yet supported by our
-    //   sparse kernels.
-    // ToDo: bring user config here for sparsity threshold or properly use MatrixRepresentation
-    if(auto t = rhs.getType().dyn_cast<mlir::daphne::MatrixType>()) {
-        auto sparsity = t.getSparsity();
-        if(sparsity < 0.25)
-            return mlir::failure();
-    }
-
-#if 0
-    // TODO Adapt PhyOperatorSelectionPass once this code is turned on again.
-    if(lhsTransposeOp) {
-        lhs = lhsTransposeOp.getArg();
-        ta = !ta;
-    }
-#endif
-    if(rhsTransposeOp) {
-        rhs = rhsTransposeOp.getArg();
-        tb = !tb;
-    }
-
-    rewriter.replaceOpWithNewOp<mlir::daphne::MatMulOp>(
-        op, op.getType(), lhs, rhs,
-        static_cast<mlir::Value>(rewriter.create<mlir::daphne::ConstantOp>(transa.getLoc(), ta)),
-        static_cast<mlir::Value>(rewriter.create<mlir::daphne::ConstantOp>(transb.getLoc(), tb))
-    );
-    return mlir::success();
-}
-
-/**
- * @brief Replaces NumRowsOp by a constant, if the #rows of the input is known
- * (e.g., due to shape inference).
- */
-mlir::LogicalResult mlir::daphne::NumRowsOp::canonicalize(
-        mlir::daphne::NumRowsOp op, PatternRewriter &rewriter
-) {
-    ssize_t numRows = -1;
-    
-    mlir::Type inTy = op.getArg().getType();
-    if(auto t = inTy.dyn_cast<mlir::daphne::MatrixType>())
-        numRows = t.getNumRows();
-    else if(auto t = inTy.dyn_cast<mlir::daphne::FrameType>())
-        numRows = t.getNumRows();
-    
-    if(numRows != -1) {
-        rewriter.replaceOpWithNewOp<mlir::daphne::ConstantOp>(
-                op, rewriter.getIndexType(), rewriter.getIndexAttr(numRows)
-        );
-        return mlir::success();
-    }
-    return mlir::failure();
-}
-
-/**
- * @brief Replaces NumColsOp by a constant, if the #cols of the input is known
- * (e.g., due to shape inference).
- */
-mlir::LogicalResult mlir::daphne::NumColsOp::canonicalize(
-        mlir::daphne::NumColsOp op, PatternRewriter &rewriter
-) {
-    ssize_t numCols = -1;
-    
-    mlir::Type inTy = op.getArg().getType();
-    if(auto t = inTy.dyn_cast<mlir::daphne::MatrixType>())
-        numCols = t.getNumCols();
-    else if(auto t = inTy.dyn_cast<mlir::daphne::FrameType>())
-        numCols = t.getNumCols();
-    
-    if(numCols != -1) {
-        rewriter.replaceOpWithNewOp<mlir::daphne::ConstantOp>(
-                op, rewriter.getIndexType(), rewriter.getIndexAttr(numCols)
-        );
-        return mlir::success();
-    }
-    return mlir::failure();
-}
-
-/**
- * @brief Replaces NumCellsOp by a constant, if the #rows and #cols of the
- * input is known (e.g., due to shape inference).
- */
-mlir::LogicalResult mlir::daphne::NumCellsOp::canonicalize(
-        mlir::daphne::NumCellsOp op, PatternRewriter &rewriter
-) {
-    ssize_t numRows = -1;
-    ssize_t numCols = -1;
-    
-    mlir::Type inTy = op.getArg().getType();
-    if(auto t = inTy.dyn_cast<mlir::daphne::MatrixType>()) {
-        numRows = t.getNumRows();
-        numCols = t.getNumCols();
-    }
-    else if(auto t = inTy.dyn_cast<mlir::daphne::FrameType>()) {
-        numRows = t.getNumRows();
-        numCols = t.getNumCols();
-    }
-    
-    if(numRows != -1 && numCols != -1) {
-        rewriter.replaceOpWithNewOp<mlir::daphne::ConstantOp>(
-                op, rewriter.getIndexType(), rewriter.getIndexAttr(numRows * numCols)
-        );
-        return mlir::success();
-    }
-    return mlir::failure();
-}
-
-/**
- * @brief Replaces SparsityOp by a constant, if the sparsity of the input is known
- * (e.g., due to sparsity inference).
- */
-mlir::LogicalResult mlir::daphne::SparsityOp::canonicalize(
-        mlir::daphne::SparsityOp op, PatternRewriter &rewriter
-) {
-    double sparsity = -1.0;
-
-    mlir::Type inTy = op.getArg().getType();
-    if(auto t = inTy.dyn_cast<mlir::daphne::MatrixType>())
-        sparsity = t.getSparsity();
-
-    if(sparsity != -1) {
-        rewriter.replaceOpWithNewOp<mlir::daphne::ConstantOp>(
-                op, sparsity
-        );
-        return mlir::success();
-    }
-    return mlir::failure();
-}
-
-/**
- * @brief Replaces a `DistributeOp` by a `DistributedReadOp`, if its input
- * value (a) is defined by a `ReadOp`, and (b) is not used elsewhere.
- * @param context
- */
-struct SimplifyDistributeRead : public mlir::OpRewritePattern<mlir::daphne::DistributeOp> {
-    SimplifyDistributeRead(mlir::MLIRContext *context)
-        : OpRewritePattern<mlir::daphne::DistributeOp>(context, 1) {
-        //
-    }
-    
-    mlir::LogicalResult
-    matchAndRewrite(
-            mlir::daphne::DistributeOp op, mlir::PatternRewriter &rewriter
-    ) const override {
-        mlir::daphne::ReadOp readOp = op.getMat().getDefiningOp<mlir::daphne::ReadOp>();
-        if(!readOp || !readOp.getOperation()->hasOneUse())
-            return mlir::failure();
-        rewriter.replaceOp(
-                op, {rewriter.create<mlir::daphne::DistributedReadOp>(
-                        readOp.getLoc(), op.getType(), readOp.getFileName()
-                )}
-        );
-        // TODO Instead of erasing the ReadOp here, the compiler should
-        // generally remove unused SSA values. Then, we might even drop the
-        // hasOneUse requirement above.
-        rewriter.eraseOp(readOp);
-        return mlir::success();
-    }
-};
-
-/**
- * @brief Replaces (1) `a + b` by `a concat b`, if `a` or `b` is a string,
- * and (2) `a + X` by `X + a` (`a` scalar, `X` matrix/frame).
- * 
- * (1) is important, since we use the `+`-operator for both addition and
- * string concatenation in DaphneDSL, while the types of the operands might be
- * known only after type inference.
- * 
- * (2) is important, since our kernels for elementwise binary operations only support
- * scalars as the right-hand-side operand so far (see #203).
- * 
- * @param op
- * @param rewriter
- * @return 
- */
-mlir::LogicalResult mlir::daphne::EwAddOp::canonicalize(
-        mlir::daphne::EwAddOp op, PatternRewriter &rewriter
-) {
-    mlir::Value lhs = op.getLhs();
-    mlir::Value rhs = op.getRhs();
-
-    const bool lhsIsStr = llvm::isa<mlir::daphne::StringType>(lhs.getType());
-    const bool rhsIsStr = llvm::isa<mlir::daphne::StringType>(rhs.getType());
-    if(lhsIsStr || rhsIsStr) {
-        mlir::Type strTy = mlir::daphne::StringType::get(rewriter.getContext());
-        if(!lhsIsStr)
-            lhs = rewriter.create<mlir::daphne::CastOp>(op.getLoc(), strTy, lhs);
-        if(!rhsIsStr)
-            rhs = rewriter.create<mlir::daphne::CastOp>(op.getLoc(), strTy, rhs);
-        rewriter.replaceOpWithNewOp<mlir::daphne::EwConcatOp>(op, strTy, lhs, rhs);
-        return mlir::success();
-    }
-    else {
-        const bool lhsIsSca = !llvm::isa<mlir::daphne::MatrixType, mlir::daphne::FrameType>(lhs.getType());
-        const bool rhsIsSca = !llvm::isa<mlir::daphne::MatrixType, mlir::daphne::FrameType>(rhs.getType());
-        if(lhsIsSca && !rhsIsSca) {
-            rewriter.replaceOpWithNewOp<mlir::daphne::EwAddOp>(op, op.getResult().getType(), rhs, lhs);
-            return mlir::success();
-        }
-        return mlir::failure();
-    }
-}
-
-/**
- * @brief Replaces `a - X` by `(X * -1) + a` (`a` scalar, `X` matrix/frame).
- * 
- * This is important, since our kernels for elementwise binary operations only support
- * scalars as the right-hand-side operand so far (see #203).
- * 
- * As a downside, an additional operation and intermediate result is introduced.
- * 
- * @param op
- * @param rewriter
- * @return 
- */
-mlir::LogicalResult mlir::daphne::EwSubOp::canonicalize(
-        mlir::daphne::EwSubOp op, PatternRewriter &rewriter
-) {
-    mlir::Value lhs = op.getLhs();
-    mlir::Value rhs = op.getRhs();
-    const bool lhsIsSca = !llvm::isa<mlir::daphne::MatrixType, mlir::daphne::FrameType>(lhs.getType());
-    const bool rhsIsSca = !llvm::isa<mlir::daphne::MatrixType, mlir::daphne::FrameType>(rhs.getType());
-    if(lhsIsSca && !rhsIsSca) {
-        rewriter.replaceOpWithNewOp<mlir::daphne::EwAddOp>(
-                op,
-                op.getResult().getType(),
-                rewriter.create<mlir::daphne::EwMulOp>(
-                        op->getLoc(),
-                        mlir::daphne::UnknownType::get(op->getContext()), // to be inferred
-                        rhs,
-                        rewriter.create<mlir::daphne::ConstantOp>(op->getLoc(), int64_t(-1))
-                ),
-                lhs
-        );
-        return mlir::success();
-    }
-    return mlir::failure();
-}
-
-/**
- * @brief Replaces `a * X` by `X * a` (`a` scalar, `X` matrix/frame).
- * 
- * This is important, since our kernels for elementwise binary operations only support
- * scalars as the right-hand-side operand so far (see #203).
- * 
- * @param op
- * @param rewriter
- * @return 
- */
-mlir::LogicalResult mlir::daphne::EwMulOp::canonicalize(
-        mlir::daphne::EwMulOp op, PatternRewriter &rewriter
-) {
-    mlir::Value lhs = op.getLhs();
-    mlir::Value rhs = op.getRhs();
-    const bool lhsIsSca = !llvm::isa<mlir::daphne::MatrixType, mlir::daphne::FrameType>(lhs.getType());
-    const bool rhsIsSca = !llvm::isa<mlir::daphne::MatrixType, mlir::daphne::FrameType>(rhs.getType());
-    if(lhsIsSca && !rhsIsSca) {
-        rewriter.replaceOpWithNewOp<mlir::daphne::EwMulOp>(op, op.getResult().getType(), rhs, lhs);
-        return mlir::success();
-    }
-    return mlir::failure();
-}
-
-/**
- * @brief Replaces `a / X` by `(X ^ -1) * a` (`a` scalar, `X` matrix/frame),
- * if `X` has a floating-point value type.
- * 
- * This is important, since our kernels for elementwise binary operations only support
- * scalars as the right-hand-side operand so far (see #203).
- * 
- * As a downside, an additional operation and intermediate result is introduced.
- * 
- * @param op
- * @param rewriter
- * @return 
- */
-mlir::LogicalResult mlir::daphne::EwDivOp::canonicalize(
-        mlir::daphne::EwDivOp op, PatternRewriter &rewriter
-) {
-    mlir::Value lhs = op.getLhs();
-    mlir::Value rhs = op.getRhs();
-    const bool lhsIsSca = !llvm::isa<mlir::daphne::MatrixType, mlir::daphne::FrameType>(lhs.getType());
-    const bool rhsIsSca = !llvm::isa<mlir::daphne::MatrixType, mlir::daphne::FrameType>(rhs.getType());
-    const bool rhsIsFP = llvm::isa<mlir::FloatType>(CompilerUtils::getValueType(rhs.getType()));
-    if(lhsIsSca && !rhsIsSca && rhsIsFP) {
-        rewriter.replaceOpWithNewOp<mlir::daphne::EwMulOp>(
-                op,
-                op.getResult().getType(),
-                rewriter.create<mlir::daphne::EwPowOp>(
-                        op->getLoc(),
-                        mlir::daphne::UnknownType::get(op->getContext()), // to be inferred
-                        rhs,
-                        rewriter.create<mlir::daphne::ConstantOp>(op->getLoc(), double(-1))
-                ),
-                lhs
-        );
-        return mlir::success();
-    }
-    return mlir::failure();
-}
-
-void mlir::daphne::DistributeOp::getCanonicalizationPatterns(
-        RewritePatternSet &results, MLIRContext *context
-) {
-    results.add<SimplifyDistributeRead>(context);
-}
-
-mlir::LogicalResult mlir::daphne::CondOp::canonicalize(mlir::daphne::CondOp op,
-                                                       mlir::PatternRewriter &rewriter)
-{
-    mlir::Value cond = op.getCond();
-    if(llvm::isa<mlir::daphne::UnknownType, mlir::daphne::MatrixType, mlir::daphne::FrameType>(cond.getType()))
-        // If the condition is not a scalar, we cannot rewrite the operation here.
-        return mlir::failure();
-    else {
-        // If the condition is a scalar, we rewrite the operation to an if-then-else construct
-        // using the SCF dialect.
-        // TODO Check if it is really a scalar.
-
-        mlir::Location loc = op.getLoc();
-
-        // Ensure that the condition is a boolean.
-        if(!cond.getType().isSignlessInteger(1))
-            cond = rewriter.create<mlir::daphne::CastOp>(loc, rewriter.getI1Type(), cond);
-
-        mlir::Block thenBlock;
-        mlir::Block elseBlock;
-        mlir::Value thenVal = op.getThenVal();
-        mlir::Value elseVal = op.getElseVal();
-
-        // Get rid of frame column labels, since they interfere with the type comparison (see #485).
-        if(auto thenFrmTy = thenVal.getType().dyn_cast<daphne::FrameType>())
-            if(thenFrmTy.getLabels() != nullptr)
-                thenVal = rewriter.create<mlir::daphne::CastOp>(loc, thenFrmTy.withLabels(nullptr), thenVal);
-        if(auto elseFrmTy = elseVal.getType().dyn_cast<daphne::FrameType>())
-            if(elseFrmTy.getLabels() != nullptr)
-                elseVal = rewriter.create<mlir::daphne::CastOp>(loc, elseFrmTy.withLabels(nullptr), elseVal);
-
-        // Check if the types of the then-value and the else-value are the same.
-        if(thenVal.getType() != elseVal.getType()) {
-            if(llvm::isa<daphne::UnknownType>(thenVal.getType()) || llvm::isa<daphne::UnknownType>(elseVal.getType()))
-                // If one of them is unknown, we abort the rewrite (but this is not an error).
-                // The type may become known later, this rewrite will be triggered again.
-                return mlir::failure();
-            else
-                // If both types are known, but different, this is an error.
-                // TODO We could try to cast the types.
-                throw ErrorHandler::compilerError(
-                    op, "CanonicalizerPass (mlir::daphne::CondOp)",
-                    "the then/else-values of CondOp must have the same value "
-                    "type");
-        }
-
-        {
-            // Save the insertion point (automatically restored at the end of the block).
-            PatternRewriter::InsertionGuard insertGuard(rewriter);
-
-            // TODO The current implementation only makes sure that the correct value is
-            // returned, but the operations calculating the then/else-values are still
-            // outside the if-then-else and will always both be executed (unless, e.g.,
-            // the entire branching can be elimitated). This could be good (e.g., if
-            // the then/else-values have common subexpressions with other code) or bad
-            // (e.g., if they are expensive to compute). See #486.
-
-            // Create yield-operations in both branches.
-            rewriter.setInsertionPointToEnd(&thenBlock);
-            rewriter.create<mlir::scf::YieldOp>(loc, thenVal);
-            rewriter.setInsertionPointToEnd(&elseBlock);
-            rewriter.create<mlir::scf::YieldOp>(loc, elseVal);
-        }
-
-        // Helper functions to move the operations in the two blocks created above
-        // into the actual branches of the if-operation.
-        auto insertThenBlockDo = [&](mlir::OpBuilder & nested, mlir::Location loc) {
-            nested.getBlock()->getOperations().splice(nested.getBlock()->end(), thenBlock.getOperations());
-        };
-        auto insertElseBlockDo = [&](mlir::OpBuilder & nested, mlir::Location loc) {
-            nested.getBlock()->getOperations().splice(nested.getBlock()->end(), elseBlock.getOperations());
-        };
-
-        // Replace the daphne::CondOp by an scf::IfOp.
-        rewriter.replaceOpWithNewOp<mlir::scf::IfOp>(
-            op, cond, insertThenBlockDo, insertElseBlockDo
-        );
-
-        return mlir::success();
-    }
-}
-
-mlir::LogicalResult mlir::daphne::ConvertDenseMatrixToMemRef::canonicalize(
-    mlir::daphne::ConvertDenseMatrixToMemRef op,
-    mlir::PatternRewriter &rewriter) {
-    // removes unnecessary conversions of MemRef -> DM -> MemRef
-    mlir::Operation *dmNode = op->getOperand(0).getDefiningOp();
-
-    if (!llvm::isa<mlir::daphne::ConvertMemRefToDenseMatrix>(dmNode))
-        return failure();
-
-    mlir::Operation *originalMemRefOp =
-        dmNode->getPrevNode()->getOperand(0).getDefiningOp();
-    op.replaceAllUsesWith(originalMemRefOp);
-
-    rewriter.eraseOp(op);
-    if (dmNode->getUsers().empty()) rewriter.eraseOp(dmNode);
-
-    return mlir::success();
-}
-
-mlir::LogicalResult mlir::daphne::ConvertMemRefToDenseMatrix::canonicalize(
-    mlir::daphne::ConvertMemRefToDenseMatrix op,
-    mlir::PatternRewriter &rewriter) {
-    mlir::Operation *extractPtr = op->getPrevNode();
-    auto srcMemRef = extractPtr->getOperand(0).getDefiningOp();
-    extractPtr->moveAfter(srcMemRef);
-    op->moveAfter(extractPtr);
-
-    return mlir::success();
-}
-
-mlir::LogicalResult mlir::daphne::RenameOp::canonicalize(
-    mlir::daphne::RenameOp op,
-    mlir::PatternRewriter &rewriter
-) {
-    // Replace the RenameOp by its argument, since we only need
-    // this operation during DaphneDSL parsing.
-    rewriter.replaceOp(op, op.getArg());
-    return mlir::success();
-}
-
-
-/**
- * @brief Replaces `--a` by `a` (`a` scalar).
- *
- * @param op
- * @param rewriter
- * @return
- */
-mlir::LogicalResult mlir::daphne::EwMinusOp::canonicalize(
-        mlir::daphne::EwMinusOp op, PatternRewriter &rewriter
-) {
-    if (auto innerOp = op.getOperand().getDefiningOp<mlir::daphne::EwMinusOp>()) {
-        rewriter.replaceOp(op, innerOp.getOperand());
-        return mlir::success();
-    }
-    return mlir::failure();
-}
\ No newline at end of file
diff --git a/src/ir/daphneir/DaphneDistributableOpInterface.cpp b/src/ir/daphneir/DaphneDistributableOpInterface.cpp
index d12eb6e43..0575b33f7 100644
--- a/src/ir/daphneir/DaphneDistributableOpInterface.cpp
+++ b/src/ir/daphneir/DaphneDistributableOpInterface.cpp
@@ -17,12 +17,11 @@
 #include <ir/daphneir/Daphne.h>
 #include <util/ErrorHandler.h>
 
+#include <stdexcept>
 #include <string>
 #include <vector>
-#include <stdexcept>
 
-namespace mlir::daphne
-{
+namespace mlir::daphne {
 #include <ir/daphneir/DaphneDistributableOpInterface.cpp.inc>
 }
 
@@ -42,14 +41,12 @@ Type getWrappedType(Value v) {
     return wrappedType.dyn_cast<daphne::MatrixType>().withSameElementTypeAndRepr();
 }
 
-template<class EwBinaryOp>
+template <class EwBinaryOp>
 std::vector<mlir::Value> createEquivalentDistributedDAG_EwBinaryOp(EwBinaryOp *op, mlir::OpBuilder &builder,
-                                                                   mlir::ValueRange distributedInputs)
-{
+                                                                   mlir::ValueRange distributedInputs) {
     auto loc = op->getLoc();
-    auto compute = builder.create<daphne::DistributedComputeOp>(loc,
-        ArrayRef<Type>{daphne::HandleType::get(op->getContext(), op->getType())},
-        distributedInputs);
+    auto compute = builder.create<daphne::DistributedComputeOp>(
+        loc, ArrayRef<Type>{daphne::HandleType::get(op->getContext(), op->getType())}, distributedInputs);
     auto &block = compute.getBody().emplaceBlock();
     auto argLhs = block.addArgument(getWrappedType(distributedInputs[0]), builder.getUnknownLoc());
     auto argRhs = block.addArgument(getWrappedType(distributedInputs[1]), builder.getUnknownLoc());
@@ -68,50 +65,46 @@ std::vector<mlir::Value> createEquivalentDistributedDAG_EwBinaryOp(EwBinaryOp *o
     return ret;
 }
 
-template<class EwBinaryOp>
-std::vector<bool> getOperandDistrPrimitives_EwBinaryOp(EwBinaryOp *op) {
+template <class EwBinaryOp> std::vector<bool> getOperandDistrPrimitives_EwBinaryOp(EwBinaryOp *op) {
     Type tL0 = op->getLhs().getType();
-    auto tL  = tL0.dyn_cast<daphne::MatrixType>();
+    auto tL = tL0.dyn_cast<daphne::MatrixType>();
     Type tR0 = op->getRhs().getType();
-    auto tR  = tR0.dyn_cast<daphne::MatrixType>();
+    auto tR = tR0.dyn_cast<daphne::MatrixType>();
     const ssize_t nrL = tL.getNumRows();
     const ssize_t ncL = tL.getNumCols();
     const ssize_t nrR = tR.getNumRows();
     const ssize_t ncR = tR.getNumCols();
 
     if (nrL == -1 || nrR == -1 || ncL == -1 || ncR == -1)
-        throw ErrorHandler::compilerError(
-            op->getLoc(), "DistributableOpInterface",
-            "unknown shapes of left and/or right operand to elementwise "
-            "binary operation are not supported while deciding "
-            "distribute/broadcast");
-
-    if(nrL == nrR && ncL == ncR) // matrix-matrix
-        return {false, false}; // distribute both inputs
-    else if(nrR == 1 && ncL == ncR) // matrix-row
-        return {false, true}; // distribute lhs, broadcast rhs
-    else if(nrL == nrR && ncR == 1) // matrix-col
-        return {false, true}; // distribute lhs, broadcast rhs
+        throw ErrorHandler::compilerError(op->getLoc(), "DistributableOpInterface",
+                                          "unknown shapes of left and/or right operand to elementwise "
+                                          "binary operation are not supported while deciding "
+                                          "distribute/broadcast");
+
+    if (nrL == nrR && ncL == ncR)    // matrix-matrix
+        return {false, false};       // distribute both inputs
+    else if (nrR == 1 && ncL == ncR) // matrix-row
+        return {false, true};        // distribute lhs, broadcast rhs
+    else if (nrL == nrR && ncR == 1) // matrix-col
+        return {false, true};        // distribute lhs, broadcast rhs
     else
-        throw ErrorHandler::compilerError(
-            op->getLoc(), "DistributableOpInterface",
-            "mismatching shapes of left and right operand to elementwise "
-            "binary operation while deciding distribute/broadcast");
+        throw ErrorHandler::compilerError(op->getLoc(), "DistributableOpInterface",
+                                          "mismatching shapes of left and right operand to elementwise "
+                                          "binary operation while deciding distribute/broadcast");
 }
 
 // ****************************************************************************
 // DistributableOpInterface implementations
 // ****************************************************************************
 
-#define IMPL_EWBINARYOP(OP) \
-    std::vector<mlir::Value> mlir::daphne::OP::createEquivalentDistributedDAG(mlir::OpBuilder &builder, \
-        mlir::ValueRange distributedInputs) \
-    { \
-        return createEquivalentDistributedDAG_EwBinaryOp(this, builder, distributedInputs); \
-    } \
-    \
-    std::vector<bool> mlir::daphne::OP::getOperandDistrPrimitives() { \
-        return getOperandDistrPrimitives_EwBinaryOp(this); \
+#define IMPL_EWBINARYOP(OP)                                                                                            \
+    std::vector<mlir::Value> mlir::daphne::OP::createEquivalentDistributedDAG(mlir::OpBuilder &builder,                \
+                                                                              mlir::ValueRange distributedInputs) {    \
+        return createEquivalentDistributedDAG_EwBinaryOp(this, builder, distributedInputs);                            \
+    }                                                                                                                  \
+                                                                                                                       \
+    std::vector<bool> mlir::daphne::OP::getOperandDistrPrimitives() {                                                  \
+        return getOperandDistrPrimitives_EwBinaryOp(this);                                                             \
     }
 
 // TODO We should use traits (like for shape inference) so that we don't need
@@ -149,13 +142,11 @@ IMPL_EWBINARYOP(EwLeOp)
 IMPL_EWBINARYOP(EwGtOp)
 IMPL_EWBINARYOP(EwGeOp)
 
-std::vector<mlir::Value> daphne::RowAggMaxOp::createEquivalentDistributedDAG(
-        OpBuilder &builder, ValueRange distributedInputs
-) {
+std::vector<mlir::Value> daphne::RowAggMaxOp::createEquivalentDistributedDAG(OpBuilder &builder,
+                                                                             ValueRange distributedInputs) {
     auto loc = getLoc();
-    auto compute = builder.create<daphne::DistributedComputeOp>(loc,
-        ArrayRef<Type>{daphne::HandleType::get(getContext(), getType())},
-        distributedInputs);
+    auto compute = builder.create<daphne::DistributedComputeOp>(
+        loc, ArrayRef<Type>{daphne::HandleType::get(getContext(), getType())}, distributedInputs);
     auto &block = compute.getBody().emplaceBlock();
     auto arg = block.addArgument(getWrappedType(distributedInputs[0]), builder.getUnknownLoc());
 
@@ -172,6 +163,4 @@ std::vector<mlir::Value> daphne::RowAggMaxOp::createEquivalentDistributedDAG(
     return ret;
 }
 
-std::vector<bool> daphne::RowAggMaxOp::getOperandDistrPrimitives() {
-    return {false};
-}
+std::vector<bool> daphne::RowAggMaxOp::getOperandDistrPrimitives() { return {false}; }
diff --git a/src/ir/daphneir/DaphneDistributableOpInterface.h b/src/ir/daphneir/DaphneDistributableOpInterface.h
index 0bcde3064..7d3c8a024 100644
--- a/src/ir/daphneir/DaphneDistributableOpInterface.h
+++ b/src/ir/daphneir/DaphneDistributableOpInterface.h
@@ -17,8 +17,8 @@
 #ifndef SRC_IR_DAPHNEIR_DAPHNEDISTRIBUTABLEOPINTERFACE_H
 #define SRC_IR_DAPHNEIR_DAPHNEDISTRIBUTABLEOPINTERFACE_H
 
-#include "mlir/IR/OpDefinition.h"
 #include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/OpDefinition.h"
 
 namespace mlir {
 namespace daphne {
diff --git a/src/ir/daphneir/DaphneInferFrameLabelsOpInterface.cpp b/src/ir/daphneir/DaphneInferFrameLabelsOpInterface.cpp
index 0bfaa8c53..6746aebbe 100644
--- a/src/ir/daphneir/DaphneInferFrameLabelsOpInterface.cpp
+++ b/src/ir/daphneir/DaphneInferFrameLabelsOpInterface.cpp
@@ -15,15 +15,15 @@
  */
 
 #include <compiler/utils/CompilerUtils.h>
-#include <util/ErrorHandler.h>
 #include <ir/daphneir/Daphne.h>
 #include <runtime/local/datastructures/LabelUtils.h>
+#include <util/ErrorHandler.h>
 
 #include <mlir/IR/Value.h>
 
+#include <stdexcept>
 #include <string>
 #include <vector>
-#include <stdexcept>
 
 namespace mlir::daphne {
 #include <ir/daphneir/DaphneInferFrameLabelsOpInterface.cpp.inc>
@@ -36,10 +36,9 @@ using namespace mlir;
 // ****************************************************************************
 // For families of operations.
 
-template<class ExtractOrFilterRowOp>
-void inferFrameLabels_ExtractOrFilterRowOp(ExtractOrFilterRowOp * op) {
+template <class ExtractOrFilterRowOp> void inferFrameLabels_ExtractOrFilterRowOp(ExtractOrFilterRowOp *op) {
     Type t = op->getSource().getType();
-    if(auto ft = t.dyn_cast<daphne::FrameType>()) {
+    if (auto ft = t.dyn_cast<daphne::FrameType>()) {
         Value res = op->getResult();
         res.setType(res.getType().dyn_cast<daphne::FrameType>().withLabels(ft.getLabels()));
     }
@@ -53,7 +52,7 @@ void daphne::ReadOp::inferFrameLabels() {
     auto p = CompilerUtils::isConstant<std::string>(getFileName());
     if (auto resType = getRes().getType().dyn_cast<daphne::FrameType>()) {
         if (p.first) {
-            std::vector<std::string> * labels;
+            std::vector<std::string> *labels;
             FileMetaData fmd = CompilerUtils::getFileMetaData(getFileName());
             if (fmd.labels.empty()) {
                 labels = nullptr;
@@ -71,21 +70,19 @@ void daphne::ColBindOp::inferFrameLabels() {
     auto ftLhs = getLhs().getType().dyn_cast<daphne::FrameType>();
     auto ftRhs = getRhs().getType().dyn_cast<daphne::FrameType>();
 
-    if(!ftLhs || !ftRhs)
-        throw ErrorHandler::compilerError(
-            getLoc(), "daphne::ColBindOp::inferFrameLabels",
-            "currently ColBindOp can only infer its output labels if both "
-            "inputs are frames");
-    if(!ftLhs.getLabels() || !ftRhs.getLabels())
-        throw ErrorHandler::compilerError(
-            getLoc(), "daphne::ColBindOp::inferFrameLabels",
-            "currenly ColBindOp can only infer its output labels if the "
-            "labels of both input frames are known");
+    if (!ftLhs || !ftRhs)
+        throw ErrorHandler::compilerError(getLoc(), "daphne::ColBindOp::inferFrameLabels",
+                                          "currently ColBindOp can only infer its output labels if both "
+                                          "inputs are frames");
+    if (!ftLhs.getLabels() || !ftRhs.getLabels())
+        throw ErrorHandler::compilerError(getLoc(), "daphne::ColBindOp::inferFrameLabels",
+                                          "currenly ColBindOp can only infer its output labels if the "
+                                          "labels of both input frames are known");
 
     auto labelsRes = new std::vector<std::string>();
-    for(auto l : *(ftLhs.getLabels()))
+    for (auto l : *(ftLhs.getLabels()))
         labelsRes->push_back(l);
-    for(auto l : *(ftRhs.getLabels()))
+    for (auto l : *(ftRhs.getLabels()))
         labelsRes->push_back(l);
 
     Value res = getResult();
@@ -94,7 +91,7 @@ void daphne::ColBindOp::inferFrameLabels() {
 
 void daphne::CreateFrameOp::inferFrameLabels() {
     auto resLabels = new std::vector<std::string>();
-    for(Value label : getLabels())
+    for (Value label : getLabels())
         resLabels->push_back(CompilerUtils::constantOrThrow<std::string>(label));
     Value res = getResult();
     res.setType(res.getType().dyn_cast<daphne::FrameType>().withLabels(resLabels));
@@ -103,8 +100,8 @@ void daphne::CreateFrameOp::inferFrameLabels() {
 void daphne::ExtractColOp::inferFrameLabels() {
     auto ft = getSource().getType().dyn_cast<daphne::FrameType>();
     auto st = getSelectedCols().getType().dyn_cast<daphne::StringType>();
-    
-    if(ft && st) {
+
+    if (ft && st) {
         std::string label = CompilerUtils::constantOrThrow<std::string>(getSelectedCols());
         std::string delimiter = ".";
         const std::string frameName = label.substr(0, label.find(delimiter));
@@ -116,7 +113,7 @@ void daphne::ExtractColOp::inferFrameLabels() {
                 std::string labelFrameName = label.substr(0, label.find(delimiter));
                 if (labelFrameName.compare(frameName) == 0) {
                     resultLabels->push_back(label);
-                } 
+                }
             }
             Value res = getResult();
             res.setType(res.getType().dyn_cast<daphne::FrameType>().withLabels(resultLabels));
@@ -126,23 +123,18 @@ void daphne::ExtractColOp::inferFrameLabels() {
             Value res = getResult();
             res.setType(res.getType().dyn_cast<daphne::FrameType>().withLabels(resLabels));
         }
-        
     }
-
 }
 
-void daphne::ExtractRowOp::inferFrameLabels() {
-    inferFrameLabels_ExtractOrFilterRowOp(this);
-}
+void daphne::ExtractRowOp::inferFrameLabels() { inferFrameLabels_ExtractOrFilterRowOp(this); }
 
-void daphne::FilterRowOp::inferFrameLabels() {
-    inferFrameLabels_ExtractOrFilterRowOp(this);
-}
+void daphne::FilterRowOp::inferFrameLabels() { inferFrameLabels_ExtractOrFilterRowOp(this); }
 
 void daphne::GroupJoinOp::inferFrameLabels() {
     auto newLabels = new std::vector<std::string>();
     newLabels->push_back(CompilerUtils::constantOrThrow<std::string>(getLhsOn()));
-    newLabels->push_back(std::string("SUM(") + CompilerUtils::constantOrThrow<std::string>(getRhsAgg()) + std::string(")"));
+    newLabels->push_back(std::string("SUM(") + CompilerUtils::constantOrThrow<std::string>(getRhsAgg()) +
+                         std::string(")"));
     Value res = getResult(0);
     res.setType(res.getType().dyn_cast<daphne::FrameType>().withLabels(newLabels));
 }
@@ -158,14 +150,14 @@ void daphne::CartesianOp::inferFrameLabels() {
     auto newLabels = new std::vector<std::string>();
     auto ft1 = getLhs().getType().dyn_cast<daphne::FrameType>();
     auto ft2 = getRhs().getType().dyn_cast<daphne::FrameType>();
-    std::vector<std::string> * labelsStr1 = ft1.getLabels();
-    std::vector<std::string> * labelsStr2 = ft2.getLabels();
+    std::vector<std::string> *labelsStr1 = ft1.getLabels();
+    std::vector<std::string> *labelsStr2 = ft2.getLabels();
 
-    if(labelsStr1)
-        for(auto labelStr : *labelsStr1)
+    if (labelsStr1)
+        for (auto labelStr : *labelsStr1)
             newLabels->push_back(labelStr);
-    if(labelsStr2)
-        for(auto labelStr : *labelsStr2)
+    if (labelsStr2)
+        for (auto labelStr : *labelsStr2)
             newLabels->push_back(labelStr);
 
     getResult().setType(getRes().getType().dyn_cast<daphne::FrameType>().withLabels(newLabels));
@@ -173,7 +165,7 @@ void daphne::CartesianOp::inferFrameLabels() {
 
 void daphne::OrderOp::inferFrameLabels() {
     Type t = getArg().getType();
-    if(auto ft = t.dyn_cast<daphne::FrameType>()) {
+    if (auto ft = t.dyn_cast<daphne::FrameType>()) {
         Value res = getResult();
         res.setType(res.getType().dyn_cast<daphne::FrameType>().withLabels(ft.getLabels()));
     }
@@ -183,32 +175,32 @@ void daphne::InnerJoinOp::inferFrameLabels() {
     auto newLabels = new std::vector<std::string>();
     auto ft1 = getLhs().getType().dyn_cast<daphne::FrameType>();
     auto ft2 = getRhs().getType().dyn_cast<daphne::FrameType>();
-    std::vector<std::string> * labelsStr1 = ft1.getLabels();
-    std::vector<std::string> * labelsStr2 = ft2.getLabels();
+    std::vector<std::string> *labelsStr1 = ft1.getLabels();
+    std::vector<std::string> *labelsStr2 = ft2.getLabels();
 
-    if(labelsStr1)
-        for(auto labelStr : *labelsStr1)
+    if (labelsStr1)
+        for (auto labelStr : *labelsStr1)
             newLabels->push_back(labelStr);
-    if(labelsStr2)
-        for(auto labelStr : *labelsStr2)
+    if (labelsStr2)
+        for (auto labelStr : *labelsStr2)
             newLabels->push_back(labelStr);
 
     getResult().setType(getRes().getType().dyn_cast<daphne::FrameType>().withLabels(newLabels));
 }
 
 void daphne::ThetaJoinOp::inferFrameLabels() {
-    std::vector<std::string> * newLabels = nullptr;
-    
+    std::vector<std::string> *newLabels = nullptr;
+
     auto ft1 = getLhs().getType().dyn_cast<daphne::FrameType>();
     auto ft2 = getRhs().getType().dyn_cast<daphne::FrameType>();
-    std::vector<std::string> * labelsStr1 = ft1.getLabels();
-    std::vector<std::string> * labelsStr2 = ft2.getLabels();
+    std::vector<std::string> *labelsStr1 = ft1.getLabels();
+    std::vector<std::string> *labelsStr2 = ft2.getLabels();
 
-    if(labelsStr1 && labelsStr2) {
+    if (labelsStr1 && labelsStr2) {
         newLabels = new std::vector<std::string>();
-        for(auto labelStr : *labelsStr1)
+        for (auto labelStr : *labelsStr1)
             newLabels->push_back(labelStr);
-        for(auto labelStr : *labelsStr2)
+        for (auto labelStr : *labelsStr2)
             newLabels->push_back(labelStr);
     }
 
@@ -220,18 +212,18 @@ void daphne::GroupOp::inferFrameLabels() {
     std::vector<std::string> aggColLabels;
     std::vector<std::string> aggFuncNames;
 
-    for(Value t: getKeyCol()){ //Adopting keyCol Labels
+    for (Value t : getKeyCol()) { // Adopting keyCol Labels
         std::string keyLabel = CompilerUtils::constantOrThrow<std::string>(t);
         std::string delimiter = ".";
         const std::string frameName = keyLabel.substr(0, keyLabel.find(delimiter));
         const std::string colLabel = keyLabel.substr(keyLabel.find(delimiter) + delimiter.length(), keyLabel.length());
-        
-        if(keyLabel == "*") {
+
+        if (keyLabel == "*") {
             daphne::FrameType arg = getFrame().getType().dyn_cast<daphne::FrameType>();
             for (std::string frameLabel : *arg.getLabels()) {
                 newLabels->push_back(frameLabel);
             }
-        } else if(colLabel.compare("*") == 0) {
+        } else if (colLabel.compare("*") == 0) {
             daphne::FrameType arg = getFrame().getType().dyn_cast<daphne::FrameType>();
             std::vector<std::string> labels = *arg.getLabels();
             for (std::string label : labels) {
@@ -245,14 +237,14 @@ void daphne::GroupOp::inferFrameLabels() {
         }
     }
 
-    for(Value t: getAggCol()){
+    for (Value t : getAggCol()) {
         aggColLabels.push_back(CompilerUtils::constantOrThrow<std::string>(t));
     }
-    for(Attribute t: getAggFuncs()){
+    for (Attribute t : getAggFuncs()) {
         GroupEnum aggFuncValue = t.dyn_cast<GroupEnumAttr>().getValue();
         aggFuncNames.push_back(stringifyGroupEnum(aggFuncValue).str());
     }
-    for(size_t i = 0; i < aggFuncNames.size() && i < aggColLabels.size(); i++){
+    for (size_t i = 0; i < aggFuncNames.size() && i < aggColLabels.size(); i++) {
         newLabels->push_back(aggFuncNames.at(i) + "(" + aggColLabels.at(i) + ")");
     }
 
@@ -261,11 +253,10 @@ void daphne::GroupOp::inferFrameLabels() {
 
 void daphne::SetColLabelsOp::inferFrameLabels() {
     auto newLabels = new std::vector<std::string>();
-    for(Value label : getLabels()) {
+    for (Value label : getLabels()) {
         try {
             newLabels->push_back(CompilerUtils::constantOrThrow<std::string>(label));
-        }
-        catch(std::runtime_error&) {
+        } catch (std::runtime_error &) {
             // TODO This could be improved by supporting knowledge on only some
             // of the labels.
             // If we do not know the values of all label operands at
@@ -281,9 +272,9 @@ void daphne::SetColLabelsPrefixOp::inferFrameLabels() {
     auto newLabels = new std::vector<std::string>();
     std::string prefixStr = CompilerUtils::constantOrThrow<std::string>(getPrefix());
     auto ft = getArg().getType().dyn_cast<daphne::FrameType>();
-    std::vector<std::string> * labelsStr = ft.getLabels();
-    if(labelsStr)
-        for(auto labelStr : *labelsStr)
+    std::vector<std::string> *labelsStr = ft.getLabels();
+    if (labelsStr)
+        for (auto labelStr : *labelsStr)
             newLabels->push_back(LabelUtils::setPrefix(prefixStr, labelStr));
     else {
         delete newLabels;
diff --git a/src/ir/daphneir/DaphneInferShapeOpInterface.cpp b/src/ir/daphneir/DaphneInferShapeOpInterface.cpp
index 91e05c648..d0c82c93e 100644
--- a/src/ir/daphneir/DaphneInferShapeOpInterface.cpp
+++ b/src/ir/daphneir/DaphneInferShapeOpInterface.cpp
@@ -15,15 +15,15 @@
  */
 
 #include <compiler/utils/CompilerUtils.h>
-#include <util/ErrorHandler.h>
 #include <ir/daphneir/Daphne.h>
 #include <runtime/local/datastructures/Structure.h>
+#include <util/ErrorHandler.h>
 
 #include <mlir/IR/Value.h>
 
-#include <vector>
 #include <stdexcept>
 #include <utility>
+#include <vector>
 
 namespace mlir::daphne {
 #include <ir/daphneir/DaphneInferShapeOpInterface.cpp.inc>
@@ -38,37 +38,37 @@ using namespace mlir::OpTrait;
 
 std::pair<ssize_t, ssize_t> getShape(Value v) {
     Type t = v.getType();
-    if(auto mt = t.dyn_cast<daphne::MatrixType>())
+    if (auto mt = t.dyn_cast<daphne::MatrixType>())
         return std::make_pair(mt.getNumRows(), mt.getNumCols());
-    if(auto ft = t.dyn_cast<daphne::FrameType>())
+    if (auto ft = t.dyn_cast<daphne::FrameType>())
         return std::make_pair(ft.getNumRows(), ft.getNumCols());
     // TODO Maybe check if it is really a scalar type.
     else // scalar
         return std::make_pair(1, 1);
 }
 
-ssize_t inferNumRowsFromArgs(Operation* op, ValueRange vs) {
+ssize_t inferNumRowsFromArgs(Operation *op, ValueRange vs) {
     // If the #rows of all arguments is known and matches, then this is the
     // inferred #rows. If the known #rows of any two arguments mismatch, an
     // exception is thrown. Otherwise, if the #rows of any argument is unknown,
     // the inferred #rows is unknown.
     ssize_t numRows = getShape(vs[0]).first;
     bool someUnknown = false;
-    if(numRows == -1)
+    if (numRows == -1)
         someUnknown = true;
-    for(size_t i = 1; i < vs.size(); i++) {
+    for (size_t i = 1; i < vs.size(); i++) {
         const ssize_t nextNumRows = getShape(vs[i]).first;
-        if(nextNumRows == -1)
+        if (nextNumRows == -1)
             someUnknown = true;
-        else if(numRows == -1)
+        else if (numRows == -1)
             numRows = nextNumRows;
-        else if(nextNumRows != numRows)
+        else if (nextNumRows != numRows)
             throw ErrorHandler::compilerError(op->getLoc(), "InferShapeOpInterface",
-                    "shape inference: inferNumRowsFromArgs() requires that "
-                    "arguments have the same number of rows, but there is "
-                    "one with " + std::to_string(numRows) + " and one with " +
-                    std::to_string(nextNumRows) + " rows"
-            );
+                                              "shape inference: inferNumRowsFromArgs() requires that "
+                                              "arguments have the same number of rows, but there is "
+                                              "one with " +
+                                                  std::to_string(numRows) + " and one with " +
+                                                  std::to_string(nextNumRows) + " rows");
     }
     return someUnknown ? -1 : numRows;
 }
@@ -80,30 +80,30 @@ ssize_t inferNumColsFromArgs(Operation *op, ValueRange vs) {
     // the infered #cols is unknown.
     ssize_t numCols = getShape(vs[0]).second;
     bool someUnknown = false;
-    if(numCols == -1)
+    if (numCols == -1)
         someUnknown = true;
-    for(size_t i = 1; i < vs.size(); i++) {
+    for (size_t i = 1; i < vs.size(); i++) {
         const ssize_t nextNumCols = getShape(vs[i]).second;
-        if(nextNumCols == -1)
+        if (nextNumCols == -1)
             someUnknown = true;
-        else if(numCols == -1)
+        else if (numCols == -1)
             numCols = nextNumCols;
-        else if(nextNumCols != numCols)
+        else if (nextNumCols != numCols)
             throw ErrorHandler::compilerError(op->getLoc(), "InferShapeOpInterface",
-                    "shape inference: inferNumColsFromArgs() requires that "
-                    "arguments have the same number of columns, but there is "
-                    "one with " + std::to_string(numCols) + " and one with " +
-                    std::to_string(nextNumCols) + " columns"
-            );
+                                              "shape inference: inferNumColsFromArgs() requires that "
+                                              "arguments have the same number of columns, but there is "
+                                              "one with " +
+                                                  std::to_string(numCols) + " and one with " +
+                                                  std::to_string(nextNumCols) + " columns");
     }
     return someUnknown ? -1 : numCols;
 }
 
 ssize_t inferNumRowsFromSumOfArgs(ValueRange vs) {
     ssize_t sumNumRows = 0;
-    for(Value v : vs) {
+    for (Value v : vs) {
         const ssize_t numRows = getShape(v).first;
-        if(numRows == -1)
+        if (numRows == -1)
             return -1;
         sumNumRows += numRows;
     }
@@ -112,9 +112,9 @@ ssize_t inferNumRowsFromSumOfArgs(ValueRange vs) {
 
 ssize_t inferNumColsFromSumOfArgs(ValueRange vs) {
     ssize_t sumNumCols = 0;
-    for(Value v : vs) {
+    for (Value v : vs) {
         const ssize_t numCols = getShape(v).second;
-        if(numCols == -1)
+        if (numCols == -1)
             return -1;
         sumNumCols += numCols;
     }
@@ -133,43 +133,38 @@ ssize_t daphne::CartesianOp::inferNumRows() {
 
 ssize_t daphne::SeqOp::inferNumRows() {
     Type fromTy = getFrom().getType();
-    if(fromTy.isF64()) {
+    if (fromTy.isF64()) {
         try {
             double vFrom = CompilerUtils::constantOrThrow<double>(getFrom());
             double vTo = CompilerUtils::constantOrThrow<double>(getTo());
             double vInc = CompilerUtils::constantOrThrow<double>(getInc());
             return floor(vTo / vInc - vFrom / vInc) + 1;
-        }
-        catch(const std::runtime_error & e) {
+        } catch (const std::runtime_error &e) {
             return -1;
         }
     }
-    if(fromTy.isF32()) {
+    if (fromTy.isF32()) {
         try {
             float vFrom = CompilerUtils::constantOrThrow<float>(getFrom());
             float vTo = CompilerUtils::constantOrThrow<float>(getTo());
             float vInc = CompilerUtils::constantOrThrow<float>(getInc());
             return floor(vTo / vInc - vFrom / vInc) + 1;
-        }
-        catch(const std::runtime_error & e) {
+        } catch (const std::runtime_error &e) {
             return -1;
         }
-    }
-    else if(fromTy.isSignedInteger(64)) {
+    } else if (fromTy.isSignedInteger(64)) {
         try {
             int64_t vFrom = CompilerUtils::constantOrThrow<int64_t>(getFrom());
             int64_t vTo = CompilerUtils::constantOrThrow<int64_t>(getTo());
             int64_t vInc = CompilerUtils::constantOrThrow<int64_t>(getInc());
             return abs(vTo - vFrom) / abs(vInc) + 1;
-        }
-        catch(const std::runtime_error & e) {
+        } catch (const std::runtime_error &e) {
             return -1;
         }
     }
-    throw ErrorHandler::compilerError(
-        getLoc(), "InferShapeOpInterface (daphne::SeqOp::inferNumRows)",
-        "at the moment, shape inference for SeqOp supports only F64 and "
-        "SI64 value types");
+    throw ErrorHandler::compilerError(getLoc(), "InferShapeOpInterface (daphne::SeqOp::inferNumRows)",
+                                      "at the moment, shape inference for SeqOp supports only F64 and "
+                                      "SI64 value types");
 }
 
 std::vector<std::pair<ssize_t, ssize_t>> daphne::CreateFrameOp::inferShape() {
@@ -188,18 +183,18 @@ std::vector<std::pair<ssize_t, ssize_t>> daphne::GroupOp::inferShape() {
 
     std::vector<std::string> newLabels;
 
-    for(Value t: getKeyCol()){ //Adopting keyCol Labels
+    for (Value t : getKeyCol()) { // Adopting keyCol Labels
         std::string keyLabel = CompilerUtils::constantOrThrow<std::string>(t);
         std::string delimiter = ".";
         const std::string frameName = keyLabel.substr(0, keyLabel.find(delimiter));
         const std::string colLabel = keyLabel.substr(keyLabel.find(delimiter) + delimiter.length(), keyLabel.length());
-        
-        if(keyLabel == "*") {
+
+        if (keyLabel == "*") {
             daphne::FrameType arg = getFrame().getType().dyn_cast<daphne::FrameType>();
             for (std::string frameLabel : *arg.getLabels()) {
                 newLabels.push_back(frameLabel);
             }
-        } else if(colLabel.compare("*") == 0) {
+        } else if (colLabel.compare("*") == 0) {
             daphne::FrameType arg = getFrame().getType().dyn_cast<daphne::FrameType>();
             std::vector<std::string> labels = *arg.getLabels();
             for (std::string label : labels) {
@@ -212,7 +207,7 @@ std::vector<std::pair<ssize_t, ssize_t>> daphne::GroupOp::inferShape() {
             newLabels.push_back(keyLabel);
         }
     }
-    
+
     const size_t numCols = newLabels.size() + getAggCol().size();
     return {{numRows, numCols}};
 }
@@ -223,12 +218,12 @@ std::vector<std::pair<ssize_t, ssize_t>> daphne::MatMulOp::inferShape() {
 
     ssize_t numRows = -1;
     std::pair<bool, bool> pr = CompilerUtils::isConstant<bool>(getTransa());
-    if(pr.first)
+    if (pr.first)
         numRows = pr.second ? shapeLhs.second : shapeLhs.first;
-    
+
     ssize_t numCols = -1;
     std::pair<bool, bool> pc = CompilerUtils::isConstant<bool>(getTransb());
-    if(pc.first)
+    if (pc.first)
         numCols = pc.second ? shapeRhs.first : shapeRhs.second;
 
     return {{numRows, numCols}};
@@ -249,20 +244,19 @@ std::vector<std::pair<ssize_t, ssize_t>> daphne::OrderOp::inferShape() {
     size_t numCols = -1;
 
     Type t = getArg().getType();
-    if(auto mt = t.dyn_cast<daphne::MatrixType>()){
+    if (auto mt = t.dyn_cast<daphne::MatrixType>()) {
         numRows = mt.getNumRows();
         numCols = mt.getNumCols();
     }
-    if(auto ft = t.dyn_cast<daphne::FrameType>()){
+    if (auto ft = t.dyn_cast<daphne::FrameType>()) {
         numRows = ft.getNumRows();
         numCols = ft.getNumCols();
     }
     std::pair<bool, bool> p = CompilerUtils::isConstant<bool>(getReturnIdxs());
-    if(p.first) {
-        if(p.second)
+    if (p.first) {
+        if (p.second)
             numCols = 1;
-    }
-    else
+    } else
         numCols = -1;
 
     return {{numRows, numCols}};
@@ -270,23 +264,22 @@ std::vector<std::pair<ssize_t, ssize_t>> daphne::OrderOp::inferShape() {
 
 std::vector<std::pair<ssize_t, ssize_t>> daphne::CondOp::inferShape() {
     Type condTy = getCond().getType();
-    if(llvm::isa<daphne::UnknownType>(condTy))
+    if (llvm::isa<daphne::UnknownType>(condTy))
         // Actually, this should not happen, because if the type of the
         // condition is unknown, the type of the result should be unknown
         // too per type inference, such that shape inference should not
         // even get called. Nevertheless, returning unknown will probably
         // not hurt in case anyone ever calls this from somewhere else.
         return {{-1, -1}};
-    if(auto condMatTy = condTy.dyn_cast<daphne::MatrixType>())
+    if (auto condMatTy = condTy.dyn_cast<daphne::MatrixType>())
         return {{condMatTy.getNumRows(), condMatTy.getNumCols()}};
-    else if(auto condFrmTy = condTy.dyn_cast<daphne::FrameType>())
-        throw ErrorHandler::compilerError(
-            getLoc(), "InferShapeOpInterface (daphne::CondOp::inferShape)",
-            "CondOp does not support frames for the condition yet");
+    else if (auto condFrmTy = condTy.dyn_cast<daphne::FrameType>())
+        throw ErrorHandler::compilerError(getLoc(), "InferShapeOpInterface (daphne::CondOp::inferShape)",
+                                          "CondOp does not support frames for the condition yet");
     else { // cond is a scalar // TODO check if it is really a scalar
         Type thenTy = getThenVal().getType();
         Type elseTy = getElseVal().getType();
-        
+
         ssize_t thenNumRows = -1;
         ssize_t thenNumCols = -1;
         ssize_t elseNumRows = -1;
@@ -295,28 +288,23 @@ std::vector<std::pair<ssize_t, ssize_t>> daphne::CondOp::inferShape() {
         auto thenFrmTy = thenTy.dyn_cast<daphne::FrameType>();
         auto elseMatTy = elseTy.dyn_cast<daphne::MatrixType>();
         auto elseFrmTy = elseTy.dyn_cast<daphne::FrameType>();
-        if(thenMatTy) {
+        if (thenMatTy) {
             thenNumRows = thenMatTy.getNumRows();
             thenNumCols = thenMatTy.getNumCols();
-        }
-        else if(thenFrmTy) {
+        } else if (thenFrmTy) {
             thenNumRows = thenFrmTy.getNumRows();
             thenNumCols = thenFrmTy.getNumCols();
         }
-        if(elseMatTy) {
+        if (elseMatTy) {
             elseNumRows = elseMatTy.getNumRows();
             elseNumCols = elseMatTy.getNumCols();
-        }
-        else if(elseFrmTy) {
+        } else if (elseFrmTy) {
             elseNumRows = elseFrmTy.getNumRows();
             elseNumCols = elseFrmTy.getNumCols();
         }
 
-        if((thenMatTy || thenFrmTy) && (elseMatTy || elseFrmTy))
-            return {{
-                (thenNumRows == elseNumRows) ? thenNumRows : -1,
-                (thenNumCols == elseNumCols) ? thenNumCols : -1
-            }};
+        if ((thenMatTy || thenFrmTy) && (elseMatTy || elseFrmTy))
+            return {{(thenNumRows == elseNumRows) ? thenNumRows : -1, (thenNumCols == elseNumCols) ? thenNumCols : -1}};
         else
             // Then-value or else-value is a scalar.
             return {{-1, -1}};
@@ -342,7 +330,8 @@ std::vector<std::pair<ssize_t, ssize_t>> daphne::Conv2DForwardOp::inferShape() {
     ssize_t numRows = shapeX.first;
     ssize_t numCols = F == -1 ? -1 : F * Hout * Wout;
 
-    // op output is [mat, scalar, scalar] for the convolved data and its dimensions
+    // op output is [mat, scalar, scalar] for the convolved data and its
+    // dimensions
     return {{numRows, numCols}, std::make_pair(1, 1), std::make_pair(1, 1)};
 }
 
@@ -362,7 +351,8 @@ std::vector<std::pair<ssize_t, ssize_t>> daphne::AvgPoolForwardOp::inferShape()
     size_t Wout = std::floor((Win + 2 * padw - Wf) / stridew + 1);
     auto numCols = C * Hout * Wout;
 
-    // op output is [mat, scalar, scalar] for the convolved data and its dimensions
+    // op output is [mat, scalar, scalar] for the convolved data and its
+    // dimensions
     return {{numRows, numCols}, std::make_pair(1, 1), std::make_pair(1, 1)};
 }
 
@@ -382,7 +372,8 @@ std::vector<std::pair<ssize_t, ssize_t>> daphne::MaxPoolForwardOp::inferShape()
     size_t Wout = std::floor((Win + 2 * padw - Wf) / stridew + 1);
     auto numCols = C * Hout * Wout;
 
-    // op output is [mat, scalar, scalar] for the convolved data and its dimensions
+    // op output is [mat, scalar, scalar] for the convolved data and its
+    // dimensions
     return {{numRows, numCols}, std::make_pair(1, 1), std::make_pair(1, 1)};
 }
 
@@ -394,14 +385,13 @@ std::vector<std::pair<ssize_t, ssize_t>> daphne::CTableOp::inferShape() {
     // the lhs and rhs input matrices) and the lhs/rhs input matrices
     // are compile-time constants, then we could determine the number
     // of rows/columns here.
-    return {{
-        CompilerUtils::constantOrDefault<ssize_t>(getResNumRows(), -1),
-        CompilerUtils::constantOrDefault<ssize_t>(getResNumCols(), -1)
-    }};
+    return {{CompilerUtils::constantOrDefault<ssize_t>(getResNumRows(), -1),
+             CompilerUtils::constantOrDefault<ssize_t>(getResNumCols(), -1)}};
 }
 
 std::vector<std::pair<ssize_t, ssize_t>> daphne::MatrixConstantOp::inferShape() {
-    const Structure* mat = reinterpret_cast<const Structure*>(CompilerUtils::constantOrThrow<uint64_t>(getMatrixAddr()));
+    const Structure *mat =
+        reinterpret_cast<const Structure *>(CompilerUtils::constantOrThrow<uint64_t>(getMatrixAddr()));
     return {{mat->getNumRows(), mat->getNumCols()}};
 }
 
@@ -409,57 +399,48 @@ std::vector<std::pair<ssize_t, ssize_t>> daphne::SliceRowOp::inferShape() {
     Type srcTy = getSource().getType();
     ssize_t srcNumRows;
     ssize_t srcNumCols;
-    if(llvm::isa<daphne::UnknownType>(srcTy)) {
+    if (llvm::isa<daphne::UnknownType>(srcTy)) {
         srcNumRows = -1;
         srcNumCols = -1;
-    }
-    else if(auto srcMatTy = srcTy.dyn_cast<daphne::MatrixType>()) {
+    } else if (auto srcMatTy = srcTy.dyn_cast<daphne::MatrixType>()) {
         srcNumRows = srcMatTy.getNumRows();
         srcNumCols = srcMatTy.getNumCols();
-    }
-    else if(auto srcFrmTy = srcTy.dyn_cast<daphne::FrameType>()) {
+    } else if (auto srcFrmTy = srcTy.dyn_cast<daphne::FrameType>()) {
         srcNumRows = srcFrmTy.getNumRows();
         srcNumCols = srcFrmTy.getNumCols();
-    }
-    else
+    } else
         // If this is the case, shape inference shouldn't have been called.
-        throw ErrorHandler::compilerError(
-            getLoc(), "InferShapeOpInterface (daphne::SliceRowOp::inferShape)",
-            "SliceRowOp shape inference does only support unknown, matrix, and "
-            "frame inputs");
+        throw ErrorHandler::compilerError(getLoc(), "InferShapeOpInterface (daphne::SliceRowOp::inferShape)",
+                                          "SliceRowOp shape inference does only support unknown, matrix, and "
+                                          "frame inputs");
 
     auto loIn = CompilerUtils::isConstant<int64_t>(getLowerIncl());
     auto upEx = CompilerUtils::isConstant<int64_t>(getUpperExcl());
 
     ssize_t resNumRows = -1;
-    if(srcNumRows != -1 && loIn.first && upEx.first) {
+    if (srcNumRows != -1 && loIn.first && upEx.first) {
         ssize_t loInPos = loIn.second;
         ssize_t upExPos = upEx.second;
-        if(loInPos < 0 || loInPos >= srcNumRows)
-            throw ErrorHandler::compilerError(
-                getLoc(),
-                "InferShapeOpInterface (daphne::SliceRowOp::inferShape)",
-                "SliceRowOp shape inference: lowerIncl must be in [0, numRows), "
-                "but is " +
-                    std::to_string(loInPos) + " with " +
-                    std::to_string(srcNumRows) + " rows");
-        if(upExPos < 0 || upExPos > srcNumRows)
-            throw ErrorHandler::compilerError(
-                getLoc(),
-                "InferShapeOpInterface (daphne::SliceRowOp::inferShape)",
-                "SliceRowOp shape inference: upperExcl must be in [0, numRows], "
-                "but is " + std::to_string(upExPos) +
-                " with " + std::to_string(srcNumRows) + " rows"
-            );
-        if(loInPos > upExPos)
-            throw ErrorHandler::compilerError(
-                getLoc(),
-                "InferShapeOpInterface (daphne::SliceRowOp::inferShape)",
-                "SliceRowOp shape inference: lowerIncl must not be greater "
-                "than upperExcl"
-                " (found " +
-                    std::to_string(loInPos) + " and " +
-                    std::to_string(upExPos) + ")");
+        if (loInPos < 0 || loInPos >= srcNumRows)
+            throw ErrorHandler::compilerError(getLoc(), "InferShapeOpInterface (daphne::SliceRowOp::inferShape)",
+                                              "SliceRowOp shape inference: lowerIncl must be in [0, "
+                                              "numRows), "
+                                              "but is " +
+                                                  std::to_string(loInPos) + " with " + std::to_string(srcNumRows) +
+                                                  " rows");
+        if (upExPos < 0 || upExPos > srcNumRows)
+            throw ErrorHandler::compilerError(getLoc(), "InferShapeOpInterface (daphne::SliceRowOp::inferShape)",
+                                              "SliceRowOp shape inference: upperExcl must be in [0, "
+                                              "numRows], "
+                                              "but is " +
+                                                  std::to_string(upExPos) + " with " + std::to_string(srcNumRows) +
+                                                  " rows");
+        if (loInPos > upExPos)
+            throw ErrorHandler::compilerError(getLoc(), "InferShapeOpInterface (daphne::SliceRowOp::inferShape)",
+                                              "SliceRowOp shape inference: lowerIncl must not be greater "
+                                              "than upperExcl"
+                                              " (found " +
+                                                  std::to_string(loInPos) + " and " + std::to_string(upExPos) + ")");
         resNumRows = upExPos - loInPos;
     }
 
@@ -470,52 +451,45 @@ std::vector<std::pair<ssize_t, ssize_t>> daphne::SliceColOp::inferShape() {
     Type srcTy = getSource().getType();
     ssize_t srcNumRows;
     ssize_t srcNumCols;
-    if(auto srcMatTy = srcTy.dyn_cast<daphne::MatrixType>()) {
+    if (auto srcMatTy = srcTy.dyn_cast<daphne::MatrixType>()) {
         srcNumRows = srcMatTy.getNumRows();
         srcNumCols = srcMatTy.getNumCols();
-    }
-    else if(auto srcFrmTy = srcTy.dyn_cast<daphne::FrameType>()) {
+    } else if (auto srcFrmTy = srcTy.dyn_cast<daphne::FrameType>()) {
         srcNumRows = srcFrmTy.getNumRows();
         srcNumCols = srcFrmTy.getNumCols();
-    }
-    else
+    } else
         // If this is the case, shape inference shouldn't have been called.
-        throw ErrorHandler::compilerError(
-            getLoc(), "InferShapeOpInterface (daphne::SliceColOp::inferShape)",
-            "SliceColOp shape inference does only support matrix and frame "
-            "inputs");
+        throw ErrorHandler::compilerError(getLoc(), "InferShapeOpInterface (daphne::SliceColOp::inferShape)",
+                                          "SliceColOp shape inference does only support matrix and frame "
+                                          "inputs");
 
     auto loIn = CompilerUtils::isConstant<int64_t>(getLowerIncl());
     auto upEx = CompilerUtils::isConstant<int64_t>(getUpperExcl());
 
     ssize_t resNumCols = -1;
-    if(srcNumCols != -1 && loIn.first && upEx.first) {
+    if (srcNumCols != -1 && loIn.first && upEx.first) {
         ssize_t loInPos = loIn.second;
         ssize_t upExPos = upEx.second;
-        if(loInPos < 0 || loInPos >= srcNumCols)
-            throw ErrorHandler::compilerError(
-                getLoc(),
-                "InferShapeOpInterface (daphne::SliceColOp::inferShape)",
-                "SliceColOp shape inference: lowerIncl must be in [0, "
-                "numCols), "
-                "but is " +
-                    std::to_string(loInPos) + " with " +
-                    std::to_string(srcNumCols) + " cols");
-        if(upExPos < 0 || upExPos > srcNumCols)
-            throw ErrorHandler::compilerError(
-                getLoc(),
-                "InferShapeOpInterface (daphne::SliceColOp::inferShape)",
-                "SliceColOp shape inference: upperExcl must be in [0, numCols], "
-                "but is " + std::to_string(upExPos) +
-                " with " + std::to_string(srcNumCols) + " cols"
-            );
-        if(loInPos > upExPos)
-            throw ErrorHandler::compilerError(
-                getLoc(),
-                "InferShapeOpInterface (daphne::SliceColOp::inferShape)",
-                "SliceColOp shape inference: lowerIncl must not be greater than upperExcl"
-                " (found " + std::to_string(loInPos) + " and " + std::to_string(upExPos) + ")"
-            );
+        if (loInPos < 0 || loInPos >= srcNumCols)
+            throw ErrorHandler::compilerError(getLoc(), "InferShapeOpInterface (daphne::SliceColOp::inferShape)",
+                                              "SliceColOp shape inference: lowerIncl must be in [0, "
+                                              "numCols), "
+                                              "but is " +
+                                                  std::to_string(loInPos) + " with " + std::to_string(srcNumCols) +
+                                                  " cols");
+        if (upExPos < 0 || upExPos > srcNumCols)
+            throw ErrorHandler::compilerError(getLoc(), "InferShapeOpInterface (daphne::SliceColOp::inferShape)",
+                                              "SliceColOp shape inference: upperExcl must be in [0, "
+                                              "numCols], "
+                                              "but is " +
+                                                  std::to_string(upExPos) + " with " + std::to_string(srcNumCols) +
+                                                  " cols");
+        if (loInPos > upExPos)
+            throw ErrorHandler::compilerError(getLoc(), "InferShapeOpInterface (daphne::SliceColOp::inferShape)",
+                                              "SliceColOp shape inference: lowerIncl must not be greater "
+                                              "than upperExcl"
+                                              " (found " +
+                                                  std::to_string(loInPos) + " and " + std::to_string(upExPos) + ")");
         resNumCols = upEx.second - loIn.second;
     }
 
@@ -526,8 +500,8 @@ std::vector<std::pair<ssize_t, ssize_t>> daphne::ExtractColOp::inferShape() {
     auto ft = getSource().getType().dyn_cast<daphne::FrameType>();
     auto srcNumRows = getShape(getOperand(0)).first;
     auto st = getSelectedCols().getType().dyn_cast<daphne::StringType>();
-    
-    if(ft && st) {
+
+    if (ft && st) {
         std::string label = CompilerUtils::constantOrThrow<std::string>(getSelectedCols());
         std::string delimiter = ".";
         const std::string frameName = label.substr(0, label.find(delimiter));
@@ -545,7 +519,7 @@ std::vector<std::pair<ssize_t, ssize_t>> daphne::ExtractColOp::inferShape() {
         }
     }
     // Default case except when the selectedCols ends in a wildcard
-    return{{srcNumRows, getShape(getOperand(1)).second}};
+    return {{srcNumRows, getShape(getOperand(1)).second}};
 }
 
 std::vector<std::pair<ssize_t, ssize_t>> daphne::EigenOp::inferShape() {
@@ -562,25 +536,22 @@ std::vector<std::pair<ssize_t, ssize_t>> daphne::RecodeOp::inferShape() {
 
     ssize_t resNumRows;
     ssize_t resNumCols;
-    if(auto argMatTy = llvm::dyn_cast<daphne::MatrixType>(argTy)) {
+    if (auto argMatTy = llvm::dyn_cast<daphne::MatrixType>(argTy)) {
         resNumRows = argMatTy.getNumRows();
         resNumCols = argMatTy.getNumCols();
-    }
-    else if(auto argFrmTy = llvm::dyn_cast<daphne::FrameType>(argTy)) {
+    } else if (auto argFrmTy = llvm::dyn_cast<daphne::FrameType>(argTy)) {
         resNumRows = argFrmTy.getNumRows();
         resNumCols = argFrmTy.getNumCols();
-    }
-    else if(llvm::isa<daphne::UnknownType>(argTy)) {
+    } else if (llvm::isa<daphne::UnknownType>(argTy)) {
         resNumRows = -1;
         resNumCols = -1;
-    }
-    else
-        throw ErrorHandler::compilerError(
-            getLoc(), "InferShapeOpInterface (daphne::RecodeOp::inferShape)",
-            "the argument to recode has an invalid type");
+    } else
+        throw ErrorHandler::compilerError(getLoc(), "InferShapeOpInterface (daphne::RecodeOp::inferShape)",
+                                          "the argument to recode has an invalid type");
 
-    // TODO We could infer (or estimate) the number of rows of the dictionary result
-    // if we knew the number of distinct values in the argument (or could estimate it).
+    // TODO We could infer (or estimate) the number of rows of the dictionary
+    // result if we knew the number of distinct values in the argument (or could
+    // estimate it).
     const ssize_t dictNumRows = -1;
     const ssize_t dictNumCols = 1;
 
@@ -595,69 +566,60 @@ std::vector<std::pair<ssize_t, ssize_t>> daphne::RecodeOp::inferShape() {
  * @brief Utility for trying a parametric trait for all values of the parameter
  * from 0 to some upper bound.
  */
-template<size_t upper, template<size_t> class tryParametricTrait>
-struct tryParamTraitUntil {
-    static void apply(ssize_t& numRows, ssize_t& numCols, Operation * op) {
+template <size_t upper, template <size_t> class tryParametricTrait> struct tryParamTraitUntil {
+    static void apply(ssize_t &numRows, ssize_t &numCols, Operation *op) {
         tryParametricTrait<upper>::apply(numRows, numCols, op);
         tryParamTraitUntil<upper - 1, tryParametricTrait>::apply(numRows, numCols, op);
     }
 };
-template<template<size_t> class tryParametricTrait>
-struct tryParamTraitUntil<0, tryParametricTrait> {
-    static void apply(ssize_t& numRows, ssize_t& numCols, Operation * op) {
+template <template <size_t> class tryParametricTrait> struct tryParamTraitUntil<0, tryParametricTrait> {
+    static void apply(ssize_t &numRows, ssize_t &numCols, Operation *op) {
         tryParametricTrait<0>::apply(numRows, numCols, op);
     }
 };
 
-template<size_t i>
-struct tryNumRowsFromIthScalar {
-    static void apply(ssize_t& numRows, ssize_t& numCols, Operation* op) {
-        if(op->hasTrait<NumRowsFromIthScalar<i>::template Impl>())
+template <size_t i> struct tryNumRowsFromIthScalar {
+    static void apply(ssize_t &numRows, ssize_t &numCols, Operation *op) {
+        if (op->hasTrait<NumRowsFromIthScalar<i>::template Impl>())
             numRows = CompilerUtils::constantOrDefault<int64_t>(op->getOperand(i), -1);
     }
 };
-template<size_t i>
-struct tryNumColsFromIthScalar {
-    static void apply(ssize_t& numRows, ssize_t& numCols, Operation* op) {
-        if(op->hasTrait<NumColsFromIthScalar<i>::template Impl>())
+template <size_t i> struct tryNumColsFromIthScalar {
+    static void apply(ssize_t &numRows, ssize_t &numCols, Operation *op) {
+        if (op->hasTrait<NumColsFromIthScalar<i>::template Impl>())
             numCols = CompilerUtils::constantOrDefault<int64_t>(op->getOperand(i), -1);
     }
 };
 
-template<size_t i>
-struct tryNumRowsFromIthArg {
-    static void apply(ssize_t& numRows, ssize_t& numCols, Operation* op) {
-        if(op->hasTrait<NumRowsFromIthArg<i>::template Impl>())
+template <size_t i> struct tryNumRowsFromIthArg {
+    static void apply(ssize_t &numRows, ssize_t &numCols, Operation *op) {
+        if (op->hasTrait<NumRowsFromIthArg<i>::template Impl>())
             numRows = getShape(op->getOperand(i)).first;
     }
 };
-template<size_t i>
-struct tryNumColsFromIthArg {
-    static void apply(ssize_t& numRows, ssize_t& numCols, Operation* op) {
-        if(op->hasTrait<NumColsFromIthArg<i>::template Impl>())
+template <size_t i> struct tryNumColsFromIthArg {
+    static void apply(ssize_t &numRows, ssize_t &numCols, Operation *op) {
+        if (op->hasTrait<NumColsFromIthArg<i>::template Impl>())
             numCols = getShape(op->getOperand(i)).second;
     }
 };
 
-template<size_t i>
-struct tryNumRowsFromIthArgNumCols {
-    static void apply(ssize_t& numRows, ssize_t& numCols, Operation* op) {
-        if(op->hasTrait<NumRowsFromIthArgNumCols<i>::template Impl>())
+template <size_t i> struct tryNumRowsFromIthArgNumCols {
+    static void apply(ssize_t &numRows, ssize_t &numCols, Operation *op) {
+        if (op->hasTrait<NumRowsFromIthArgNumCols<i>::template Impl>())
             numRows = getShape(op->getOperand(i)).second;
     }
 };
-template<size_t i>
-struct tryNumColsFromIthArgNumRows {
-    static void apply(ssize_t& numRows, ssize_t& numCols, Operation* op) {
-        if(op->hasTrait<NumColsFromIthArgNumRows<i>::template Impl>())
+template <size_t i> struct tryNumColsFromIthArgNumRows {
+    static void apply(ssize_t &numRows, ssize_t &numCols, Operation *op) {
+        if (op->hasTrait<NumColsFromIthArgNumRows<i>::template Impl>())
             numCols = getShape(op->getOperand(i)).first;
     }
 };
 
-template<size_t i>
-struct tryShapeFromIthArg {
-    static void apply(ssize_t& numRows, ssize_t& numCols, Operation* op) {
-        if(op->hasTrait<ShapeFromIthArg<i>::template Impl>()) {
+template <size_t i> struct tryShapeFromIthArg {
+    static void apply(ssize_t &numRows, ssize_t &numCols, Operation *op) {
+        if (op->hasTrait<ShapeFromIthArg<i>::template Impl>()) {
             auto shape = getShape(op->getOperand(i));
             numRows = shape.first;
             numCols = shape.second;
@@ -669,12 +631,12 @@ struct tryShapeFromIthArg {
 // Shape inference function
 // ****************************************************************************
 
-std::vector<std::pair<ssize_t, ssize_t>> daphne::tryInferShape(Operation* op) {
-    if(auto inferShapeOp = llvm::dyn_cast<daphne::InferShape>(op))
+std::vector<std::pair<ssize_t, ssize_t>> daphne::tryInferShape(Operation *op) {
+    if (auto inferShapeOp = llvm::dyn_cast<daphne::InferShape>(op))
         // If the operation implements the shape inference interface,
         // we apply that.
         return inferShapeOp.inferShape();
-    else if(op->getNumResults() == 1) {
+    else if (op->getNumResults() == 1) {
         // If the operation does not implement the shape inference interface
         // and has exactly one result, we utilize its shape inference traits,
         // or the inference interfaces for the number of rows and columns
@@ -683,9 +645,9 @@ std::vector<std::pair<ssize_t, ssize_t>> daphne::tryInferShape(Operation* op) {
         ssize_t numRows = -1;
         ssize_t numCols = -1;
 
-        if(op->hasTrait<OneRow>())
+        if (op->hasTrait<OneRow>())
             numRows = 1;
-        if(op->hasTrait<OneCol>())
+        if (op->hasTrait<OneCol>())
             numCols = 1;
         // Our parametric traits addressing a certain argument are supported
         // for up to 10 arguments (this can easily be changed here).
@@ -698,16 +660,16 @@ std::vector<std::pair<ssize_t, ssize_t>> daphne::tryInferShape(Operation* op) {
         tryParamTraitUntil<u, tryNumColsFromIthArg>::apply(numRows, numCols, op);
         tryParamTraitUntil<u, tryNumRowsFromIthArgNumCols>::apply(numRows, numCols, op);
         tryParamTraitUntil<u, tryNumColsFromIthArgNumRows>::apply(numRows, numCols, op);
-        if(op->hasTrait<NumRowsFromAllArgs>())
+        if (op->hasTrait<NumRowsFromAllArgs>())
             numRows = inferNumRowsFromArgs(op, op->getOperands());
-        if(op->hasTrait<NumColsFromAllArgs>())
+        if (op->hasTrait<NumColsFromAllArgs>())
             numCols = inferNumColsFromArgs(op, op->getOperands());
-        if(op->hasTrait<NumRowsFromSumOfAllArgs>())
+        if (op->hasTrait<NumRowsFromSumOfAllArgs>())
             numRows = inferNumRowsFromSumOfArgs(op->getOperands());
-        if(op->hasTrait<NumColsFromSumOfAllArgs>())
+        if (op->hasTrait<NumColsFromSumOfAllArgs>())
             numCols = inferNumColsFromSumOfArgs(op->getOperands());
         tryParamTraitUntil<u, tryShapeFromIthArg>::apply(numRows, numCols, op);
-        if(op->hasTrait<ShapeEwBinary>()) {
+        if (op->hasTrait<ShapeEwBinary>()) {
             // The output has the shape of the left-hand-side operand. This is
             // consistent with the kernel, but in the future, we should extend
             // this to support broadcasting of vectors and scalars from left
@@ -716,35 +678,32 @@ std::vector<std::pair<ssize_t, ssize_t>> daphne::tryInferShape(Operation* op) {
             auto shapeRhs = getShape(op->getOperand(1));
             // This first case is just a workaround, we should decide later how
             // to treat incomplete knowledge of the shapes.
-            if(shapeLhs.first == -1 && shapeLhs.second == 1 && shapeRhs.first == -1 && shapeRhs.second == 1) {
+            if (shapeLhs.first == -1 && shapeLhs.second == 1 && shapeRhs.first == -1 && shapeRhs.second == 1) {
                 numRows = -1;
                 numCols = 1;
-            }
-            else if(shapeRhs.first == -1 || shapeRhs.second == -1) {
+            } else if (shapeRhs.first == -1 || shapeRhs.second == -1) {
                 numRows = -1;
                 numCols = -1;
-            }
-            else {
+            } else {
                 numRows = shapeLhs.first;
                 numCols = shapeLhs.second;
             }
             // TODO Throw if lhs and rhs don't agree.
         }
 
-        if(auto inferNumRowsOp = llvm::dyn_cast<daphne::InferNumRows>(op))
+        if (auto inferNumRowsOp = llvm::dyn_cast<daphne::InferNumRows>(op))
             numRows = inferNumRowsOp.inferNumRows();
-        if(auto inferNumColsOp = llvm::dyn_cast<daphne::InferNumCols>(op))
+        if (auto inferNumColsOp = llvm::dyn_cast<daphne::InferNumCols>(op))
             numCols = inferNumColsOp.inferNumCols();
 
         // Note that all our shape inference traits assume that the operation
         // has exactly one result (which is the case for most DaphneIR ops).
         return {{numRows, numCols}};
-    }
-    else {
+    } else {
         // If the operation does not implement the shape inference interface
         // and has zero or more than one results, we return unknown.
         std::vector<std::pair<ssize_t, ssize_t>> shapes;
-        for(size_t i = 0; i < op->getNumResults(); i++)
+        for (size_t i = 0; i < op->getNumResults(); i++)
             shapes.push_back({-1, -1});
         return shapes;
     }
diff --git a/src/ir/daphneir/DaphneInferShapeOpInterface.h b/src/ir/daphneir/DaphneInferShapeOpInterface.h
index 162348ae4..b38ff07b8 100644
--- a/src/ir/daphneir/DaphneInferShapeOpInterface.h
+++ b/src/ir/daphneir/DaphneInferShapeOpInterface.h
@@ -17,8 +17,8 @@
 #ifndef SRC_IR_DAPHNEIR_DAPHNEINFERSHAPEOPINTERFACE_H
 #define SRC_IR_DAPHNEIR_DAPHNEINFERSHAPEOPINTERFACE_H
 
-#include <vector>
 #include <utility>
+#include <vector>
 
 // ****************************************************************************
 // Shape inference traits
@@ -35,72 +35,54 @@ namespace mlir::OpTrait {
 // ============================================================================
 // Traits determining #rows or #cols separately
 // ============================================================================
-    
+
 // Constant one.
-    
-template<class ConcreteOp>
-class OneRow: public TraitBase<ConcreteOp, OneRow> {};
 
-template<class ConcreteOp>
-class OneCol: public TraitBase<ConcreteOp, OneCol> {};
-    
+template <class ConcreteOp> class OneRow : public TraitBase<ConcreteOp, OneRow> {};
+
+template <class ConcreteOp> class OneCol : public TraitBase<ConcreteOp, OneCol> {};
+
 // Same as i-th scalar argument.
-    
-template<size_t i>
-struct NumRowsFromIthScalar {
-    template<class ConcreteOp>
-    class Impl: public TraitBase<ConcreteOp, Impl> {};
+
+template <size_t i> struct NumRowsFromIthScalar {
+    template <class ConcreteOp> class Impl : public TraitBase<ConcreteOp, Impl> {};
 };
 
-template<size_t i>
-struct NumColsFromIthScalar {
-    template<class ConcreteOp>
-    class Impl: public TraitBase<ConcreteOp, Impl> {};
+template <size_t i> struct NumColsFromIthScalar {
+    template <class ConcreteOp> class Impl : public TraitBase<ConcreteOp, Impl> {};
 };
 
 // Same as i-th argument's same dimension.
 
-template<size_t i>
-struct NumRowsFromIthArg {
-    template<class ConcreteOp>
-    class Impl: public TraitBase<ConcreteOp, Impl> {};
+template <size_t i> struct NumRowsFromIthArg {
+    template <class ConcreteOp> class Impl : public TraitBase<ConcreteOp, Impl> {};
 };
 
-template<size_t i>
-struct NumColsFromIthArg {
-    template<class ConcreteOp>
-    class Impl: public TraitBase<ConcreteOp, Impl> {};
+template <size_t i> struct NumColsFromIthArg {
+    template <class ConcreteOp> class Impl : public TraitBase<ConcreteOp, Impl> {};
 };
 
 // Same as i-th argument's other dimension.
 
-template<size_t i>
-struct NumRowsFromIthArgNumCols {
-    template<class ConcreteOp>
-    class Impl: public TraitBase<ConcreteOp, Impl> {};
+template <size_t i> struct NumRowsFromIthArgNumCols {
+    template <class ConcreteOp> class Impl : public TraitBase<ConcreteOp, Impl> {};
 };
 
-template<size_t i>
-struct NumColsFromIthArgNumRows {
-    template<class ConcreteOp>
-    class Impl: public TraitBase<ConcreteOp, Impl> {};
+template <size_t i> struct NumColsFromIthArgNumRows {
+    template <class ConcreteOp> class Impl : public TraitBase<ConcreteOp, Impl> {};
 };
 
 // Same as all inputs' same dimension (they must all be the same).
 
-template<class ConcreteOp>
-class NumRowsFromAllArgs: public TraitBase<ConcreteOp, NumRowsFromAllArgs> {};
+template <class ConcreteOp> class NumRowsFromAllArgs : public TraitBase<ConcreteOp, NumRowsFromAllArgs> {};
 
-template<class ConcreteOp>
-class NumColsFromAllArgs: public TraitBase<ConcreteOp, NumColsFromAllArgs> {};
+template <class ConcreteOp> class NumColsFromAllArgs : public TraitBase<ConcreteOp, NumColsFromAllArgs> {};
 
 // Sum of all inputs' same dimension.
 
-template<class ConcreteOp>
-class NumRowsFromSumOfAllArgs: public TraitBase<ConcreteOp, NumRowsFromSumOfAllArgs> {};
+template <class ConcreteOp> class NumRowsFromSumOfAllArgs : public TraitBase<ConcreteOp, NumRowsFromSumOfAllArgs> {};
 
-template<class ConcreteOp>
-class NumColsFromSumOfAllArgs: public TraitBase<ConcreteOp, NumColsFromSumOfAllArgs> {};
+template <class ConcreteOp> class NumColsFromSumOfAllArgs : public TraitBase<ConcreteOp, NumColsFromSumOfAllArgs> {};
 
 // ============================================================================
 // Traits determining #rows and #cols together
@@ -108,18 +90,15 @@ class NumColsFromSumOfAllArgs: public TraitBase<ConcreteOp, NumColsFromSumOfAllA
 
 // Same shape as i-th argument.
 
-template<size_t i>
-struct ShapeFromIthArg {
-    template<class ConcreteOp>
-    class Impl: public TraitBase<ConcreteOp, Impl> {};
+template <size_t i> struct ShapeFromIthArg {
+    template <class ConcreteOp> class Impl : public TraitBase<ConcreteOp, Impl> {};
 };
 
 // Broadcast-aware shape of elementwise binary operations.
 
-template<class ConcreteOp>
-class ShapeEwBinary: public TraitBase<ConcreteOp, ShapeEwBinary> {};
+template <class ConcreteOp> class ShapeEwBinary : public TraitBase<ConcreteOp, ShapeEwBinary> {};
 
-}
+} // namespace mlir::OpTrait
 
 // ****************************************************************************
 // Shape inference interfaces
@@ -136,21 +115,21 @@ namespace mlir::daphne {
 namespace mlir::daphne {
 /**
  * @brief Tries to infer the shapes of all results of the given operation.
- * 
+ *
  * If any shape inference traits are attached to the given operation, these are
  * applied to infer the result shape. If the operation implements any shape
  * inference interface, that implementation is invoked. If the shapes cannot be
  * infered based on the available information, or if the operation does not
  * have any relevant traits or interfaces, -1 will be returned for all
  * dimensions.
- * 
+ *
  * @param op The operation whose results' shapes shall be infered.
  * @return A vector of pairs of (number of rows, number of columns). The i-th
  * pair in this vector represents the shape of the i-th result of the given
  * operation. A value of -1 for any of the numbers of rows or columns indicates
  * that this number is not known (yet).
  */
-std::vector<std::pair<ssize_t, ssize_t>> tryInferShape(mlir::Operation* op);
-}
+std::vector<std::pair<ssize_t, ssize_t>> tryInferShape(mlir::Operation *op);
+} // namespace mlir::daphne
 
 #endif // SRC_IR_DAPHNEIR_DAPHNEINFERSHAPEOPINTERFACE_H
\ No newline at end of file
diff --git a/src/ir/daphneir/DaphneInferSparsityOpInterface.cpp b/src/ir/daphneir/DaphneInferSparsityOpInterface.cpp
index 300f079cd..2759011f2 100644
--- a/src/ir/daphneir/DaphneInferSparsityOpInterface.cpp
+++ b/src/ir/daphneir/DaphneInferSparsityOpInterface.cpp
@@ -21,9 +21,9 @@
 
 #include <parser/metadata/MetaDataParser.h>
 
-#include <vector>
 #include <stdexcept>
 #include <utility>
+#include <vector>
 
 namespace mlir::daphne {
 #include <ir/daphneir/DaphneInferSparsityOpInterface.cpp.inc>
@@ -38,7 +38,7 @@ using namespace mlir::OpTrait;
 
 double getSparsityOrUnknownFromType(Value v) {
     Type t = v.getType();
-    if(auto mt = t.dyn_cast<daphne::MatrixType>())
+    if (auto mt = t.dyn_cast<daphne::MatrixType>())
         return mt.getSparsity();
     else // scalar or frame
         // TODO: read scalar value (if 0 -> sparsity 0.0)
@@ -54,7 +54,7 @@ std::vector<double> daphne::DiagMatrixOp::inferSparsity() {
     auto k = argTy.getNumRows();
     auto sparsity = argTy.getSparsity();
 
-    if(argTy.getSparsity() == -1.0) {
+    if (argTy.getSparsity() == -1.0) {
         sparsity = 1;
     }
 
@@ -64,14 +64,14 @@ std::vector<double> daphne::DiagMatrixOp::inferSparsity() {
 std::vector<double> daphne::MatMulOp::inferSparsity() {
     auto lhsTy = getLhs().getType().dyn_cast<daphne::MatrixType>();
     auto rhsTy = getRhs().getType().dyn_cast<daphne::MatrixType>();
-    if(lhsTy.getSparsity() == -1.0 || rhsTy.getSparsity() == -1.0) {
+    if (lhsTy.getSparsity() == -1.0 || rhsTy.getSparsity() == -1.0) {
         return {-1.0};
     }
     auto k = lhsTy.getNumCols();
-    if(k == -1) {
+    if (k == -1) {
         k = rhsTy.getNumRows();
     }
-    if(k == -1)
+    if (k == -1)
         return {-1.0};
     else
         // unbiased estimate
@@ -80,7 +80,7 @@ std::vector<double> daphne::MatMulOp::inferSparsity() {
 
 std::vector<double> daphne::TriOp::inferSparsity() {
     auto argTy = getArg().getType().dyn_cast<daphne::MatrixType>();
-    if(argTy.getSparsity() == -1.0) {
+    if (argTy.getSparsity() == -1.0) {
         return {-1.0};
     }
     // TODO: remove diagonal
@@ -89,14 +89,14 @@ std::vector<double> daphne::TriOp::inferSparsity() {
 
 std::vector<double> daphne::ReadOp::inferSparsity() {
     std::pair<bool, std::string> p = CompilerUtils::isConstant<std::string>(getFileName());
-    if(p.first) {
+    if (p.first) {
         FileMetaData fmd = MetaDataParser::readMetaData(p.second);
         if (fmd.numNonZeros == -1)
             return {-1.0};
-        // TODO: maybe use type shape info instead of file? (would require correct order of optimization passes)
+        // TODO: maybe use type shape info instead of file? (would require
+        // correct order of optimization passes)
         return {(static_cast<double>(fmd.numNonZeros) / fmd.numRows) / fmd.numCols};
-    }
-    else
+    } else
         return {-1.0};
 }
 
@@ -110,32 +110,26 @@ std::vector<double> daphne::ReadOp::inferSparsity() {
  * @brief Utility for trying a parametric trait for all values of the parameter
  * from 0 to some upper bound.
  */
-template<size_t upper, template<size_t> class tryParametricTrait>
-struct tryParamTraitUntil {
+template <size_t upper, template <size_t> class tryParametricTrait> struct tryParamTraitUntil {
     static void apply(double &sparsity, Operation *op) {
         tryParametricTrait<upper>::apply(sparsity, op);
         tryParamTraitUntil<upper - 1, tryParametricTrait>::apply(sparsity, op);
     }
 };
-template<template<size_t> class tryParametricTrait>
-struct tryParamTraitUntil<0, tryParametricTrait> {
-    static void apply(double &sparsity, Operation *op) {
-        tryParametricTrait<0>::apply(sparsity, op);
-    }
+template <template <size_t> class tryParametricTrait> struct tryParamTraitUntil<0, tryParametricTrait> {
+    static void apply(double &sparsity, Operation *op) { tryParametricTrait<0>::apply(sparsity, op); }
 };
 
-template<size_t i>
-struct trySparsityFromIthScalar {
+template <size_t i> struct trySparsityFromIthScalar {
     static void apply(double &sparsity, Operation *op) {
-        if(op->hasTrait<SparsityFromIthScalar<i>::template Impl>())
+        if (op->hasTrait<SparsityFromIthScalar<i>::template Impl>())
             sparsity = CompilerUtils::constantOrDefault<double>(op->getOperand(i), -1);
     }
 };
 
-template<size_t i>
-struct trySparsityFromIthArg {
+template <size_t i> struct trySparsityFromIthArg {
     static void apply(double &sparsity, Operation *op) {
-        if(op->hasTrait<SparsityFromIthArg<i>::template Impl>())
+        if (op->hasTrait<SparsityFromIthArg<i>::template Impl>())
             sparsity = getSparsityOrUnknownFromType(op->getOperand(i));
     }
 };
@@ -145,31 +139,31 @@ struct trySparsityFromIthArg {
 // ****************************************************************************
 
 std::vector<double> daphne::tryInferSparsity(Operation *op) {
-    if(auto inferSparsityOp = llvm::dyn_cast<daphne::InferSparsity>(op))
+    if (auto inferSparsityOp = llvm::dyn_cast<daphne::InferSparsity>(op))
         // If the operation implements the sparsity inference interface,
         // we apply that.
         return inferSparsityOp.inferSparsity();
-    else if(op->getNumResults() == 1) {
+    else if (op->getNumResults() == 1) {
         // If the operation does not implement the sparsity inference interface
         // and has exactly one result, we utilize its sparsity inference traits.
         double sparsity = -1.0;
 
-        if(op->hasTrait<CompletelyDense>()) {
+        if (op->hasTrait<CompletelyDense>()) {
             sparsity = 1.0;
         }
 
-        if(op->hasTrait<EwSparseIfBoth>()) {
+        if (op->hasTrait<EwSparseIfBoth>()) {
             auto spLhs = getSparsityOrUnknownFromType(op->getOperand(0));
             auto spRhs = getSparsityOrUnknownFromType(op->getOperand(1));
-            if(spLhs != -1.0 && spRhs != -1.0)
+            if (spLhs != -1.0 && spRhs != -1.0)
                 // unbiased estimate
                 sparsity = spLhs + spRhs - spLhs * spRhs;
         }
 
-        if(op->hasTrait<EwSparseIfEither>()) {
+        if (op->hasTrait<EwSparseIfEither>()) {
             auto spLhs = getSparsityOrUnknownFromType(op->getOperand(0));
             auto spRhs = getSparsityOrUnknownFromType(op->getOperand(1));
-            if(spLhs != -1.0 && spRhs != -1.0)
+            if (spLhs != -1.0 && spRhs != -1.0)
                 // unbiased estimate
                 sparsity = spLhs * spRhs;
             else if (spLhs != -1.0)
@@ -187,12 +181,11 @@ std::vector<double> daphne::tryInferSparsity(Operation *op) {
         tryParamTraitUntil<u, trySparsityFromIthArg>::apply(sparsity, op);
 
         return {sparsity};
-    }
-    else {
+    } else {
         // If the operation does not implement the sparsity inference interface
         // and has zero or more than one results, we return unknown.
         std::vector<double> sparsities;
-        for(size_t i = 0; i < op->getNumResults(); i++)
+        for (size_t i = 0; i < op->getNumResults(); i++)
             sparsities.push_back(-1);
         return sparsities;
     }
diff --git a/src/ir/daphneir/DaphneInferSparsityOpInterface.h b/src/ir/daphneir/DaphneInferSparsityOpInterface.h
index 840bc4870..dc779e859 100644
--- a/src/ir/daphneir/DaphneInferSparsityOpInterface.h
+++ b/src/ir/daphneir/DaphneInferSparsityOpInterface.h
@@ -17,8 +17,8 @@
 #ifndef SRC_IR_DAPHNEIR_DAPHNEINFERSPARSITYOPINTERFACE_H
 #define SRC_IR_DAPHNEIR_DAPHNEINFERSPARSITYOPINTERFACE_H
 
-#include <vector>
 #include <utility>
+#include <vector>
 
 // ****************************************************************************
 // Sparsity inference traits
@@ -27,8 +27,8 @@
 // All of these traits address operations with **exactly one result**.
 // Supporting multiple results would complicate the traits unnecessarily, given
 // the low number of DaphneIR operations with multiple results. Thus,
-// operations with multiple results should simply implement the sparsity inference
-// interface instead of using traits.
+// operations with multiple results should simply implement the sparsity
+// inference interface instead of using traits.
 
 namespace mlir::OpTrait {
 
@@ -36,27 +36,20 @@ namespace mlir::OpTrait {
 // Traits definitions
 // ============================================================================
 
-template<class ConcreteOp>
-class CompletelyDense : public TraitBase<ConcreteOp, CompletelyDense> {};
+template <class ConcreteOp> class CompletelyDense : public TraitBase<ConcreteOp, CompletelyDense> {};
 
-template<size_t i>
-struct SparsityFromIthScalar {
-    template<class ConcreteOp>
-    class Impl : public TraitBase<ConcreteOp, Impl> {};
+template <size_t i> struct SparsityFromIthScalar {
+    template <class ConcreteOp> class Impl : public TraitBase<ConcreteOp, Impl> {};
 };
 
-template<size_t i>
-struct SparsityFromIthArg {
-    template<class ConcreteOp>
-    class Impl : public TraitBase<ConcreteOp, Impl> {};
+template <size_t i> struct SparsityFromIthArg {
+    template <class ConcreteOp> class Impl : public TraitBase<ConcreteOp, Impl> {};
 };
 
-template<class ConcreteOp>
-class EwSparseIfEither : public TraitBase<ConcreteOp, EwSparseIfEither> {};
-template<class ConcreteOp>
-class EwSparseIfBoth : public TraitBase<ConcreteOp, EwSparseIfBoth> {};
+template <class ConcreteOp> class EwSparseIfEither : public TraitBase<ConcreteOp, EwSparseIfEither> {};
+template <class ConcreteOp> class EwSparseIfBoth : public TraitBase<ConcreteOp, EwSparseIfBoth> {};
 
-}
+} // namespace mlir::OpTrait
 
 // ****************************************************************************
 // Sparsity inference interfaces
@@ -71,16 +64,18 @@ namespace mlir::daphne {
 // ****************************************************************************
 
 namespace mlir::daphne {
-    // NOTE: we could replace this by instead using a default implementation of the interface method, which would check
-    //  the traits
+// NOTE: we could replace this by instead using a default implementation of the
+// interface method, which would check
+//  the traits
 /**
  * @brief Tries to infer the sparsities of all results of the given operation.
- * 
- * If any sparsity inference traits are attached to the given operation, these are
- * applied to infer the result sparsity. If the operation implements any sparsity
- * inference interface, that implementation is invoked. If the sparsity cannot be
- * inferred based on the available information, or if the operation does not
- * have any relevant traits or interfaces, -1.0 (unknown) will be returned for sparsity.
+ *
+ * If any sparsity inference traits are attached to the given operation, these
+ * are applied to infer the result sparsity. If the operation implements any
+ * sparsity inference interface, that implementation is invoked. If the sparsity
+ * cannot be inferred based on the available information, or if the operation
+ * does not have any relevant traits or interfaces, -1.0 (unknown) will be
+ * returned for sparsity.
  *
  * @param op The operation whose results' sparsities shall be inferred.
  * @return A vector of sparsity. The i-th element in this vector represents the
@@ -88,7 +83,7 @@ namespace mlir::daphne {
  * operation. A value of -1.0 for any sparsity indicates
  * that this number is not known (yet).
  */
-std::vector<double> tryInferSparsity(mlir::Operation* op);
-}
+std::vector<double> tryInferSparsity(mlir::Operation *op);
+} // namespace mlir::daphne
 
 #endif // SRC_IR_DAPHNEIR_DAPHNEINFERSPARSITYOPINTERFACE_H
diff --git a/src/ir/daphneir/DaphneInferTypesOpInterface.cpp b/src/ir/daphneir/DaphneInferTypesOpInterface.cpp
index 5e63fd34c..b6394f99b 100644
--- a/src/ir/daphneir/DaphneInferTypesOpInterface.cpp
+++ b/src/ir/daphneir/DaphneInferTypesOpInterface.cpp
@@ -15,13 +15,13 @@
  */
 
 #include <compiler/utils/CompilerUtils.h>
-#include <util/ErrorHandler.h>
 #include <compiler/utils/TypePrinting.h>
 #include <ir/daphneir/Daphne.h>
+#include <util/ErrorHandler.h>
 
+#include <stdexcept>
 #include <string>
 #include <vector>
-#include <stdexcept>
 
 namespace mlir::daphne {
 #include <ir/daphneir/DaphneInferTypesOpInterface.cpp.inc>
@@ -37,10 +37,10 @@ using namespace mlir::OpTrait;
 // ****************************************************************************
 
 Type getFrameColumnTypeByLabel(Operation *op, daphne::FrameType ft, Value labelVal) {
-    auto labelStr = CompilerUtils::constantOrThrow<std::string>(labelVal,
-            "the specified label must be a constant of string type");
+    auto labelStr =
+        CompilerUtils::constantOrThrow<std::string>(labelVal, "the specified label must be a constant of string type");
 
-    std::vector<std::string> * labels = ft.getLabels();
+    std::vector<std::string> *labels = ft.getLabels();
     if (labels) {
         // The column labels are known, so we search for the specified
         // label.
@@ -50,9 +50,8 @@ Type getFrameColumnTypeByLabel(Operation *op, daphne::FrameType ft, Value labelV
                 // Found the label.
                 return colTypes[i];
         // Did not find the label.
-        throw ErrorHandler::compilerError(
-            op->getLoc(), "InferTypesOpInterface.cpp:" + std::to_string(__LINE__),
-            "the specified label was not found: '" + labelStr + "'");
+        throw ErrorHandler::compilerError(op->getLoc(), "InferTypesOpInterface.cpp:" + std::to_string(__LINE__),
+                                          "the specified label was not found: '" + labelStr + "'");
     } else
         // The column labels are unknown, so we cannot tell what type
         // the column with the specified label has.
@@ -64,47 +63,41 @@ Type getFrameColumnTypeByLabel(Operation *op, daphne::FrameType ft, Value labelV
 // ****************************************************************************
 
 std::vector<Type> daphne::CastOp::inferTypes() {
-    Type argTy = getArg().getType();
-    Type resTy = getRes().getType();
-    auto mtArg = argTy.dyn_cast<daphne::MatrixType>();
-    auto ftArg = argTy.dyn_cast<daphne::FrameType>();
-    auto mtRes = resTy.dyn_cast<daphne::MatrixType>();
-
-    // If the result type is a matrix with so far unknown value type, then we
-    // infer the value type from the argument.
-    if(mtRes && llvm::isa<daphne::UnknownType>(mtRes.getElementType())) {
-        Type resVt;
-
-        if(mtArg)
-            // The argument is a matrix; we use its value type for the result.
-            resVt = mtArg.getElementType();
-        else if(ftArg) {
-            // The argument is a frame, we use the value type of its only
-            // column for the results; if the argument has more than one
-            // column, we throw an exception.
-            std::vector<Type> ctsArg = ftArg.getColumnTypes();
-            if(ctsArg.size() == 1)
-                resVt = ctsArg[0];
-            else
-                // TODO We could use the most general of the column types.
-                throw ErrorHandler::compilerError(
-                    getLoc(), "InferTypesOpInterface (daphne::CastOp::inferTypes)",
-                    "currently CastOp cannot infer the value type of its "
-                    "output matrix, if the input is a multi-column frame");
+    Type argumentType = getArg().getType();
+    Type resultType = getRes().getType();
+    auto matrixArgument = argumentType.dyn_cast<daphne::MatrixType>();
+    auto frameArgument = argumentType.dyn_cast<daphne::FrameType>();
+    auto matrixResult = resultType.dyn_cast<daphne::MatrixType>();
+
+    // If the result type is not a matrix or a matrix with so far unknown value type, then we
+    // we leave the result type as it is. We do not reset it to
+    // unknown, since this could drop information that was explicitly
+    // encoded in the CastOp.
+    if (!matrixResult || !llvm::isa<daphne::UnknownType>(matrixResult.getElementType()))
+        return {resultType};
+
+    // The argument is a matrix, result is a matrix; we use its value type for the result.
+    if (matrixArgument)
+        return {matrixResult.withElementType(matrixArgument.getElementType())};
+
+    // The argument is a frame, result is a matrix; we use the value type of its only
+    // column for the results; if the argument has more than one
+    // column, we throw an exception.
+    if (frameArgument) {
+        auto argumentColumnTypes = frameArgument.getColumnTypes();
+        if (argumentColumnTypes.size() != 1) {
+            // TODO We could use the most general of the column types.
+            throw ErrorHandler::compilerError(getLoc(), "InferTypesOpInterface (daphne::CastOp::inferTypes)",
+                                              "currently CastOp cannot infer the value type of its "
+                                              "output matrix, if the input is a multi-column frame");
         }
-        else
-            // The argument is a scalar, we use its type for the value type
-            // of the result.
-            // TODO double-check if it is really a scalar
-            resVt = argTy;
-         
-        return {daphne::MatrixType::get(getContext(), resVt)};
+        return {matrixResult.withElementType(argumentColumnTypes[0])};
     }
 
-    // Otherwise, we leave the result type as it is. We do not reset it to
-    // unknown, since this could drop information that was explicitly
-    // encoded in the CastOp.
-    return {resTy};
+    // The argument is a scalar, result is a matrix; we use its type for the value type
+    // of the result.
+    // TODO double-check if it is really a scalar
+    return {daphne::MatrixType::get(getContext(), argumentType)};
 }
 
 std::vector<Type> daphne::ExtractColOp::inferTypes() {
@@ -113,19 +106,20 @@ std::vector<Type> daphne::ExtractColOp::inferTypes() {
     Type selTy = getSelectedCols().getType();
     Type resTy;
 
-    if(auto srcMatTy = srcTy.dyn_cast<daphne::MatrixType>())
+    if (auto srcMatTy = srcTy.dyn_cast<daphne::MatrixType>())
         // Extracting columns from a matrix retains the value type.
         resTy = srcMatTy.withSameElementType();
-    else if(auto srcFrmTy = srcTy.dyn_cast<daphne::FrameType>()) {
-        // Extracting columns from a frame may change the list of column value types (schema).
+    else if (auto srcFrmTy = srcTy.dyn_cast<daphne::FrameType>()) {
+        // Extracting columns from a frame may change the list of column value
+        // types (schema).
         std::vector<Type> resColTys;
 
-        if(auto selStrTy = selTy.dyn_cast<daphne::StringType>()) {
+        if (auto selStrTy = selTy.dyn_cast<daphne::StringType>()) {
             std::string label = CompilerUtils::constantOrThrow<std::string>(getSelectedCols());
             std::string delimiter = ".";
             const std::string frameName = label.substr(0, label.find(delimiter));
             const std::string colLabel = label.substr(label.find(delimiter) + delimiter.length(), label.length());
-            if(colLabel.compare("*") == 0) {
+            if (colLabel.compare("*") == 0) {
                 std::vector<std::string> labels = *srcFrmTy.getLabels();
                 std::vector<mlir::Type> colTypes = srcFrmTy.getColumnTypes();
                 for (size_t i = 0; i < labels.size(); i++) {
@@ -138,42 +132,37 @@ std::vector<Type> daphne::ExtractColOp::inferTypes() {
                 // Extracting a single column by its string label.
                 resColTys = {getFrameColumnTypeByLabel(this->getOperation(), srcFrmTy, getSelectedCols())};
             }
-        }
-        else if(auto selMatTy = selTy.dyn_cast<daphne::MatrixType>()) {
+        } else if (auto selMatTy = selTy.dyn_cast<daphne::MatrixType>()) {
             // Extracting columns by their positions (given as a column matrix).
 
-            // We don't know the result column types, but if the shape of selectedCols
-            // is known, we at least know the number of columns in the result
-            // and set them all to unknown type.
+            // We don't know the result column types, but if the shape of
+            // selectedCols is known, we at least know the number of columns in
+            // the result and set them all to unknown type.
             const ssize_t numColsSel = selMatTy.getNumCols();
             const ssize_t numRowsSel = selMatTy.getNumRows();
-            if(numColsSel != -1 && numColsSel != 1)
-                throw ErrorHandler::compilerError(
-                    getLoc(), "InferTypesOpInterface (daphne::ExtractColOp::inferTypes)",
-                        "ExtractColOp type inference: selectedCols must have "
-                        "exactly 1 column, but found " + std::to_string(numColsSel)
-                );
-            if(numRowsSel != -1)
-                for(ssize_t i = 0; i < numRowsSel; i++)
+            if (numColsSel != -1 && numColsSel != 1)
+                throw ErrorHandler::compilerError(getLoc(), "InferTypesOpInterface (daphne::ExtractColOp::inferTypes)",
+                                                  "ExtractColOp type inference: selectedCols must have "
+                                                  "exactly 1 column, but found " +
+                                                      std::to_string(numColsSel));
+            if (numRowsSel != -1)
+                for (ssize_t i = 0; i < numRowsSel; i++)
                     resColTys.push_back(u);
 
-            // TODO Use the concrete column positions whenever they are known, e.g.,
-            // if selectedCols is defined by a MatrixConstantOp (matrix literal),
-            // FillOp (with known scalar value), SeqOp, ...
+            // TODO Use the concrete column positions whenever they are known,
+            // e.g., if selectedCols is defined by a MatrixConstantOp (matrix
+            // literal), FillOp (with known scalar value), SeqOp, ...
 
-            // TODO If all columns of the input frame have the same type, we know
-            // the output frame's column types if we know the shape of selectedCols.
-        }
-        else
-            throw ErrorHandler::compilerError(
-                getLoc(),
-                "InferTypesOpInterface (daphne::ExtractColOp::inferTypes)",
-                "ExtractColOp type inference: selectedCols must be a string or "
-                "a matrix");
+            // TODO If all columns of the input frame have the same type, we
+            // know the output frame's column types if we know the shape of
+            // selectedCols.
+        } else
+            throw ErrorHandler::compilerError(getLoc(), "InferTypesOpInterface (daphne::ExtractColOp::inferTypes)",
+                                              "ExtractColOp type inference: selectedCols must be a string or "
+                                              "a matrix");
 
         resTy = daphne::FrameType::get(getContext(), resColTys);
-    }
-    else
+    } else
         resTy = u;
 
     return {resTy};
@@ -184,35 +173,35 @@ std::vector<Type> daphne::FilterColOp::inferTypes() {
         return {mt.withSameElementType()};
     else
         // TODO See #484.
-        throw ErrorHandler::compilerError(
-            getLoc(), "InferTypesOpInterface (daphne::FilterColOp::inferTypes)",
-            "currently, FilterColOp can only infer its type for matrix inputs");
+        throw ErrorHandler::compilerError(getLoc(), "InferTypesOpInterface (daphne::FilterColOp::inferTypes)",
+                                          "currently, FilterColOp can only infer its type for matrix inputs");
 }
 
 std::vector<Type> daphne::CreateFrameOp::inferTypes() {
     std::vector<Type> colTypes;
-    for(Value col : getCols())
+    for (Value col : getCols())
         colTypes.push_back(col.getType().dyn_cast<daphne::MatrixType>().getElementType());
     return {daphne::FrameType::get(getContext(), colTypes)};
 }
 
 std::vector<Type> daphne::RandMatrixOp::inferTypes() {
     auto elTy = getMin().getType();
-    if(elTy == UnknownType::get(getContext())) {
+    if (elTy == UnknownType::get(getContext())) {
         elTy = getMax().getType();
-    }
-    else {
+    } else {
         if (getMax().getType() != UnknownType::get(getContext()) && elTy != getMax().getType())
-            throw ErrorHandler::compilerError(
-                getLoc(), "InferTypesOpInterface (daphne::RandMatrixOp::inferTypes)",
-                "min and max need to have the same type");
+            throw ErrorHandler::compilerError(getLoc(), "InferTypesOpInterface (daphne::RandMatrixOp::inferTypes)",
+                                              "min and max need to have the same type");
     }
-    return {daphne::MatrixType::get(getContext(), elTy)};
+    if (auto matrixType = llvm::dyn_cast<mlir::daphne::MatrixType>(getRes().getType()))
+        return {daphne::MatrixType::get(getContext(), elTy)};
+    throw ErrorHandler::compilerError(getLoc(), "InferTypesOpInterface (daphne::RandMatrixOp::inferTypes)",
+                                      "output type for randMatrix is not of matrix type");
 }
 
 std::vector<Type> daphne::EigenOp::inferTypes() {
     auto evMatType = getArg().getType().dyn_cast<daphne::MatrixType>();
-    return  {evMatType.withSameElementType(), evMatType};
+    return {evMatType.withSameElementType(), evMatType};
 }
 
 std::vector<Type> daphne::GroupJoinOp::inferTypes() {
@@ -221,28 +210,22 @@ std::vector<Type> daphne::GroupJoinOp::inferTypes() {
     Type lhsOnType = getFrameColumnTypeByLabel(this->getOperation(), lhsFt, getLhsOn());
     Type rhsAggType = getFrameColumnTypeByLabel(this->getOperation(), rhsFt, getRhsAgg());
 
-    MLIRContext * ctx = getContext();
+    MLIRContext *ctx = getContext();
     Builder builder(ctx);
-    return {
-        daphne::FrameType::get(ctx, {lhsOnType, rhsAggType}),
-        daphne::MatrixType::get(ctx, builder.getIndexType())
-    };
+    return {daphne::FrameType::get(ctx, {lhsOnType, rhsAggType}), daphne::MatrixType::get(ctx, builder.getIndexType())};
 }
 
 std::vector<Type> daphne::SemiJoinOp::inferTypes() {
     auto lhsFt = getLhs().getType().dyn_cast<daphne::FrameType>();
     Type lhsOnType = getFrameColumnTypeByLabel(this->getOperation(), lhsFt, getLhsOn());
 
-    MLIRContext * ctx = getContext();
+    MLIRContext *ctx = getContext();
     Builder builder(ctx);
-    return {
-        daphne::FrameType::get(ctx, {lhsOnType}),
-        daphne::MatrixType::get(ctx, builder.getIndexType())
-    };
+    return {daphne::FrameType::get(ctx, {lhsOnType}), daphne::MatrixType::get(ctx, builder.getIndexType())};
 }
 
 std::vector<Type> daphne::GroupOp::inferTypes() {
-    MLIRContext * ctx = getContext();
+    MLIRContext *ctx = getContext();
     Builder builder(ctx);
 
     auto arg = getFrame().getType().dyn_cast<daphne::FrameType>();
@@ -251,20 +234,19 @@ std::vector<Type> daphne::GroupOp::inferTypes() {
     std::vector<Value> aggColValues;
     std::vector<std::string> aggFuncNames;
 
-    for(Value t : getKeyCol()){
-        //Key Types getting adopted for the new Frame
-        std::string labelStr = CompilerUtils::constantOrThrow<std::string>(
-            t, "the specified label must be a constant of string type"
-        );
+    for (Value t : getKeyCol()) {
+        // Key Types getting adopted for the new Frame
+        std::string labelStr =
+            CompilerUtils::constantOrThrow<std::string>(t, "the specified label must be a constant of string type");
         std::string delimiter = ".";
         const std::string frameName = labelStr.substr(0, labelStr.find(delimiter));
         const std::string colLabel = labelStr.substr(labelStr.find(delimiter) + delimiter.length(), labelStr.length());
-        if(labelStr == "*") {
+        if (labelStr == "*") {
             auto allTypes = arg.getColumnTypes();
-            for (Type type: allTypes) {
+            for (Type type : allTypes) {
                 newColumnTypes.push_back(type);
             }
-        } else if(colLabel.compare("*") == 0) {
+        } else if (colLabel.compare("*") == 0) {
             std::vector<std::string> labels = *arg.getLabels();
             std::vector<mlir::Type> colTypes = arg.getColumnTypes();
             for (size_t i = 0; i < labels.size(); i++) {
@@ -279,22 +261,22 @@ std::vector<Type> daphne::GroupOp::inferTypes() {
     }
 
     // Values get collected in an easier to use data structure
-    for(Value t : getAggCol()){
+    for (Value t : getAggCol()) {
         aggColValues.push_back(t);
     }
     // Function names get collected in an easier to use data structure
-    for(Attribute t: getAggFuncs()){
+    for (Attribute t : getAggFuncs()) {
         GroupEnum aggFuncValue = t.dyn_cast<GroupEnumAttr>().getValue();
         aggFuncNames.push_back(stringifyGroupEnum(aggFuncValue).str());
     }
-    //New Types get computed
-    for(size_t i = 0; i < aggFuncNames.size() && i < aggColValues.size(); i++){
+    // New Types get computed
+    for (size_t i = 0; i < aggFuncNames.size() && i < aggColValues.size(); i++) {
         std::string groupAggFunction = aggFuncNames.at(i);
-        if(groupAggFunction == "COUNT"){
+        if (groupAggFunction == "COUNT") {
             newColumnTypes.push_back(builder.getIntegerType(64, true));
-        }else if(groupAggFunction == "AVG"){
+        } else if (groupAggFunction == "AVG") {
             newColumnTypes.push_back(builder.getF64Type());
-        }else{ //DEFAULT OPTION (The Type of the named column)
+        } else { // DEFAULT OPTION (The Type of the named column)
             Value t = aggColValues.at(i);
             newColumnTypes.push_back(getFrameColumnTypeByLabel(this->getOperation(), arg, t));
         }
@@ -303,9 +285,8 @@ std::vector<Type> daphne::GroupOp::inferTypes() {
 }
 
 std::vector<Type> daphne::ExtractOp::inferTypes() {
-    throw ErrorHandler::compilerError(
-        getLoc(), "InferTypesOpInterface",
-        "type inference not implemented for ExtractOp"); // TODO
+    throw ErrorHandler::compilerError(getLoc(), "InferTypesOpInterface",
+                                      "type inference not implemented for ExtractOp"); // TODO
 }
 
 std::vector<Type> daphne::OneHotOp::inferTypes() {
@@ -315,10 +296,10 @@ std::vector<Type> daphne::OneHotOp::inferTypes() {
 
 std::vector<Type> daphne::GenericCallOp::inferTypes() {
     std::vector<Type> resTypes;
-    for(auto rt : getResultTypes()) {
-        if(auto mt = rt.dyn_cast<daphne::MatrixType>())
+    for (auto rt : getResultTypes()) {
+        if (auto mt = rt.dyn_cast<daphne::MatrixType>())
             resTypes.push_back(mt.withSameElementType());
-        else if(auto ft = rt.dyn_cast<daphne::FrameType>())
+        else if (auto ft = rt.dyn_cast<daphne::FrameType>())
             resTypes.push_back(ft.withSameColumnTypes());
         else
             resTypes.push_back(rt);
@@ -330,25 +311,33 @@ std::vector<Type> daphne::OrderOp::inferTypes() {
     // TODO Take into account if indexes or data shall be returned.
     Type srcType = getArg().getType();
     Type t;
-    if(auto mt = srcType.dyn_cast<daphne::MatrixType>())
+    if (auto mt = srcType.dyn_cast<daphne::MatrixType>())
         t = mt.withSameElementType();
-    else if(auto ft = srcType.dyn_cast<daphne::FrameType>())
+    else if (auto ft = srcType.dyn_cast<daphne::FrameType>())
         t = ft.withSameColumnTypes();
     return {t};
 }
 
-
 mlir::Type mlirTypeForCode(ValueTypeCode type, Builder builder) {
-    switch(type) {
-        case ValueTypeCode::SI8:  return builder.getIntegerType(8, true);
-        case ValueTypeCode::SI32: return builder.getIntegerType(32, true);
-        case ValueTypeCode::SI64: return builder.getIntegerType(64, true);
-        case ValueTypeCode::UI8:  return builder.getIntegerType(8, false);
-        case ValueTypeCode::UI32: return builder.getIntegerType(32, false);
-        case ValueTypeCode::UI64: return builder.getIntegerType(64, false);
-        case ValueTypeCode::F32: return builder.getF32Type();
-        case ValueTypeCode::F64: return builder.getF64Type();
-        default: throw std::runtime_error("mlirTypeForCode: unknown value type code");
+    switch (type) {
+    case ValueTypeCode::SI8:
+        return builder.getIntegerType(8, true);
+    case ValueTypeCode::SI32:
+        return builder.getIntegerType(32, true);
+    case ValueTypeCode::SI64:
+        return builder.getIntegerType(64, true);
+    case ValueTypeCode::UI8:
+        return builder.getIntegerType(8, false);
+    case ValueTypeCode::UI32:
+        return builder.getIntegerType(32, false);
+    case ValueTypeCode::UI64:
+        return builder.getIntegerType(64, false);
+    case ValueTypeCode::F32:
+        return builder.getF32Type();
+    case ValueTypeCode::F64:
+        return builder.getF64Type();
+    default:
+        throw std::runtime_error("mlirTypeForCode: unknown value type code");
     }
 }
 
@@ -356,21 +345,22 @@ std::vector<Type> daphne::ReadOp::inferTypes() {
 
     auto p = CompilerUtils::isConstant<std::string>(getFileName());
     Builder builder(getContext());
-    if (getRes().getType().dyn_cast<daphne::MatrixType>()) {
+    if (auto matrixType = llvm::dyn_cast<mlir::daphne::MatrixType>(getRes().getType())) {
         // If an individual value type was specified per column
         // (fmd.isSingleValueType == false), then this silently uses the
         // type of the first column.
-        // TODO: add sparsity information here already (if present), currently not possible as many other ops
-        //  just take input types as output types, which is incorrect for sparsity
+        // TODO: add sparsity information here already (if present), currently
+        // not possible as many other ops
+        //  just take input types as output types, which is incorrect for
+        //  sparsity
         if (p.first) {
             FileMetaData fmd = CompilerUtils::getFileMetaData(getFileName());
             mlir::Type valType = mlirTypeForCode(fmd.schema[0], builder);
-            return {mlir::daphne::MatrixType::get(getContext(), valType)};
+            return {matrixType.withElementType(valType)};
         } else {
-            return {mlir::daphne::MatrixType::get(getContext(), daphne::UnknownType::get(getContext()))};
+            return {matrixType.withElementType(daphne::UnknownType::get(getContext()))};
         }
-    }
-    else if (getRes().getType().dyn_cast<daphne::FrameType>()) {
+    } else if (getRes().getType().dyn_cast<daphne::FrameType>()) {
         if (p.first) {
             FileMetaData fmd = CompilerUtils::getFileMetaData(getFileName());
             std::vector<mlir::Type> cts;
@@ -396,61 +386,58 @@ std::vector<Type> daphne::SliceColOp::inferTypes() {
     Type srcTy = getSource().getType();
     Type resTy;
 
-    if(auto srcMatTy = srcTy.dyn_cast<daphne::MatrixType>())
+    if (auto srcMatTy = srcTy.dyn_cast<daphne::MatrixType>())
         // Slicing columns from a matrix retains the value type.
         resTy = srcMatTy.withSameElementType();
-    else if(auto srcFrmTy = srcTy.dyn_cast<daphne::FrameType>()) {
-        // Extracting columns from a frame may change the list of column value types (schema).
+    else if (auto srcFrmTy = srcTy.dyn_cast<daphne::FrameType>()) {
+        // Extracting columns from a frame may change the list of column value
+        // types (schema).
         auto loIn = CompilerUtils::isConstant<int64_t>(getLowerIncl());
         auto upEx = CompilerUtils::isConstant<int64_t>(getUpperExcl());
-        if(loIn.first && upEx.first) {
+        if (loIn.first && upEx.first) {
             // Both the lower and upper bound are known.
             ssize_t loInPos = loIn.second;
             ssize_t upExPos = upEx.second;
             std::vector<Type> srcColTys = srcFrmTy.getColumnTypes();
             std::vector<Type> resColTys;
 
-            // ToDo: remove this when dealing with the next ToDo below (just getting rid of a linter warning here)
+            // ToDo: remove this when dealing with the next ToDo below (just
+            // getting rid of a linter warning here)
             const auto srcNumCols = static_cast<ssize_t>(srcColTys.size());
 
             // TODO Don't duplicate these checks from shape inference.
-            if(loInPos < 0 || loInPos >= srcNumCols)
-                throw ErrorHandler::compilerError(
-                    getLoc(), "InferTypesOpInterface",
-                    "SliceColOp type inference: lowerIncl must be in [0, "
-                    "numCols), "
-                    "but is " +
-                        std::to_string(loInPos) + " with " +
-                        std::to_string(srcNumCols) + " cols");
-            if(upExPos < 0 || upExPos > srcNumCols)
-                throw ErrorHandler::compilerError(
-                    getLoc(), "InferTypesOpInterface",
-                    "SliceColOp type inference: upperExcl must be in [0, "
-                    "numCols], "
-                    "but is " +
-                        std::to_string(upExPos) + " with " +
-                        std::to_string(srcNumCols) + " cols");
-            if(loInPos > upExPos)
-                throw ErrorHandler::compilerError(
-                    getLoc(), "InferTypesOpInterface",
-                    "SliceColOp type inference: lowerIncl must not be greater "
-                    "than upperExcl"
-                    " (found " +
-                        std::to_string(loInPos) + " and " +
-                        std::to_string(upExPos) + ")");
-
-            for(ssize_t pos = loInPos; pos < upExPos; pos++)
+            if (loInPos < 0 || loInPos >= srcNumCols)
+                throw ErrorHandler::compilerError(getLoc(), "InferTypesOpInterface",
+                                                  "SliceColOp type inference: lowerIncl must be in [0, "
+                                                  "numCols), "
+                                                  "but is " +
+                                                      std::to_string(loInPos) + " with " + std::to_string(srcNumCols) +
+                                                      " cols");
+            if (upExPos < 0 || upExPos > srcNumCols)
+                throw ErrorHandler::compilerError(getLoc(), "InferTypesOpInterface",
+                                                  "SliceColOp type inference: upperExcl must be in [0, "
+                                                  "numCols], "
+                                                  "but is " +
+                                                      std::to_string(upExPos) + " with " + std::to_string(srcNumCols) +
+                                                      " cols");
+            if (loInPos > upExPos)
+                throw ErrorHandler::compilerError(getLoc(), "InferTypesOpInterface",
+                                                  "SliceColOp type inference: lowerIncl must not be greater "
+                                                  "than upperExcl"
+                                                  " (found " +
+                                                      std::to_string(loInPos) + " and " + std::to_string(upExPos) +
+                                                      ")");
+
+            for (ssize_t pos = loInPos; pos < upExPos; pos++)
                 resColTys.push_back(srcColTys[pos]);
-                
+
             resTy = daphne::FrameType::get(getContext(), resColTys);
-        }
-        else
-            // TODO The number of column types may not match the actual number of columns
-            // in this case; actually, we should leave the column types blank, but this
-            // cannot be represented at the moment.
+        } else
+            // TODO The number of column types may not match the actual number
+            // of columns in this case; actually, we should leave the column
+            // types blank, but this cannot be represented at the moment.
             resTy = daphne::FrameType::get(getContext(), {u});
-    }
-    else
+    } else
         resTy = u;
 
     return {resTy};
@@ -458,54 +445,49 @@ std::vector<Type> daphne::SliceColOp::inferTypes() {
 
 std::vector<Type> daphne::CondOp::inferTypes() {
     Type condTy = getCond().getType();
-    if(llvm::isa<daphne::UnknownType>(condTy))
+    if (llvm::isa<daphne::UnknownType>(condTy))
         return {daphne::UnknownType::get(getContext())};
-    if(auto condMatTy = condTy.dyn_cast<daphne::MatrixType>()) {
+    if (auto condMatTy = condTy.dyn_cast<daphne::MatrixType>()) {
         Type thenTy = getThenVal().getType();
         Type elseTy = getElseVal().getType();
 
-        if(llvm::isa<daphne::FrameType>(thenTy) || llvm::isa<daphne::FrameType>(elseTy))
-            throw ErrorHandler::compilerError(
-                getLoc(), "InferTypesOpInterface",
-                "CondOp does not support frames for the then-value or "
-                "else-value if "
-                "the condition is a matrix");
+        if (llvm::isa<daphne::FrameType>(thenTy) || llvm::isa<daphne::FrameType>(elseTy))
+            throw ErrorHandler::compilerError(getLoc(), "InferTypesOpInterface",
+                                              "CondOp does not support frames for the then-value or "
+                                              "else-value if "
+                                              "the condition is a matrix");
 
         Type thenValTy = CompilerUtils::getValueType(thenTy);
         Type elseValTy = CompilerUtils::getValueType(elseTy);
 
-        if(thenValTy != elseValTy)
-            throw ErrorHandler::compilerError(
-                getLoc(), "InferTypesOpInterface",
-                "the then/else-values of CondOp must have the same value type");
+        if (thenValTy != elseValTy)
+            throw ErrorHandler::compilerError(getLoc(), "InferTypesOpInterface",
+                                              "the then/else-values of CondOp must have the same value type");
 
-        return {daphne::MatrixType::get(getContext(), thenValTy)};
-    }
-    else if(auto condFrmTy = condTy.dyn_cast<daphne::FrameType>())
-        throw ErrorHandler::compilerError(
-            getLoc(), "InferTypesOpInterface",
-            "CondOp does not support frames for the condition yet");
+        return {condMatTy.withElementType(thenValTy)};
+    } else if (auto condFrmTy = condTy.dyn_cast<daphne::FrameType>())
+        throw ErrorHandler::compilerError(getLoc(), "InferTypesOpInterface",
+                                          "CondOp does not support frames for the condition yet");
     else { // cond is a scalar // TODO check if it is really a scalar
         Type thenTy = getThenVal().getType();
         Type elseTy = getElseVal().getType();
-        
+
         // Remove any properties of matrix/frame except for the value types,
         // such that they don't interfere with the type comparison below,
         // and since we don't want them in the inferred type.
-        if(auto thenMatTy = thenTy.dyn_cast<daphne::MatrixType>())
+        if (auto thenMatTy = thenTy.dyn_cast<daphne::MatrixType>())
             thenTy = thenMatTy.withSameElementType();
-        else if(auto thenFrmTy = thenTy.dyn_cast<daphne::FrameType>())
+        else if (auto thenFrmTy = thenTy.dyn_cast<daphne::FrameType>())
             thenTy = thenFrmTy.withSameColumnTypes();
-        if(auto elseMatTy = elseTy.dyn_cast<daphne::MatrixType>())
+        if (auto elseMatTy = elseTy.dyn_cast<daphne::MatrixType>())
             elseTy = elseMatTy.withSameElementType();
-        else if(auto elseFrmTy = elseTy.dyn_cast<daphne::FrameType>())
+        else if (auto elseFrmTy = elseTy.dyn_cast<daphne::FrameType>())
             elseTy = elseFrmTy.withSameColumnTypes();
 
-        if(thenTy != elseTy) {
-            throw ErrorHandler::compilerError(
-                getLoc(), "InferTypesOpInterface (daphne::CondOp::inferTypes)",
-                "the then/else-values of CondOp must have the same type if "
-                "the condition is a scalar");
+        if (thenTy != elseTy) {
+            throw ErrorHandler::compilerError(getLoc(), "InferTypesOpInterface (daphne::CondOp::inferTypes)",
+                                              "the then/else-values of CondOp must have the same type if "
+                                              "the condition is a scalar");
         }
 
         // It is important that all matrix/frame properties except for the
@@ -523,7 +505,7 @@ std::vector<Type> daphne::RecodeOp::inferTypes() {
     //   - If the argument is a frame, all its columns must have the same
     //     value type (alternatively, one could take the most general one).
 
-    MLIRContext * ctx = getContext();
+    MLIRContext *ctx = getContext();
 
     Type argTy = getArg().getType();
     Type si64 = IntegerType::get(ctx, 64, IntegerType::SignednessSemantics::Signed);
@@ -531,76 +513,74 @@ std::vector<Type> daphne::RecodeOp::inferTypes() {
 
     Type resTy;
     Type dictValTy;
-    if(auto argMatTy = llvm::dyn_cast<daphne::MatrixType>(argTy)) {
+    if (auto argMatTy = llvm::dyn_cast<daphne::MatrixType>(argTy)) {
         resTy = daphne::MatrixType::get(ctx, si64);
         dictValTy = argMatTy.getElementType();
-    }
-    else if(auto argFrmTy = llvm::dyn_cast<daphne::FrameType>(argTy)) {
+    } else if (auto argFrmTy = llvm::dyn_cast<daphne::FrameType>(argTy)) {
         std::vector<Type> argColTys = argFrmTy.getColumnTypes();
-        if(argColTys.size() == 0) {
+        if (argColTys.size() == 0) {
             resTy = daphne::FrameType::get(ctx, {});
             dictValTy = u;
-        }
-        else {
+        } else {
             std::vector<Type> resColTys(argColTys.size(), si64);
             resTy = daphne::FrameType::get(ctx, resColTys);
             dictValTy = nullptr;
             bool hasUnknownColTy = false;
-            for(Type argColTy : argColTys) {
-                if(llvm::isa<daphne::UnknownType>(argColTy))
+            for (Type argColTy : argColTys) {
+                if (llvm::isa<daphne::UnknownType>(argColTy))
                     hasUnknownColTy = true;
                 else {
-                    if(!dictValTy)
+                    if (!dictValTy)
                         dictValTy = argColTy;
-                    else if(argColTy != dictValTy)
-                        throw ErrorHandler::compilerError(
-                            getLoc(), "InferTypesOpInterface",
-                            "when a frame is used as the argument to recode, "
-                            "all its columns must have the same value type");
+                    else if (argColTy != dictValTy)
+                        throw ErrorHandler::compilerError(getLoc(), "InferTypesOpInterface",
+                                                          "when a frame is used as the argument to recode, "
+                                                          "all its columns must have the same value type");
                 }
             }
-            if(!dictValTy || hasUnknownColTy)
+            if (!dictValTy || hasUnknownColTy)
                 dictValTy = u;
         }
-    }
-    else if(llvm::isa<daphne::UnknownType>(argTy))
+    } else if (llvm::isa<daphne::UnknownType>(argTy))
         resTy = u;
     else
-        throw ErrorHandler::compilerError(
-            getLoc(), "InferTypesOpInterface",
-            "the argument to recode has an invalid type");
+        throw ErrorHandler::compilerError(getLoc(), "InferTypesOpInterface",
+                                          "the argument to recode has an invalid type");
 
     Type dictTy = daphne::MatrixType::get(ctx, dictValTy);
     return {resTy, dictTy};
 }
 
 std::vector<Type> daphne::Conv2DForwardOp::inferTypes() {
-    MLIRContext * ctx = getContext();
+    MLIRContext *ctx = getContext();
     Builder builder(ctx);
     auto restype = llvm::dyn_cast<daphne::MatrixType>(getInput().getType());
     auto restype2 = daphne::MatrixType::get(ctx, restype.getElementType());
 
-    // output matrix of same type as input, height/width dimensions as size/index type
+    // output matrix of same type as input, height/width dimensions as
+    // size/index type
     return {restype2, builder.getIndexType(), builder.getIndexType()};
 }
 
 std::vector<Type> daphne::AvgPoolForwardOp::inferTypes() {
-    MLIRContext * ctx = getContext();
+    MLIRContext *ctx = getContext();
     Builder builder(ctx);
     auto restype = llvm::dyn_cast<daphne::MatrixType>(getInput().getType());
     auto restype2 = daphne::MatrixType::get(ctx, restype.getElementType());
 
-    // output matrix of same type as input, height/width dimensions as size/index type
+    // output matrix of same type as input, height/width dimensions as
+    // size/index type
     return {restype2, builder.getIndexType(), builder.getIndexType()};
 }
 
 std::vector<Type> daphne::MaxPoolForwardOp::inferTypes() {
-    MLIRContext * ctx = getContext();
+    MLIRContext *ctx = getContext();
     Builder builder(ctx);
     auto restype = llvm::dyn_cast<daphne::MatrixType>(getInput().getType());
     auto restype2 = daphne::MatrixType::get(ctx, restype.getElementType());
 
-    // output matrix of same type as input, height/width dimensions as size/index type
+    // output matrix of same type as input, height/width dimensions as
+    // size/index type
     return {restype2, builder.getIndexType(), builder.getIndexType()};
 }
 
@@ -608,41 +588,32 @@ std::vector<Type> daphne::CreateListOp::inferTypes() {
     ValueRange elems = getElems();
     const size_t numElems = elems.size();
 
-    if(numElems == 0)
-        throw ErrorHandler::compilerError(
-            getLoc(),
-            "InferTypesOpInterface",
-            "type inference for CreateListOp requires at least one argument"
-        );
+    if (numElems == 0)
+        throw ErrorHandler::compilerError(getLoc(), "InferTypesOpInterface",
+                                          "type inference for CreateListOp requires at least one argument");
 
     // All elements must be matrices of the same value type.
     // If the type of some element is (still) unknown or if the data type
     // of some element is matrix, but the value type is (still) unknown,
     // then we ignore this element for now.
     Type etRes = nullptr;
-    for(size_t i = 0; i < numElems; i++) {
+    for (size_t i = 0; i < numElems; i++) {
         Type etCur = elems[i].getType();
-        if(etCur.isa<daphne::UnknownType>())
+        if (etCur.isa<daphne::UnknownType>())
             continue;
-        if(auto mtCur = etCur.dyn_cast<daphne::MatrixType>()) {
+        if (auto mtCur = etCur.dyn_cast<daphne::MatrixType>()) {
             Type vtCur = mtCur.getElementType();
-            if(vtCur.isa<daphne::UnknownType>())
+            if (vtCur.isa<daphne::UnknownType>())
                 continue;
-            else if(!etRes)
+            else if (!etRes)
                 etRes = mtCur.withSameElementType();
-            else if(etRes != mtCur.withSameElementType())
-                throw ErrorHandler::compilerError(
-                    getLoc(),
-                    "InferTypesOpInterface",
-                    "all arguments to CreateListOp must be matrices of the same value type"
-                );
-        }
-        else
-            throw ErrorHandler::compilerError(
-                getLoc(),
-                "InferTypesOpInterface",
-                "the arguments of CreateListOp must be matrices"
-            );
+            else if (etRes != mtCur.withSameElementType())
+                throw ErrorHandler::compilerError(getLoc(), "InferTypesOpInterface",
+                                                  "all arguments to CreateListOp must be matrices of the "
+                                                  "same value type");
+        } else
+            throw ErrorHandler::compilerError(getLoc(), "InferTypesOpInterface",
+                                              "the arguments of CreateListOp must be matrices");
     }
 
     return {daphne::ListType::get(getContext(), etRes)};
@@ -652,26 +623,23 @@ std::vector<Type> daphne::RemoveOp::inferTypes() {
     // The type of the first result is the same as that of the argument list.
     // The type of the second result is the element type of the argument list.
     Type argListTy = getArgList().getType();
-    if(auto lt = argListTy.dyn_cast<daphne::ListType>())
+    if (auto lt = argListTy.dyn_cast<daphne::ListType>())
         return {lt, lt.getElementType()};
     else
-        throw ErrorHandler::compilerError(
-            getLoc(),
-            "InferTypesOpInterface",
-            "RemoveOp expects a list as its first argument"
-        );
+        throw ErrorHandler::compilerError(getLoc(), "InferTypesOpInterface",
+                                          "RemoveOp expects a list as its first argument");
 }
 
 // ****************************************************************************
 // Type inference function
 // ****************************************************************************
 
-std::vector<Type> daphne::tryInferType(Operation* op) {
-    if(auto inferTypeOp = llvm::dyn_cast<daphne::InferTypes>(op))
+std::vector<Type> daphne::tryInferType(Operation *op) {
+    if (auto inferTypeOp = llvm::dyn_cast<daphne::InferTypes>(op))
         // If the operation implements the type inference interface,
         // we apply that.
         return inferTypeOp.inferTypes();
-    else if(op->getNumResults() == 1) {
+    else if (op->getNumResults() == 1) {
         // If the operation does not implement the type inference interface
         // and has exactly one result, we utilize its type inference traits.
         mlir::Type resTy;
@@ -683,50 +651,42 @@ std::vector<Type> daphne::tryInferType(Operation* op) {
             throw ErrorHandler::rethrowError("InferTypesOpInterface", e.what());
         }
         return {resTy};
-    }
-    else {
+    } else {
         // If the operation does not implement the type inference interface
         // and has zero or more than one results, we return unknowns.
         std::vector<Type> resTys;
-        for(size_t i = 0; i < op->getNumResults(); i++)
+        for (size_t i = 0; i < op->getNumResults(); i++)
             resTys.push_back(daphne::UnknownType::get(op->getContext()));
         return resTys;
     }
 }
 
-void daphne::setInferedTypes(Operation* op, bool partialInferenceAllowed) {
+void daphne::setInferedTypes(Operation *op, bool partialInferenceAllowed) {
     // Try to infer the types of all results of this operation.
     std::vector<Type> types;
     try {
         types = daphne::tryInferType(op);
-    }
-    catch (std::runtime_error& re) {
-        throw ErrorHandler::rethrowError(
-            "InferTypesOpInterface.cpp:" + std::to_string(__LINE__), re.what());
-    }
-    catch (...) {
-        throw ErrorHandler::rethrowError(
-            "InferTypesOpInterface.cpp:" + std::to_string(__LINE__), "Unknown exception.");
+    } catch (std::runtime_error &re) {
+        throw ErrorHandler::rethrowError("InferTypesOpInterface.cpp:" + std::to_string(__LINE__), re.what());
+    } catch (...) {
+        throw ErrorHandler::rethrowError("InferTypesOpInterface.cpp:" + std::to_string(__LINE__), "Unknown exception.");
     }
     const size_t numRes = op->getNumResults();
-    if(types.size() != numRes)
-        throw ErrorHandler::compilerError(
-            op->getLoc(), "InferTypesOpInterface",
-            "type inference for op " + op->getName().getStringRef().str() +
-                " returned " + std::to_string(types.size()) +
-                " types, but the op has " + std::to_string(numRes) +
-                " results");
+    if (types.size() != numRes)
+        throw ErrorHandler::compilerError(op->getLoc(), "InferTypesOpInterface",
+                                          "type inference for op " + op->getName().getStringRef().str() + " returned " +
+                                              std::to_string(types.size()) + " types, but the op has " +
+                                              std::to_string(numRes) + " results");
     // Set the inferred types on all results of this operation.
-    for(size_t i = 0; i < numRes; i++) {
+    for (size_t i = 0; i < numRes; i++) {
         if (llvm::isa<daphne::UnknownType>(types[i]) && !partialInferenceAllowed)
             // TODO As soon as the run-time can handle unknown
             // data/value types, we do not need to throw here anymore.
-            throw ErrorHandler::compilerError(
-                op->getLoc(), "InferTypesOpInterface",
-                "type inference returned an unknown result type "
-                "for some op, but partial inference is not allowed "
-                "at this point: " +
-                    op->getName().getStringRef().str());
+            throw ErrorHandler::compilerError(op->getLoc(), "InferTypesOpInterface",
+                                              "type inference returned an unknown result type "
+                                              "for some op, but partial inference is not allowed "
+                                              "at this point: " +
+                                                  op->getName().getStringRef().str());
         op->getResult(i).setType(types[i]);
     }
 }
diff --git a/src/ir/daphneir/DaphneInferTypesOpInterface.h b/src/ir/daphneir/DaphneInferTypesOpInterface.h
index eec0adebc..05f90d6d7 100644
--- a/src/ir/daphneir/DaphneInferTypesOpInterface.h
+++ b/src/ir/daphneir/DaphneInferTypesOpInterface.h
@@ -30,11 +30,11 @@
 // interface instead of using traits.
 
 namespace mlir::OpTrait {
-    
+
 // ============================================================================
 // Traits determining data type and value type separately
 // ============================================================================
-    
+
 // ----------------------------------------------------------------------------
 // Data type
 // ----------------------------------------------------------------------------
@@ -42,54 +42,49 @@ namespace mlir::OpTrait {
 /**
  * @brief The data type (of the single result) is always the same as the data
  * type of the first argument.
- * 
+ *
  * Assumes that the operation has always exactly one result.
  */
-template<class ConcreteOp>
-class DataTypeFromFirstArg : public TraitBase<ConcreteOp, DataTypeFromFirstArg> {};
+template <class ConcreteOp> class DataTypeFromFirstArg : public TraitBase<ConcreteOp, DataTypeFromFirstArg> {};
 
 /**
  * @brief The data type (of the single result) is the most general of the data
  * types of all arguments.
- * 
+ *
  * In that context, `Frame` is more general than `Matrix` is more general than
  * scalar. In other words:
  * - If any argument is of `Frame` data type, the result will be a `Frame`.
  * - Otherwise, if any argument is of `Matrix` data type, the result will be a
  *   `Matrix`.
  * - Otherwise, the result will be a scalar.
- * 
+ *
  * If the type of any argument is unknown, the type of the result is unknown.
- * 
+ *
  * Assumes that the operation has always exactly one result.
  */
-template<class ConcreteOp>
-class DataTypeFromArgs : public TraitBase<ConcreteOp, DataTypeFromArgs> {};
+template <class ConcreteOp> class DataTypeFromArgs : public TraitBase<ConcreteOp, DataTypeFromArgs> {};
 
 /**
  * @brief The data type (of the single result) is always scalar.
- * 
+ *
  * Assumes that the operation has always exactly one result.
  */
-template<class ConcreteOp>
-class DataTypeSca : public TraitBase<ConcreteOp, DataTypeSca> {};
+template <class ConcreteOp> class DataTypeSca : public TraitBase<ConcreteOp, DataTypeSca> {};
 
 /**
  * @brief The data type (of the single result) is always `Matrix`.
- * 
+ *
  * Assumes that the operation has always exactly one result.
  */
-template<class ConcreteOp>
-class DataTypeMat : public TraitBase<ConcreteOp, DataTypeMat> {};
+template <class ConcreteOp> class DataTypeMat : public TraitBase<ConcreteOp, DataTypeMat> {};
 
 /**
  * @brief The data type (of the single result) is always `Frame`.
- * 
+ *
  * Assumes that the operation has always exactly one result.
  */
-template<class ConcreteOp>
-class DataTypeFrm : public TraitBase<ConcreteOp, DataTypeFrm> {};
-    
+template <class ConcreteOp> class DataTypeFrm : public TraitBase<ConcreteOp, DataTypeFrm> {};
+
 // ----------------------------------------------------------------------------
 // Value type
 // ----------------------------------------------------------------------------
@@ -100,39 +95,36 @@ class DataTypeFrm : public TraitBase<ConcreteOp, DataTypeFrm> {};
  *
  * Like ValueTypeFromArgs, but if any argument is of string value type, then
  * the result value type is si64.
- * 
+ *
  * Assumes that the operation has always exactly one result.
  */
-template<class ConcreteOp>
-class ValueTypeCmp : public TraitBase<ConcreteOp, ValueTypeCmp> {};
+template <class ConcreteOp> class ValueTypeCmp : public TraitBase<ConcreteOp, ValueTypeCmp> {};
 
 /**
  * @brief The value type (of the single result) is always the same as the value
  * type of the first argument.
- * 
+ *
  * Assumes that the operation has always exactly one result.
  */
-template<class ConcreteOp>
-class ValueTypeFromFirstArg : public TraitBase<ConcreteOp, ValueTypeFromFirstArg> {};
+template <class ConcreteOp> class ValueTypeFromFirstArg : public TraitBase<ConcreteOp, ValueTypeFromFirstArg> {};
 
 // TODO Merge this trait with ValueTypeFromFirstArg into a parametric trait
 // ValueTypeFromArg<N>, see #487.
 /**
  * @brief The value type (of the single result) is always the same as the value
  * type of the third argument.
- * 
+ *
  * Assumes that the operation has always exactly one result.
  */
-template<class ConcreteOp>
-class ValueTypeFromThirdArg : public TraitBase<ConcreteOp, ValueTypeFromThirdArg> {};
+template <class ConcreteOp> class ValueTypeFromThirdArg : public TraitBase<ConcreteOp, ValueTypeFromThirdArg> {};
 
 /**
  * @brief The value type (of the single result) is the most general of the
  * value types of all arguments.
- * 
+ *
  * If the result data type is `Frame`, the most general value type is
  * determined for each column separately.
- * 
+ *
  * In that context:
  * - `str` is the most general value type.
  * - Floating-point types are more general than integer types.
@@ -141,7 +133,7 @@ class ValueTypeFromThirdArg : public TraitBase<ConcreteOp, ValueTypeFromThirdArg
  * - The unsigned integer type of a certain bit width is more general than the
  *   signed integer type of the same bit width.
  * - `bool` is treated as an unsigned integer of 1 bit.
- * 
+ *
  * Examples:
  * - [argument value types...] -> result value type
  * - [`str`, `si64`] -> `str`
@@ -149,41 +141,38 @@ class ValueTypeFromThirdArg : public TraitBase<ConcreteOp, ValueTypeFromThirdArg
  * - [`f64`, `f32`] -> `f64`
  * - [`ui64`, `si32`] -> `ui64`
  * - [`ui64`, `si64`] -> `ui64`
- * 
+ *
  * Assumes that the operation has always exactly one result.
  */
-template<class ConcreteOp>
-class ValueTypeFromArgs : public TraitBase<ConcreteOp, ValueTypeFromArgs> {};
+template <class ConcreteOp> class ValueTypeFromArgs : public TraitBase<ConcreteOp, ValueTypeFromArgs> {};
 
 /**
  * @brief Like `ValueTypeFromArgs`, but if the outcome is not a floating-point
  * type, it is replaced by the most general floating-point type.
- * 
+ *
  * If the data type of the single result is `Frame`, this replacement is
  * applied for each column value type separately.
- * 
+ *
  * Assumes that the operation has always exactly one result.
  */
-template<class ConcreteOp>
-class ValueTypeFromArgsFP : public TraitBase<ConcreteOp, ValueTypeFromArgsFP> {};
+template <class ConcreteOp> class ValueTypeFromArgsFP : public TraitBase<ConcreteOp, ValueTypeFromArgsFP> {};
 
 /**
  * @brief Like `ValueTypeFromArgs`, but if the outcome is not an integer
  * type, it is replaced by the most general integer type.
- * 
+ *
  * If the data type of the single result is `Frame`, this replacement is
  * applied for each column value type separately.
- * 
+ *
  * Assumes that the operation has always exactly one result.
  */
-template<class ConcreteOp>
-class ValueTypeFromArgsInt : public TraitBase<ConcreteOp, ValueTypeFromArgsInt> {};
+template <class ConcreteOp> class ValueTypeFromArgsInt : public TraitBase<ConcreteOp, ValueTypeFromArgsInt> {};
 
 /**
  * @brief The value type (of the single result) reflects a horizontal
  * concatenation of the first two arguments, i.e., column types are
  * concatenated.
- * 
+ *
  * If the data type of the single result is `Frame`, then:
  * - the column value types are obtained by concatenating the column value
  *   types of the first two arguments
@@ -194,30 +183,26 @@ class ValueTypeFromArgsInt : public TraitBase<ConcreteOp, ValueTypeFromArgsInt>
  * Otherwise:
  * - this trait falls back to `ValueTypeFromArgs` limited to the first two
  *   arguments
- * 
+ *
  * Assumes that the operation has always exactly one result.
  */
-template<class ConcreteOp>
-class ValueTypesConcat : public TraitBase<ConcreteOp, ValueTypesConcat> {};
+template <class ConcreteOp> class ValueTypesConcat : public TraitBase<ConcreteOp, ValueTypesConcat> {};
 
 /**
  * @brief The value type (of the single result) is `SI64`.
  */
-template<class ConcreteOp>
-class ValueTypeSI64 : public TraitBase<ConcreteOp, ValueTypeSI64> {};
+template <class ConcreteOp> class ValueTypeSI64 : public TraitBase<ConcreteOp, ValueTypeSI64> {};
 
 /**
  * @brief The value type (of the single result) is `Size`.
  */
-template<class ConcreteOp>
-class ValueTypeSize : public TraitBase<ConcreteOp, ValueTypeSize> {};
+template <class ConcreteOp> class ValueTypeSize : public TraitBase<ConcreteOp, ValueTypeSize> {};
 
 /**
  * @brief The value type (of the single result) is `String`.
  */
-template<class ConcreteOp>
-class ValueTypeStr : public TraitBase<ConcreteOp, ValueTypeStr> {};
-    
+template <class ConcreteOp> class ValueTypeStr : public TraitBase<ConcreteOp, ValueTypeStr> {};
+
 // ============================================================================
 // Traits determining data type and value type together
 // ============================================================================
@@ -225,13 +210,12 @@ class ValueTypeStr : public TraitBase<ConcreteOp, ValueTypeStr> {};
 /**
  * @brief The data type and value type (of the single result) are always the
  * same as the data type and value type of the first argument.
- * 
+ *
  * Assumes that the operation has always exactly one result.
  */
-template<class ConcreteOp>
-class TypeFromFirstArg : public TraitBase<ConcreteOp, TypeFromFirstArg> {};
+template <class ConcreteOp> class TypeFromFirstArg : public TraitBase<ConcreteOp, TypeFromFirstArg> {};
 
-}
+} // namespace mlir::OpTrait
 
 // ****************************************************************************
 // Type inference interface
@@ -248,29 +232,29 @@ namespace mlir::daphne {
 namespace mlir::daphne {
 /**
  * @brief Tries to infer the type of all results of the given operation.
- * 
+ *
  * If any type inference traits are attached to the given operation, these are
  * applied to infer the result type. If the operation implements the type
  * inference interface, that implementation is invoked. If the types cannot be
  * infered based on the available information, or if the operation does not
  * have any relevant traits or interfaces, `mlir::daphne::UnknownType` will be
  * returned.
- * 
+ *
  * @param op The operation whose results' types shall be infered.
  * @return A vector of `Type`s. The i-th pair in this vector represents the
  * type of the i-th result of the given operation. A value of
  * `mlir::daphne::UnknownType` indicates that this type is not known (yet).
  */
-std::vector<mlir::Type> tryInferType(mlir::Operation* op);
+std::vector<mlir::Type> tryInferType(mlir::Operation *op);
 
 /**
  * @brief Infers and sets the types of all results of the given operation.
- * 
+ *
  * @param op The operation whose results' types shall be infered and set.
  * @param partialInferenceAllowed If `true`, unknown will be allowed as an
  * infered type; if `false`, infering unknown will throw an exception.
-*/
-void setInferedTypes(mlir::Operation* op, bool partialInferenceAllowed = true);
-}
+ */
+void setInferedTypes(mlir::Operation *op, bool partialInferenceAllowed = true);
+} // namespace mlir::daphne
 
 #endif // SRC_IR_DAPHNEIR_DAPHNEINFERTYPESOPINTERFACE_H
diff --git a/src/ir/daphneir/DaphneVectorizableOpInterface.cpp b/src/ir/daphneir/DaphneVectorizableOpInterface.cpp
index 9785c9ba3..cf835d368 100644
--- a/src/ir/daphneir/DaphneVectorizableOpInterface.cpp
+++ b/src/ir/daphneir/DaphneVectorizableOpInterface.cpp
@@ -19,8 +19,7 @@
 
 #include <vector>
 
-namespace mlir::daphne
-{
+namespace mlir::daphne {
 #include <ir/daphneir/DaphneVectorizableOpInterface.cpp.inc>
 }
 
@@ -31,24 +30,19 @@ using namespace mlir;
 // ****************************************************************************
 // For families of operations.
 
-template<class EwBinaryOp>
-std::vector<daphne::VectorSplit> getVectorSplits_EwBinaryOp(EwBinaryOp *op)
-{
+template <class EwBinaryOp> std::vector<daphne::VectorSplit> getVectorSplits_EwBinaryOp(EwBinaryOp *op) {
     // Matrix -> row-wise, Scalar -> none
-    auto lhsSplit =
-        op->getLhs().getType().template isa<daphne::MatrixType>() ? daphne::VectorSplit::ROWS : daphne::VectorSplit::NONE;
-    auto rhsSplit =
-        op->getRhs().getType().template isa<daphne::MatrixType>() ? daphne::VectorSplit::ROWS : daphne::VectorSplit::NONE;
+    auto lhsSplit = op->getLhs().getType().template isa<daphne::MatrixType>() ? daphne::VectorSplit::ROWS
+                                                                              : daphne::VectorSplit::NONE;
+    auto rhsSplit = op->getRhs().getType().template isa<daphne::MatrixType>() ? daphne::VectorSplit::ROWS
+                                                                              : daphne::VectorSplit::NONE;
     return {lhsSplit, rhsSplit};
 }
-template<class EwBinaryOp>
-std::vector<daphne::VectorCombine> getVectorCombines_EwBinaryOp(EwBinaryOp *op)
-{
+template <class EwBinaryOp> std::vector<daphne::VectorCombine> getVectorCombines_EwBinaryOp(EwBinaryOp *op) {
     return {daphne::VectorCombine::ROWS};
 }
-template<class EwBinaryOp>
-std::vector<std::pair<Value, Value>> createOpsOutputSizes_EwBinaryOp(EwBinaryOp *op, OpBuilder &builder)
-{
+template <class EwBinaryOp>
+std::vector<std::pair<Value, Value>> createOpsOutputSizes_EwBinaryOp(EwBinaryOp *op, OpBuilder &builder) {
     auto loc = op->getLoc();
     auto sizeTy = builder.getIndexType();
     auto lhsRows = builder.create<daphne::NumRowsOp>(loc, sizeTy, op->getLhs());
@@ -56,19 +50,14 @@ std::vector<std::pair<Value, Value>> createOpsOutputSizes_EwBinaryOp(EwBinaryOp
     // TODO: do max on #rows/#cols of lhs and rhs for broadcasting
     return {{lhsRows, lhsCols}};
 }
-template<class EwUnaryOp>
-std::vector<daphne::VectorSplit> getVectorSplits_EwUnaryOp(EwUnaryOp *op)
-{
+template <class EwUnaryOp> std::vector<daphne::VectorSplit> getVectorSplits_EwUnaryOp(EwUnaryOp *op) {
     return {daphne::VectorSplit::ROWS};
 }
-template<class EwUnaryOp>
-std::vector<daphne::VectorCombine> getVectorCombines_EwUnaryOp(EwUnaryOp *op)
-{
+template <class EwUnaryOp> std::vector<daphne::VectorCombine> getVectorCombines_EwUnaryOp(EwUnaryOp *op) {
     return {daphne::VectorCombine::ROWS};
 }
-template<class EwUnaryOp>
-std::vector<std::pair<Value, Value>> createOpsOutputSizes_EwUnaryOp(EwUnaryOp *op, OpBuilder &builder)
-{
+template <class EwUnaryOp>
+std::vector<std::pair<Value, Value>> createOpsOutputSizes_EwUnaryOp(EwUnaryOp *op, OpBuilder &builder) {
     auto loc = op->getLoc();
     auto sizeTy = builder.getIndexType();
     auto rows = builder.create<daphne::NumRowsOp>(loc, sizeTy, op->getArg());
@@ -76,33 +65,25 @@ std::vector<std::pair<Value, Value>> createOpsOutputSizes_EwUnaryOp(EwUnaryOp *o
     // TODO: do max on #rows/#cols of lhs and rhs for broadcasting
     return {{rows, cols}};
 }
-template<class RowAggOp>
-std::vector<daphne::VectorSplit> getVectorSplits_RowAggOp(RowAggOp *op)
-{
+template <class RowAggOp> std::vector<daphne::VectorSplit> getVectorSplits_RowAggOp(RowAggOp *op) {
     return {daphne::VectorSplit::ROWS};
 }
-template<class RowAggOp>
-std::vector<daphne::VectorCombine> getVectorCombines_RowAggOp(RowAggOp *op)
-{
+template <class RowAggOp> std::vector<daphne::VectorCombine> getVectorCombines_RowAggOp(RowAggOp *op) {
     return {daphne::VectorCombine::ROWS};
 }
-template<class RowAggOp>
-std::vector<std::pair<Value, Value>> createOpsOutputSizes_RowAggOp(RowAggOp *op, OpBuilder &builder)
-{
+template <class RowAggOp>
+std::vector<std::pair<Value, Value>> createOpsOutputSizes_RowAggOp(RowAggOp *op, OpBuilder &builder) {
     auto loc = op->getLoc();
     auto sizeTy = builder.getIndexType();
     auto rows = builder.create<daphne::NumRowsOp>(loc, sizeTy, op->getArg());
     auto cst1 = builder.create<daphne::ConstantOp>(loc, sizeTy, builder.getIndexAttr(1l));
     return {{rows, cst1}};
 }
-template<class ColAggOp>
-std::vector<daphne::VectorSplit> getVectorSplits_ColAggOp(ColAggOp *op)
-{
+template <class ColAggOp> std::vector<daphne::VectorSplit> getVectorSplits_ColAggOp(ColAggOp *op) {
     return {daphne::VectorSplit::ROWS};
 }
-template<class ColAggOp>
-std::vector<std::pair<Value, Value>> createOpsOutputSizes_ColAggOp(ColAggOp *op, OpBuilder &builder)
-{
+template <class ColAggOp>
+std::vector<std::pair<Value, Value>> createOpsOutputSizes_ColAggOp(ColAggOp *op, OpBuilder &builder) {
     auto loc = op->getLoc();
     auto sizeTy = builder.getIndexType();
     auto cst1 = builder.create<daphne::ConstantOp>(loc, sizeTy, builder.getIndexAttr(1l));
@@ -116,8 +97,7 @@ std::vector<std::pair<Value, Value>> createOpsOutputSizes_ColAggOp(ColAggOp *op,
 
 // ----------------------------------------------------------------------------
 // Matrix multiplication
-std::vector<daphne::VectorSplit> daphne::MatMulOp::getVectorSplits()
-{
+std::vector<daphne::VectorSplit> daphne::MatMulOp::getVectorSplits() {
     return {
         daphne::VectorSplit::ROWS, // lhs
         daphne::VectorSplit::NONE, // rhs
@@ -125,34 +105,27 @@ std::vector<daphne::VectorSplit> daphne::MatMulOp::getVectorSplits()
         daphne::VectorSplit::NONE  // transb
     };
 }
-std::vector<daphne::VectorCombine> daphne::MatMulOp::getVectorCombines()
-{
-    return {daphne::VectorCombine::ROWS};
-}
-std::vector<std::pair<Value, Value>> daphne::MatMulOp::createOpsOutputSizes(OpBuilder &builder)
-{
+std::vector<daphne::VectorCombine> daphne::MatMulOp::getVectorCombines() { return {daphne::VectorCombine::ROWS}; }
+std::vector<std::pair<Value, Value>> daphne::MatMulOp::createOpsOutputSizes(OpBuilder &builder) {
     auto loc = getLoc();
     auto sizeTy = builder.getIndexType();
 
     Value rows;
-    bool ta = CompilerUtils::constantOrThrow<bool>(
-            getTransa(),
-            "VectorizableOpInterface::createOpsOutputSizes() for MatMulOp cannot know the number "
-            "of rows of the result, because it is not known if the lhs input is transposed"
-    );
-    rows = ta
-            ? builder.create<daphne::NumColsOp>(loc, sizeTy, getLhs()).getResult()
-            : builder.create<daphne::NumRowsOp>(loc, sizeTy, getLhs()).getResult();
-    
+    bool ta = CompilerUtils::constantOrThrow<bool>(getTransa(), "VectorizableOpInterface::createOpsOutputSizes() for "
+                                                                "MatMulOp cannot know the number "
+                                                                "of rows of the result, because it is not known if the "
+                                                                "lhs input is transposed");
+    rows = ta ? builder.create<daphne::NumColsOp>(loc, sizeTy, getLhs()).getResult()
+              : builder.create<daphne::NumRowsOp>(loc, sizeTy, getLhs()).getResult();
+
     Value cols;
-    bool tb = CompilerUtils::constantOrThrow<bool>(
-            getTransb(),
-            "VectorizableOpInterface::createOpsOutputSizes() for MatMulOp cannot know the number "
-            "of columns of the result, because it is not known if the rhs input is transposed"
-    );
-    cols = tb
-            ? builder.create<daphne::NumRowsOp>(loc, sizeTy, getRhs()).getResult()
-            : builder.create<daphne::NumColsOp>(loc, sizeTy, getRhs()).getResult();
+    bool tb =
+        CompilerUtils::constantOrThrow<bool>(getTransb(), "VectorizableOpInterface::createOpsOutputSizes() for "
+                                                          "MatMulOp cannot know the number "
+                                                          "of columns of the result, because it is not known if the "
+                                                          "rhs input is transposed");
+    cols = tb ? builder.create<daphne::NumRowsOp>(loc, sizeTy, getRhs()).getResult()
+              : builder.create<daphne::NumColsOp>(loc, sizeTy, getRhs()).getResult();
 
     return {{rows, cols}};
 }
@@ -160,15 +133,11 @@ std::vector<std::pair<Value, Value>> daphne::MatMulOp::createOpsOutputSizes(OpBu
 
 // ----------------------------------------------------------------------------
 // Binary
-#define IMPL_SPLIT_COMBINE_EWBINARYOP(OP) \
-    std::vector<daphne::VectorSplit> daphne::OP::getVectorSplits() { \
-        return getVectorSplits_EwBinaryOp(this); \
-    } \
-    std::vector<daphne::VectorCombine> daphne::OP::getVectorCombines() { \
-        return getVectorCombines_EwBinaryOp(this); \
-    } \
-    std::vector<std::pair<Value, Value>> daphne::OP::createOpsOutputSizes(OpBuilder &builder) { \
-        return createOpsOutputSizes_EwBinaryOp(this, builder); \
+#define IMPL_SPLIT_COMBINE_EWBINARYOP(OP)                                                                              \
+    std::vector<daphne::VectorSplit> daphne::OP::getVectorSplits() { return getVectorSplits_EwBinaryOp(this); }        \
+    std::vector<daphne::VectorCombine> daphne::OP::getVectorCombines() { return getVectorCombines_EwBinaryOp(this); }  \
+    std::vector<std::pair<Value, Value>> daphne::OP::createOpsOutputSizes(OpBuilder &builder) {                        \
+        return createOpsOutputSizes_EwBinaryOp(this, builder);                                                         \
     }
 
 // Arithmetic
@@ -207,15 +176,11 @@ IMPL_SPLIT_COMBINE_EWBINARYOP(EwGeOp)
 
 // ----------------------------------------------------------------------------
 // Unary
-#define IMPL_SPLIT_COMBINE_EWUNARYOP(OP) \
-    std::vector<daphne::VectorSplit> daphne::OP::getVectorSplits() { \
-        return getVectorSplits_EwUnaryOp(this); \
-    } \
-    std::vector<daphne::VectorCombine> daphne::OP::getVectorCombines() { \
-        return getVectorCombines_EwUnaryOp(this); \
-    } \
-    std::vector<std::pair<Value, Value>> daphne::OP::createOpsOutputSizes(OpBuilder &builder) { \
-        return createOpsOutputSizes_EwUnaryOp(this, builder); \
+#define IMPL_SPLIT_COMBINE_EWUNARYOP(OP)                                                                               \
+    std::vector<daphne::VectorSplit> daphne::OP::getVectorSplits() { return getVectorSplits_EwUnaryOp(this); }         \
+    std::vector<daphne::VectorCombine> daphne::OP::getVectorCombines() { return getVectorCombines_EwUnaryOp(this); }   \
+    std::vector<std::pair<Value, Value>> daphne::OP::createOpsOutputSizes(OpBuilder &builder) {                        \
+        return createOpsOutputSizes_EwUnaryOp(this, builder);                                                          \
     }
 
 IMPL_SPLIT_COMBINE_EWUNARYOP(EwSqrtOp)
@@ -226,22 +191,16 @@ IMPL_SPLIT_COMBINE_EWUNARYOP(EwSqrtOp)
 // ----------------------------------------------------------------------------
 // Aggregations
 // TODO: splitting and combining by column probably makes more sense
-#define IMPL_SPLIT_COMBINE_ROWAGG(OP) \
-    std::vector<daphne::VectorSplit> daphne::OP::getVectorSplits() { \
-        return getVectorSplits_RowAggOp(this); \
-    } \
-    std::vector<daphne::VectorCombine> daphne::OP::getVectorCombines() { \
-        return getVectorCombines_RowAggOp(this); \
-    } \
-    std::vector<std::pair<Value, Value>> daphne::OP::createOpsOutputSizes(OpBuilder &builder) { \
-        return createOpsOutputSizes_RowAggOp(this, builder); \
+#define IMPL_SPLIT_COMBINE_ROWAGG(OP)                                                                                  \
+    std::vector<daphne::VectorSplit> daphne::OP::getVectorSplits() { return getVectorSplits_RowAggOp(this); }          \
+    std::vector<daphne::VectorCombine> daphne::OP::getVectorCombines() { return getVectorCombines_RowAggOp(this); }    \
+    std::vector<std::pair<Value, Value>> daphne::OP::createOpsOutputSizes(OpBuilder &builder) {                        \
+        return createOpsOutputSizes_RowAggOp(this, builder);                                                           \
     }
-#define IMPL_SPLIT_COMBINE_COLAGG(OP) \
-    std::vector<daphne::VectorSplit> daphne::OP::getVectorSplits() { \
-        return getVectorSplits_ColAggOp(this); \
-    } \
-    std::vector<std::pair<Value, Value>> daphne::OP::createOpsOutputSizes(OpBuilder &builder) { \
-        return createOpsOutputSizes_ColAggOp(this, builder); \
+#define IMPL_SPLIT_COMBINE_COLAGG(OP)                                                                                  \
+    std::vector<daphne::VectorSplit> daphne::OP::getVectorSplits() { return getVectorSplits_ColAggOp(this); }          \
+    std::vector<std::pair<Value, Value>> daphne::OP::createOpsOutputSizes(OpBuilder &builder) {                        \
+        return createOpsOutputSizes_ColAggOp(this, builder);                                                           \
     }
 
 // RowAgg
@@ -250,32 +209,24 @@ IMPL_SPLIT_COMBINE_ROWAGG(RowAggMaxOp)
 IMPL_SPLIT_COMBINE_ROWAGG(RowAggSumOp)
 
 IMPL_SPLIT_COMBINE_COLAGG(ColAggSumOp)
-std::vector<daphne::VectorCombine> daphne::ColAggSumOp::getVectorCombines()
-{
-    return {daphne::VectorCombine::ADD};
-}
+std::vector<daphne::VectorCombine> daphne::ColAggSumOp::getVectorCombines() { return {daphne::VectorCombine::ADD}; }
 
 #undef IMPL_SPLIT_COMBINE_ROWAGG
 #undef IMPL_SPLIT_COMBINE_COLAGG
 // ----------------------------------------------------------------------------
 
-
 // ----------------------------------------------------------------------------
 // Left and right indexing
-std::vector<daphne::VectorSplit> daphne::ExtractColOp::getVectorSplits()
-{
+std::vector<daphne::VectorSplit> daphne::ExtractColOp::getVectorSplits() {
     return {daphne::VectorSplit::ROWS, daphne::VectorSplit::NONE};
 }
-std::vector<daphne::VectorCombine> daphne::ExtractColOp::getVectorCombines()
-{
-    return {daphne::VectorCombine::ROWS};
-}
-std::vector<std::pair<Value, Value>> daphne::ExtractColOp::createOpsOutputSizes(OpBuilder &builder)
-{
+std::vector<daphne::VectorCombine> daphne::ExtractColOp::getVectorCombines() { return {daphne::VectorCombine::ROWS}; }
+std::vector<std::pair<Value, Value>> daphne::ExtractColOp::createOpsOutputSizes(OpBuilder &builder) {
     auto loc = getLoc();
     auto sizeTy = builder.getIndexType();
     auto rows = builder.create<daphne::NumRowsOp>(loc, sizeTy, getSource());
-    // TODO: support scalar and maybe (based on definition of `ExtractColOp`) apply some kind of `unique()` op
+    // TODO: support scalar and maybe (based on definition of `ExtractColOp`)
+    // apply some kind of `unique()` op
     auto cols = builder.create<daphne::NumRowsOp>(loc, sizeTy, getSelectedCols());
     return {{rows, cols}};
 }
@@ -283,16 +234,9 @@ std::vector<std::pair<Value, Value>> daphne::ExtractColOp::createOpsOutputSizes(
 
 // ----------------------------------------------------------------------------
 // Reorganization
-std::vector<daphne::VectorSplit> daphne::TransposeOp::getVectorSplits()
-{
-    return {daphne::VectorSplit::ROWS};
-}
-std::vector<daphne::VectorCombine> daphne::TransposeOp::getVectorCombines()
-{
-    return {daphne::VectorCombine::COLS};
-}
-std::vector<std::pair<Value, Value>> daphne::TransposeOp::createOpsOutputSizes(OpBuilder &builder)
-{
+std::vector<daphne::VectorSplit> daphne::TransposeOp::getVectorSplits() { return {daphne::VectorSplit::ROWS}; }
+std::vector<daphne::VectorCombine> daphne::TransposeOp::getVectorCombines() { return {daphne::VectorCombine::COLS}; }
+std::vector<std::pair<Value, Value>> daphne::TransposeOp::createOpsOutputSizes(OpBuilder &builder) {
     auto loc = getLoc();
     auto sizeTy = builder.getIndexType();
     auto rows = builder.create<daphne::NumRowsOp>(loc, sizeTy, getArg());
@@ -300,42 +244,29 @@ std::vector<std::pair<Value, Value>> daphne::TransposeOp::createOpsOutputSizes(O
     return {{cols, rows}};
 }
 
-std::vector<daphne::VectorSplit> daphne::ColBindOp::getVectorSplits()
-{
+std::vector<daphne::VectorSplit> daphne::ColBindOp::getVectorSplits() {
     return {daphne::VectorSplit::ROWS, daphne::VectorSplit::ROWS};
 }
-std::vector<daphne::VectorCombine> daphne::ColBindOp::getVectorCombines()
-{
-    return {daphne::VectorCombine::ROWS};
-}
-std::vector<std::pair<Value, Value>> daphne::ColBindOp::createOpsOutputSizes(OpBuilder &builder)
-{
+std::vector<daphne::VectorCombine> daphne::ColBindOp::getVectorCombines() { return {daphne::VectorCombine::ROWS}; }
+std::vector<std::pair<Value, Value>> daphne::ColBindOp::createOpsOutputSizes(OpBuilder &builder) {
     auto loc = getLoc();
     auto i64Ty = builder.getIntegerType(64, true);
     auto sizeTy = builder.getIndexType();
     auto rows = builder.create<daphne::NumRowsOp>(loc, sizeTy, getLhs());
     auto colsLhs = builder.create<daphne::NumColsOp>(loc, sizeTy, getLhs());
     auto colsRhs = builder.create<daphne::NumColsOp>(loc, sizeTy, getRhs());
-    return {{rows, builder.create<daphne::CastOp>(loc,
-        sizeTy,
-        builder.create<daphne::EwAddOp>(loc,
-            builder.create<daphne::CastOp>(loc, i64Ty, colsLhs),
-            builder.create<daphne::CastOp>(loc, i64Ty, colsRhs)))}};
+    return {{rows, builder.create<daphne::CastOp>(
+                       loc, sizeTy,
+                       builder.create<daphne::EwAddOp>(loc, builder.create<daphne::CastOp>(loc, i64Ty, colsLhs),
+                                                       builder.create<daphne::CastOp>(loc, i64Ty, colsRhs)))}};
 }
 // ----------------------------------------------------------------------------
 
 // ----------------------------------------------------------------------------
 // Other
-std::vector<daphne::VectorSplit> daphne::SyrkOp::getVectorSplits()
-{
-    return {daphne::VectorSplit::ROWS};
-}
-std::vector<daphne::VectorCombine> daphne::SyrkOp::getVectorCombines()
-{
-    return {daphne::VectorCombine::ADD};
-}
-std::vector<std::pair<Value, Value>> daphne::SyrkOp::createOpsOutputSizes(OpBuilder &builder)
-{
+std::vector<daphne::VectorSplit> daphne::SyrkOp::getVectorSplits() { return {daphne::VectorSplit::ROWS}; }
+std::vector<daphne::VectorCombine> daphne::SyrkOp::getVectorCombines() { return {daphne::VectorCombine::ADD}; }
+std::vector<std::pair<Value, Value>> daphne::SyrkOp::createOpsOutputSizes(OpBuilder &builder) {
     auto loc = getLoc();
     auto sizeTy = builder.getIndexType();
     auto cols = builder.create<daphne::NumColsOp>(loc, sizeTy, getArg());
@@ -343,16 +274,11 @@ std::vector<std::pair<Value, Value>> daphne::SyrkOp::createOpsOutputSizes(OpBuil
     return {{cols, cols}};
 }
 
-std::vector<daphne::VectorSplit> daphne::GemvOp::getVectorSplits()
-{
+std::vector<daphne::VectorSplit> daphne::GemvOp::getVectorSplits() {
     return {daphne::VectorSplit::ROWS, daphne::VectorSplit::ROWS};
 }
-std::vector<daphne::VectorCombine> daphne::GemvOp::getVectorCombines()
-{
-    return {daphne::VectorCombine::ADD};
-}
-std::vector<std::pair<Value, Value>> daphne::GemvOp::createOpsOutputSizes(OpBuilder &builder)
-{
+std::vector<daphne::VectorCombine> daphne::GemvOp::getVectorCombines() { return {daphne::VectorCombine::ADD}; }
+std::vector<std::pair<Value, Value>> daphne::GemvOp::createOpsOutputSizes(OpBuilder &builder) {
     auto loc = getLoc();
     auto sizeTy = builder.getIndexType();
     auto cols = builder.create<daphne::NumColsOp>(loc, sizeTy, getMat());
diff --git a/src/ir/daphneir/DaphneVectorizableOpInterface.h b/src/ir/daphneir/DaphneVectorizableOpInterface.h
index d3f38df9b..0fbd5925e 100644
--- a/src/ir/daphneir/DaphneVectorizableOpInterface.h
+++ b/src/ir/daphneir/DaphneVectorizableOpInterface.h
@@ -17,10 +17,8 @@
 #ifndef SRC_IR_DAPHNEIR_DAPHNEVECTORIZABLEOPINTERFACE_H
 #define SRC_IR_DAPHNEIR_DAPHNEVECTORIZABLEOPINTERFACE_H
 namespace mlir::OpTrait {
-    template<class ConcreteOp>
-    class CUDASupport : public TraitBase<ConcreteOp, CUDASupport> {
-    };
-}
+template <class ConcreteOp> class CUDASupport : public TraitBase<ConcreteOp, CUDASupport> {};
+} // namespace mlir::OpTrait
 namespace mlir::daphne {
 #include <ir/daphneir/DaphneVectorizableOpInterface.h.inc>
 }
diff --git a/src/ir/daphneir/Fold.cpp b/src/ir/daphneir/Fold.cpp
new file mode 100644
index 000000000..7320ec54d
--- /dev/null
+++ b/src/ir/daphneir/Fold.cpp
@@ -0,0 +1,515 @@
+/*
+ * Copyright 2024 The DAPHNE Consortium
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "mlir/IR/OpDefinition.h"
+#include <ir/daphneir/Daphne.h>
+#include <ir/daphneir/DaphneOps.cpp.inc>
+#include <util/ErrorHandler.h>
+
+mlir::Attribute performCast(mlir::Attribute attr, mlir::Type targetType, mlir::Location loc) {
+    if (auto intAttr = attr.dyn_cast<mlir::IntegerAttr>()) {
+        auto apInt = intAttr.getValue();
+
+        if (auto outTy = targetType.dyn_cast<mlir::IntegerType>()) {
+            // Extend or truncate the integer value based on the target type
+            if (outTy.isUnsignedInteger()) {
+                apInt = apInt.zextOrTrunc(outTy.getWidth());
+            } else if (outTy.isSignedInteger()) {
+                apInt = (intAttr.getType().isSignedInteger()) ? apInt.sextOrTrunc(outTy.getWidth())
+                                                              : apInt.zextOrTrunc(outTy.getWidth());
+            }
+            return mlir::IntegerAttr::getChecked(loc, outTy, apInt);
+        }
+
+        if (auto outTy = targetType.dyn_cast<mlir::IndexType>()) {
+            return mlir::IntegerAttr::getChecked(loc, outTy, apInt);
+        }
+
+        if (targetType.isF64()) {
+            if (intAttr.getType().isSignedInteger()) {
+                return mlir::FloatAttr::getChecked(loc, targetType, llvm::APIntOps::RoundSignedAPIntToDouble(apInt));
+            }
+            if (intAttr.getType().isUnsignedInteger() || intAttr.getType().isIndex()) {
+                return mlir::FloatAttr::getChecked(loc, targetType, llvm::APIntOps::RoundAPIntToDouble(apInt));
+            }
+        }
+
+        if (targetType.isF32()) {
+            if (intAttr.getType().isSignedInteger()) {
+                return mlir::FloatAttr::getChecked(loc, targetType, llvm::APIntOps::RoundSignedAPIntToFloat(apInt));
+            }
+            if (intAttr.getType().isUnsignedInteger()) {
+                return mlir::FloatAttr::get(targetType, llvm::APIntOps::RoundAPIntToFloat(apInt));
+            }
+        }
+    } else if (auto floatAttr = attr.dyn_cast<mlir::FloatAttr>()) {
+        auto val = floatAttr.getValueAsDouble();
+
+        if (targetType.isF64()) {
+            return mlir::FloatAttr::getChecked(loc, targetType, val);
+        }
+        if (targetType.isF32()) {
+            return mlir::FloatAttr::getChecked(loc, targetType, static_cast<float>(val));
+        }
+        if (targetType.isIntOrIndex()) {
+            auto num = static_cast<int64_t>(val);
+            return mlir::IntegerAttr::getChecked(loc, targetType, num);
+        }
+    }
+
+    // If casting is not possible, return the original attribute
+    return {};
+}
+
+template <class AttrElementT, class ElementValueT = typename AttrElementT::ValueType,
+          class CalculationT = std::function<ElementValueT(const ElementValueT &)>>
+mlir::Attribute constFoldUnaryOp(mlir::Location loc, mlir::Type resultType, llvm::ArrayRef<mlir::Attribute> operands,
+                                 const CalculationT &calculate) {
+    if (operands.size() != 1)
+        throw ErrorHandler::compilerError(loc, "CanonicalizerPass (constFoldUnaryOp)",
+                                          "unary op takes one operand but " + std::to_string(operands.size()) +
+                                              " were given");
+
+    if (!operands[0])
+        return {};
+
+    if (llvm::isa<AttrElementT>(operands[0])) {
+        auto operand = operands[0].cast<AttrElementT>();
+
+        return AttrElementT::get(resultType, calculate(operand.getValue()));
+    }
+    return {};
+}
+
+template <class ArgAttrElementT, class ResAttrElementT = ArgAttrElementT,
+          class ArgElementValueT = typename ArgAttrElementT::ValueType,
+          class ResElementValueT = typename ResAttrElementT::ValueType,
+          class CalculationT = std::function<ResElementValueT(const ArgElementValueT &, const ArgElementValueT &)>>
+mlir::Attribute constFoldBinaryOp(mlir::Location loc, mlir::Type resultType, llvm::ArrayRef<mlir::Attribute> operands,
+                                  const CalculationT &calculate) {
+    if (operands.size() != 2)
+        throw ErrorHandler::compilerError(loc, "CanonicalizerPass (constFoldBinaryOp)",
+                                          "binary op takes two operands but " + std::to_string(operands.size()) +
+                                              " were given");
+
+    if (!operands[0] || !operands[1])
+        return {};
+
+    if (llvm::isa<ArgAttrElementT>(operands[0]) && llvm::isa<ArgAttrElementT>(operands[1])) {
+        auto lhs = operands[0].cast<ArgAttrElementT>();
+        auto rhs = operands[1].cast<ArgAttrElementT>();
+
+        // We need dedicated cases, as the parameters of ResAttrElementT::get()
+        // depend on ResAttrElementT.
+        if constexpr (std::is_same<ResAttrElementT, mlir::IntegerAttr>::value ||
+                      std::is_same<ResAttrElementT, mlir::FloatAttr>::value) {
+            mlir::Type l = lhs.getType();
+            mlir::Type r = rhs.getType();
+            if ((l.dyn_cast<mlir::IntegerType>() || l.dyn_cast<mlir::FloatType>()) &&
+                (r.dyn_cast<mlir::IntegerType>() || r.dyn_cast<mlir::FloatType>())) {
+                auto lhsBitWidth = lhs.getType().getIntOrFloatBitWidth();
+                auto rhsBitWidth = rhs.getType().getIntOrFloatBitWidth();
+
+                if (lhsBitWidth < rhsBitWidth) {
+                    mlir::Attribute promotedLhs = performCast(lhs, rhs.getType(), loc);
+                    lhs = promotedLhs.cast<ArgAttrElementT>();
+                } else if (rhsBitWidth < lhsBitWidth) {
+                    mlir::Attribute promotedRhs = performCast(rhs, lhs.getType(), loc);
+                    rhs = promotedRhs.cast<ArgAttrElementT>();
+                }
+            }
+            return ResAttrElementT::get(resultType, calculate(lhs.getValue(), rhs.getValue()));
+        } else if constexpr (std::is_same<ResAttrElementT, mlir::BoolAttr>::value) {
+            if (!resultType.isSignlessInteger(1))
+                throw ErrorHandler::compilerError(loc, "CanonicalizerPass (constFoldBinaryOp)",
+                                                  "expected boolean result type");
+            return ResAttrElementT::get(lhs.getContext(), calculate(lhs.getValue(), rhs.getValue()));
+        } else if constexpr (std::is_same<ResAttrElementT, mlir::StringAttr>::value) {
+            if (!resultType.isa<mlir::daphne::StringType>())
+                throw ErrorHandler::compilerError(loc, "CanonicalizerPass (constFoldBinaryOp)",
+                                                  "expected string result type");
+            return ResAttrElementT::get(calculate(lhs.getValue(), rhs.getValue()), resultType);
+        }
+    }
+    return {};
+}
+
+mlir::OpFoldResult mlir::daphne::ConstantOp::fold(FoldAdaptor adaptor) {
+    if (!adaptor.getOperands().empty())
+        throw ErrorHandler::compilerError(this->getLoc(), "CanonicalizerPass (mlir::daphne::ConstantOp::fold)",
+                                          "constant has no operands but " +
+                                              std::to_string(adaptor.getOperands().size()) + " were given");
+
+    return getValue();
+}
+
+mlir::OpFoldResult mlir::daphne::CastOp::fold(FoldAdaptor adaptor) {
+    ArrayRef<Attribute> operands = adaptor.getOperands();
+
+    if (isTrivialCast()) {
+        if (operands[0])
+            return {operands[0]};
+        else
+            return {getArg()};
+    }
+
+    if (operands[0]) {
+        if (auto castedAttr = performCast(operands[0], getType(), getLoc())) {
+            return castedAttr;
+        }
+    }
+
+    return {};
+}
+
+mlir::OpFoldResult mlir::daphne::EwAddOp::fold(FoldAdaptor adaptor) {
+    ArrayRef<Attribute> operands = adaptor.getOperands();
+    auto floatOp = [](const llvm::APFloat &a, const llvm::APFloat &b) { return a + b; };
+    // TODO: we could check overflows
+    auto intOp = [](const llvm::APInt &a, const llvm::APInt &b) { return a + b; };
+    if (auto res = constFoldBinaryOp<FloatAttr>(getLoc(), getType(), operands, floatOp))
+        return res;
+    if (auto res = constFoldBinaryOp<IntegerAttr>(getLoc(), getType(), operands, intOp))
+        return res;
+    return {};
+}
+
+mlir::OpFoldResult mlir::daphne::EwSubOp::fold(FoldAdaptor adaptor) {
+    ArrayRef<Attribute> operands = adaptor.getOperands();
+    auto floatOp = [](const llvm::APFloat &a, const llvm::APFloat &b) { return a - b; };
+    auto intOp = [](const llvm::APInt &a, const llvm::APInt &b) { return a - b; };
+    if (auto res = constFoldBinaryOp<FloatAttr>(getLoc(), getType(), operands, floatOp))
+        return res;
+    if (auto res = constFoldBinaryOp<IntegerAttr>(getLoc(), getType(), operands, intOp))
+        return res;
+    return {};
+}
+
+mlir::OpFoldResult mlir::daphne::EwMulOp::fold(FoldAdaptor adaptor) {
+    ArrayRef<Attribute> operands = adaptor.getOperands();
+    auto floatOp = [](const llvm::APFloat &a, const llvm::APFloat &b) { return a * b; };
+    auto intOp = [](const llvm::APInt &a, const llvm::APInt &b) { return a * b; };
+    if (auto res = constFoldBinaryOp<FloatAttr>(getLoc(), getType(), operands, floatOp))
+        return res;
+    if (auto res = constFoldBinaryOp<IntegerAttr>(getLoc(), getType(), operands, intOp))
+        return res;
+    return {};
+}
+
+mlir::OpFoldResult mlir::daphne::EwDivOp::fold(FoldAdaptor adaptor) {
+    ArrayRef<Attribute> operands = adaptor.getOperands();
+    auto floatOp = [](const llvm::APFloat &a, const llvm::APFloat &b) { return a / b; };
+    auto sintOp = [&](const llvm::APInt &a, const llvm::APInt &b) {
+        if (b == 0) {
+            throw ErrorHandler::compilerError(this->getLoc(), "CanonicalizerPass (mlir::daphne::EwDivOp::fold)",
+                                              "Can't divide by 0");
+        }
+        return a.sdiv(b);
+    };
+    auto uintOp = [&](const llvm::APInt &a, const llvm::APInt &b) {
+        if (b == 0) {
+            throw ErrorHandler::compilerError(this->getLoc(), "CanonicalizerPass (mlir::daphne::EwDivOp::fold)",
+                                              "Can't divide by 0");
+        }
+        return a.udiv(b);
+    };
+
+    if (auto res = constFoldBinaryOp<FloatAttr>(getLoc(), getType(), operands, floatOp))
+        return res;
+    if (getType().isSignedInteger()) {
+        if (auto res = constFoldBinaryOp<IntegerAttr>(getLoc(), getType(), operands, sintOp))
+            return res;
+    } else if (getType().isUnsignedInteger()) {
+        if (auto res = constFoldBinaryOp<IntegerAttr>(getLoc(), getType(), operands, uintOp))
+            return res;
+    }
+    return {};
+}
+
+mlir::OpFoldResult mlir::daphne::EwMinusOp::fold(FoldAdaptor adaptor) {
+    ArrayRef<Attribute> operands = adaptor.getOperands();
+    auto intOp = [](const llvm::APInt &a) { return -a; };
+    auto floatOp = [](const llvm::APFloat &a) { return -a; };
+
+    if (auto res = constFoldUnaryOp<IntegerAttr>(getLoc(), getType(), operands, intOp))
+        return res;
+    if (auto res = constFoldUnaryOp<FloatAttr>(getLoc(), getType(), operands, floatOp))
+        return res;
+
+    return {};
+}
+
+mlir::OpFoldResult mlir::daphne::EwPowOp::fold(FoldAdaptor adaptor) {
+    ArrayRef<Attribute> operands = adaptor.getOperands();
+    // TODO: EwPowOp integer constant folding
+    auto floatOp = [](const llvm::APFloat &a, const llvm::APFloat &b) {
+        return std::pow(a.convertToDouble(), b.convertToDouble());
+    };
+    if (auto res = constFoldBinaryOp<FloatAttr>(getLoc(), getType(), operands, floatOp))
+        return res;
+    return {};
+}
+
+mlir::OpFoldResult mlir::daphne::EwModOp::fold(FoldAdaptor adaptor) {
+    ArrayRef<Attribute> operands = adaptor.getOperands();
+    auto sintOp = [&](const llvm::APInt &a, const llvm::APInt &b) {
+        if (b == 0) {
+            throw ErrorHandler::compilerError(this->getLoc(), "CanonicalizerPass (mlir::daphne::EwModOp::fold)",
+                                              "Can't compute mod 0");
+        }
+        return a.srem(b);
+    };
+    auto uintOp = [&](const llvm::APInt &a, const llvm::APInt &b) {
+        if (b == 0) {
+            throw ErrorHandler::compilerError(this->getLoc(), "CanonicalizerPass (mlir::daphne::EwModOp::fold)",
+                                              "Can't compute mod 0");
+        }
+        return a.urem(b);
+    };
+    if (getType().isSignedInteger()) {
+        if (auto res = constFoldBinaryOp<IntegerAttr>(getLoc(), getType(), operands, sintOp))
+            return res;
+    } else if (getType().isUnsignedInteger()) {
+        if (auto res = constFoldBinaryOp<IntegerAttr>(getLoc(), getType(), operands, uintOp))
+            return res;
+    }
+    return {};
+}
+
+mlir::OpFoldResult mlir::daphne::EwLogOp::fold(FoldAdaptor adaptor) {
+    ArrayRef<Attribute> operands = adaptor.getOperands();
+    auto floatOp = [](const llvm::APFloat &a, const llvm::APFloat &b) {
+        // Compute the element-wise logarithm of a to the base b
+        // Equivalent to log_b(a)
+        return log(a.convertToDouble()) / log(b.convertToDouble());
+    };
+    if (auto res = constFoldBinaryOp<FloatAttr>(getLoc(), getType(), operands, floatOp))
+        return res;
+    return {};
+}
+
+mlir::OpFoldResult mlir::daphne::EwMinOp::fold(FoldAdaptor adaptor) {
+    ArrayRef<Attribute> operands = adaptor.getOperands();
+    auto floatOp = [](const llvm::APFloat &a, const llvm::APFloat &b) { return llvm::minimum(a, b); };
+    auto sintOp = [&](const llvm::APInt &a, const llvm::APInt &b) {
+        if (a.slt(b))
+            return a;
+        else
+            return b;
+    };
+    auto uintOp = [&](const llvm::APInt &a, const llvm::APInt &b) {
+        if (a.ult(b))
+            return a;
+        else
+            return b;
+    };
+    if (auto res = constFoldBinaryOp<FloatAttr>(getLoc(), getType(), operands, floatOp))
+        return res;
+    if (getType().isSignedInteger()) {
+        if (auto res = constFoldBinaryOp<IntegerAttr>(getLoc(), getType(), operands, sintOp))
+            return res;
+    } else if (getType().isUnsignedInteger()) {
+        if (auto res = constFoldBinaryOp<IntegerAttr>(getLoc(), getType(), operands, uintOp))
+            return res;
+    }
+    return {};
+}
+
+mlir::OpFoldResult mlir::daphne::EwMaxOp::fold(FoldAdaptor adaptor) {
+    ArrayRef<Attribute> operands = adaptor.getOperands();
+    auto floatOp = [](const llvm::APFloat &a, const llvm::APFloat &b) { return llvm::maximum(a, b); };
+    auto sintOp = [&](const llvm::APInt &a, const llvm::APInt &b) {
+        if (a.sgt(b))
+            return a;
+        else
+            return b;
+    };
+    auto uintOp = [&](const llvm::APInt &a, const llvm::APInt &b) {
+        if (a.ugt(b))
+            return a;
+        else
+            return b;
+    };
+    if (auto res = constFoldBinaryOp<FloatAttr>(getLoc(), getType(), operands, floatOp))
+        return res;
+    if (getType().isSignedInteger()) {
+        if (auto res = constFoldBinaryOp<IntegerAttr>(getLoc(), getType(), operands, sintOp))
+            return res;
+    } else if (getType().isUnsignedInteger()) {
+        if (auto res = constFoldBinaryOp<IntegerAttr>(getLoc(), getType(), operands, uintOp))
+            return res;
+    }
+    return {};
+}
+
+mlir::OpFoldResult mlir::daphne::EwAndOp::fold(FoldAdaptor adaptor) {
+    ArrayRef<Attribute> operands = adaptor.getOperands();
+    auto boolOp = [](const bool &a, const bool &b) { return a && b; };
+    auto intOp = [](const llvm::APInt &a, const llvm::APInt &b) { return (a != 0) && (b != 0); };
+    if (auto res = constFoldBinaryOp<BoolAttr>(getLoc(), getType(), operands, boolOp))
+        return res;
+    // TODO: should output bool?
+    if (auto res = constFoldBinaryOp<IntegerAttr>(getLoc(), getType(), operands, intOp))
+        return res;
+    return {};
+}
+
+mlir::OpFoldResult mlir::daphne::EwBitwiseAndOp::fold(FoldAdaptor adaptor) { return {}; }
+
+mlir::OpFoldResult mlir::daphne::EwOrOp::fold(FoldAdaptor adaptor) {
+    ArrayRef<Attribute> operands = adaptor.getOperands();
+    auto boolOp = [](const bool &a, const bool &b) { return a || b; };
+    auto intOp = [](const llvm::APInt &a, const llvm::APInt &b) { return (a != 0) || (b != 0); };
+    if (auto res = constFoldBinaryOp<BoolAttr>(getLoc(), getType(), operands, boolOp))
+        return res;
+    // TODO: should output bool
+    if (auto res = constFoldBinaryOp<IntegerAttr>(getLoc(), getType(), operands, intOp))
+        return res;
+    return {};
+}
+
+mlir::OpFoldResult mlir::daphne::EwXorOp::fold(FoldAdaptor adaptor) {
+    ArrayRef<Attribute> operands = adaptor.getOperands();
+    auto boolOp = [](const bool &a, const bool &b) { return a ^ b; };
+    auto intOp = [](const llvm::APInt &a, const llvm::APInt &b) { return (a != 0) ^ (b != 0); };
+    if (auto res = constFoldBinaryOp<BoolAttr>(getLoc(), getType(), operands, boolOp))
+        return res;
+    // TODO: should output bool
+    if (auto res = constFoldBinaryOp<IntegerAttr>(getLoc(), getType(), operands, intOp))
+        return res;
+    return {};
+}
+
+mlir::OpFoldResult mlir::daphne::EwConcatOp::fold(FoldAdaptor adaptor) {
+    ArrayRef<Attribute> operands = adaptor.getOperands();
+
+    if (operands.size() != 2)
+        throw ErrorHandler::compilerError(this->getLoc(), "CanonicalizerPass (mlir::daphne::EwConcatOp::fold)",
+                                          "binary op takes two operands but " + std::to_string(operands.size()) +
+                                              " were given");
+
+    if (!operands[0] || !operands[1])
+        return {};
+
+    if (llvm::isa<StringAttr>(operands[0]) && isa<StringAttr>(operands[1])) {
+        auto lhs = operands[0].cast<StringAttr>();
+        auto rhs = operands[1].cast<StringAttr>();
+
+        auto concated = lhs.getValue().str() + rhs.getValue().str();
+        return StringAttr::get(concated, getType());
+    }
+    return {};
+}
+
+mlir::OpFoldResult mlir::daphne::EwEqOp::fold(FoldAdaptor adaptor) {
+    ArrayRef<Attribute> operands = adaptor.getOperands();
+    auto floatOp = [](const llvm::APFloat &a, const llvm::APFloat &b) { return a == b; };
+    auto intOp = [](const llvm::APInt &a, const llvm::APInt &b) { return a == b; };
+    auto strOp = [](const llvm::StringRef &a, const llvm::StringRef &b) { return a == b; };
+    // TODO: fix bool return
+    if (auto res = constFoldBinaryOp<FloatAttr>(getLoc(), getType(), operands, floatOp))
+        return res;
+    if (auto res = constFoldBinaryOp<IntegerAttr>(getLoc(), getType(), operands, intOp))
+        return res;
+    if (auto res = constFoldBinaryOp<StringAttr, IntegerAttr>(
+            getLoc(), IntegerType::get(getContext(), 64, IntegerType::SignednessSemantics::Signed), operands, strOp))
+        return res;
+    return {};
+}
+
+mlir::OpFoldResult mlir::daphne::EwNeqOp::fold(FoldAdaptor adaptor) {
+    ArrayRef<Attribute> operands = adaptor.getOperands();
+    auto floatOp = [](const llvm::APFloat &a, const llvm::APFloat &b) { return a != b; };
+    auto intOp = [](const llvm::APInt &a, const llvm::APInt &b) { return a != b; };
+    // TODO: fix bool return
+    if (auto res = constFoldBinaryOp<FloatAttr>(getLoc(), getType(), operands, floatOp))
+        return res;
+    if (auto res = constFoldBinaryOp<IntegerAttr>(getLoc(), getType(), operands, intOp))
+        return res;
+    return {};
+}
+
+mlir::OpFoldResult mlir::daphne::EwLtOp::fold(FoldAdaptor adaptor) {
+    ArrayRef<Attribute> operands = adaptor.getOperands();
+    auto floatOp = [](const llvm::APFloat &a, const llvm::APFloat &b) { return a < b; };
+    auto sintOp = [](const llvm::APInt &a, const llvm::APInt &b) { return a.slt(b); };
+    auto uintOp = [](const llvm::APInt &a, const llvm::APInt &b) { return a.ult(b); };
+    // TODO: fix bool return
+    if (auto res = constFoldBinaryOp<FloatAttr>(getLoc(), getType(), operands, floatOp))
+        return res;
+    if (getType().isSignedInteger()) {
+        if (auto res = constFoldBinaryOp<IntegerAttr>(getLoc(), getType(), operands, sintOp))
+            return res;
+    } else if (getType().isUnsignedInteger()) {
+        if (auto res = constFoldBinaryOp<IntegerAttr>(getLoc(), getType(), operands, uintOp))
+            return res;
+    }
+    return {};
+}
+
+mlir::OpFoldResult mlir::daphne::EwLeOp::fold(FoldAdaptor adaptor) {
+    ArrayRef<Attribute> operands = adaptor.getOperands();
+    auto floatOp = [](const llvm::APFloat &a, const llvm::APFloat &b) { return a <= b; };
+    auto sintOp = [](const llvm::APInt &a, const llvm::APInt &b) { return a.sle(b); };
+    auto uintOp = [](const llvm::APInt &a, const llvm::APInt &b) { return a.ule(b); };
+    // TODO: fix bool return
+    if (auto res = constFoldBinaryOp<FloatAttr>(getLoc(), getType(), operands, floatOp))
+        return res;
+    if (getType().isSignedInteger()) {
+        if (auto res = constFoldBinaryOp<IntegerAttr>(getLoc(), getType(), operands, sintOp))
+            return res;
+    } else if (getType().isUnsignedInteger()) {
+        if (auto res = constFoldBinaryOp<IntegerAttr>(getLoc(), getType(), operands, uintOp))
+            return res;
+    }
+    return {};
+}
+
+mlir::OpFoldResult mlir::daphne::EwGtOp::fold(FoldAdaptor adaptor) {
+    ArrayRef<Attribute> operands = adaptor.getOperands();
+    auto floatOp = [](const llvm::APFloat &a, const llvm::APFloat &b) { return a > b; };
+    auto sintOp = [](const llvm::APInt &a, const llvm::APInt &b) { return a.sgt(b); };
+    auto uintOp = [](const llvm::APInt &a, const llvm::APInt &b) { return a.ugt(b); };
+    // TODO: fix bool return
+    if (auto res = constFoldBinaryOp<FloatAttr>(getLoc(), getType(), operands, floatOp))
+        return res;
+    if (getType().isSignedInteger()) {
+        if (auto res = constFoldBinaryOp<IntegerAttr>(getLoc(), getType(), operands, sintOp))
+            return res;
+    } else if (getType().isUnsignedInteger()) {
+        if (auto res = constFoldBinaryOp<IntegerAttr>(getLoc(), getType(), operands, uintOp))
+            return res;
+    }
+    return {};
+}
+
+mlir::OpFoldResult mlir::daphne::EwGeOp::fold(FoldAdaptor adaptor) {
+    ArrayRef<Attribute> operands = adaptor.getOperands();
+    auto floatOp = [](const llvm::APFloat &a, const llvm::APFloat &b) { return a >= b; };
+    auto sintOp = [](const llvm::APInt &a, const llvm::APInt &b) { return a.sge(b); };
+    auto uintOp = [](const llvm::APInt &a, const llvm::APInt &b) { return a.uge(b); };
+    // TODO: fix bool return
+    if (auto res = constFoldBinaryOp<FloatAttr>(getLoc(), getType(), operands, floatOp))
+        return res;
+    if (getType().isSignedInteger()) {
+        if (auto res = constFoldBinaryOp<IntegerAttr>(getLoc(), getType(), operands, sintOp))
+            return res;
+    } else if (getType().isUnsignedInteger()) {
+        if (auto res = constFoldBinaryOp<IntegerAttr>(getLoc(), getType(), operands, uintOp))
+            return res;
+    }
+    return {};
+}
diff --git a/src/ir/daphneir/Passes.h b/src/ir/daphneir/Passes.h
index 00f8be22e..3dbb67e4f 100644
--- a/src/ir/daphneir/Passes.h
+++ b/src/ir/daphneir/Passes.h
@@ -29,61 +29,55 @@
 #include <unordered_map>
 
 namespace mlir::daphne {
-    struct InferenceConfig {
-        InferenceConfig(bool partialInferenceAllowed,
-                        bool typeInference,
-                        bool shapeInference,
-                        bool frameLabelInference,
-                        bool sparsityInference);
-        bool partialInferenceAllowed;
-        bool typeInference;
-        bool shapeInference;
-        bool frameLabelInference;
-        bool sparsityInference;
-    };
+struct InferenceConfig {
+    InferenceConfig(bool partialInferenceAllowed, bool typeInference, bool shapeInference, bool frameLabelInference,
+                    bool sparsityInference);
+    bool partialInferenceAllowed;
+    bool typeInference;
+    bool shapeInference;
+    bool frameLabelInference;
+    bool sparsityInference;
+};
 
-    // alphabetically sorted list of passes
-    std::unique_ptr<Pass> createAdaptTypesToKernelsPass();
-    std::unique_ptr<Pass> createDistributeComputationsPass();
-    std::unique_ptr<Pass> createDistributePipelinesPass();
-    std::unique_ptr<Pass> createMapOpLoweringPass();
-    std::unique_ptr<Pass> createEwOpLoweringPass();
-    std::unique_ptr<Pass> createModOpLoweringPass();
-    std::unique_ptr<Pass> createInferencePass(InferenceConfig cfg = {false, true, true, true, true});
-    std::unique_ptr<Pass> createInsertDaphneContextPass(const DaphneUserConfig& cfg);
-    std::unique_ptr<Pass> createDaphneOptPass();
-    std::unique_ptr<OperationPass<ModuleOp>> createMatMulOpLoweringPass(bool matmul_tile,
-        int matmul_vec_size_bits = 0,
-        std::vector<unsigned> matmul_fixed_tile_sizes = {},
-        bool matmul_use_fixed_tile_sizes = false,
-        int matmul_unroll_factor = 1,
-        int matmul_unroll_jam_factor = 4,
-        int matmul_num_vec_registers = 16,
-        bool matmul_invert_loops = false);
-    std::unique_ptr<OperationPass<ModuleOp>>  createMatMulOpLoweringPass();
-    std::unique_ptr<Pass> createAggAllOpLoweringPass();
-    std::unique_ptr<Pass> createMemRefTestPass();
-    std::unique_ptr<Pass> createProfilingPass();
-    std::unique_ptr<Pass> createLowerToLLVMPass(const DaphneUserConfig& cfg);
-    std::unique_ptr<Pass> createManageObjRefsPass();
-    std::unique_ptr<Pass> createPhyOperatorSelectionPass();
-    std::unique_ptr<Pass> createPrintIRPass(std::string message = "");
-    std::unique_ptr<Pass> createRewriteSqlOpPass();
-    std::unique_ptr<Pass> createRewriteToCallKernelOpPass(const DaphneUserConfig& cfg, std::unordered_map<std::string, bool> & usedLibPaths);
-    std::unique_ptr<Pass> createSelectMatrixRepresentationsPass(const DaphneUserConfig& cfg);
-    std::unique_ptr<Pass> createSpecializeGenericFunctionsPass(const DaphneUserConfig& cfg);
-    std::unique_ptr<Pass> createVectorizeComputationsPass();
-    std::unique_ptr<Pass> createWhileLoopInvariantCodeMotionPass();
+// alphabetically sorted list of passes
+std::unique_ptr<Pass> createAdaptTypesToKernelsPass();
+std::unique_ptr<Pass> createDistributeComputationsPass();
+std::unique_ptr<Pass> createDistributePipelinesPass();
+std::unique_ptr<Pass> createMapOpLoweringPass();
+std::unique_ptr<Pass> createEwOpLoweringPass();
+std::unique_ptr<Pass> createModOpLoweringPass();
+std::unique_ptr<Pass> createInferencePass(InferenceConfig cfg = {false, true, true, true, true});
+std::unique_ptr<Pass> createInsertDaphneContextPass(const DaphneUserConfig &cfg);
+std::unique_ptr<Pass> createDaphneOptPass();
+std::unique_ptr<OperationPass<ModuleOp>>
+createMatMulOpLoweringPass(bool matmul_tile, int matmul_vec_size_bits = 0,
+                           std::vector<unsigned> matmul_fixed_tile_sizes = {}, bool matmul_use_fixed_tile_sizes = false,
+                           int matmul_unroll_factor = 1, int matmul_unroll_jam_factor = 4,
+                           int matmul_num_vec_registers = 16, bool matmul_invert_loops = false);
+std::unique_ptr<OperationPass<ModuleOp>> createMatMulOpLoweringPass();
+std::unique_ptr<Pass> createAggAllOpLoweringPass();
+std::unique_ptr<Pass> createMemRefTestPass();
+std::unique_ptr<Pass> createProfilingPass();
+std::unique_ptr<Pass> createLowerToLLVMPass(const DaphneUserConfig &cfg);
+std::unique_ptr<Pass> createManageObjRefsPass();
+std::unique_ptr<Pass> createPhyOperatorSelectionPass();
+std::unique_ptr<Pass> createPrintIRPass(std::string message = "");
+std::unique_ptr<Pass> createRewriteSqlOpPass();
+std::unique_ptr<Pass> createRewriteToCallKernelOpPass(const DaphneUserConfig &cfg,
+                                                      std::unordered_map<std::string, bool> &usedLibPaths);
+std::unique_ptr<Pass> createSelectMatrixRepresentationsPass(const DaphneUserConfig &cfg);
+std::unique_ptr<Pass> createSpecializeGenericFunctionsPass(const DaphneUserConfig &cfg);
+std::unique_ptr<Pass> createVectorizeComputationsPass();
 #ifdef USE_CUDA
-    std::unique_ptr<Pass> createMarkCUDAOpsPass(const DaphneUserConfig& cfg);
+std::unique_ptr<Pass> createMarkCUDAOpsPass(const DaphneUserConfig &cfg);
 #endif
 
 #ifdef USE_FPGAOPENCL
-    std::unique_ptr<Pass> createMarkFPGAOPENCLOpsPass(const DaphneUserConfig& cfg);
+std::unique_ptr<Pass> createMarkFPGAOPENCLOpsPass(const DaphneUserConfig &cfg);
 #endif
 
 #define GEN_PASS_REGISTRATION
 #include "ir/daphneir/Passes.h.inc"
 } // namespace mlir::daphne
 
-#endif //SRC_IR_DAPHNEIR_PASSES_H
+#endif // SRC_IR_DAPHNEIR_PASSES_H
diff --git a/src/ir/daphneir/Passes.td b/src/ir/daphneir/Passes.td
index 64f9f4340..a383b00fe 100644
--- a/src/ir/daphneir/Passes.td
+++ b/src/ir/daphneir/Passes.td
@@ -47,10 +47,6 @@ def RewriteSqlOpPass : Pass<"rewrite-sqlop", "::mlir::func::FuncOp"> {
     let constructor = "mlir::daphne::createRewriteSqlOpPass()";
 }
 
-def WhileLoopInvariantCodeMotionPass : Pass<"while-loop-invariant-code-motion", "::mlir::func::FuncOp"> {
-    let constructor = "mlir::daphne::createWhileLoopInvariantCodeMotionPass()";
-}
-
 def AggAllLoweringPass : Pass<"lower-agg", "::mlir::func::FuncOp"> {
     let constructor = "mlir::daphne::createAggAllOpLoweringPass()";
 }
diff --git a/src/parser/CancelingErrorListener.h b/src/parser/CancelingErrorListener.h
index 602d8a3b1..67a8ede7a 100644
--- a/src/parser/CancelingErrorListener.h
+++ b/src/parser/CancelingErrorListener.h
@@ -21,19 +21,16 @@
 #include <util/ErrorHandler.h>
 
 class CancelingErrorListener : public antlr4::BaseErrorListener {
-private:
-    void syntaxError(antlr4::Recognizer *recognizer,
-                     antlr4::Token *offendingSymbol,
-                     size_t line,
-                     size_t charPositionInLine,
-                     const std::string &msg,
-                     std::exception_ptr e) override {
+  private:
+    void syntaxError(antlr4::Recognizer *recognizer, antlr4::Token *offendingSymbol, size_t line,
+                     size_t charPositionInLine, const std::string &msg, std::exception_ptr e) override {
         std::stringstream ss;
-        ss << recognizer->getInputStream()->getSourceName() << ':' << line << ':' << charPositionInLine << ' '
-           << msg << "\n";
-        throw ErrorHandler::makeError("Antlr4 Parser", ss.str(), recognizer->getInputStream()->getSourceName(), line, charPositionInLine);
+        ss << recognizer->getInputStream()->getSourceName() << ':' << line << ':' << charPositionInLine << ' ' << msg
+           << "\n";
+        throw ErrorHandler::makeError("Antlr4 Parser", ss.str(), recognizer->getInputStream()->getSourceName(), line,
+                                      charPositionInLine);
         // throw antlr4::ParseCancellationException(ss.str());
     }
 };
 
-#endif //SRC_PARSER_CANCELINGERRORLISTENER_H
+#endif // SRC_PARSER_CANCELINGERRORLISTENER_H
diff --git a/src/parser/Parser.h b/src/parser/Parser.h
index 11ebc56db..806c4729c 100644
--- a/src/parser/Parser.h
+++ b/src/parser/Parser.h
@@ -25,39 +25,38 @@
 #include <stdexcept>
 #include <string>
 
-
 /**
  * @brief The super-class of all parsers producing DaphneIR.
- * 
+ *
  * All parsers generating a DaphneIR representation from a program or query
  * given in a particular domain-specific language (DSL) should inherit from
  * this class and implement `parseStream`.
  */
 struct Parser {
-    
+
     /**
      * @brief Generates a DaphneIR representation for the contents of the given
      * stream.
-     * 
+     *
      * @param builder The builder to use for generating DaphneIR operations.
      * @param stream The stream to read from.
      * @param sourceName A name used for source location information.
      */
     virtual void parseStream(mlir::OpBuilder &builder, std::istream &stream, const std::string &sourceName) = 0;
-    
+
     /**
      * @brief Generates a DaphneIR representation for the given DSL file.
-     * 
+     *
      * @param builder The builder to use for generating DaphneIR operations.
      * @param filename The path to the file to read from.
      */
-    void parseFile(mlir::OpBuilder & builder, const std::string & filename) {
+    void parseFile(mlir::OpBuilder &builder, const std::string &filename) {
         // Open the given DSL file.
         std::ifstream ifs(filename, std::ios::in);
         if (!ifs.good())
-            // It's important to say "DaphneDSL" here, because the error message could be confused
-            // with some data file to read, especially when the path to a data file was passed
-            // by accident.
+            // It's important to say "DaphneDSL" here, because the error message
+            // could be confused with some data file to read, especially when
+            // the path to a data file was passed by accident.
             throw std::runtime_error("could not open DaphneDSL file '" + filename + "' for parsing");
 
         // Parse the file contents.
@@ -66,18 +65,19 @@ struct Parser {
 
     /**
      * @brief Generates a DaphneIR representation for the given DSL string.
-     * 
+     *
      * @param builder The builder to use for generating DaphneIR operations.
      * @param str The string to read from.
-     * @param sourceName Optional name for the source used in MLIR Locations (defaults to "DSL String")
+     * @param sourceName Optional name for the source used in MLIR Locations
+     * (defaults to "DSL String")
      */
-    void parseStr(mlir::OpBuilder & builder, const std::string & str, const std::string & sourceName = "DSL String") {
+    void parseStr(mlir::OpBuilder &builder, const std::string &str, const std::string &sourceName = "DSL String") {
         // Parse the file contents.
         std::istringstream s(str);
-        
+
         // Parse the string contents.
         parseStream(builder, s, sourceName);
     }
 };
 
-#endif //SRC_PARSER_PARSER_H
+#endif // SRC_PARSER_PARSER_H
diff --git a/src/parser/ParserUtils.h b/src/parser/ParserUtils.h
index 979023b4e..851b94e23 100644
--- a/src/parser/ParserUtils.h
+++ b/src/parser/ParserUtils.h
@@ -38,10 +38,9 @@ class ParserUtils {
     /**
      * The OpBuilder used to generate DaphneIR operations.
      */
-    mlir::OpBuilder & builder;
-
-public:
+    mlir::OpBuilder &builder;
 
+  public:
     // ************************************************************************
     // `mlir::Type`s corresponsing to the types in `DaphneTypes.td`
     // ************************************************************************
@@ -81,9 +80,7 @@ class ParserUtils {
      * @param vt
      * @return
      */
-    mlir::daphne::MatrixType matrixOf(mlir::Type vt) {
-        return mlir::daphne::MatrixType::get(builder.getContext(), vt);
-    }
+    mlir::daphne::MatrixType matrixOf(mlir::Type vt) { return mlir::daphne::MatrixType::get(builder.getContext(), vt); }
 
     /**
      * @brief Get a `daphne::MatrixType` with the type of the given value as
@@ -99,16 +96,11 @@ class ParserUtils {
     // Constructor
     // ************************************************************************
 
-    ParserUtils(mlir::OpBuilder & builder)
-    :
-            builder(builder),
-            sizeType(builder.getIndexType()),
-            boolType(builder.getI1Type()),
-            seedType(builder.getIntegerType(64, true)),
-            strType(mlir::daphne::StringType::get(builder.getContext())),
-            matrixOfSizeType(static_cast<mlir::Type>(mlir::daphne::MatrixType::get(builder.getContext(), sizeType))),
-            unknownType(mlir::daphne::UnknownType::get(builder.getContext()))
-    {
+    ParserUtils(mlir::OpBuilder &builder)
+        : builder(builder), sizeType(builder.getIndexType()), boolType(builder.getI1Type()),
+          seedType(builder.getIntegerType(64, true)), strType(mlir::daphne::StringType::get(builder.getContext())),
+          matrixOfSizeType(static_cast<mlir::Type>(mlir::daphne::MatrixType::get(builder.getContext(), sizeType))),
+          unknownType(mlir::daphne::UnknownType::get(builder.getContext())) {
         // nothing to do
     }
 
@@ -126,70 +118,58 @@ class ParserUtils {
      * @return `v` if it has type `t`, otherwise a `CastOp` of `v` to `t`.
      */
     mlir::Value castIf(mlir::Type t, mlir::Value v) {
-        if(v.getType() == t)
+        if (v.getType() == t)
             return v;
         return builder.create<mlir::daphne::CastOp>(v.getLoc(), t, v);
     }
 
-    mlir::Value castSizeIf(mlir::Value v) {
-        return castIf(sizeType, v);
-    }
+    mlir::Value castSizeIf(mlir::Value v) { return castIf(sizeType, v); }
 
-    mlir::Value castBoolIf(mlir::Value v) {
-        return castIf(boolType, v);
-    }
+    mlir::Value castBoolIf(mlir::Value v) { return castIf(boolType, v); }
 
-    mlir::Value castSeedIf(mlir::Value v) {
-        return castIf(seedType, v);
-    }
+    mlir::Value castSeedIf(mlir::Value v) { return castIf(seedType, v); }
 
-    mlir::Value castStrIf(mlir::Value v) {
-        return castIf(strType, v);
-    }
+    mlir::Value castStrIf(mlir::Value v) { return castIf(strType, v); }
 
-    mlir::Value castUI8If(mlir::Value v) {
-        return castIf(builder.getIntegerType(8, false), v);
-    }
+    mlir::Value castUI8If(mlir::Value v) { return castIf(builder.getIntegerType(8, false), v); }
 
-    mlir::Value castUI32If(mlir::Value v) {
-        return castIf(builder.getIntegerType(32, false), v);
-    }
+    mlir::Value castUI32If(mlir::Value v) { return castIf(builder.getIntegerType(32, false), v); }
 
-    mlir::Value castUI64If(mlir::Value v) {
-        return castIf(builder.getIntegerType(64, false), v);
-    }
+    mlir::Value castUI64If(mlir::Value v) { return castIf(builder.getIntegerType(64, false), v); }
 
-    mlir::Value castSI8If(mlir::Value v) {
-        return castIf(builder.getIntegerType(8, true), v);
-    }
+    mlir::Value castSI8If(mlir::Value v) { return castIf(builder.getIntegerType(8, true), v); }
 
-    mlir::Value castSI32If(mlir::Value v) {
-        return castIf(builder.getIntegerType(32, true), v);
-    }
+    mlir::Value castSI32If(mlir::Value v) { return castIf(builder.getIntegerType(32, true), v); }
 
-    mlir::Value castSI64If(mlir::Value v) {
-        return castIf(builder.getIntegerType(64, true), v);
-    }
+    mlir::Value castSI64If(mlir::Value v) { return castIf(builder.getIntegerType(64, true), v); }
 
-    mlir::Value castF64If(mlir::Value v) {
-        return castIf(builder.getF64Type(), v);
-    }
+    mlir::Value castF64If(mlir::Value v) { return castIf(builder.getF64Type(), v); }
 
     // ************************************************************************
     // Type parsing
     // ************************************************************************
 
-    mlir::Type getValueTypeByName(const std::string & name) {
-        if(name == "f64") return builder.getF64Type();
-        if(name == "f32") return builder.getF32Type();
-        if(name == "si64") return builder.getIntegerType(64, true);
-        if(name == "si32") return builder.getIntegerType(32, true);
-        if(name == "si8") return builder.getIntegerType(8, true);
-        if(name == "ui64") return builder.getIntegerType(64, false);
-        if(name == "ui32") return builder.getIntegerType(32, false);
-        if(name == "ui8") return builder.getIntegerType(8, false);
-        if(name == "str") return strType;
-        if(name == "bool") return boolType;
+    mlir::Type getValueTypeByName(const std::string &name) {
+        if (name == "f64")
+            return builder.getF64Type();
+        if (name == "f32")
+            return builder.getF32Type();
+        if (name == "si64")
+            return builder.getIntegerType(64, true);
+        if (name == "si32")
+            return builder.getIntegerType(32, true);
+        if (name == "si8")
+            return builder.getIntegerType(8, true);
+        if (name == "ui64")
+            return builder.getIntegerType(64, false);
+        if (name == "ui32")
+            return builder.getIntegerType(32, false);
+        if (name == "ui8")
+            return builder.getIntegerType(8, false);
+        if (name == "str")
+            return strType;
+        if (name == "bool")
+            return boolType;
         throw std::runtime_error("unsupported value type: " + name);
     }
 
@@ -198,62 +178,69 @@ class ParserUtils {
     // ************************************************************************
 
     /**
-     * @brief Returns the given result of an ANTLR visitor function as an `mlir::Value` or
-     * throws an informative error if it is not a single value.
+     * @brief Returns the given result of an ANTLR visitor function as an
+     * `mlir::Value` or throws an informative error if it is not a single value.
      *
-     * @param loc The location in the input file (e.g., DaphneDSL script or SQL query).
-     * @param a The result returned by an ANTLR visitor function (e.g., by `visit()` or `visitXYZ()`).
+     * @param loc The location in the input file (e.g., DaphneDSL script or SQL
+     * query).
+     * @param a The result returned by an ANTLR visitor function (e.g., by
+     * `visit()` or `visitXYZ()`).
      */
     mlir::Value valueOrError(mlir::Location loc, antlrcpp::Any a) {
-        if(a.is<mlir::Value>())
+        if (a.is<mlir::Value>())
             return a.as<mlir::Value>();
-        if(a.isNull())
-            // Typically happens when a *user-defined* function with zero return values
-            // is used in a place where exactly one value is required (e.g., in an expression).
+        if (a.isNull())
+            // Typically happens when a *user-defined* function with zero return
+            // values is used in a place where exactly one value is required
+            // (e.g., in an expression).
             throw ErrorHandler::compilerError(loc, "DSLVisitor",
-                "the expression was expected to return a single value, but it returns no value"
-            );
-        if(a.is<mlir::Operation*>())
-            // Typically happens when a *built-in* function with zero return values
-            // is used in a place where exactly one value is required (e.g., in an expression).
+                                              "the expression was expected to return a single value, but it "
+                                              "returns no value");
+        if (a.is<mlir::Operation *>())
+            // Typically happens when a *built-in* function with zero return
+            // values is used in a place where exactly one value is required
+            // (e.g., in an expression).
             throw ErrorHandler::compilerError(loc, "DSLVisitor",
-                "the expression was expected to return a single value, but it returns no value"
-            );
-        if(a.is<mlir::ResultRange>()) {
-            // Typically happens when a *built-in or user-defined* function with more than one return values
-            // is used in a place where exactly one value is required (e.g., in an expression).
+                                              "the expression was expected to return a single value, but it "
+                                              "returns no value");
+        if (a.is<mlir::ResultRange>()) {
+            // Typically happens when a *built-in or user-defined* function with
+            // more than one return values is used in a place where exactly one
+            // value is required (e.g., in an expression).
             auto rr = a.as<mlir::ResultRange>();
             throw ErrorHandler::compilerError(loc, "DSLVisitor",
-                "the expression was expected to return a single value, but it returns " +
-                std::to_string(rr.size()) + " values"
-            );
+                                              "the expression was expected to return a single value, but it "
+                                              "returns " +
+                                                  std::to_string(rr.size()) + " values");
         }
-        // This should never happen. If it happens, that indicates that this function is called on
-        // an invalid argument or that some visitor returns a wrong result type.
+        // This should never happen. If it happens, that indicates that this
+        // function is called on an invalid argument or that some visitor
+        // returns a wrong result type.
         throw ErrorHandler::compilerError(loc, "DSLVisitor",
-            "the expression was expected to return a single value, but it is something unexpected"
-        );
+                                          "the expression was expected to return a single value, but it is "
+                                          "something unexpected");
     }
 
     mlir::Type typeOrError(antlrcpp::Any a) {
-        if(a.is<mlir::Type>())
+        if (a.is<mlir::Type>())
             return a.as<mlir::Type>();
         throw std::runtime_error("something was expected to be an mlir::Type, but it was none");
     }
 
     /**
      * @brief Utility function for getting the file location of the token
-     * @param start Start token of this rule (usually you want to use `ctx->start`)
+     * @param start Start token of this rule (usually you want to use
+     * `ctx->start`)
      * @return mlir location representing the position of the token in the file
      */
     mlir::Location getLoc(antlr4::Token *start) {
         return mlir::FileLineColLoc::get(builder.getStringAttr(start->getTokenSource()->getSourceName()),
-            start->getLine(),
-            start->getCharPositionInLine());
+                                         start->getLine(), start->getCharPositionInLine());
     }
 
     /**
-     * @brief Creates an unique symbol for function symbol names by appending an unique id.
+     * @brief Creates an unique symbol for function symbol names by appending an
+     * unique id.
      * @param functionName the function name
      * @return the unique function name, due to an unique id
      */
@@ -263,28 +250,30 @@ class ParserUtils {
     }
 
     /**
-     * @brief Infers and sets the result type of the given operation and returns the result as an `mlir::Value`.
-     * 
+     * @brief Infers and sets the result type of the given operation and returns
+     * the result as an `mlir::Value`.
+     *
      * Works only for operations with exactly one result.
-     * For operations with more than one result, use `retValsWithInferedTypes()`.
+     * For operations with more than one result, use
+     * `retValsWithInferedTypes()`.
      */
-    template<class Op>
-    mlir::Value retValWithInferedType(Op op) {
+    template <class Op> mlir::Value retValWithInferedType(Op op) {
         mlir::daphne::setInferedTypes(op.getOperation());
         return static_cast<mlir::Value>(op);
     }
 
     /**
-     * @brief Infers and sets the result types of the given operation and returns the results as an `mlir::ResultRange`.
-     * 
+     * @brief Infers and sets the result types of the given operation and
+     * returns the results as an `mlir::ResultRange`.
+     *
      * Works for operations with any number of results.
-     * For operations with exactly one result, using `retValWithInferedType()` can be more convenient.
+     * For operations with exactly one result, using `retValWithInferedType()`
+     * can be more convenient.
      */
-    template<class Op>
-    mlir::ResultRange retValsWithInferedTypes(Op op) {
+    template <class Op> mlir::ResultRange retValsWithInferedTypes(Op op) {
         mlir::daphne::setInferedTypes(op.getOperation());
         return op.getResults();
     }
 };
 
-#endif //SRC_PARSER_PARSERUTILS_H
+#endif // SRC_PARSER_PARSERUTILS_H
diff --git a/src/parser/ScopedSymbolTable.h b/src/parser/ScopedSymbolTable.h
index bf57abe60..6f34c0870 100644
--- a/src/parser/ScopedSymbolTable.h
+++ b/src/parser/ScopedSymbolTable.h
@@ -30,7 +30,7 @@
 
 /**
  * @brief A hierarchical symbol table offering a stack of nested scopes.
- * 
+ *
  * Each scope is a single-level symbol table. A symbol table maps a variable
  * name (symbol) to the SSA value currently denoted by that name. A symbol
  * table is used during the parsing of a DSL script. This particular kind of
@@ -39,144 +39,136 @@
  * if-then-else and loops.
  */
 class ScopedSymbolTable {
-    
-public:
-    
+
+  public:
     struct SymbolInfo {
         mlir::Value value;
         bool isReadOnly;
-        
+
         SymbolInfo() : value(nullptr), isReadOnly(false) {
             // nothing to do
         }
-        
-        SymbolInfo(mlir::Value value, bool isReadOnly)
-        : value(value), isReadOnly(isReadOnly) {
+
+        SymbolInfo(mlir::Value value, bool isReadOnly) : value(value), isReadOnly(isReadOnly) {
             // nothing to do
         }
     };
-    
+
     /**
      * @brief The type of single-level symbol table.
      */
     using SymbolTable = std::unordered_map<std::string, SymbolInfo>;
-    
-private:
+
+  private:
     /**
      * @brief A stack of single-level symbol tables representing nested scopes.
      */
     std::vector<SymbolTable> scopes;
-    
+
     /**
      * @brief Determines whether some scope has the given symbol.
-     * 
+     *
      * @param sym The symbol (variable name) to look for.
      * @param parent `0` to start at the current scope, `1` to start at the
      * direct parent, and so on.
      * @return `true` if the symbol is found, `false` otherwise.
      */
-    bool has(const std::string & sym, int parent) const {
-        for(int i = scopes.size() - 1 - parent; i >= 0; i--)
-            if(scopes[i].count(sym))
+    bool has(const std::string &sym, int parent) const {
+        for (int i = scopes.size() - 1 - parent; i >= 0; i--)
+            if (scopes[i].count(sym))
                 return true;
         return false;
     }
-    
+
     /**
      * @brief Determines whether some scope has the given SSA value.
-     * 
+     *
      * @param val The SSA value to look for.
      * @param parent `0` to start at the current scope, `1` to start at the
      * direct parent, and so on.
      * @return `true` if the SSA value is found, `false` otherwise.
      */
     bool has(mlir::Value val, int parent) const {
-        for(int i = scopes.size() - 1 - parent; i >= 0; i--)
-            for(auto it = scopes[i].begin(); it != scopes[i].end(); it++)
-                if(it->second.value == val)
+        for (int i = scopes.size() - 1 - parent; i >= 0; i--)
+            for (auto it = scopes[i].begin(); it != scopes[i].end(); it++)
+                if (it->second.value == val)
                     return true;
         return false;
     }
-    
-public:
+
+  public:
     /**
      * @brief Creates a new `ScopedSymbolTable` initialized with a single empty
      * scope.
      */
-    ScopedSymbolTable() {
-        pushScope();
-    }
-    
+    ScopedSymbolTable() { pushScope(); }
+
     /**
      * @brief Determines whether some scope has the given symbol.
-     * 
+     *
      * @param sym The symbol (variable name) to look for.
      * @return `true` if the symbol is found, `false` otherwise.
      */
-    bool has(const std::string & sym) const {
-        return has(sym, 0);
-    }
-    
+    bool has(const std::string &sym) const { return has(sym, 0); }
+
     /**
      * @brief Determines whether some scope has the given SSA value.
-     * 
+     *
      * @param val The SSA value to look for.
      * @return `true` if the SSA value is found, `false` otherwise.
      */
-    bool has(mlir::Value val) const {
-        return has(val, 0);
-    }
-    
+    bool has(mlir::Value val) const { return has(val, 0); }
+
     /**
      * @brief Returns the SSA value associated with the given symbol, or throws
      * an exception if the symbol is unknown.
-     * 
+     *
      * Starting at the current scope, all hierarchy levels are searched until
      * the first occurrence of the symbol is found.
-     * 
+     *
      * @param sym The symbol (variable name) to look for.
      * @return Information on the symbol, including the associated SSA value.
      */
-    SymbolInfo get(const std::string & sym) const {
-        for(int i = scopes.size() - 1; i >= 0; i--) {
+    SymbolInfo get(const std::string &sym) const {
+        for (int i = scopes.size() - 1; i >= 0; i--) {
             auto it = scopes[i].find(sym);
-            if(it != scopes[i].end())
+            if (it != scopes[i].end())
                 return it->second;
         }
         throw std::runtime_error("symbol not found: '" + sym + "'");
     }
-    
+
     /**
      * @brief Like the other `get` method, but first tries to find the symbol
      * in the given single-level symbol table.
-     * 
+     *
      * @param sym The symbol (variable name) to look for.
      * @param tab A single-level symbol table from outside of this
      * `ScopedSymbolTable`.
      * @return Information on the symbol, including the associated SSA value.
      */
-    SymbolInfo get(const std::string & sym, const SymbolTable & tab) const {
+    SymbolInfo get(const std::string &sym, const SymbolTable &tab) const {
         auto it = tab.find(sym);
-        if(it != tab.end())
+        if (it != tab.end())
             return it->second;
         return get(sym);
     }
-    
+
     /**
-     * @brief Returns the symbol (variable name) associated with the given SSA value,
-     * or throws an exception if the SSA value is unknown.
-     * 
+     * @brief Returns the symbol (variable name) associated with the given SSA
+     * value, or throws an exception if the SSA value is unknown.
+     *
      * Starting at the current scope, all hierarchy levels are searched until
      * the first occurrence of the SSA value is found.
-     * 
-     * 
+     *
+     *
      * @param val The SSA value to look for.
      * @return The associated variable name.
      */
     std::string getSymbol(mlir::Value val) const {
-        for(int i = scopes.size() - 1; i >= 0; i--)
-            for(auto it = scopes[i].begin(); it != scopes[i].end(); it++)
-                if(it->second.value == val)
+        for (int i = scopes.size() - 1; i >= 0; i--)
+            for (auto it = scopes[i].begin(); it != scopes[i].end(); it++)
+                if (it->second.value == val)
                     return it->first;
         throw std::runtime_error("no symbol found for the given value");
     }
@@ -184,72 +176,66 @@ class ScopedSymbolTable {
     /**
      * Like the other `getSymbol` method, but only tries to find the SSA value
      * in the given single-level symbol table.
-     * 
-     * 
+     *
+     *
      * @param val The SSA value to look for.
      * @param tab A single-level symbol table from outside of this
      * `ScopedSymbolTable`.
      * @return The associated variable name.
      */
-    std::string getSymbol(mlir::Value val, const SymbolTable & tab) const {
-        for(auto it = tab.begin(); it != tab.end(); it++)
-            if(it->second.value == val)
+    std::string getSymbol(mlir::Value val, const SymbolTable &tab) const {
+        for (auto it = tab.begin(); it != tab.end(); it++)
+            if (it->second.value == val)
                 return it->first;
         // Unlike get(symbol, tab), we don't want to search in the scopes of
         // this ScopedSymbolTable.
         throw std::runtime_error("no symbol found for the given value");
     }
-    
+
     /**
      * @brief Associates the given symbol information (including an SSA value)
      * with the given symbol in the current scope.
-     * 
+     *
      * Any existing mapping in that scope will be overwritten.
-     * 
+     *
      * @param sym The symbol (variable name).
      * @param info The symbol information, including the SSA value.
      */
-    void put(std::string sym, SymbolInfo info) {
-        scopes.back()[sym] = info;
-    }
-    
+    void put(std::string sym, SymbolInfo info) { scopes.back()[sym] = info; }
+
     /**
      * @brief Puts all symbol-to-value mappings in the given single-level
      * symbol table into the current scope.
-     * 
+     *
      * Existing mappings are overwritten in case of duplicate symbols.
-     * 
+     *
      * @param tab The single-level symbol table to read from.
      */
     void put(SymbolTable tab) {
-        if(!getNumScopes())
+        if (!getNumScopes())
             scopes.push_back(tab);
         else
-            for(auto it = tab.begin(); it != tab.end(); it++)
+            for (auto it = tab.begin(); it != tab.end(); it++)
                 put(it->first, it->second);
     }
-    
+
     /**
      * @brief Creates a new scope in the hierarchy of nested scopes.
-     * 
+     *
      * All subsequent calls to `get` and `put` will address the new scope.
      */
-    void pushScope() {
-        scopes.push_back(SymbolTable());
-    }
+    void pushScope() { scopes.push_back(SymbolTable()); }
 
     /**
      * @brief Get the number of nested scopes.
      *
      * @return Number of nested scopes
      */
-    size_t getNumScopes() const {
-        return scopes.size();
-    }
+    size_t getNumScopes() const { return scopes.size(); }
 
     /**
      * @brief Removes the current scope from the hierarchy of nested scopes.
-     * 
+     *
      * @return A single-level symbol table containing only those symbols that
      * (1) existed prior to the removed scope, and (2) were overwritten in the
      * removed scope.
@@ -257,8 +243,8 @@ class ScopedSymbolTable {
     SymbolTable popScope() {
         SymbolTable curScope = scopes.back();
         SymbolTable overwritten;
-        for(auto it = curScope.begin(); it != curScope.end(); it++) {
-            if(has(it->first, 1))
+        for (auto it = curScope.begin(); it != curScope.end(); it++) {
+            if (has(it->first, 1))
                 overwritten.emplace(it->first, it->second);
         }
         scopes.pop_back();
@@ -266,8 +252,9 @@ class ScopedSymbolTable {
     }
 
     /**
-     * @brief Removes and returns the current scope from the hierarchy of nested scopes.
-     * 
+     * @brief Removes and returns the current scope from the hierarchy of nested
+     * scopes.
+     *
      * @return A single-level symbol table with all its symbols
      */
     SymbolTable extractScope() {
@@ -275,39 +262,39 @@ class ScopedSymbolTable {
         scopes.pop_back();
         return curScope;
     }
-    
+
     /**
      * @brief Prints the contents of this `ScopedSymbolTable` to a stream.
-     * 
+     *
      * @param os The stream to print to. Could be `std::cout`.
      */
-    void dump(std::ostream & os) const {
-        for(size_t i = 0; i < scopes.size(); i++) {
+    void dump(std::ostream &os) const {
+        for (size_t i = 0; i < scopes.size(); i++) {
             os << "scope #" << i << ':' << std::endl;
-            for(auto it = scopes[i].begin(); it != scopes[i].end(); it++)
+            for (auto it = scopes[i].begin(); it != scopes[i].end(); it++)
                 os << '\t' << it->first << std::endl;
         }
         os << std::endl;
     }
-    
+
     /**
      * @brief Determines the union of the symbols in the two given single-level
      * symbol tables.
-     * 
+     *
      * @param lhs Some single-level symbol table.
      * @param rhs Some single-level symbol table.
      * @return The union of the symbol in the two input symbol tables.
      */
     static std::set<std::string> mergeSymbols(SymbolTable lhs, SymbolTable rhs) {
         std::set<std::string> res;
-        
-        for(auto it = lhs.begin(); it != lhs.end(); it++)
+
+        for (auto it = lhs.begin(); it != lhs.end(); it++)
             res.insert(it->first);
-        for(auto it = rhs.begin(); it != rhs.end(); it++)
+        for (auto it = rhs.begin(); it != rhs.end(); it++)
             res.insert(it->first);
-        
+
         return res;
     }
 };
 
-#endif //SRC_PARSER_SCOPEDSYMBOLTABLE_H
+#endif // SRC_PARSER_SCOPEDSYMBOLTABLE_H
diff --git a/src/parser/catalog/KernelCatalogParser.cpp b/src/parser/catalog/KernelCatalogParser.cpp
index cbd0ec140..7560e7f4e 100644
--- a/src/parser/catalog/KernelCatalogParser.cpp
+++ b/src/parser/catalog/KernelCatalogParser.cpp
@@ -22,43 +22,44 @@
 #include <nlohmannjson/json.hpp>
 
 #include <algorithm>
-#include <iterator>
 #include <filesystem>
 #include <fstream>
+#include <iterator>
 #include <sstream>
 #include <stdexcept>
 #include <unordered_map>
 #include <vector>
 
-KernelCatalogParser::KernelCatalogParser(mlir::MLIRContext * mctx) {
-    // Initialize the mapping from C++ type name strings to MLIR types for parsing.
+KernelCatalogParser::KernelCatalogParser(mlir::MLIRContext *mctx) {
+    // Initialize the mapping from C++ type name strings to MLIR types for
+    // parsing.
 
     mlir::OpBuilder builder(mctx);
 
     // Scalars and matrices.
-    std::vector<mlir::Type> scalarTypes = {
-        builder.getF64Type(),
-        builder.getF32Type(),
-        builder.getIntegerType(64, true),
-        builder.getIntegerType(32, true),
-        builder.getIntegerType(8, true),
-        builder.getIntegerType(64, false),
-        builder.getIntegerType(32, false),
-        builder.getIntegerType(8, false),
-        builder.getI1Type(),
-        builder.getIndexType(),
-        mlir::daphne::StringType::get(mctx)
-    };
-    for(mlir::Type st : scalarTypes) {
+    std::vector<mlir::Type> scalarTypes = {builder.getF64Type(),
+                                           builder.getF32Type(),
+                                           builder.getIntegerType(64, true),
+                                           builder.getIntegerType(32, true),
+                                           builder.getIntegerType(8, true),
+                                           builder.getIntegerType(64, false),
+                                           builder.getIntegerType(32, false),
+                                           builder.getIntegerType(8, false),
+                                           builder.getI1Type(),
+                                           builder.getIndexType(),
+                                           mlir::daphne::StringType::get(mctx)};
+    for (mlir::Type st : scalarTypes) {
         // Scalar type.
         typeMap.emplace(CompilerUtils::mlirTypeToCppTypeName(st), st);
 
         // Matrix type for DenseMatrix.
-        // TODO This should have withRepresentation(mlir::daphne::MatrixRepresentation::Dense).
+        // TODO This should have
+        // withRepresentation(mlir::daphne::MatrixRepresentation::Dense).
         mlir::Type mtDense = mlir::daphne::MatrixType::get(mctx, st);
         typeMap.emplace(CompilerUtils::mlirTypeToCppTypeName(mtDense), mtDense);
         // Matrix type for CSRMatrix.
-        mlir::Type mtCSR = mlir::daphne::MatrixType::get(mctx, st).withRepresentation(mlir::daphne::MatrixRepresentation::Sparse);
+        mlir::Type mtCSR =
+            mlir::daphne::MatrixType::get(mctx, st).withRepresentation(mlir::daphne::MatrixRepresentation::Sparse);
         typeMap.emplace(CompilerUtils::mlirTypeToCppTypeName(mtCSR), mtCSR);
 
         // List type for list of DenseMatrix.
@@ -69,9 +70,10 @@ KernelCatalogParser::KernelCatalogParser(mlir::MLIRContext * mctx) {
         typeMap.emplace(CompilerUtils::mlirTypeToCppTypeName(ltCSR), ltCSR);
 
         // MemRef type.
-        if(!st.isa<mlir::daphne::StringType>()) {
-            // DAPHNE's StringType is not supported as the element type of a MemRef.
-            // The dimensions of the MemRef are irrelevant here, so we use {0, 0}.
+        if (!st.isa<mlir::daphne::StringType>()) {
+            // DAPHNE's StringType is not supported as the element type of a
+            // MemRef. The dimensions of the MemRef are irrelevant here, so we
+            // use {0, 0}.
             mlir::Type mrt = mlir::MemRefType::get({0, 0}, st);
             typeMap.emplace(CompilerUtils::mlirTypeToCppTypeName(mrt), mrt);
         }
@@ -83,46 +85,42 @@ KernelCatalogParser::KernelCatalogParser(mlir::MLIRContext * mctx) {
         mlir::daphne::FrameType::get(mctx, {mlir::daphne::UnknownType::get(mctx)}),
         mlir::daphne::DaphneContextType::get(mctx),
     };
-    for(mlir::Type t : otherTypes) {
+    for (mlir::Type t : otherTypes) {
         typeMap.emplace(CompilerUtils::mlirTypeToCppTypeName(t), t);
     }
 }
 
-void KernelCatalogParser::mapTypes(
-    const std::vector<std::string> & in,
-    std::vector<mlir::Type> & out,
-    const std::string & word,
-    const std::string & kernelFuncName,
-    const std::string & opMnemonic,
-    const std::string & backend
-) const {
-    for(size_t i = 0; i < in.size(); i++) {
+void KernelCatalogParser::mapTypes(const std::vector<std::string> &in, std::vector<mlir::Type> &out,
+                                   const std::string &word, const std::string &kernelFuncName,
+                                   const std::string &opMnemonic, const std::string &backend) const {
+    for (size_t i = 0; i < in.size(); i++) {
         const std::string name = in[i];
         auto it = typeMap.find(name);
-        if(it != typeMap.end())
+        if (it != typeMap.end())
             out.push_back(it->second);
         else {
             std::stringstream s;
-            s << "KernelCatalogParser: error while parsing " + word + " types of kernel `"
-                << kernelFuncName << "` for operation `" << opMnemonic << "` (backend `"
-                << backend << "`): unknown type for " << word << " #" << i << ": `" << name << '`';
+            s << "KernelCatalogParser: error while parsing " + word + " types of kernel `" << kernelFuncName
+              << "` for operation `" << opMnemonic << "` (backend `" << backend << "`): unknown type for " << word
+              << " #" << i << ": `" << name << '`';
             throw std::runtime_error(s.str());
         }
     }
 }
 
-void KernelCatalogParser::parseKernelCatalog(const std::string & filePath, KernelCatalog & kc) const {
+void KernelCatalogParser::parseKernelCatalog(const std::string &filePath, KernelCatalog &kc) const {
     std::filesystem::path dirPath = std::filesystem::path(filePath).parent_path();
     try {
         std::ifstream kernelsConfigFile(filePath);
-        if(!kernelsConfigFile.good())
+        if (!kernelsConfigFile.good())
             throw std::runtime_error("could not open file for reading");
         nlohmann::json kernelsConfigData = nlohmann::json::parse(kernelsConfigFile);
-        for(auto kernelData : kernelsConfigData) {
+        for (auto kernelData : kernelsConfigData) {
             const std::string opMnemonic = kernelData["opMnemonic"].get<std::string>();
             // TODO Remove this workaround.
-            // Skip these two problematic operations, which return multiple results in the wrong way.
-            if(opMnemonic == "Avg_Forward" || opMnemonic == "Max_Forward")
+            // Skip these two problematic operations, which return multiple
+            // results in the wrong way.
+            if (opMnemonic == "Avg_Forward" || opMnemonic == "Max_Forward")
                 continue;
             const std::string kernelFuncName = kernelData["kernelFuncName"].get<std::string>();
             const std::string backend = kernelData["backend"].get<std::string>();
@@ -133,8 +131,7 @@ void KernelCatalogParser::parseKernelCatalog(const std::string & filePath, Kerne
             mapTypes(kernelData["argTypes"], argTypes, "argument", kernelFuncName, opMnemonic, backend);
             kc.registerKernel(opMnemonic, KernelInfo(kernelFuncName, resTypes, argTypes, backend, libPath));
         }
-    }
-    catch(std::exception& e) {
+    } catch (std::exception &e) {
         throw std::runtime_error("error while parsing kernel catalog file `" + filePath + "`: " + e.what());
     }
 }
\ No newline at end of file
diff --git a/src/parser/catalog/KernelCatalogParser.h b/src/parser/catalog/KernelCatalogParser.h
index 7e93ae260..d55f8fdb8 100644
--- a/src/parser/catalog/KernelCatalogParser.h
+++ b/src/parser/catalog/KernelCatalogParser.h
@@ -31,41 +31,39 @@
 class KernelCatalogParser {
 
     /**
-     * @brief A mapping from C++ type name strings to MLIR types used for parsing input/output types of kernels.
+     * @brief A mapping from C++ type name strings to MLIR types used for
+     * parsing input/output types of kernels.
      */
     std::unordered_map<std::string, mlir::Type> typeMap;
 
     /**
      * @brief Maps the given C++ type names to MLIR types.
-     * 
+     *
      * @param in The vector of C++ type names.
      * @param out The vector of corresponding MLIR types.
      * @param word Typically either `"argument"` or `"result"`.
-     * @param kernelFuncName The name of the kernel function for which this method is called (for error message).
-     * @param opMnemonic The mnemonic of the operation for which this method is called (for error message).
-     * @param backend The backend for which this method is called (for error message).
+     * @param kernelFuncName The name of the kernel function for which this
+     * method is called (for error message).
+     * @param opMnemonic The mnemonic of the operation for which this method is
+     * called (for error message).
+     * @param backend The backend for which this method is called (for error
+     * message).
      */
-    void mapTypes(
-        const std::vector<std::string> & in,
-        std::vector<mlir::Type> & out,
-        const std::string & word,
-        const std::string & kernelFuncName,
-        const std::string & opMnemonic,
-        const std::string & backend
-    ) const;
-
-public:
+    void mapTypes(const std::vector<std::string> &in, std::vector<mlir::Type> &out, const std::string &word,
+                  const std::string &kernelFuncName, const std::string &opMnemonic, const std::string &backend) const;
 
+  public:
     /**
      * @brief Creates a new kernel catalog parser.
      */
-    KernelCatalogParser(mlir::MLIRContext * mctx);
+    KernelCatalogParser(mlir::MLIRContext *mctx);
 
     /**
-     * @brief Parses kernel information from the given file and registers them with the given kernel catalog.
-     * 
+     * @brief Parses kernel information from the given file and registers them
+     * with the given kernel catalog.
+     *
      * @param filePath The path to the file to extract kernel information from.
      * @param kc The kernel catalog to register the kernels with.
      */
-    void parseKernelCatalog(const std::string & filePath, KernelCatalog & kc) const;
+    void parseKernelCatalog(const std::string &filePath, KernelCatalog &kc) const;
 };
\ No newline at end of file
diff --git a/src/parser/config/ConfigParser.cpp b/src/parser/config/ConfigParser.cpp
index 2040286e6..2a7b6a28c 100644
--- a/src/parser/config/ConfigParser.cpp
+++ b/src/parser/config/ConfigParser.cpp
@@ -18,29 +18,32 @@
 #include <parser/config/JsonParams.h>
 #include <util/DaphneLogger.h>
 
-#include <iostream>
 #include <fstream>
+#include <iostream>
 #include <vector>
 
-int readLogLevel(const std::string& level) {
+int readLogLevel(const std::string &level) {
     std::string level_lowercase(level);
     std::transform(level.begin(), level.end(), level_lowercase.begin(), ::tolower);
     return static_cast<int>(spdlog::level::from_str(level_lowercase));
 }
 
-bool ConfigParser::fileExists(const std::string& filename) {
+bool ConfigParser::fileExists(const std::string &filename) {
     // Open the given config file.
     std::ifstream ifs(filename, std::ios::in);
-    if (!ifs.good())
-        throw std::runtime_error("could not open file '" + filename + "' for reading user config");
+    if (!ifs.good()) {
+        spdlog::warn("could not open file {} for reading user config", filename);
+        return false;
+    }
     return true;
 }
 
-void ConfigParser::readUserConfig(const std::string& filename, DaphneUserConfig& config) {
+void ConfigParser::readUserConfig(const std::string &filename, DaphneUserConfig &config) {
     std::ifstream ifs(filename);
     auto jf = nlohmann::json::parse(ifs);
 
-    checkAnyUnexpectedKeys(jf, filename);   // raise an error if the config JSON file contains any unexpected keys
+    checkAnyUnexpectedKeys(jf, filename); // raise an error if the config JSON
+                                          // file contains any unexpected keys
 
     if (keyExists(jf, DaphneConfigJsonParams::USE_CUDA_))
         config.use_cuda = jf.at(DaphneConfigJsonParams::USE_CUDA_).get<bool>();
@@ -61,7 +64,8 @@ void ConfigParser::readUserConfig(const std::string& filename, DaphneUserConfig&
     if (keyExists(jf, DaphneConfigJsonParams::MATMUL_USE_FIXED_TILE_SIZES))
         config.matmul_use_fixed_tile_sizes = jf.at(DaphneConfigJsonParams::MATMUL_USE_FIXED_TILE_SIZES).get<bool>();
     if (keyExists(jf, DaphneConfigJsonParams::MATMUL_FIXED_TILE_SIZES))
-        config.matmul_fixed_tile_sizes = jf.at(DaphneConfigJsonParams::MATMUL_FIXED_TILE_SIZES).get<std::vector<unsigned>>();
+        config.matmul_fixed_tile_sizes =
+            jf.at(DaphneConfigJsonParams::MATMUL_FIXED_TILE_SIZES).get<std::vector<unsigned>>();
     if (keyExists(jf, DaphneConfigJsonParams::MATMUL_UNROLL_FACTOR))
         config.matmul_unroll_factor = jf.at(DaphneConfigJsonParams::MATMUL_UNROLL_FACTOR).get<int>();
     if (keyExists(jf, DaphneConfigJsonParams::MATMUL_UNROLL_JAM_FACTOR))
@@ -101,10 +105,11 @@ void ConfigParser::readUserConfig(const std::string& filename, DaphneUserConfig&
     if (keyExists(jf, DaphneConfigJsonParams::EXPLAIN_MLIR_CODEGEN))
         config.explain_mlir_codegen = jf.at(DaphneConfigJsonParams::EXPLAIN_MLIR_CODEGEN).get<bool>();
     if (keyExists(jf, DaphneConfigJsonParams::TASK_PARTITIONING_SCHEME)) {
-        config.taskPartitioningScheme = jf.at(DaphneConfigJsonParams::TASK_PARTITIONING_SCHEME).get<SelfSchedulingScheme>();
+        config.taskPartitioningScheme =
+            jf.at(DaphneConfigJsonParams::TASK_PARTITIONING_SCHEME).get<SelfSchedulingScheme>();
         if (config.taskPartitioningScheme == SelfSchedulingScheme::INVALID) {
             throw std::invalid_argument(std::string("Invalid value for enum \"SelfSchedulingScheme\"")
-                    .append(std::to_string(static_cast<int>(config.taskPartitioningScheme))));
+                                            .append(std::to_string(static_cast<int>(config.taskPartitioningScheme))));
         }
     }
     if (keyExists(jf, DaphneConfigJsonParams::NUMBER_OF_THREADS))
@@ -124,27 +129,25 @@ void ConfigParser::readUserConfig(const std::string& filename, DaphneUserConfig&
     if (keyExists(jf, DaphneConfigJsonParams::LIB_DIR))
         config.libdir = jf.at(DaphneConfigJsonParams::LIB_DIR).get<std::string>();
     if (keyExists(jf, DaphneConfigJsonParams::DAPHNEDSL_IMPORT_PATHS)) {
-        config.daphnedsl_import_paths = jf.at(DaphneConfigJsonParams::DAPHNEDSL_IMPORT_PATHS).get<std::map<std::string,
-                std::vector<std::string>>>();
+        config.daphnedsl_import_paths = jf.at(DaphneConfigJsonParams::DAPHNEDSL_IMPORT_PATHS)
+                                            .get<std::map<std::string, std::vector<std::string>>>();
     }
     if (keyExists(jf, DaphneConfigJsonParams::LOGGING)) {
-        for (const auto&[key, val]: jf.at(DaphneConfigJsonParams::LOGGING).items()) {
-            if(val.contains("log-level-limit")) {
+        for (const auto &[key, val] : jf.at(DaphneConfigJsonParams::LOGGING).items()) {
+            if (val.contains("log-level-limit")) {
                 config.log_level_limit = static_cast<spdlog::level::level_enum>(readLogLevel(val.front()));
-            }
-            else if (val.contains("name")) {
-                config.loggers.emplace_back(LogConfig({val.at("name"), val.at("filename"), readLogLevel(val.at("level")),
-                        val.at("format")}));
-            }
-            else {
+            } else if (val.contains("name")) {
+                config.loggers.emplace_back(
+                    LogConfig({val.at("name"), val.at("filename"), readLogLevel(val.at("level")), val.at("format")}));
+            } else {
                 spdlog::error("Not handling unknown/malformed log config entry {}", key);
-                for (const auto&[key2, val2]: val.items()) {
-                    // not using spdlog::get() here as loggers are most likely not configured yet
+                for (const auto &[key2, val2] : val.items()) {
+                    // not using spdlog::get() here as loggers are most likely
+                    // not configured yet
                     spdlog::error(key2);
-                    spdlog::error(val2);
+                    spdlog::error(static_cast<std::string>(key2));
                 }
             }
-
         }
     }
     if (keyExists(jf, DaphneConfigJsonParams::FORCE_CUDA))
@@ -153,22 +156,20 @@ void ConfigParser::readUserConfig(const std::string& filename, DaphneUserConfig&
         config.sparsity_threshold = jf.at(DaphneConfigJsonParams::SPARSITY_THRESHOLD).get<float>();
 }
 
-bool ConfigParser::keyExists(const nlohmann::json& j, const std::string& key) {
-    return j.find(key) != j.end();
-}
+bool ConfigParser::keyExists(const nlohmann::json &j, const std::string &key) { return j.find(key) != j.end(); }
 
-void ConfigParser::checkAnyUnexpectedKeys(const nlohmann::basic_json<>& j, const std::string& filename) {
-    for (auto&[key, val]: j.items()) {
+void ConfigParser::checkAnyUnexpectedKeys(const nlohmann::basic_json<> &j, const std::string &filename) {
+    for (auto &[key, val] : j.items()) {
         bool flag = false;
-        for (auto &jsonParam: DaphneConfigJsonParams::JSON_PARAMS) {
+        for (auto &jsonParam : DaphneConfigJsonParams::JSON_PARAMS) {
             if (key == jsonParam) {
                 flag = true;
                 break;
             }
         }
         if (!flag) {
-            throw std::invalid_argument(std::string("Unexpected key '").append(key).append("' in '").append(filename)
-                .append("' file"));
+            throw std::invalid_argument(
+                std::string("Unexpected key '").append(key).append("' in '").append(filename).append("' file"));
         }
     }
 }
diff --git a/src/parser/config/ConfigParser.h b/src/parser/config/ConfigParser.h
index 0a1021c7d..111cb8191 100644
--- a/src/parser/config/ConfigParser.h
+++ b/src/parser/config/ConfigParser.h
@@ -16,33 +16,32 @@
  */
 
 #pragma once
+#include <api/cli/DaphneUserConfig.h>
 #include <nlohmannjson/json.hpp>
 #include <runtime/local/vectorized/LoadPartitioning.h>
-#include <api/cli/DaphneUserConfig.h>
 #include <string>
 
 // must be in the same namespace as the enum SelfSchedulingScheme
-NLOHMANN_JSON_SERIALIZE_ENUM(SelfSchedulingScheme, {
-    {INVALID, nullptr},
-    {STATIC, "STATIC"},
-    {SS, "SS"},
-    {GSS, "GSS"},
-    {TSS, "TSS"},
-    {FAC2, "FAC2"},
-    {TFSS, "TFSS"},
-    {FISS, "FISS"},
-    {VISS, "VISS"},
-    {PLS, "PLS"},
-    {MSTATIC, "MSTATIC"},
-    {MFSC, "MFSC"},
-    {PSS, "PSS"}
-})
+NLOHMANN_JSON_SERIALIZE_ENUM(SelfSchedulingScheme, {{INVALID, nullptr},
+                                                    {STATIC, "STATIC"},
+                                                    {SS, "SS"},
+                                                    {GSS, "GSS"},
+                                                    {TSS, "TSS"},
+                                                    {FAC2, "FAC2"},
+                                                    {TFSS, "TFSS"},
+                                                    {FISS, "FISS"},
+                                                    {VISS, "VISS"},
+                                                    {PLS, "PLS"},
+                                                    {MSTATIC, "MSTATIC"},
+                                                    {MFSC, "MFSC"},
+                                                    {PSS, "PSS"}})
 
 class ConfigParser {
-public:
-    static bool fileExists(const std::string& filename);
-    static void readUserConfig(const std::string& filename, DaphneUserConfig& config);
-private:
-    static bool keyExists(const nlohmann::json& j, const std::string& key);
-    static void checkAnyUnexpectedKeys(const nlohmann::basic_json<>& j, const std::string& filename);
+  public:
+    static bool fileExists(const std::string &filename);
+    static void readUserConfig(const std::string &filename, DaphneUserConfig &config);
+
+  private:
+    static bool keyExists(const nlohmann::json &j, const std::string &key);
+    static void checkAnyUnexpectedKeys(const nlohmann::basic_json<> &j, const std::string &filename);
 };
diff --git a/src/parser/config/JsonParams.h b/src/parser/config/JsonParams.h
index 3a8b9d8af..d2c964cf9 100644
--- a/src/parser/config/JsonParams.h
+++ b/src/parser/config/JsonParams.h
@@ -67,47 +67,45 @@ struct DaphneConfigJsonParams {
     inline static const std::string FORCE_CUDA = "force_cuda";
     inline static const std::string SPARSITY_THRESHOLD = "sparsity_threshold";
 
-    inline static const std::string JSON_PARAMS[] = {
-            MATMUL_VEC_SIZE_BITS,
-            MATMUL_TILE,
-            MATMUL_FIXED_TILE_SIZES,
-            MATMUL_USE_FIXED_TILE_SIZES,
-            MATMUL_UNROLL_FACTOR,
-            MATMUL_UNROLL_JAM_FACTOR,
-            MATMUL_NUM_VEC_REGISTERS,
-            MATMUL_INVERT_LOOPS,
-            USE_CUDA_,
-            USE_VECTORIZED_EXEC,
-            USE_OBJ_REF_MGNT,
-            USE_IPA_CONST_PROPA,
-            USE_PHY_OP_SELECTION,
-            USE_MLIR_CODEGEN,
-            CUDA_FUSE_ANY,
-            VECTORIZED_SINGLE_QUEUE,
-            DEBUG_LLVM,
-            EXPLAIN_KERNELS,
-            EXPLAIN_LLVM,
-            EXPLAIN_PARSING,
-            EXPLAIN_PARSING_SIMPLIFIED,
-            EXPLAIN_PROPERTY_INFERENCE,
-            EXPLAIN_SELECT_MATRIX_REPR,
-            EXPLAIN_SQL,
-            EXPLAIN_PHY_OP_SELECTION,
-            EXPLAIN_TYPE_ADAPTATION,
-            EXPLAIN_VECTORIZED,
-            EXPLAIN_MLIR_CODEGEN,
-            EXPLAIN_OBJ_REF_MGNT,
-            TASK_PARTITIONING_SCHEME,
-            NUMBER_OF_THREADS,
-            MINIMUM_TASK_SIZE,
-            USE_HDFS_,
-            HDFS_ADDRESS,
-            HDFS_USERNAME,
-            CUDA_DEVICES,
-            LIB_DIR,
-            DAPHNEDSL_IMPORT_PATHS,
-            LOGGING,
-            FORCE_CUDA,
-            SPARSITY_THRESHOLD
-    };
+    inline static const std::string JSON_PARAMS[] = {MATMUL_VEC_SIZE_BITS,
+                                                     MATMUL_TILE,
+                                                     MATMUL_FIXED_TILE_SIZES,
+                                                     MATMUL_USE_FIXED_TILE_SIZES,
+                                                     MATMUL_UNROLL_FACTOR,
+                                                     MATMUL_UNROLL_JAM_FACTOR,
+                                                     MATMUL_NUM_VEC_REGISTERS,
+                                                     MATMUL_INVERT_LOOPS,
+                                                     USE_CUDA_,
+                                                     USE_VECTORIZED_EXEC,
+                                                     USE_OBJ_REF_MGNT,
+                                                     USE_IPA_CONST_PROPA,
+                                                     USE_PHY_OP_SELECTION,
+                                                     USE_MLIR_CODEGEN,
+                                                     CUDA_FUSE_ANY,
+                                                     VECTORIZED_SINGLE_QUEUE,
+                                                     DEBUG_LLVM,
+                                                     EXPLAIN_KERNELS,
+                                                     EXPLAIN_LLVM,
+                                                     EXPLAIN_PARSING,
+                                                     EXPLAIN_PARSING_SIMPLIFIED,
+                                                     EXPLAIN_PROPERTY_INFERENCE,
+                                                     EXPLAIN_SELECT_MATRIX_REPR,
+                                                     EXPLAIN_SQL,
+                                                     EXPLAIN_PHY_OP_SELECTION,
+                                                     EXPLAIN_TYPE_ADAPTATION,
+                                                     EXPLAIN_VECTORIZED,
+                                                     EXPLAIN_MLIR_CODEGEN,
+                                                     EXPLAIN_OBJ_REF_MGNT,
+                                                     TASK_PARTITIONING_SCHEME,
+                                                     NUMBER_OF_THREADS,
+                                                     MINIMUM_TASK_SIZE,
+                                                     USE_HDFS_,
+                                                     HDFS_ADDRESS,
+                                                     HDFS_USERNAME,
+                                                     CUDA_DEVICES,
+                                                     LIB_DIR,
+                                                     DAPHNEDSL_IMPORT_PATHS,
+                                                     LOGGING,
+                                                     FORCE_CUDA,
+                                                     SPARSITY_THRESHOLD};
 };
diff --git a/src/parser/daphnedsl/DaphneDSLBuiltins.cpp b/src/parser/daphnedsl/DaphneDSLBuiltins.cpp
index 58237873f..afe498a20 100644
--- a/src/parser/daphnedsl/DaphneDSLBuiltins.cpp
+++ b/src/parser/daphnedsl/DaphneDSLBuiltins.cpp
@@ -34,178 +34,160 @@
 // Checking number of arguments
 // ************************************************************************
 
-void DaphneDSLBuiltins::checkNumArgsExact(mlir::Location loc, const std::string & func, size_t numArgs, size_t numArgsExact) {
-    if(numArgs != numArgsExact)
+void DaphneDSLBuiltins::checkNumArgsExact(mlir::Location loc, const std::string &func, size_t numArgs,
+                                          size_t numArgsExact) {
+    if (numArgs != numArgsExact)
         throw ErrorHandler::compilerError(loc, "DSLBuiltins",
-                "built-in function `" + func + "` expects exactly " +
-                std::to_string(numArgsExact) + " argument(s), but got " +
-                std::to_string(numArgs)
-        );
+                                          "built-in function `" + func + "` expects exactly " +
+                                              std::to_string(numArgsExact) + " argument(s), but got " +
+                                              std::to_string(numArgs));
 }
 
-void DaphneDSLBuiltins::checkNumArgsBetween(mlir::Location loc, const std::string & func, size_t numArgs, size_t numArgsMin, size_t numArgsMax) {
-    if(numArgs < numArgsMin || numArgs > numArgsMax)
+void DaphneDSLBuiltins::checkNumArgsBetween(mlir::Location loc, const std::string &func, size_t numArgs,
+                                            size_t numArgsMin, size_t numArgsMax) {
+    if (numArgs < numArgsMin || numArgs > numArgsMax)
         throw ErrorHandler::compilerError(loc, "DSLBuiltins",
-                "built-in function `" + func + "` expects between " +
-                std::to_string(numArgsMin) + " and " + std::to_string(numArgsMax) +
-                " argument(s), but got " + std::to_string(numArgs)
-        );
+                                          "built-in function `" + func + "` expects between " +
+                                              std::to_string(numArgsMin) + " and " + std::to_string(numArgsMax) +
+                                              " argument(s), but got " + std::to_string(numArgs));
 }
 
-void DaphneDSLBuiltins::checkNumArgsIn(mlir::Location loc, const std::string & func, size_t numArgs, std::vector<size_t> numArgsChoice) {
-    if(numArgsChoice.empty())
+void DaphneDSLBuiltins::checkNumArgsIn(mlir::Location loc, const std::string &func, size_t numArgs,
+                                       std::vector<size_t> numArgsChoice) {
+    if (numArgsChoice.empty())
         throw ErrorHandler::compilerError(loc, "DSLBuiltins",
-                "error while parsing built-in function `" + func +
-                "`: expecting at least one option for the permitted number of arguments"
-        );
-    if(std::find(numArgsChoice.begin(), numArgsChoice.end(), numArgs) == numArgsChoice.end()) {
+                                          "error while parsing built-in function `" + func +
+                                              "`: expecting at least one option for the permitted number of "
+                                              "arguments");
+    if (std::find(numArgsChoice.begin(), numArgsChoice.end(), numArgs) == numArgsChoice.end()) {
         std::stringstream msg;
         msg << "built-in function `" << func << "` expects exactly " << numArgsChoice[0];
-        for(size_t i = 1; i < numArgsChoice.size(); i++)
+        for (size_t i = 1; i < numArgsChoice.size(); i++)
             msg << " or " << numArgsChoice[i];
         msg << " argument(s), but got " << numArgs;
-        throw ErrorHandler::compilerError(loc, "DSLBuiltins",
-                msg.str()
-        );
+        throw ErrorHandler::compilerError(loc, "DSLBuiltins", msg.str());
     }
 }
 
-void DaphneDSLBuiltins::checkNumArgsMin(mlir::Location loc, const std::string & func, size_t numArgs, size_t numArgsMin) {
-    if(numArgs < numArgsMin)
-        throw ErrorHandler::compilerError(loc, "DSLBuiltins", 
-                "built-in function `" + func + "` expects at least " +
-                std::to_string(numArgsMin) + " argument(s), but got " +
-                std::to_string(numArgs)
-        );
+void DaphneDSLBuiltins::checkNumArgsMin(mlir::Location loc, const std::string &func, size_t numArgs,
+                                        size_t numArgsMin) {
+    if (numArgs < numArgsMin)
+        throw ErrorHandler::compilerError(loc, "DSLBuiltins",
+                                          "built-in function `" + func + "` expects at least " +
+                                              std::to_string(numArgsMin) + " argument(s), but got " +
+                                              std::to_string(numArgs));
 }
 
-void DaphneDSLBuiltins::checkNumArgsEven(mlir::Location loc, const std::string & func, size_t numArgs) {
-    if(numArgs % 2)
-        throw ErrorHandler::compilerError(loc, "DSLBuiltins", 
-                "built-in function `" + func +
-                "` expects an even number of arguments, but got " +
-                std::to_string(numArgs)
-        );
+void DaphneDSLBuiltins::checkNumArgsEven(mlir::Location loc, const std::string &func, size_t numArgs) {
+    if (numArgs % 2)
+        throw ErrorHandler::compilerError(
+            loc, "DSLBuiltins",
+            "built-in function `" + func + "` expects an even number of arguments, but got " + std::to_string(numArgs));
 }
 
 // ************************************************************************
 // Creating similar DaphneIR operations
 // ************************************************************************
 
-template<class NumOp>
-mlir::Value DaphneDSLBuiltins::createNumOp(mlir::Location loc, const std::string & func, const std::vector<mlir::Value> & args) {
+template <class NumOp>
+mlir::Value DaphneDSLBuiltins::createNumOp(mlir::Location loc, const std::string &func,
+                                           const std::vector<mlir::Value> &args) {
     checkNumArgsExact(loc, func, args.size(), 1);
-    return static_cast<mlir::Value>(builder.create<NumOp>(
-            loc, utils.sizeType, args[0]
-    ));
+    return static_cast<mlir::Value>(builder.create<NumOp>(loc, utils.sizeType, args[0]));
 }
 
-template<class UnaryOp>
-mlir::Value DaphneDSLBuiltins::createUnaryOp(mlir::Location loc, const std::string & func, const std::vector<mlir::Value> & args) {
+template <class UnaryOp>
+mlir::Value DaphneDSLBuiltins::createUnaryOp(mlir::Location loc, const std::string &func,
+                                             const std::vector<mlir::Value> &args) {
     checkNumArgsExact(loc, func, args.size(), 1);
-    return utils.retValWithInferedType(builder.create<UnaryOp>(
-            loc, utils.unknownType, args[0]
-    ));
+    return utils.retValWithInferedType(builder.create<UnaryOp>(loc, utils.unknownType, args[0]));
 }
 
-template<class BinaryOp>
-mlir::Value DaphneDSLBuiltins::createBinaryOp(mlir::Location loc, const std::string & func, const std::vector<mlir::Value> & args) {
+template <class BinaryOp>
+mlir::Value DaphneDSLBuiltins::createBinaryOp(mlir::Location loc, const std::string &func,
+                                              const std::vector<mlir::Value> &args) {
     checkNumArgsExact(loc, func, args.size(), 2);
-    return utils.retValWithInferedType(builder.create<BinaryOp>(
-            loc, utils.unknownType, args[0], args[1]
-    ));
+    return utils.retValWithInferedType(builder.create<BinaryOp>(loc, utils.unknownType, args[0], args[1]));
 }
 
-template<class RowAggOp, class ColAggOp>
-mlir::Value DaphneDSLBuiltins::createRowOrColAggOp(mlir::Location loc, const std::string & func, const std::vector<mlir::Value> & args) {
+template <class RowAggOp, class ColAggOp>
+mlir::Value DaphneDSLBuiltins::createRowOrColAggOp(mlir::Location loc, const std::string &func,
+                                                   const std::vector<mlir::Value> &args) {
     checkNumArgsExact(loc, func, args.size(), 2);
-    int64_t axis = CompilerUtils::constantOrThrow<int64_t>(
-            args[1], "second argument of aggregation must be a constant"
-    );
-    if(axis == 0)
-        return utils.retValWithInferedType(
-                builder.create<RowAggOp>(
-                        loc, utils.unknownType, args[0]
-                )
-        );
-    else if(axis == 1)
-        return utils.retValWithInferedType(
-                builder.create<ColAggOp>(
-                        loc, utils.unknownType, args[0]
-                )
-        );
+    int64_t axis =
+        CompilerUtils::constantOrThrow<int64_t>(args[1], "second argument of aggregation must be a constant");
+    if (axis == 0)
+        return utils.retValWithInferedType(builder.create<RowAggOp>(loc, utils.unknownType, args[0]));
+    else if (axis == 1)
+        return utils.retValWithInferedType(builder.create<ColAggOp>(loc, utils.unknownType, args[0]));
     else
         throw ErrorHandler::compilerError(loc, "DSLBuiltins", "invalid axis for aggregation.");
 }
 
-template<class GrpAggOp>
-mlir::Value DaphneDSLBuiltins::createGrpAggOp(mlir::Location loc, const std::string & func, const std::vector<mlir::Value> & args) {
+template <class GrpAggOp>
+mlir::Value DaphneDSLBuiltins::createGrpAggOp(mlir::Location loc, const std::string &func,
+                                              const std::vector<mlir::Value> &args) {
     checkNumArgsExact(loc, func, args.size(), 3);
     mlir::Value arg = args[0];
     mlir::Value groupIds = args[1];
     mlir::Value numGroups = utils.castSizeIf(args[2]);
-    return static_cast<mlir::Value>(builder.create<GrpAggOp>(
-            loc, args[0].getType(), arg, groupIds, numGroups
-    ));
+    return static_cast<mlir::Value>(builder.create<GrpAggOp>(loc, args[0].getType(), arg, groupIds, numGroups));
 }
 
-template<class AllAggOp, class RowAggOp, class ColAggOp, class GrpAggOp>
-mlir::Value DaphneDSLBuiltins::createAnyAggOp(mlir::Location loc, const std::string & func, const std::vector<mlir::Value> & args) {
+template <class AllAggOp, class RowAggOp, class ColAggOp, class GrpAggOp>
+mlir::Value DaphneDSLBuiltins::createAnyAggOp(mlir::Location loc, const std::string &func,
+                                              const std::vector<mlir::Value> &args) {
     const size_t numArgs = args.size();
     checkNumArgsBetween(loc, func, numArgs, 1, 3);
-    if(args.size() == 1)
+    if (args.size() == 1)
         return utils.retValWithInferedType(builder.create<AllAggOp>(loc, utils.unknownType, args[0]));
-    else if(numArgs == 2)
+    else if (numArgs == 2)
         return createRowOrColAggOp<RowAggOp, ColAggOp>(loc, func, args);
     else // numArgs == 3
         return createGrpAggOp<GrpAggOp>(loc, func, args);
 }
 
-template<class CumAggOp>
-mlir::Value DaphneDSLBuiltins::createCumAggOp(mlir::Location loc, const std::string & func, const std::vector<mlir::Value> & args) {
+template <class CumAggOp>
+mlir::Value DaphneDSLBuiltins::createCumAggOp(mlir::Location loc, const std::string &func,
+                                              const std::vector<mlir::Value> &args) {
     checkNumArgsExact(loc, func, args.size(), 1);
-    return static_cast<mlir::Value>(builder.create<CumAggOp>(
-            loc, args[0].getType(), args[0]
-    ));
+    return static_cast<mlir::Value>(builder.create<CumAggOp>(loc, args[0].getType(), args[0]));
 }
 
-template<class BindOp>
-mlir::Value DaphneDSLBuiltins::createBindOp(mlir::Location loc, const std::string & func, const std::vector<mlir::Value> & args) {
+template <class BindOp>
+mlir::Value DaphneDSLBuiltins::createBindOp(mlir::Location loc, const std::string &func,
+                                            const std::vector<mlir::Value> &args) {
     checkNumArgsExact(loc, func, args.size(), 2);
-    return utils.retValWithInferedType(builder.create<BindOp>(
-            loc, utils.unknownType, args[0], args[1]
-    ));
+    return utils.retValWithInferedType(builder.create<BindOp>(loc, utils.unknownType, args[0], args[1]));
 }
 
-template<class TheOp>
-mlir::Value DaphneDSLBuiltins::createSameTypeUnaryOp(mlir::Location loc, const std::string & func, const std::vector<mlir::Value> & args) {
+template <class TheOp>
+mlir::Value DaphneDSLBuiltins::createSameTypeUnaryOp(mlir::Location loc, const std::string &func,
+                                                     const std::vector<mlir::Value> &args) {
     checkNumArgsExact(loc, func, args.size(), 1);
-    return utils.retValWithInferedType(builder.create<TheOp>(
-            loc, utils.unknownType, args[0]
-    ));
+    return utils.retValWithInferedType(builder.create<TheOp>(loc, utils.unknownType, args[0]));
 }
 
-mlir::Value DaphneDSLBuiltins::createTriOp(mlir::Location loc, const std::string & func, const std::vector<mlir::Value> & args, bool upper) {
+mlir::Value DaphneDSLBuiltins::createTriOp(mlir::Location loc, const std::string &func,
+                                           const std::vector<mlir::Value> &args, bool upper) {
     checkNumArgsExact(loc, func, args.size(), 3);
     mlir::Value arg = args[0];
     mlir::Value upper2 = builder.create<mlir::daphne::ConstantOp>(loc, upper);
     mlir::Value diag = utils.castBoolIf(args[1]);
     mlir::Value values = utils.castBoolIf(args[2]);
-    return static_cast<mlir::Value>(builder.create<mlir::daphne::TriOp>(
-            loc, arg.getType(), arg, upper2, diag, values
-    ));
+    return static_cast<mlir::Value>(builder.create<mlir::daphne::TriOp>(loc, arg.getType(), arg, upper2, diag, values));
 }
 
-template<class SetOp>
-mlir::Value DaphneDSLBuiltins::createSetOp(mlir::Location loc, const std::string & func, const std::vector<mlir::Value> & args) {
+template <class SetOp>
+mlir::Value DaphneDSLBuiltins::createSetOp(mlir::Location loc, const std::string &func,
+                                           const std::vector<mlir::Value> &args) {
     checkNumArgsExact(loc, func, args.size(), 2);
-    return static_cast<mlir::Value>(builder.create<SetOp>(
-            loc, args[0].getType(), args[0], args[1]
-    ));
+    return static_cast<mlir::Value>(builder.create<SetOp>(loc, args[0].getType(), args[0], args[1]));
 }
 
-template<class JoinOp>
-mlir::Value DaphneDSLBuiltins::createJoinOp(mlir::Location loc, const std::string & func, const std::vector<mlir::Value> & args) {
+template <class JoinOp>
+mlir::Value DaphneDSLBuiltins::createJoinOp(mlir::Location loc, const std::string &func,
+                                            const std::vector<mlir::Value> &args) {
     const size_t numArgs = args.size();
     checkNumArgsMin(loc, func, numArgs, 4);
     checkNumArgsEven(loc, func, numArgs);
@@ -214,22 +196,21 @@ mlir::Value DaphneDSLBuiltins::createJoinOp(mlir::Location loc, const std::strin
     std::vector<mlir::Value> leftOn;
     std::vector<mlir::Value> rightOn;
     const size_t numCols = (numArgs - 2) / 2;
-    for(size_t i = 0; i < numCols; i++) {
+    for (size_t i = 0; i < numCols; i++) {
         leftOn.push_back(utils.castSizeIf(args[2 + i]));
         rightOn.push_back(utils.castSizeIf(args[2 + numCols + i]));
     }
     std::vector<mlir::Type> colTypes;
-    for(mlir::Type t : lhs.getType().dyn_cast<mlir::daphne::FrameType>().getColumnTypes())
+    for (mlir::Type t : lhs.getType().dyn_cast<mlir::daphne::FrameType>().getColumnTypes())
         colTypes.push_back(t);
-    for(mlir::Type t : rhs.getType().dyn_cast<mlir::daphne::FrameType>().getColumnTypes())
+    for (mlir::Type t : rhs.getType().dyn_cast<mlir::daphne::FrameType>().getColumnTypes())
         colTypes.push_back(t);
     mlir::Type t = mlir::daphne::FrameType::get(builder.getContext(), colTypes);
-    return static_cast<mlir::Value>(builder.create<JoinOp>(
-            loc, t, lhs, rhs, leftOn, rightOn
-    ));
+    return static_cast<mlir::Value>(builder.create<JoinOp>(loc, t, lhs, rhs, leftOn, rightOn));
 }
 
-mlir::Value DaphneDSLBuiltins::createAffineFwdOp(mlir::Location loc, const std::string& func, const std::vector<mlir::Value>& args) {
+mlir::Value DaphneDSLBuiltins::createAffineFwdOp(mlir::Location loc, const std::string &func,
+                                                 const std::vector<mlir::Value> &args) {
     const size_t numArgs = args.size();
     checkNumArgsExact(loc, func, numArgs, 3);
 
@@ -237,12 +218,12 @@ mlir::Value DaphneDSLBuiltins::createAffineFwdOp(mlir::Location loc, const std::
     mlir::Value weights_data = args[1];
     mlir::Value bias_data = args[2];
 
-    return static_cast<mlir::Value>(builder.create<mlir::daphne::AffineForwardOp>(loc, input_data.getType(), input_data,
-        weights_data, bias_data));
+    return static_cast<mlir::Value>(
+        builder.create<mlir::daphne::AffineForwardOp>(loc, input_data.getType(), input_data, weights_data, bias_data));
 }
 
 mlir::Value DaphneDSLBuiltins::createBatchNorm2dTestFwdOp(mlir::Location loc, const std::string &func,
-        const std::vector<mlir::Value> &args) {
+                                                          const std::vector<mlir::Value> &args) {
     const size_t numArgs = args.size();
     checkNumArgsExact(loc, func, numArgs, 6);
 
@@ -254,12 +235,12 @@ mlir::Value DaphneDSLBuiltins::createBatchNorm2dTestFwdOp(mlir::Location loc, co
     mlir::Value ema_var = args[4];
     mlir::Value eps = args[5];
 
-    return  static_cast<mlir::Value>(builder.create<mlir::daphne::BatchNorm2DTestForwardOp>(loc, input_data.getType(),
-            input_data, gamma, beta, ema_mean, ema_var, eps));
+    return static_cast<mlir::Value>(builder.create<mlir::daphne::BatchNorm2DTestForwardOp>(
+        loc, input_data.getType(), input_data, gamma, beta, ema_mean, ema_var, eps));
 }
 
-mlir::ResultRange DaphneDSLBuiltins::createConv2dFwdOp(mlir::Location loc, const std::string& func, const std::vector<mlir::Value>&
-        args) {
+mlir::ResultRange DaphneDSLBuiltins::createConv2dFwdOp(mlir::Location loc, const std::string &func,
+                                                       const std::vector<mlir::Value> &args) {
     const size_t numArgs = args.size();
     checkNumArgsBetween(loc, func, numArgs, 12, 13);
 
@@ -277,21 +258,25 @@ mlir::ResultRange DaphneDSLBuiltins::createConv2dFwdOp(mlir::Location loc, const
     mlir::Value padding_h = utils.castSizeIf(args[10]);
     mlir::Value padding_w = utils.castSizeIf(args[11]);
     if (numArgs == 12) {
-        return builder.create<mlir::daphne::Conv2DForwardOp>(loc, input_data.getType(), utils.sizeType, utils.sizeType,
-                input_data, filter_data, filter_data, num_images, num_channels, img_height, img_width, filter_h, filter_w, stride_h,
-                stride_w, padding_h, padding_w).getResults();
-    }
-    else {
+        return builder
+            .create<mlir::daphne::Conv2DForwardOp>(loc, input_data.getType(), utils.sizeType, utils.sizeType,
+                                                   input_data, filter_data, filter_data, num_images, num_channels,
+                                                   img_height, img_width, filter_h, filter_w, stride_h, stride_w,
+                                                   padding_h, padding_w)
+            .getResults();
+    } else {
         mlir::Value bias = args[12];
-        return builder.create<mlir::daphne::Conv2DForwardOp>(loc, input_data.getType(), utils.sizeType, utils.sizeType,
-                input_data, filter_data, bias, num_images, num_channels, img_height, img_width, filter_h, filter_w, stride_h,
-                stride_w, padding_h, padding_w).getResults();
+        return builder
+            .create<mlir::daphne::Conv2DForwardOp>(
+                loc, input_data.getType(), utils.sizeType, utils.sizeType, input_data, filter_data, bias, num_images,
+                num_channels, img_height, img_width, filter_h, filter_w, stride_h, stride_w, padding_h, padding_w)
+            .getResults();
     }
 }
 
-template<class PoolOp>
-mlir::ResultRange DaphneDSLBuiltins::createPoolFwdOp(mlir::Location loc, const std::string& func,
-        const std::vector<mlir::Value>&    args) {
+template <class PoolOp>
+mlir::ResultRange DaphneDSLBuiltins::createPoolFwdOp(mlir::Location loc, const std::string &func,
+                                                     const std::vector<mlir::Value> &args) {
     const size_t numArgs = args.size();
     checkNumArgsExact(loc, func, numArgs, 11);
 
@@ -307,16 +292,18 @@ mlir::ResultRange DaphneDSLBuiltins::createPoolFwdOp(mlir::Location loc, const s
     mlir::Value padding_h = utils.castSizeIf(args[9]);
     mlir::Value padding_w = utils.castSizeIf(args[10]);
 
-    return builder.create<PoolOp>(loc, input_data.getType(), utils.sizeType, utils.sizeType,
-            input_data, num_images, num_channels, img_height, img_width, pool_h, pool_w, stride_h, stride_w, padding_h,
-            padding_w).getResults();
+    return builder
+        .create<PoolOp>(loc, input_data.getType(), utils.sizeType, utils.sizeType, input_data, num_images, num_channels,
+                        img_height, img_width, pool_h, pool_w, stride_h, stride_w, padding_h, padding_w)
+        .getResults();
 }
 
 // ****************************************************************************
 // Other utilities
 // ****************************************************************************
 
-antlrcpp::Any DaphneDSLBuiltins::build(mlir::Location loc, const std::string & func, const std::vector<mlir::Value> & args) {
+antlrcpp::Any DaphneDSLBuiltins::build(mlir::Location loc, const std::string &func,
+                                       const std::vector<mlir::Value> &args) {
     using namespace mlir::daphne;
 
     const size_t numArgs = args.size();
@@ -339,64 +326,55 @@ antlrcpp::Any DaphneDSLBuiltins::build(mlir::Location loc, const std::string & f
     // Data generation
     // ********************************************************************
 
-    if(func == "fill") {
+    if (func == "fill") {
         checkNumArgsExact(loc, func, numArgs, 3);
         mlir::Value arg = args[0];
         mlir::Value numRows = utils.castSizeIf(args[1]);
         mlir::Value numCols = utils.castSizeIf(args[2]);
-        return static_cast<mlir::Value>(builder.create<FillOp>(
-                loc, utils.matrixOf(arg), arg, numRows, numCols
-        ));
+        return static_cast<mlir::Value>(builder.create<FillOp>(loc, utils.matrixOf(arg), arg, numRows, numCols));
     }
-    if(func == "createFrame") {
+    if (func == "createFrame") {
         checkNumArgsMin(loc, func, numArgs, 1);
         // Determine which arguments are column matrices and which are labels.
         std::vector<mlir::Type> colTypes;
         std::vector<mlir::Value> cols;
         std::vector<mlir::Value> labels;
         bool expectCol = true;
-        for(auto arg : args) {
+        for (auto arg : args) {
             mlir::Type t = arg.getType();
             auto mt = t.dyn_cast<MatrixType>();
-            if(expectCol && mt) {
+            if (expectCol && mt) {
                 colTypes.push_back(mt.getElementType());
                 cols.push_back(arg);
-            }
-            else if(llvm::isa<mlir::daphne::StringType>(t)) {
+            } else if (llvm::isa<mlir::daphne::StringType>(t)) {
                 expectCol = false;
                 labels.push_back(arg);
-            }
-            else
+            } else
                 throw ErrorHandler::compilerError(loc, "DSLBuiltins",
-                        "arguments to createFrame() built-in function must be one or "
-                        "more matrices optionally followed by equally many "
-                        "strings"
-                );
+                                                  "arguments to createFrame() built-in function must be one "
+                                                  "or "
+                                                  "more matrices optionally followed by equally many "
+                                                  "strings");
         }
         // Use default labels, if necessary.
         const size_t numCols = cols.size();
         const size_t numLabels = labels.size();
-        if(!numLabels)
-            for(size_t i = 0; i < numCols; i++) {
+        if (!numLabels)
+            for (size_t i = 0; i < numCols; i++) {
                 const std::string dl(Frame::getDefaultLabel(i));
-                labels.push_back(builder.create<ConstantOp>(
-                        loc, dl
-                ));
+                labels.push_back(builder.create<ConstantOp>(loc, dl));
             }
-        else if(numLabels != numCols)
+        else if (numLabels != numCols)
             throw ErrorHandler::compilerError(loc, "DSLBuiltins",
-                    "frame built-in function expects either no column labels "
-                    "or as many labels as columns"
-            );
+                                              "frame built-in function expects either no column labels "
+                                              "or as many labels as columns");
         // Create CreateFrameOp.
         mlir::Type t = FrameType::get(builder.getContext(), colTypes);
-        return static_cast<mlir::Value>(
-                builder.create<CreateFrameOp>(loc, t, cols, labels)
-        );
+        return static_cast<mlir::Value>(builder.create<CreateFrameOp>(loc, t, cols, labels));
     }
-    if(func == "diagMatrix")
+    if (func == "diagMatrix")
         return createSameTypeUnaryOp<DiagMatrixOp>(loc, func, args);
-    if(func == "rand") {
+    if (func == "rand") {
         checkNumArgsExact(loc, func, numArgs, 6);
         mlir::Value numRows = utils.castSizeIf(args[0]);
         mlir::Value numCols = utils.castSizeIf(args[1]);
@@ -405,41 +383,33 @@ antlrcpp::Any DaphneDSLBuiltins::build(mlir::Location loc, const std::string & f
         mlir::Value sparsity = utils.castIf(builder.getF64Type(), args[4]);
         mlir::Value seed = utils.castSeedIf(args[5]);
         return static_cast<mlir::Value>(builder.create<RandMatrixOp>(
-                loc,
-                MatrixType::get(builder.getContext(), min.getType()),
-                numRows, numCols, min, max, sparsity, seed
-        ));
+            loc, MatrixType::get(builder.getContext(), min.getType()), numRows, numCols, min, max, sparsity, seed));
     }
-    if(func == "sample") {
+    if (func == "sample") {
         checkNumArgsExact(loc, func, numArgs, 4);
         mlir::Value range = args[0];
         mlir::Value size = utils.castSizeIf(args[1]);
         mlir::Value withReplacement = utils.castBoolIf(args[2]);
         mlir::Value seed = utils.castSeedIf(args[3]);
-        return static_cast<mlir::Value>(
-                builder.create<SampleOp>(
-                        loc,
-                        MatrixType::get(builder.getContext(), range.getType()),
-                        range, size, withReplacement, seed
-                )
-        );
+        return static_cast<mlir::Value>(builder.create<SampleOp>(
+            loc, MatrixType::get(builder.getContext(), range.getType()), range, size, withReplacement, seed));
     }
-    if(func == "seq") {
-        checkNumArgsMin(loc, func, numArgs, 2); // The first two arguments are mandatory.
+    if (func == "seq") {
+        checkNumArgsMin(loc, func, numArgs,
+                        2); // The first two arguments are mandatory.
         mlir::Value from = args[0];
         mlir::Value to = args[1];
-        mlir::Value inc;                        // The third argument is optional.
-        
-        switch (numArgs)
-        {
-        case 2:{ // inc is not given, so it defaults to 1
+        mlir::Value inc; // The third argument is optional.
+
+        switch (numArgs) {
+        case 2: { // inc is not given, so it defaults to 1
             // We use the least general numeric type si8 for inc,
             // such that it never dominates from/to in type promotion.
             mlir::Type si8 = builder.getIntegerType(8, true);
             inc = builder.create<ConstantOp>(loc, si8, builder.getIntegerAttr(si8, 1));
             break;
         }
-        case 3:{ // inc is given
+        case 3: { // inc is given
             inc = args[2];
             break;
         }
@@ -447,28 +417,24 @@ antlrcpp::Any DaphneDSLBuiltins::build(mlir::Location loc, const std::string & f
             throw ErrorHandler::compilerError(loc, "DSLBuiltins", "seq(): unexpected number of arguments");
         }
 
-        return utils.retValWithInferedType(
-                builder.create<SeqOp>(
-                        loc, utils.unknownType, from, to, inc
-                )
-        );
+        return utils.retValWithInferedType(builder.create<SeqOp>(loc, utils.unknownType, from, to, inc));
     }
 
     // ********************************************************************
     // Matrix/frame meta data
     // ********************************************************************
 
-    if(func == "typeOf") {
+    if (func == "typeOf") {
         checkNumArgsExact(loc, func, numArgs, 1);
         return static_cast<mlir::Value>(builder.create<TypeOfOp>(loc, StringType::get(builder.getContext()), args[0]));
     }
-    if(func == "nrow")
+    if (func == "nrow")
         return createNumOp<NumRowsOp>(loc, func, args);
-    if(func == "ncol")
+    if (func == "ncol")
         return createNumOp<NumColsOp>(loc, func, args);
-    if(func == "ncell")
+    if (func == "ncell")
         return createNumOp<NumCellsOp>(loc, func, args);
-    if(func == "sparsity") {
+    if (func == "sparsity") {
         checkNumArgsExact(loc, func, numArgs, 1);
         mlir::Value arg = args[0];
         return static_cast<mlir::Value>(builder.create<SparsityOp>(loc, builder.getF64Type(), arg));
@@ -482,53 +448,53 @@ antlrcpp::Any DaphneDSLBuiltins::build(mlir::Location loc, const std::string & f
     // Arithmetic/general math
     // --------------------------------------------------------------------
 
-    if(func == "minus")
+    if (func == "minus")
         return createUnaryOp<EwMinusOp>(loc, func, args);
-    if(func == "abs")
+    if (func == "abs")
         return createUnaryOp<EwAbsOp>(loc, func, args);
-    if(func == "sign")
+    if (func == "sign")
         return createUnaryOp<EwSignOp>(loc, func, args);
-    if(func == "exp")
+    if (func == "exp")
         return createUnaryOp<EwExpOp>(loc, func, args);
-    if(func == "ln")
+    if (func == "ln")
         return createUnaryOp<EwLnOp>(loc, func, args);
-    if(func == "mod")
+    if (func == "mod")
         return createBinaryOp<EwModOp>(loc, func, args);
-    if(func == "sqrt")
+    if (func == "sqrt")
         return createUnaryOp<EwSqrtOp>(loc, func, args);
 
     // --------------------------------------------------------------------
     // Rounding
     // --------------------------------------------------------------------
 
-    if(func == "round")
+    if (func == "round")
         return createUnaryOp<EwRoundOp>(loc, func, args);
-    if(func == "floor")
+    if (func == "floor")
         return createUnaryOp<EwFloorOp>(loc, func, args);
-    if(func == "ceil")
+    if (func == "ceil")
         return createUnaryOp<EwCeilOp>(loc, func, args);
 
     // --------------------------------------------------------------------
     // Trigonometric
     // --------------------------------------------------------------------
 
-    if(func == "sin")
+    if (func == "sin")
         return createUnaryOp<EwSinOp>(loc, func, args);
-    if(func == "cos")
+    if (func == "cos")
         return createUnaryOp<EwCosOp>(loc, func, args);
-    if(func == "tan")
+    if (func == "tan")
         return createUnaryOp<EwTanOp>(loc, func, args);
-    if(func == "sinh")
+    if (func == "sinh")
         return createUnaryOp<EwSinhOp>(loc, func, args);
-    if(func == "cosh")
+    if (func == "cosh")
         return createUnaryOp<EwCoshOp>(loc, func, args);
-    if(func == "tanh")
+    if (func == "tanh")
         return createUnaryOp<EwTanhOp>(loc, func, args);
-    if(func == "asin")
+    if (func == "asin")
         return createUnaryOp<EwAsinOp>(loc, func, args);
-    if(func == "acos")
+    if (func == "acos")
         return createUnaryOp<EwAcosOp>(loc, func, args);
-    if(func == "atan")
+    if (func == "atan")
         return createUnaryOp<EwAtanOp>(loc, func, args);
 
     // --------------------------------------------------------------------
@@ -546,29 +512,28 @@ antlrcpp::Any DaphneDSLBuiltins::build(mlir::Location loc, const std::string & f
     // Arithmetic
     // --------------------------------------------------------------------
 
-    if(func == "pow")
+    if (func == "pow")
         return createBinaryOp<EwPowOp>(loc, func, args);
-    if(func == "log")
+    if (func == "log")
         return createBinaryOp<EwLogOp>(loc, func, args);
 
     // --------------------------------------------------------------------
     // Min/max
     // --------------------------------------------------------------------
 
-    if(func == "min")
+    if (func == "min")
         return createBinaryOp<EwMinOp>(loc, func, args);
-    if(func == "max")
+    if (func == "max")
         return createBinaryOp<EwMaxOp>(loc, func, args);
 
     // --------------------------------------------------------------------
     // Strings
     // --------------------------------------------------------------------
 
-    if(func == "concat") {
+    if (func == "concat") {
         checkNumArgsExact(loc, func, numArgs, 2);
-        return static_cast<mlir::Value>(builder.create<EwConcatOp>(
-                loc, StringType::get(builder.getContext()), args[0], args[1]
-        ));
+        return static_cast<mlir::Value>(
+            builder.create<EwConcatOp>(loc, StringType::get(builder.getContext()), args[0], args[1]));
     }
 
     // ********************************************************************
@@ -579,63 +544,63 @@ antlrcpp::Any DaphneDSLBuiltins::build(mlir::Location loc, const std::string & f
     // Arithmetic
     // --------------------------------------------------------------------
 
-    if(func == "outerAdd")
+    if (func == "outerAdd")
         return createBinaryOp<OuterAddOp>(loc, func, args);
-    if(func == "outerSub")
+    if (func == "outerSub")
         return createBinaryOp<OuterSubOp>(loc, func, args);
-    if(func == "outerMul")
+    if (func == "outerMul")
         return createBinaryOp<OuterMulOp>(loc, func, args);
-    if(func == "outerDiv")
+    if (func == "outerDiv")
         return createBinaryOp<OuterDivOp>(loc, func, args);
-    if(func == "outerPow")
+    if (func == "outerPow")
         return createBinaryOp<OuterPowOp>(loc, func, args);
-    if(func == "outerMod")
+    if (func == "outerMod")
         return createBinaryOp<OuterModOp>(loc, func, args);
-    if(func == "outerLog")
+    if (func == "outerLog")
         return createBinaryOp<OuterLogOp>(loc, func, args);
 
     // --------------------------------------------------------------------
     // Min/max
     // --------------------------------------------------------------------
 
-    if(func == "outerMin")
+    if (func == "outerMin")
         return createBinaryOp<OuterMinOp>(loc, func, args);
-    if(func == "outerMax")
+    if (func == "outerMax")
         return createBinaryOp<OuterMaxOp>(loc, func, args);
 
     // --------------------------------------------------------------------
     // Logical
     // --------------------------------------------------------------------
 
-    if(func == "outerAnd")
+    if (func == "outerAnd")
         return createBinaryOp<OuterAndOp>(loc, func, args);
-    if(func == "outerOr")
+    if (func == "outerOr")
         return createBinaryOp<OuterOrOp>(loc, func, args);
-    if(func == "outerXor")
+    if (func == "outerXor")
         return createBinaryOp<OuterXorOp>(loc, func, args);
 
     // --------------------------------------------------------------------
     // Strings
     // --------------------------------------------------------------------
 
-    if(func == "outerConcat")
+    if (func == "outerConcat")
         return createBinaryOp<OuterConcatOp>(loc, func, args);
 
     // --------------------------------------------------------------------
     // Comparisons
     // --------------------------------------------------------------------
 
-    if(func == "outerEq")
+    if (func == "outerEq")
         return createBinaryOp<OuterEqOp>(loc, func, args);
-    if(func == "outerNeq")
+    if (func == "outerNeq")
         return createBinaryOp<OuterNeqOp>(loc, func, args);
-    if(func == "outerLt")
+    if (func == "outerLt")
         return createBinaryOp<OuterLtOp>(loc, func, args);
-    if(func == "outerLe")
+    if (func == "outerLe")
         return createBinaryOp<OuterLeOp>(loc, func, args);
-    if(func == "outerGt")
+    if (func == "outerGt")
         return createBinaryOp<OuterGtOp>(loc, func, args);
-    if(func == "outerGe")
+    if (func == "outerGe")
         return createBinaryOp<OuterGeOp>(loc, func, args);
 
     // ********************************************************************
@@ -648,36 +613,38 @@ antlrcpp::Any DaphneDSLBuiltins::build(mlir::Location loc, const std::string & f
     // These four kinds of aggregation all have the same built-in function
     // names and are distinguished by their arguments.
 
-    if(func == "sum")
+    if (func == "sum")
         return createAnyAggOp<AllAggSumOp, RowAggSumOp, ColAggSumOp, GrpAggSumOp>(loc, func, args);
-    if(func == "aggMin") // otherwise name clash with elementwise functions (cannot be resolved by types)
+    if (func == "aggMin") // otherwise name clash with elementwise functions
+                          // (cannot be resolved by types)
         return createAnyAggOp<AllAggMinOp, RowAggMinOp, ColAggMinOp, GrpAggMinOp>(loc, func, args);
-    if(func == "aggMax") // otherwise name clash with elementwise functions (cannot be resolved by types)
+    if (func == "aggMax") // otherwise name clash with elementwise functions
+                          // (cannot be resolved by types)
         return createAnyAggOp<AllAggMaxOp, RowAggMaxOp, ColAggMaxOp, GrpAggMaxOp>(loc, func, args);
-    if(func == "mean")
+    if (func == "mean")
         return createAnyAggOp<AllAggMeanOp, RowAggMeanOp, ColAggMeanOp, GrpAggMeanOp>(loc, func, args);
-    if(func == "var")
+    if (func == "var")
         return createAnyAggOp<AllAggVarOp, RowAggVarOp, ColAggVarOp, GrpAggVarOp>(loc, func, args);
-    if(func == "stddev")
+    if (func == "stddev")
         return createAnyAggOp<AllAggStddevOp, RowAggStddevOp, ColAggStddevOp, GrpAggStddevOp>(loc, func, args);
-    if(func == "idxMin")
+    if (func == "idxMin")
         return createRowOrColAggOp<RowAggIdxMinOp, ColAggIdxMinOp>(loc, func, args);
-    if(func == "idxMax")
+    if (func == "idxMax")
         return createRowOrColAggOp<RowAggIdxMaxOp, ColAggIdxMaxOp>(loc, func, args);
-    if(func == "count")
+    if (func == "count")
         return createGrpAggOp<GrpAggCountOp>(loc, func, args);
 
     // --------------------------------------------------------------------
     // Cumulative aggregation
     // --------------------------------------------------------------------
 
-    if(func == "cumSum")
+    if (func == "cumSum")
         return createCumAggOp<CumAggSumOp>(loc, func, args);
-    if(func == "cumProd")
+    if (func == "cumProd")
         return createCumAggOp<CumAggProdOp>(loc, func, args);
-    if(func == "cumMin")
+    if (func == "cumMin")
         return createCumAggOp<CumAggMinOp>(loc, func, args);
-    if(func == "cumMax")
+    if (func == "cumMax")
         return createCumAggOp<CumAggMaxOp>(loc, func, args);
 
     // --------------------------------------------------------------------
@@ -690,28 +657,24 @@ antlrcpp::Any DaphneDSLBuiltins::build(mlir::Location loc, const std::string & f
     // Reorganization
     // ********************************************************************
 
-    if(func == "reshape") {
+    if (func == "reshape") {
         checkNumArgsExact(loc, func, numArgs, 3);
         mlir::Value arg = args[0];
         mlir::Value numRows = utils.castSizeIf(args[1]);
         mlir::Value numCols = utils.castSizeIf(args[2]);
-        return static_cast<mlir::Value>(builder.create<ReshapeOp>(
-                loc, arg.getType(), arg, numRows, numCols
-        ));
+        return static_cast<mlir::Value>(builder.create<ReshapeOp>(loc, arg.getType(), arg, numRows, numCols));
     }
-    if(func == "transpose" || func == "t") {
+    if (func == "transpose" || func == "t") {
         checkNumArgsExact(loc, func, numArgs, 1);
-        return static_cast<mlir::Value>(builder.create<TransposeOp>(
-                loc, args[0]
-        ));
+        return static_cast<mlir::Value>(builder.create<TransposeOp>(loc, args[0]));
     }
-    if(func == "cbind")
+    if (func == "cbind")
         return createBindOp<ColBindOp>(loc, func, args);
-    if(func == "rbind")
+    if (func == "rbind")
         return createBindOp<RowBindOp>(loc, func, args);
-    if(func == "reverse")
+    if (func == "reverse")
         return createSameTypeUnaryOp<ReverseOp>(loc, func, args);
-    if(func == "order") {
+    if (func == "order") {
         checkNumArgsMin(loc, func, numArgs, 4);
         checkNumArgsEven(loc, func, numArgs);
         mlir::Value arg = args[0];
@@ -719,35 +682,31 @@ antlrcpp::Any DaphneDSLBuiltins::build(mlir::Location loc, const std::string & f
         std::vector<mlir::Value> ascs;
         mlir::Value returnIdxs = utils.castBoolIf(args[numArgs - 1]);
         const size_t numCols = (numArgs - 2) / 2;
-        for(size_t i = 0; i < numCols; i++) {
+        for (size_t i = 0; i < numCols; i++) {
             colIdxs.push_back(utils.castSizeIf(args[1 + i]));
             ascs.push_back(utils.castBoolIf(args[1 + numCols + i]));
         }
         mlir::Type retTy;
 
         std::pair<bool, bool> p = CompilerUtils::isConstant<bool>(returnIdxs);
-        if(p.first) {
-            if(p.second)
+        if (p.first) {
+            if (p.second)
                 retTy = utils.matrixOfSizeType;
             else
                 retTy = args[0].getType();
-        }
-        else
+        } else
             retTy = utils.unknownType;
 
-        return static_cast<mlir::Value>(builder.create<OrderOp>(
-                loc, retTy, arg, colIdxs, ascs, returnIdxs
-        ));
+        return static_cast<mlir::Value>(builder.create<OrderOp>(loc, retTy, arg, colIdxs, ascs, returnIdxs));
     }
 
     // ********************************************************************
     // Matrix decompositions & co
     // ********************************************************************
 
-    if( func == "eigen" ) {
+    if (func == "eigen") {
         checkNumArgsExact(loc, func, numArgs, 1);
-        return builder.create<EigenOp>(loc,
-            args[0].getType(), args[0].getType(), args[0]).getResults();
+        return builder.create<EigenOp>(loc, args[0].getType(), args[0].getType(), args[0]).getResults();
     }
 
     // TODO Add built-in functions for those.
@@ -760,11 +719,11 @@ antlrcpp::Any DaphneDSLBuiltins::build(mlir::Location loc, const std::string & f
         return createAffineFwdOp(loc, func, args);
     }
 
-    if(func == "avg_pool2d") {
+    if (func == "avg_pool2d") {
         return createPoolFwdOp<AvgPoolForwardOp>(loc, func, args);
     }
 
-    if(func == "batch_norm2d") {
+    if (func == "batch_norm2d") {
         return createBatchNorm2dTestFwdOp(loc, func, args);
     }
 
@@ -772,28 +731,30 @@ antlrcpp::Any DaphneDSLBuiltins::build(mlir::Location loc, const std::string & f
         checkNumArgsExact(loc, func, numArgs, 2);
         mlir::Value input_data = args[0];
         mlir::Value bias = args[1];
-        return static_cast<mlir::Value>(builder.create<mlir::daphne::BiasAddForwardOp>(loc, input_data.getType(),
-                input_data, bias));
+        return static_cast<mlir::Value>(
+            builder.create<mlir::daphne::BiasAddForwardOp>(loc, input_data.getType(), input_data, bias));
     }
 
-    if(func == "conv2d") {
+    if (func == "conv2d") {
         return createConv2dFwdOp(loc, func, args);
     }
 
-    if(func == "max_pool2d") {
+    if (func == "max_pool2d") {
         return createPoolFwdOp<MaxPoolForwardOp>(loc, func, args);
     }
 
     if (func == "relu") {
         checkNumArgsExact(loc, func, numArgs, 1);
         mlir::Value input_data = args[0];
-        return static_cast<mlir::Value>(builder.create<mlir::daphne::ReluForwardOp>(loc, input_data.getType(), input_data));
+        return static_cast<mlir::Value>(
+            builder.create<mlir::daphne::ReluForwardOp>(loc, input_data.getType(), input_data));
     }
 
     if (func == "softmax") {
         checkNumArgsExact(loc, func, numArgs, 1);
         mlir::Value input_data = args[0];
-        return static_cast<mlir::Value>(builder.create<mlir::daphne::SoftmaxForwardOp>(loc, input_data.getType(), input_data));
+        return static_cast<mlir::Value>(
+            builder.create<mlir::daphne::SoftmaxForwardOp>(loc, input_data.getType(), input_data));
     }
 
     if (func == "batch_norm2d_backward") {
@@ -804,9 +765,10 @@ antlrcpp::Any DaphneDSLBuiltins::build(mlir::Location loc, const std::string & f
         mlir::Value dout = args[3];
         mlir::Value gamma = args[4];
         mlir::Value eps = args[5];
-        return builder.create<mlir::daphne::BatchNorm2DBackwardOp>(loc, 
-            dout.getType(), dout.getType(), dout.getType(), mean, invVar, in, dout, gamma, eps).getResults();
-            
+        return builder
+            .create<mlir::daphne::BatchNorm2DBackwardOp>(loc, dout.getType(), dout.getType(), dout.getType(), mean,
+                                                         invVar, in, dout, gamma, eps)
+            .getResults();
     }
 
     if (func == "conv2d_backward_filter") {
@@ -826,10 +788,9 @@ antlrcpp::Any DaphneDSLBuiltins::build(mlir::Location loc, const std::string & f
         mlir::Value filter_h = utils.castSizeIf(args[12]);
         mlir::Value filter_w = utils.castSizeIf(args[13]);
 
-        return static_cast<mlir::Value>(builder.create<mlir::daphne::Conv2DBackwardFilterOp>(loc, output.getType(),
-            input, output, stride_h, stride_w, pad_h, pad_w,
-            input_batch_size, input_num_channels, input_h, input_w,
-            filter_num_filters, filter_num_channels, filter_h, filter_w));
+        return static_cast<mlir::Value>(builder.create<mlir::daphne::Conv2DBackwardFilterOp>(
+            loc, output.getType(), input, output, stride_h, stride_w, pad_h, pad_w, input_batch_size,
+            input_num_channels, input_h, input_w, filter_num_filters, filter_num_channels, filter_h, filter_w));
     }
 
     if (func == "conv2d_backward_data") {
@@ -847,14 +808,13 @@ antlrcpp::Any DaphneDSLBuiltins::build(mlir::Location loc, const std::string & f
         mlir::Value filter_num_filters = utils.castSizeIf(args[10]);
         mlir::Value filter_num_channels = utils.castSizeIf(args[11]);
         mlir::Value filter_h = utils.castSizeIf(args[12]);
-        mlir::Value filter_w = utils.castSizeIf(args[13]);       
-        return static_cast<mlir::Value>(builder.create<mlir::daphne::Conv2DBackwardDataOp>(loc, output.getType(),
-            filter, output, stride_h, stride_w, pad_h, pad_w,
-            input_batch_size, input_num_channels, input_h, input_w,
-            filter_num_filters, filter_num_channels, filter_h, filter_w));
+        mlir::Value filter_w = utils.castSizeIf(args[13]);
+        return static_cast<mlir::Value>(builder.create<mlir::daphne::Conv2DBackwardDataOp>(
+            loc, output.getType(), filter, output, stride_h, stride_w, pad_h, pad_w, input_batch_size,
+            input_num_channels, input_h, input_w, filter_num_filters, filter_num_channels, filter_h, filter_w));
     }
 
-if (func == "avg_pool2d_backward") {
+    if (func == "avg_pool2d_backward") {
         checkNumArgsExact(loc, func, numArgs, 12);
         mlir::Value input = args[0];
         mlir::Value dOut = args[1];
@@ -868,8 +828,9 @@ if (func == "avg_pool2d_backward") {
         mlir::Value stride_w = utils.castSizeIf(args[9]);
         mlir::Value pad_h = utils.castSizeIf(args[10]);
         mlir::Value pad_w = utils.castSizeIf(args[11]);
-        return static_cast<mlir::Value>(builder.create<mlir::daphne::AvgPoolBackwardOp>(loc, dOut.getType(),
-            input, dOut, batch_size, num_channels, img_h, img_w, pool_h, pool_w, stride_h, stride_w, pad_h, pad_w));
+        return static_cast<mlir::Value>(builder.create<mlir::daphne::AvgPoolBackwardOp>(
+            loc, dOut.getType(), input, dOut, batch_size, num_channels, img_h, img_w, pool_h, pool_w, stride_h,
+            stride_w, pad_h, pad_w));
     }
 
     if (func == "max_pool2d_backward") {
@@ -886,40 +847,36 @@ if (func == "avg_pool2d_backward") {
         mlir::Value stride_w = utils.castSizeIf(args[9]);
         mlir::Value pad_h = utils.castSizeIf(args[10]);
         mlir::Value pad_w = utils.castSizeIf(args[11]);
-        return static_cast<mlir::Value>(builder.create<mlir::daphne::MaxPoolBackwardOp>(loc, dOut.getType(),
-            input, dOut, batch_size, num_channels, img_h, img_w, pool_h, pool_w, stride_h, stride_w, pad_h, pad_w));
+        return static_cast<mlir::Value>(builder.create<mlir::daphne::MaxPoolBackwardOp>(
+            loc, dOut.getType(), input, dOut, batch_size, num_channels, img_h, img_w, pool_h, pool_w, stride_h,
+            stride_w, pad_h, pad_w));
     }
 
-    
-
     // ********************************************************************
     // Other matrix operations
     // ********************************************************************
 
-    if(func == "diagVector")
+    if (func == "diagVector")
         return createSameTypeUnaryOp<DiagVectorOp>(loc, func, args);
-    if(func == "lowerTri")
+    if (func == "lowerTri")
         return createTriOp(loc, func, args, false);
-    if(func == "upperTri")
+    if (func == "upperTri")
         return createTriOp(loc, func, args, true);
-    if(func == "solve") {
+    if (func == "solve") {
         checkNumArgsExact(loc, func, numArgs, 2);
         mlir::Value a = args[0];
         mlir::Value b = args[1];
-        return utils.retValWithInferedType(builder.create<SolveOp>(
-                loc, utils.unknownType, a, b
-        ));
+        return utils.retValWithInferedType(builder.create<SolveOp>(loc, utils.unknownType, a, b));
     }
-    if(func == "replace") {
+    if (func == "replace") {
         checkNumArgsExact(loc, func, numArgs, 3);
         mlir::Value arg = args[0];
         mlir::Value pattern = args[1];
         mlir::Value replacement = args[2];
-        return utils.retValWithInferedType(builder.create<ReplaceOp>(
-                loc, utils.unknownType, arg, pattern, replacement
-        ));
+        return utils.retValWithInferedType(
+            builder.create<ReplaceOp>(loc, utils.unknownType, arg, pattern, replacement));
     }
-    if(func == "ctable") {
+    if (func == "ctable") {
         // The first two arguments are mandatory.
         checkNumArgsMin(loc, func, numArgs, 2);
         mlir::Value lhs = args[0];
@@ -930,48 +887,46 @@ if (func == "avg_pool2d_backward") {
         mlir::Value resNumCols;
         mlir::Value one = builder.create<ConstantOp>(loc, double(1));
         mlir::Value minusOne = builder.create<ConstantOp>(loc, int64_t(-1));
-        switch(numArgs) {
-            case 2: { // none are given, all default
-                weight = one;
-                resNumRows = minusOne;
-                resNumCols = minusOne;
-                break;
-            }
-            case 3: { // weight is given; resNumRows and resNumCols default to -1 (unknown)
-                weight = args[2];
-                resNumRows = minusOne;
-                resNumCols = minusOne;
-                break;
-            }
-            case 4: { // resNumRows, resNumCols are given; weight defaults to 1.0
-                weight = one;
-                resNumRows = utils.castSI64If(args[2]);
-                resNumCols = utils.castSI64If(args[3]);
-                break;
-            }
-            case 5: { // weight, resNumRows, resNumCols are given
-                weight = args[2];
-                resNumRows = utils.castSI64If(args[3]);
-                resNumCols = utils.castSI64If(args[4]);
-                break;
-            }
-            default:
-                throw ErrorHandler::compilerError(loc, "DSLBuiltins", "ctable(): unexpected number of arguments");
+        switch (numArgs) {
+        case 2: { // none are given, all default
+            weight = one;
+            resNumRows = minusOne;
+            resNumCols = minusOne;
+            break;
+        }
+        case 3: { // weight is given; resNumRows and resNumCols default to -1
+                  // (unknown)
+            weight = args[2];
+            resNumRows = minusOne;
+            resNumCols = minusOne;
+            break;
+        }
+        case 4: { // resNumRows, resNumCols are given; weight defaults to 1.0
+            weight = one;
+            resNumRows = utils.castSI64If(args[2]);
+            resNumCols = utils.castSI64If(args[3]);
+            break;
         }
-        return static_cast<mlir::Value>(builder.create<CTableOp>(
-                loc, utils.unknownType, lhs, rhs, weight, resNumRows, resNumCols
-        ));
+        case 5: { // weight, resNumRows, resNumCols are given
+            weight = args[2];
+            resNumRows = utils.castSI64If(args[3]);
+            resNumCols = utils.castSI64If(args[4]);
+            break;
+        }
+        default:
+            throw ErrorHandler::compilerError(loc, "DSLBuiltins", "ctable(): unexpected number of arguments");
+        }
+        return static_cast<mlir::Value>(
+            builder.create<CTableOp>(loc, utils.unknownType, lhs, rhs, weight, resNumRows, resNumCols));
     }
-    if(func == "syrk") {
+    if (func == "syrk") {
         return createSameTypeUnaryOp<SyrkOp>(loc, func, args);
     }
-    if(func == "gemv") {
+    if (func == "gemv") {
         checkNumArgsExact(loc, func, numArgs, 2);
         mlir::Value mat = args[0];
         mlir::Value vec = args[1];
-        return utils.retValWithInferedType(builder.create<GemvOp>(
-                loc, utils.unknownType, mat, vec
-        ));
+        return utils.retValWithInferedType(builder.create<GemvOp>(loc, utils.unknownType, mat, vec));
     }
 
     // ********************************************************************
@@ -982,12 +937,11 @@ if (func == "avg_pool2d_backward") {
     // Entire SQL query
     // ----------------------------------------------------------------------------
 
-    if(func == "sql") {
+    if (func == "sql") {
         checkNumArgsExact(loc, func, numArgs, 1);
 
-        std::string sql = CompilerUtils::constantOrThrow<std::string>(
-                args[0], "SqlOp requires a SQL query as a constant string"
-        );
+        std::string sql =
+            CompilerUtils::constantOrThrow<std::string>(args[0], "SqlOp requires a SQL query as a constant string");
 
         // TODO How to know the column types, or how to not need to
         // know them here? For now, we just leave them blank here.
@@ -996,68 +950,58 @@ if (func == "avg_pool2d_backward") {
         // them at this point, we should enable some way to leave them
         // unknown.
         colTypes.push_back(builder.getF64Type());
-        return static_cast<mlir::Value>(builder.create<SqlOp>(
-                loc,
-                FrameType::get(builder.getContext(), colTypes),
-                builder.getStringAttr(sql)
-        ));
+        return static_cast<mlir::Value>(
+            builder.create<SqlOp>(loc, FrameType::get(builder.getContext(), colTypes), builder.getStringAttr(sql)));
     }
-    if(func == "registerView") {
+    if (func == "registerView") {
         checkNumArgsExact(loc, func, numArgs, 2);
 
         std::string viewName = CompilerUtils::constantOrThrow<std::string>(
-                args[0], "registerView requires a view name as a constant string"
-        );
+            args[0], "registerView requires a view name as a constant string");
         mlir::Value view = args[1];
-        return builder.create<RegisterViewOp>(
-                loc,
-                builder.getStringAttr(viewName),
-                view
-        ).getOperation();
+        return builder.create<RegisterViewOp>(loc, builder.getStringAttr(viewName), view).getOperation();
     }
 
     // --------------------------------------------------------------------
     // Set operations
     // --------------------------------------------------------------------
 
-    if(func == "intersect")
+    if (func == "intersect")
         return createSetOp<IntersectOp>(loc, func, args);
-    if(func == "merge")
+    if (func == "merge")
         return createSetOp<IntersectOp>(loc, func, args);
-    if(func == "except")
+    if (func == "except")
         return createSetOp<IntersectOp>(loc, func, args);
 
     // --------------------------------------------------------------------
     // Cartesian product and joins
     // --------------------------------------------------------------------
 
-    if(func == "cartesian") {
+    if (func == "cartesian") {
         checkNumArgsExact(loc, func, numArgs, 2);
         std::vector<mlir::Type> colTypes;
-        for(auto arg : args)
-            for(mlir::Type t : arg.getType().dyn_cast<FrameType>().getColumnTypes())
+        for (auto arg : args)
+            for (mlir::Type t : arg.getType().dyn_cast<FrameType>().getColumnTypes())
                 colTypes.push_back(t);
-        return static_cast<mlir::Value>(builder.create<CartesianOp>(
-                loc, FrameType::get(builder.getContext(), colTypes), args[0], args[1]
-        ));
+        return static_cast<mlir::Value>(
+            builder.create<CartesianOp>(loc, FrameType::get(builder.getContext(), colTypes), args[0], args[1]));
     }
-    if(func == "innerJoin"){
+    if (func == "innerJoin") {
         checkNumArgsExact(loc, func, numArgs, 4);
         std::vector<mlir::Type> colTypes;
-        for(int i = 0; i < 2; i++)
-            for(mlir::Type t : args[i].getType().dyn_cast<FrameType>().getColumnTypes())
+        for (int i = 0; i < 2; i++)
+            for (mlir::Type t : args[i].getType().dyn_cast<FrameType>().getColumnTypes())
                 colTypes.push_back(t);
-        return static_cast<mlir::Value>(builder.create<InnerJoinOp>(
-                loc, FrameType::get(builder.getContext(), colTypes), args[0], args[1], args[2], args[3]
-        ));
+        return static_cast<mlir::Value>(builder.create<InnerJoinOp>(loc, FrameType::get(builder.getContext(), colTypes),
+                                                                    args[0], args[1], args[2], args[3]));
     }
-    if(func == "fullOuterJoin")
+    if (func == "fullOuterJoin")
         return createJoinOp<FullOuterJoinOp>(loc, func, args);
-    if(func == "leftOuterJoin")
+    if (func == "leftOuterJoin")
         return createJoinOp<LeftOuterJoinOp>(loc, func, args);
-    if(func == "antiJoin")
+    if (func == "antiJoin")
         return createJoinOp<AntiJoinOp>(loc, func, args);
-    if(func == "semiJoin") {
+    if (func == "semiJoin") {
         // TODO Reconcile this with the other join ops, but we need it to work
         // quickly now.
         // return createJoinOp<SemiJoinOp>(loc, func, args);
@@ -1066,74 +1010,53 @@ if (func == "avg_pool2d_backward") {
         mlir::Value rhs = args[1];
         mlir::Value lhsOn = args[2];
         mlir::Value rhsOn = args[3];
-        return builder.create<SemiJoinOp>(
-                loc,
-                FrameType::get(
-                        builder.getContext(),
-                        {utils.unknownType}
-                ),
-                utils.matrixOfSizeType,
-                lhs, rhs, lhsOn, rhsOn
-        ).getResults();
+        return builder
+            .create<SemiJoinOp>(loc, FrameType::get(builder.getContext(), {utils.unknownType}), utils.matrixOfSizeType,
+                                lhs, rhs, lhsOn, rhsOn)
+            .getResults();
     }
-    if(func == "groupJoin") {
+    if (func == "groupJoin") {
         checkNumArgsExact(loc, func, numArgs, 5);
         mlir::Value lhs = args[0];
         mlir::Value rhs = args[1];
         mlir::Value lhsOn = args[2];
         mlir::Value rhsOn = args[3];
         mlir::Value rhsAgg = args[4];
-        return builder.create<GroupJoinOp>(
-                loc,
-                FrameType::get(
-                        builder.getContext(),
-                        {utils.unknownType, utils.unknownType}
-                ),
-                utils.matrixOfSizeType,
-                lhs, rhs, lhsOn, rhsOn, rhsAgg
-        ).getResults();
+        return builder
+            .create<GroupJoinOp>(loc, FrameType::get(builder.getContext(), {utils.unknownType, utils.unknownType}),
+                                 utils.matrixOfSizeType, lhs, rhs, lhsOn, rhsOn, rhsAgg)
+            .getResults();
     }
 
     // ********************************************************************
     // Frame label manipulation
     // ********************************************************************
 
-    if(func == "setColLabels") {
+    if (func == "setColLabels") {
         checkNumArgsMin(loc, func, numArgs, 2);
         std::vector<mlir::Value> labels;
-        for(size_t i = 1; i < numArgs; i++)
+        for (size_t i = 1; i < numArgs; i++)
             labels.push_back(args[i]);
         return static_cast<mlir::Value>(builder.create<SetColLabelsOp>(
-                loc,
-                args[0].getType().dyn_cast<FrameType>().withSameColumnTypes(),
-                args[0],
-                labels
-        ));
+            loc, args[0].getType().dyn_cast<FrameType>().withSameColumnTypes(), args[0], labels));
     }
-    if(func == "setColLabelsPrefix") {
+    if (func == "setColLabelsPrefix") {
         checkNumArgsExact(loc, func, numArgs, 2);
         return static_cast<mlir::Value>(builder.create<SetColLabelsPrefixOp>(
-                loc,
-                args[0].getType().dyn_cast<FrameType>().withSameColumnTypes(),
-                args[0],
-                args[1]
-        ));
+            loc, args[0].getType().dyn_cast<FrameType>().withSameColumnTypes(), args[0], args[1]));
     }
 
     // ********************************************************************
     // Conversions and casts
     // ********************************************************************
 
-    if(func == "quantize") {
+    if (func == "quantize") {
         checkNumArgsExact(loc, func, args.size(), 3);
         mlir::Value arg = args[0];
         mlir::Value min = args[1];
         mlir::Value max = args[2];
-        return static_cast<mlir::Value>(builder.create<QuantizeOp>(
-                loc,
-                utils.matrixOf(builder.getIntegerType(8, false)),
-                arg, min, max
-        ));
+        return static_cast<mlir::Value>(
+            builder.create<QuantizeOp>(loc, utils.matrixOf(builder.getIntegerType(8, false)), arg, min, max));
     }
 
     // ********************************************************************
@@ -1144,18 +1067,12 @@ if (func == "avg_pool2d_backward") {
     // High-level
     // --------------------------------------------------------------------
 
-    if(func == "print") {
+    if (func == "print") {
         checkNumArgsBetween(loc, func, numArgs, 1, 3);
         mlir::Value arg = args[0];
-        mlir::Value newline = (numArgs < 2)
-                ? builder.create<ConstantOp>(loc, true)
-                : utils.castBoolIf(args[1]);
-        mlir::Value err = (numArgs < 3)
-                ? builder.create<ConstantOp>(loc, false)
-                : utils.castBoolIf(args[2]);
-        return builder.create<PrintOp>(
-                loc, arg, newline, err
-        ).getOperation();
+        mlir::Value newline = (numArgs < 2) ? builder.create<ConstantOp>(loc, true) : utils.castBoolIf(args[1]);
+        mlir::Value err = (numArgs < 3) ? builder.create<ConstantOp>(loc, false) : utils.castBoolIf(args[2]);
+        return builder.create<PrintOp>(loc, arg, newline, err).getOperation();
     }
 
     if (func == "readMatrix") {
@@ -1170,7 +1087,7 @@ if (func == "avg_pool2d_backward") {
         return static_cast<mlir::Value>(builder.create<ReadOp>(loc, resType, /*filename = */ args[0]));
     }
 
-    if(func == "writeFrame" || func == "writeMatrix" || func == "write") {
+    if (func == "writeFrame" || func == "writeMatrix" || func == "write") {
         // Note that the type of arg already indicates if it is a frame or a
         // matrix.
         checkNumArgsExact(loc, func, numArgs, 2);
@@ -1178,9 +1095,9 @@ if (func == "avg_pool2d_backward") {
         mlir::Value filename = args[1];
         return builder.create<WriteOp>(loc, arg, filename).getOperation();
     }
-    if(func == "receiveFromNumpy") {
+    if (func == "receiveFromNumpy") {
         checkNumArgsExact(loc, func, numArgs, 5);
-        
+
         mlir::Value upper = utils.castUI32If(args[0]);
         mlir::Value lower = utils.castUI32If(args[1]);
         mlir::Value rows = args[2];
@@ -1188,84 +1105,78 @@ if (func == "avg_pool2d_backward") {
         mlir::Value valueType = args[4];
 
         int64_t valueTypeCode = CompilerUtils::constantOrThrow<int64_t>(
-                valueType, "the value type code in ReceiveFromNumpyOp must be a constant"
-        );
+            valueType, "the value type code in ReceiveFromNumpyOp must be a constant");
 
-        // TODO Is there a utility for this mapping from value type code to MLIR type?
+        // TODO Is there a utility for this mapping from value type code to MLIR
+        // type?
         mlir::Type vt;
-        if(valueTypeCode == (int64_t)ValueTypeCode::F32)
+        if (valueTypeCode == (int64_t)ValueTypeCode::F32)
             vt = builder.getF32Type();
-        else if(valueTypeCode == (int64_t)ValueTypeCode::F64)
+        else if (valueTypeCode == (int64_t)ValueTypeCode::F64)
             vt = builder.getF64Type();
-        else if(valueTypeCode == (int64_t)ValueTypeCode::SI8)
+        else if (valueTypeCode == (int64_t)ValueTypeCode::SI8)
             vt = builder.getIntegerType(8, true);
-        else if(valueTypeCode == (int64_t)ValueTypeCode::SI32)
+        else if (valueTypeCode == (int64_t)ValueTypeCode::SI32)
             vt = builder.getIntegerType(32, true);
-        else if(valueTypeCode == (int64_t)ValueTypeCode::SI64)
+        else if (valueTypeCode == (int64_t)ValueTypeCode::SI64)
             vt = builder.getIntegerType(64, true);
-        else if(valueTypeCode == (int64_t)ValueTypeCode::UI8)
+        else if (valueTypeCode == (int64_t)ValueTypeCode::UI8)
             vt = builder.getIntegerType(8, false);
-        else if(valueTypeCode == (int64_t)ValueTypeCode::UI32)
+        else if (valueTypeCode == (int64_t)ValueTypeCode::UI32)
             vt = builder.getIntegerType(32, false);
-        else if(valueTypeCode == (int64_t)ValueTypeCode::UI64)
+        else if (valueTypeCode == (int64_t)ValueTypeCode::UI64)
             vt = builder.getIntegerType(64, false);
         else
             throw ErrorHandler::compilerError(loc, "DSLBuiltins", "invalid value type code");
 
-        return static_cast<mlir::Value>(builder.create<ReceiveFromNumpyOp>(
-                loc, utils.matrixOf(vt), upper, lower, rows, cols
-        ));
+        return static_cast<mlir::Value>(
+            builder.create<ReceiveFromNumpyOp>(loc, utils.matrixOf(vt), upper, lower, rows, cols));
     }
-    if(func == "saveDaphneLibResult") {
+    if (func == "saveDaphneLibResult") {
         checkNumArgsExact(loc, func, numArgs, 1);
         mlir::Value arg = args[0];
         return builder.create<SaveDaphneLibResultOp>(loc, arg).getOperation();
     }
-    if(func == "stop") {
+    if (func == "stop") {
         checkNumArgsBetween(loc, func, numArgs, 0, 1);
         mlir::Value message;
         if (numArgs == 0) {
-            message = builder.create<mlir::daphne::ConstantOp>(loc, builder.getType<mlir::daphne::StringType>(), builder.getStringAttr("unspecified reason"));
+            message = builder.create<mlir::daphne::ConstantOp>(loc, builder.getType<mlir::daphne::StringType>(),
+                                                               builder.getStringAttr("unspecified reason"));
         } else {
             message = args[0];
         }
         return builder.create<StopOp>(loc, message).getOperation();
-    } 
+    }
 
     // --------------------------------------------------------------------
     // Low-level
     // --------------------------------------------------------------------
 
-    if(func == "openFile") {
+    if (func == "openFile") {
         checkNumArgsExact(loc, func, numArgs, 1);
         mlir::Value filename = args[0];
-        return static_cast<mlir::Value>(builder.create<OpenFileOp>(
-                loc, FileType::get(builder.getContext()), filename
-        ));
+        return static_cast<mlir::Value>(builder.create<OpenFileOp>(loc, FileType::get(builder.getContext()), filename));
     }
-    if(func == "openDevice") {
+    if (func == "openDevice") {
         checkNumArgsExact(loc, func, numArgs, 1);
         mlir::Value device = args[0];
-        return static_cast<mlir::Value>(builder.create<OpenDeviceOp>(
-                loc, TargetType::get(builder.getContext()), device
-        ));
+        return static_cast<mlir::Value>(
+            builder.create<OpenDeviceOp>(loc, TargetType::get(builder.getContext()), device));
     }
-    if(func == "openFileOnTarget") {
+    if (func == "openFileOnTarget") {
         checkNumArgsExact(loc, func, numArgs, 2);
         mlir::Value target = args[0];
         mlir::Value filename = args[1];
-        return static_cast<mlir::Value>(builder.create<OpenFileOnTargetOp>(
-                loc, DescriptorType::get(builder.getContext()), target, filename
-        ));
+        return static_cast<mlir::Value>(
+            builder.create<OpenFileOnTargetOp>(loc, DescriptorType::get(builder.getContext()), target, filename));
     }
-    if(func == "close") {
+    if (func == "close") {
         checkNumArgsExact(loc, func, numArgs, 1);
         mlir::Value fileOrTarget = args[0];
-        return builder.create<CloseOp>(
-                loc, fileOrTarget
-        ).getOperation();
+        return builder.create<CloseOp>(loc, fileOrTarget).getOperation();
     }
-    if(func == "readCsv") {
+    if (func == "readCsv") {
         checkNumArgsExact(loc, func, numArgs, 4);
         mlir::Value fileOrDescriptor = args[0];
         mlir::Value numRows = utils.castSizeIf(args[1]);
@@ -1275,119 +1186,97 @@ if (func == "avg_pool2d_backward") {
         // TODO Currently, this always assumes double as the value type. We
         // need to connect this to our FileMetaData mechanism, but for that, we
         // require the file name, which is not known here in the current design.
-        return static_cast<mlir::Value>(builder.create<ReadCsvOp>(
-                loc, utils.matrixOf(builder.getF64Type()),
-                fileOrDescriptor, numRows, numCols, delim
-        ));
+        return static_cast<mlir::Value>(builder.create<ReadCsvOp>(loc, utils.matrixOf(builder.getF64Type()),
+                                                                  fileOrDescriptor, numRows, numCols, delim));
     }
 
     // ********************************************************************
     // Data preprocessing
     // ********************************************************************
 
-    if(func == "oneHot") {
+    if (func == "oneHot") {
         checkNumArgsExact(loc, func, numArgs, 2);
         mlir::Value arg = args[0];
         mlir::Value info = args[1];
-        return static_cast<mlir::Value>(builder.create<OneHotOp>(
-                loc, arg.getType(), arg, info
-        ));
+        return static_cast<mlir::Value>(builder.create<OneHotOp>(loc, arg.getType(), arg, info));
     }
-    if(func == "recode") {
+    if (func == "recode") {
         checkNumArgsExact(loc, func, numArgs, 2);
         mlir::Value arg = args[0];
         mlir::Value orderPreserving = args[1];
-        return utils.retValsWithInferedTypes(builder.create<RecodeOp>(
-                loc, utils.unknownType, utils.unknownType, arg, orderPreserving
-        ));
+        return utils.retValsWithInferedTypes(
+            builder.create<RecodeOp>(loc, utils.unknownType, utils.unknownType, arg, orderPreserving));
     }
-    if(func == "bin") {
+    if (func == "bin") {
         checkNumArgsIn(loc, func, numArgs, {2, 4});
         mlir::Value arg = args[0];
         mlir::Value numBins = args[1];
         mlir::Value min;
         mlir::Value max;
-        if(numArgs == 2) {
-            min = utils.retValWithInferedType(
-                    builder.create<AllAggMinOp>(loc, utils.unknownType, arg)
-            );
-            max = utils.retValWithInferedType
-                    (builder.create<AllAggMaxOp>(loc, utils.unknownType, arg)
-            );
-        }
-        else {
+        if (numArgs == 2) {
+            min = utils.retValWithInferedType(builder.create<AllAggMinOp>(loc, utils.unknownType, arg));
+            max = utils.retValWithInferedType(builder.create<AllAggMaxOp>(loc, utils.unknownType, arg));
+        } else {
             min = args[2];
             max = args[3];
         }
-        return utils.retValWithInferedType(builder.create<BinOp>(
-                loc, utils.unknownType, arg, utils.castSI64If(numBins), min, max
-        ));
+        return utils.retValWithInferedType(
+            builder.create<BinOp>(loc, utils.unknownType, arg, utils.castSI64If(numBins), min, max));
     }
 
     // ********************************************************************
     // Measurements
     // ********************************************************************
 
-    if(func == "now") {
+    if (func == "now") {
         checkNumArgsExact(loc, func, numArgs, 0);
-        return static_cast<mlir::Value>(builder.create<NowOp>(
-                loc, builder.getIntegerType(64, true)
-        ));
+        return static_cast<mlir::Value>(builder.create<NowOp>(loc, builder.getIntegerType(64, true)));
     }
 
     // ****************************************************************************
     // Higher-order operations
     // ****************************************************************************
 
-    if(func == "map") {
+    if (func == "map") {
         checkNumArgsExact(loc, func, numArgs, 2);
         mlir::Value source = args[0];
 
         auto co = args[1].getDefiningOp<mlir::daphne::ConstantOp>();
         mlir::Attribute attr = co.getValue();
 
-        return static_cast<mlir::Value>(builder.create<MapOp>(
-            loc, source.getType(), source, attr.dyn_cast<mlir::StringAttr>()
-        ));
+        return static_cast<mlir::Value>(
+            builder.create<MapOp>(loc, source.getType(), source, attr.dyn_cast<mlir::StringAttr>()));
     }
 
     // ****************************************************************************
     // List operations
     // ****************************************************************************
 
-    if(func == "createList") {
+    if (func == "createList") {
         checkNumArgsMin(loc, func, numArgs, 1);
 
-        return static_cast<mlir::Value>(builder.create<CreateListOp>(
-            loc, utils.unknownType, args
-        ));
+        return static_cast<mlir::Value>(builder.create<CreateListOp>(loc, utils.unknownType, args));
     }
-    if(func == "length") {
+    if (func == "length") {
         checkNumArgsExact(loc, func, numArgs, 1);
 
-        return static_cast<mlir::Value>(builder.create<LengthOp>(
-            loc, utils.sizeType, args[0]
-        ));
+        return static_cast<mlir::Value>(builder.create<LengthOp>(loc, utils.sizeType, args[0]));
     }
-    if(func == "append") {
+    if (func == "append") {
         checkNumArgsExact(loc, func, numArgs, 2);
 
         mlir::Value list = args[0];
         mlir::Value elem = args[1];
 
-        return static_cast<mlir::Value>(builder.create<AppendOp>(
-            loc, utils.unknownType, list, elem
-        ));
+        return static_cast<mlir::Value>(builder.create<AppendOp>(loc, utils.unknownType, list, elem));
     }
-    if(func == "remove") {
+    if (func == "remove") {
         checkNumArgsExact(loc, func, numArgs, 2);
 
         mlir::Value list = args[0];
         mlir::Value idx = utils.castSizeIf(args[1]);
 
-        return builder.create<RemoveOp>(
-            loc, utils.unknownType, utils.unknownType, list, idx
-        ).getResults();
+        return builder.create<RemoveOp>(loc, utils.unknownType, utils.unknownType, list, idx).getResults();
     }
 
     // ********************************************************************
diff --git a/src/parser/daphnedsl/DaphneDSLBuiltins.h b/src/parser/daphnedsl/DaphneDSLBuiltins.h
index 212bb99e1..302d2421a 100644
--- a/src/parser/daphnedsl/DaphneDSLBuiltins.h
+++ b/src/parser/daphnedsl/DaphneDSLBuiltins.h
@@ -36,97 +36,103 @@
  * functions.
  */
 class DaphneDSLBuiltins {
-    
+
     /**
      * @brief The OpBuilder used to generate DaphneIR operations.
      */
-    mlir::OpBuilder & builder;
-    
+    mlir::OpBuilder &builder;
+
     /**
      * @brief General utilities for parsing to DaphneIR.
      */
     ParserUtils utils;
-    
+
     // ************************************************************************
     // Checking number of arguments
     // ************************************************************************
-    
-    static void checkNumArgsExact(mlir::Location loc, const std::string & func, size_t numArgs, size_t numArgsExact);
-    
-    static void checkNumArgsBetween(mlir::Location loc, const std::string & func, size_t numArgs, size_t numArgsMin, size_t numArgsMax);
-
-    static void checkNumArgsIn(mlir::Location loc, const std::string & func, size_t numArgs, std::vector<size_t> numArgsChoice);
-    
-    static void checkNumArgsMin(mlir::Location loc, const std::string & func, size_t numArgs, size_t numArgsMin);
-    
-    static void checkNumArgsEven(mlir::Location loc, const std::string & func, size_t numArgs);
-    
+
+    static void checkNumArgsExact(mlir::Location loc, const std::string &func, size_t numArgs, size_t numArgsExact);
+
+    static void checkNumArgsBetween(mlir::Location loc, const std::string &func, size_t numArgs, size_t numArgsMin,
+                                    size_t numArgsMax);
+
+    static void checkNumArgsIn(mlir::Location loc, const std::string &func, size_t numArgs,
+                               std::vector<size_t> numArgsChoice);
+
+    static void checkNumArgsMin(mlir::Location loc, const std::string &func, size_t numArgs, size_t numArgsMin);
+
+    static void checkNumArgsEven(mlir::Location loc, const std::string &func, size_t numArgs);
+
     // ************************************************************************
     // Creating similar DaphneIR operations
     // ************************************************************************
-    
-    template<class NumOp>
-    mlir::Value createNumOp(mlir::Location loc, const std::string & func, const std::vector<mlir::Value> & args);
-    
-    template<class UnaryOp>
-    mlir::Value createUnaryOp(mlir::Location loc, const std::string & func, const std::vector<mlir::Value> & args);
-    
-    template<class BinaryOp>
-    mlir::Value createBinaryOp(mlir::Location loc, const std::string & func, const std::vector<mlir::Value> & args);
-    
-    template<class RowAggOp, class ColAggOp>
-    mlir::Value createRowOrColAggOp(mlir::Location loc, const std::string & func, const std::vector<mlir::Value> & args);
-    
-    template<class GrpAggOp>
-    mlir::Value createGrpAggOp(mlir::Location loc, const std::string & func, const std::vector<mlir::Value> & args);
-    
-    template<class AllAggOp, class RowAggOp, class ColAggOp, class GrpAggOp>
-    mlir::Value createAnyAggOp(mlir::Location loc, const std::string & func, const std::vector<mlir::Value> & args);
-    
-    template<class CumAggOp>
-    mlir::Value createCumAggOp(mlir::Location loc, const std::string & func, const std::vector<mlir::Value> & args);
-    
-    mlir::Value createQuantizeOp(mlir::Location loc, const std::string & func, const std::vector<mlir::Value> & args);
-
-    template<class BindOp>
-    mlir::Value createBindOp(mlir::Location loc, const std::string & func, const std::vector<mlir::Value> & args);
-    
-    template<class TheOp>
-    mlir::Value createSameTypeUnaryOp(mlir::Location loc, const std::string & func, const std::vector<mlir::Value> & args);
-    
-    mlir::Value createTriOp(mlir::Location loc, const std::string & func, const std::vector<mlir::Value> & args, bool upper);
-    
-    template<class SetOp>
-    mlir::Value createSetOp(mlir::Location loc, const std::string & func, const std::vector<mlir::Value> & args);
-    
-    template<class JoinOp>
-    mlir::Value createJoinOp(mlir::Location loc, const std::string & func, const std::vector<mlir::Value> & args);
-
-    mlir::Value createAffineFwdOp(mlir::Location loc, const std::string & func, const std::vector<mlir::Value> & args);
-
-    mlir::Value createBatchNorm2dTestFwdOp(mlir::Location loc, const std::string & func, const std::vector<mlir::Value> & args);
-
-    mlir::ResultRange createConv2dFwdOp(mlir::Location loc, const std::string & func, const std::vector<mlir::Value> & args);
-
-    template<class PoolOp>
-    mlir::ResultRange createPoolFwdOp(mlir::Location loc, const std::string & func, const std::vector<mlir::Value> & args);
+
+    template <class NumOp>
+    mlir::Value createNumOp(mlir::Location loc, const std::string &func, const std::vector<mlir::Value> &args);
+
+    template <class UnaryOp>
+    mlir::Value createUnaryOp(mlir::Location loc, const std::string &func, const std::vector<mlir::Value> &args);
+
+    template <class BinaryOp>
+    mlir::Value createBinaryOp(mlir::Location loc, const std::string &func, const std::vector<mlir::Value> &args);
+
+    template <class RowAggOp, class ColAggOp>
+    mlir::Value createRowOrColAggOp(mlir::Location loc, const std::string &func, const std::vector<mlir::Value> &args);
+
+    template <class GrpAggOp>
+    mlir::Value createGrpAggOp(mlir::Location loc, const std::string &func, const std::vector<mlir::Value> &args);
+
+    template <class AllAggOp, class RowAggOp, class ColAggOp, class GrpAggOp>
+    mlir::Value createAnyAggOp(mlir::Location loc, const std::string &func, const std::vector<mlir::Value> &args);
+
+    template <class CumAggOp>
+    mlir::Value createCumAggOp(mlir::Location loc, const std::string &func, const std::vector<mlir::Value> &args);
+
+    mlir::Value createQuantizeOp(mlir::Location loc, const std::string &func, const std::vector<mlir::Value> &args);
+
+    template <class BindOp>
+    mlir::Value createBindOp(mlir::Location loc, const std::string &func, const std::vector<mlir::Value> &args);
+
+    template <class TheOp>
+    mlir::Value createSameTypeUnaryOp(mlir::Location loc, const std::string &func,
+                                      const std::vector<mlir::Value> &args);
+
+    mlir::Value createTriOp(mlir::Location loc, const std::string &func, const std::vector<mlir::Value> &args,
+                            bool upper);
+
+    template <class SetOp>
+    mlir::Value createSetOp(mlir::Location loc, const std::string &func, const std::vector<mlir::Value> &args);
+
+    template <class JoinOp>
+    mlir::Value createJoinOp(mlir::Location loc, const std::string &func, const std::vector<mlir::Value> &args);
+
+    mlir::Value createAffineFwdOp(mlir::Location loc, const std::string &func, const std::vector<mlir::Value> &args);
+
+    mlir::Value createBatchNorm2dTestFwdOp(mlir::Location loc, const std::string &func,
+                                           const std::vector<mlir::Value> &args);
+
+    mlir::ResultRange createConv2dFwdOp(mlir::Location loc, const std::string &func,
+                                        const std::vector<mlir::Value> &args);
+
+    template <class PoolOp>
+    mlir::ResultRange createPoolFwdOp(mlir::Location loc, const std::string &func,
+                                      const std::vector<mlir::Value> &args);
 
     // ************************************************************************
     // Other utilities
     // ************************************************************************
-    
-    FileMetaData getFileMetaData(const std::string & func, mlir::Value filename);
-    
+
+    FileMetaData getFileMetaData(const std::string &func, mlir::Value filename);
+
     // ************************************************************************
-    
-public:
-    
-    explicit DaphneDSLBuiltins(mlir::OpBuilder & builder) : builder(builder), utils(builder) {
-        //
-    };
-
-    antlrcpp::Any build(mlir::Location loc, const std::string & func, const std::vector<mlir::Value> & args);
-    
+
+  public:
+    explicit DaphneDSLBuiltins(mlir::OpBuilder &builder)
+        : builder(builder), utils(builder){
+                                //
+                            };
+
+    antlrcpp::Any build(mlir::Location loc, const std::string &func, const std::vector<mlir::Value> &args);
 };
 
-#endif //SRC_PARSER_DAPHNEDSL_DAPHNEDSLBUILTINS_H
+#endif // SRC_PARSER_DAPHNEDSL_DAPHNEDSLBUILTINS_H
diff --git a/src/parser/daphnedsl/DaphneDSLParser.cpp b/src/parser/daphnedsl/DaphneDSLParser.cpp
index 8433370d6..8eedc49de 100644
--- a/src/parser/daphnedsl/DaphneDSLParser.cpp
+++ b/src/parser/daphnedsl/DaphneDSLParser.cpp
@@ -15,13 +15,13 @@
  */
 
 #include <ir/daphneir/Daphne.h>
+#include <parser/CancelingErrorListener.h>
 #include <parser/daphnedsl/DaphneDSLParser.h>
 #include <parser/daphnedsl/DaphneDSLVisitor.h>
-#include <parser/CancelingErrorListener.h>
 
-#include "antlr4-runtime.h"
 #include "DaphneDSLGrammarLexer.h"
 #include "DaphneDSLGrammarParser.h"
+#include "antlr4-runtime.h"
 
 #include <mlir/IR/Block.h>
 #include <mlir/IR/Builders.h>
@@ -29,17 +29,18 @@
 
 #include <istream>
 
-void DaphneDSLParser::parseStream(mlir::OpBuilder & builder, std::istream & stream, const std::string &sourceName) {
+void DaphneDSLParser::parseStream(mlir::OpBuilder &builder, std::istream &stream, const std::string &sourceName) {
     CancelingErrorListener errorListener;
-    // TODO: we could remove `sourceName` arg and instead use location from module for filename
+    // TODO: we could remove `sourceName` arg and instead use location from
+    // module for filename
     auto module = llvm::cast<mlir::ModuleOp>(builder.getBlock()->getParentOp());
 
     // Create a single "main"-function and insert DaphneIR operations into it.
-    auto * funcBlock = new mlir::Block();
+    auto *funcBlock = new mlir::Block();
     {
         mlir::OpBuilder::InsertionGuard guard(builder);
         builder.setInsertionPoint(funcBlock, funcBlock->begin());
-        
+
         // Run ANTLR-based DaphneDSL parser.
         antlr4::ANTLRInputStream input(stream);
         input.name = sourceName;
@@ -53,21 +54,18 @@ void DaphneDSLParser::parseStream(mlir::OpBuilder & builder, std::istream & stre
         parser.removeErrorListeners();
         parser.addErrorListener(&errorListener);
 
-        DaphneDSLGrammarParser::ScriptContext * ctx = parser.script();
+        DaphneDSLGrammarParser::ScriptContext *ctx = parser.script();
         DaphneDSLVisitor visitor(module, builder, args, sourceName, userConf);
         visitor.visitScript(ctx);
 
         mlir::Location loc = mlir::FileLineColLoc::get(builder.getStringAttr(sourceName), 0, 0);
-        if(!builder.getBlock()->empty()) {
+        if (!builder.getBlock()->empty()) {
             loc = builder.getBlock()->back().getLoc();
         }
         builder.create<mlir::daphne::ReturnOp>(loc);
     }
-    auto * terminator = funcBlock->getTerminator();
-    auto funcType = builder.getFunctionType(
-        funcBlock->getArgumentTypes(),
-        terminator->getOperandTypes()
-    );
+    auto *terminator = funcBlock->getTerminator();
+    auto funcType = builder.getFunctionType(funcBlock->getArgumentTypes(), terminator->getOperandTypes());
     auto loc = mlir::FileLineColLoc::get(builder.getStringAttr(sourceName), 0, 0);
     auto func = builder.create<mlir::func::FuncOp>(loc, "main", funcType);
     func.push_back(funcBlock);
diff --git a/src/parser/daphnedsl/DaphneDSLParser.h b/src/parser/daphnedsl/DaphneDSLParser.h
index 9dde604a4..a4f25d10f 100644
--- a/src/parser/daphnedsl/DaphneDSLParser.h
+++ b/src/parser/daphnedsl/DaphneDSLParser.h
@@ -18,8 +18,8 @@
 
 #include <parser/Parser.h>
 
-#include <mlir/IR/Builders.h>
 #include <api/cli/DaphneUserConfig.h>
+#include <mlir/IR/Builders.h>
 
 #include <istream>
 #include <string>
@@ -30,14 +30,12 @@ class DaphneDSLParser : public Parser {
 
     std::unordered_map<std::string, std::string> args;
     DaphneUserConfig userConf;
-    
-public:
 
-    DaphneDSLParser(std::unordered_map<std::string, std::string> args, DaphneUserConfig userConf) :
-            args(std::move(args)), userConf(std::move(userConf)) { }
+  public:
+    DaphneDSLParser(std::unordered_map<std::string, std::string> args, DaphneUserConfig userConf)
+        : args(std::move(args)), userConf(std::move(userConf)) {}
 
-    DaphneDSLParser() : DaphneDSLParser(std::unordered_map<std::string, std::string>(), DaphneUserConfig()) { }
+    DaphneDSLParser() : DaphneDSLParser(std::unordered_map<std::string, std::string>(), DaphneUserConfig()) {}
 
     void parseStream(mlir::OpBuilder &builder, std::istream &stream, const std::string &sourceName) override;
-    
 };
diff --git a/src/parser/daphnedsl/DaphneDSLVisitor.cpp b/src/parser/daphnedsl/DaphneDSLVisitor.cpp
index 697b0bde2..b83955736 100644
--- a/src/parser/daphnedsl/DaphneDSLVisitor.cpp
+++ b/src/parser/daphnedsl/DaphneDSLVisitor.cpp
@@ -16,18 +16,18 @@
 
 #include <compiler/inference/TypeInferenceUtils.h>
 #include <compiler/utils/CompilerUtils.h>
-#include <util/ErrorHandler.h>
 #include <compiler/utils/TypePrinting.h>
 #include <ir/daphneir/Daphne.h>
-#include <parser/daphnedsl/DaphneDSLVisitor.h>
-#include <parser/daphnedsl/DaphneDSLParser.h>
 #include <parser/CancelingErrorListener.h>
 #include <parser/ScopedSymbolTable.h>
+#include <parser/daphnedsl/DaphneDSLParser.h>
+#include <parser/daphnedsl/DaphneDSLVisitor.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
+#include <util/ErrorHandler.h>
 
-#include "antlr4-runtime.h"
 #include "DaphneDSLGrammarLexer.h"
 #include "DaphneDSLGrammarParser.h"
+#include "antlr4-runtime.h"
 
 #include <mlir/Dialect/SCF/IR/SCF.h>
 
@@ -50,224 +50,160 @@
 // ****************************************************************************
 
 mlir::Value DaphneDSLVisitor::renameIf(mlir::Value v) {
-    if(symbolTable.has(v))
-        return static_cast<mlir::Value>(
-            builder.create<mlir::daphne::RenameOp>(v.getLoc(), v.getType(), v)
-        );
+    if (symbolTable.has(v))
+        return static_cast<mlir::Value>(builder.create<mlir::daphne::RenameOp>(v.getLoc(), v.getType(), v));
     else
         return v;
 }
 
-void DaphneDSLVisitor::handleAssignmentPart(mlir::Location loc,
-        const std::string & var,
-        DaphneDSLGrammarParser::IndexingContext * idxCtx,
-        ScopedSymbolTable & symbolTable,
-        mlir::Value val
-) {
+void DaphneDSLVisitor::handleAssignmentPart(mlir::Location loc, const std::string &var,
+                                            DaphneDSLGrammarParser::IndexingContext *idxCtx,
+                                            ScopedSymbolTable &symbolTable, mlir::Value val) {
     if (symbolTable.has(var) && symbolTable.get(var).isReadOnly)
-        throw ErrorHandler::compilerError(
-            loc, "DSLVisitor (handleAssignmentPart)",
-            "trying to assign read-only variable " + var);
-
-    if(idxCtx) { // left indexing `var[idxCtx] = val;`
-        if(!symbolTable.has(var))
-            throw ErrorHandler::compilerError(
-                loc, "DSLVisitor (handleAssignmentPart)",
-                "cannot use left indexing on variable " + var +
-                " before a value has been assigned to it");
+        throw ErrorHandler::compilerError(loc, "DSLVisitor (handleAssignmentPart)",
+                                          "trying to assign read-only variable " + var);
+
+    if (idxCtx) { // left indexing `var[idxCtx] = val;`
+        if (!symbolTable.has(var))
+            throw ErrorHandler::compilerError(loc, "DSLVisitor (handleAssignmentPart)",
+                                              "cannot use left indexing on variable " + var +
+                                                  " before a value has been assigned to it");
         mlir::Value obj = symbolTable.get(var).value;
 
-        auto indexing = visit(idxCtx).as<std::pair<
-                std::pair<bool, antlrcpp::Any>,
-                std::pair<bool, antlrcpp::Any>
-        >>();
+        auto indexing = visit(idxCtx).as<std::pair<std::pair<bool, antlrcpp::Any>, std::pair<bool, antlrcpp::Any>>>();
         auto rows = indexing.first;
         auto cols = indexing.second;
 
         // TODO Use location of rows/cols in utils.getLoc(...) for better
         // error messages.
-        if(rows.first && cols.first) {
+        if (rows.first && cols.first) {
             // TODO Use a combined InsertOp (row+col) (see #238).
-            mlir::Value rowSeg = applyRightIndexing<
-                    mlir::daphne::ExtractRowOp, mlir::daphne::SliceRowOp, mlir::daphne::NumRowsOp
-            >(utils.getLoc(idxCtx->start), obj, rows.second, false);
-            rowSeg = applyLeftIndexing<
-                    mlir::daphne::InsertColOp,
-                    mlir::daphne::NumColsOp
-            >(utils.getLoc(idxCtx->start), rowSeg, val, cols.second, llvm::isa<mlir::daphne::FrameType>(obj.getType()));
-            obj = applyLeftIndexing<
-                    mlir::daphne::InsertRowOp,
-                    mlir::daphne::NumRowsOp
-            >(utils.getLoc(idxCtx->start), obj, rowSeg, rows.second, false);
-        }
-        else if(rows.first) // rows specified
-            obj = applyLeftIndexing<
-                    mlir::daphne::InsertRowOp,
-                    mlir::daphne::NumRowsOp
-            >(utils.getLoc(idxCtx->start), obj, val, rows.second, false);
-        else if(cols.first) // cols specified
-            obj = applyLeftIndexing<
-                    mlir::daphne::InsertColOp,
-                    mlir::daphne::NumColsOp
-            >(utils.getLoc(idxCtx->start), obj, val, cols.second, llvm::isa<mlir::daphne::FrameType>(obj.getType()));
+            mlir::Value rowSeg =
+                applyRightIndexing<mlir::daphne::ExtractRowOp, mlir::daphne::SliceRowOp, mlir::daphne::NumRowsOp>(
+                    utils.getLoc(idxCtx->start), obj, rows.second, false);
+            rowSeg = applyLeftIndexing<mlir::daphne::InsertColOp, mlir::daphne::NumColsOp>(
+                utils.getLoc(idxCtx->start), rowSeg, val, cols.second,
+                llvm::isa<mlir::daphne::FrameType>(obj.getType()));
+            obj = applyLeftIndexing<mlir::daphne::InsertRowOp, mlir::daphne::NumRowsOp>(
+                utils.getLoc(idxCtx->start), obj, rowSeg, rows.second, false);
+        } else if (rows.first) // rows specified
+            obj = applyLeftIndexing<mlir::daphne::InsertRowOp, mlir::daphne::NumRowsOp>(utils.getLoc(idxCtx->start),
+                                                                                        obj, val, rows.second, false);
+        else if (cols.first) // cols specified
+            obj = applyLeftIndexing<mlir::daphne::InsertColOp, mlir::daphne::NumColsOp>(
+                utils.getLoc(idxCtx->start), obj, val, cols.second, llvm::isa<mlir::daphne::FrameType>(obj.getType()));
         else
             // no left indexing `var[, ] = val;`
             obj = renameIf(val);
 
         symbolTable.put(var, ScopedSymbolTable::SymbolInfo(obj, false));
-    }
-    else // no left indexing `var = val;`
+    } else // no left indexing `var = val;`
         symbolTable.put(var, ScopedSymbolTable::SymbolInfo(renameIf(val), false));
 }
 
-template<class ExtractAxOp, class SliceAxOp, class NumAxOp>
-mlir::Value DaphneDSLVisitor::applyRightIndexing(mlir::Location loc, mlir::Value arg, antlrcpp::Any ax, bool allowLabel) {
-    if(ax.is<mlir::Value>()) { // indexing with a single SSA value (no ':')
+template <class ExtractAxOp, class SliceAxOp, class NumAxOp>
+mlir::Value DaphneDSLVisitor::applyRightIndexing(mlir::Location loc, mlir::Value arg, antlrcpp::Any ax,
+                                                 bool allowLabel) {
+    if (ax.is<mlir::Value>()) { // indexing with a single SSA value (no ':')
         mlir::Value axVal = ax.as<mlir::Value>();
-        if(CompilerUtils::hasObjType(axVal)) // data object
-            return utils.retValWithInferedType(
-                    builder.create<ExtractAxOp>(loc, utils.unknownType, arg, axVal)
-            );
-        else if(llvm::isa<mlir::daphne::StringType>(axVal.getType())) { // string
-            if(allowLabel)
-                return utils.retValWithInferedType(
-                        builder.create<ExtractAxOp>(loc, utils.unknownType, arg, axVal)
-                );
+        if (CompilerUtils::hasObjType(axVal)) // data object
+            return utils.retValWithInferedType(builder.create<ExtractAxOp>(loc, utils.unknownType, arg, axVal));
+        else if (llvm::isa<mlir::daphne::StringType>(axVal.getType())) { // string
+            if (allowLabel)
+                return utils.retValWithInferedType(builder.create<ExtractAxOp>(loc, utils.unknownType, arg, axVal));
             else
-            throw ErrorHandler::compilerError(
-                loc, "DSLVisitor (applyRightIndexing)",
-                "cannot use right indexing with label in this case");
-        }
-        else // scalar
-            return utils.retValWithInferedType(
-                    builder.create<SliceAxOp>(
-                            loc, utils.unknownType, arg,
-                            utils.castSI64If(axVal),
-                            utils.castSI64If(
-                                    builder.create<mlir::daphne::EwAddOp>(
-                                            loc, builder.getIntegerType(64, false),
-                                            utils.castSI64If(axVal),
-                                            builder.create<mlir::daphne::ConstantOp>(
-                                                    loc, static_cast<int64_t>(1)
-                                            )
-                                    )
-                            )
-                    )
-            );
-    }
-    else if(ax.is<std::pair<mlir::Value, mlir::Value>>()) { // indexing with a range (':')
+                throw ErrorHandler::compilerError(loc, "DSLVisitor (applyRightIndexing)",
+                                                  "cannot use right indexing with label in this case");
+        } else // scalar
+            return utils.retValWithInferedType(builder.create<SliceAxOp>(
+                loc, utils.unknownType, arg, utils.castSI64If(axVal),
+                utils.castSI64If(builder.create<mlir::daphne::EwAddOp>(
+                    loc, builder.getIntegerType(64, false), utils.castSI64If(axVal),
+                    builder.create<mlir::daphne::ConstantOp>(loc, static_cast<int64_t>(1))))));
+    } else if (ax.is<std::pair<mlir::Value, mlir::Value>>()) { // indexing with
+                                                               // a range (':')
         auto axPair = ax.as<std::pair<mlir::Value, mlir::Value>>();
         auto axLowerIncl = axPair.first;
         auto axUpperExcl = axPair.second;
 
         // Use defaults if lower or upper bound not specified.
-        if(axLowerIncl == nullptr)
+        if (axLowerIncl == nullptr)
             axLowerIncl = builder.create<mlir::daphne::ConstantOp>(loc, static_cast<int64_t>(0));
-        if(axUpperExcl == nullptr)
+        if (axUpperExcl == nullptr)
             axUpperExcl = builder.create<NumAxOp>(loc, utils.sizeType, arg);
 
-        return utils.retValWithInferedType(
-                builder.create<SliceAxOp>(
-                        loc, utils.unknownType, arg,
-                        utils.castSI64If(axLowerIncl),
-                        utils.castSI64If(axUpperExcl)
-                )
-        );
-    }
-    else
-        throw ErrorHandler::compilerError(
-            loc, "DSLVisitor (applyRightIndexing)",
-            "unsupported type for right indexing");
+        return utils.retValWithInferedType(builder.create<SliceAxOp>(
+            loc, utils.unknownType, arg, utils.castSI64If(axLowerIncl), utils.castSI64If(axUpperExcl)));
+    } else
+        throw ErrorHandler::compilerError(loc, "DSLVisitor (applyRightIndexing)",
+                                          "unsupported type for right indexing");
 }
 
-template<class InsertAxOp, class NumAxOp>
-mlir::Value DaphneDSLVisitor::applyLeftIndexing(mlir::Location loc, mlir::Value arg, mlir::Value ins, antlrcpp::Any ax, bool allowLabel) {
+template <class InsertAxOp, class NumAxOp>
+mlir::Value DaphneDSLVisitor::applyLeftIndexing(mlir::Location loc, mlir::Value arg, mlir::Value ins, antlrcpp::Any ax,
+                                                bool allowLabel) {
     mlir::Type argType = arg.getType();
 
-    if(ax.is<mlir::Value>()) { // indexing with a single SSA value (no ':')
+    if (ax.is<mlir::Value>()) { // indexing with a single SSA value (no ':')
         mlir::Value axVal = ax.as<mlir::Value>();
-        if(CompilerUtils::hasObjType(axVal)) // data object
-            throw ErrorHandler::compilerError(
-                loc, "DSLVisitor (applyLeftIndexing)",
-                "left indexing with positions as a data object is not supported (yet)");
-        else if(llvm::isa<mlir::daphne::StringType>(axVal.getType())) { // string
-            if(allowLabel)
+        if (CompilerUtils::hasObjType(axVal)) // data object
+            throw ErrorHandler::compilerError(loc, "DSLVisitor (applyLeftIndexing)",
+                                              "left indexing with positions as a data object is not "
+                                              "supported (yet)");
+        else if (llvm::isa<mlir::daphne::StringType>(axVal.getType())) { // string
+            if (allowLabel)
                 // TODO Support this (#239).
-                throw ErrorHandler::compilerError(
-                loc, "DSLVisitor (applyLeftIndexing)",
-                "left indexing by label is not supported yet");
+                throw ErrorHandler::compilerError(loc, "DSLVisitor (applyLeftIndexing)",
+                                                  "left indexing by label is not supported yet");
             else
-                throw ErrorHandler::compilerError(
-                loc, "DSLVisitor (applyLeftIndexing)",
-                "cannot use left indexing with label in this case"
-                );
-        }
-        else // scalar
-            return static_cast<mlir::Value>(
-                    builder.create<InsertAxOp>(
-                            loc, argType, arg, ins,
-                            utils.castSI64If(axVal),
-                            utils.castSI64If(
-                                    builder.create<mlir::daphne::EwAddOp>(
-                                            loc, builder.getIntegerType(64, false),
-                                            utils.castSI64If(axVal),
-                                            builder.create<mlir::daphne::ConstantOp>(
-                                                    loc, static_cast<int64_t>(1)
-                                            )
-                                    )
-                            )
-                    )
-            );
-    }
-    else if(ax.is<std::pair<mlir::Value, mlir::Value>>()) { // indexing with a range (':')
+                throw ErrorHandler::compilerError(loc, "DSLVisitor (applyLeftIndexing)",
+                                                  "cannot use left indexing with label in this case");
+        } else // scalar
+            return static_cast<mlir::Value>(builder.create<InsertAxOp>(
+                loc, argType, arg, ins, utils.castSI64If(axVal),
+                utils.castSI64If(builder.create<mlir::daphne::EwAddOp>(
+                    loc, builder.getIntegerType(64, false), utils.castSI64If(axVal),
+                    builder.create<mlir::daphne::ConstantOp>(loc, static_cast<int64_t>(1))))));
+    } else if (ax.is<std::pair<mlir::Value, mlir::Value>>()) { // indexing with
+                                                               // a range (':')
         auto axPair = ax.as<std::pair<mlir::Value, mlir::Value>>();
         auto axLowerIncl = axPair.first;
         auto axUpperExcl = axPair.second;
 
         // Use defaults if lower or upper bound not specified.
-        if(axLowerIncl == nullptr)
+        if (axLowerIncl == nullptr)
             axLowerIncl = builder.create<mlir::daphne::ConstantOp>(loc, static_cast<int64_t>(0));
-        if(axUpperExcl == nullptr)
+        if (axUpperExcl == nullptr)
             axUpperExcl = builder.create<NumAxOp>(loc, utils.sizeType, arg);
 
-        return static_cast<mlir::Value>(
-                builder.create<InsertAxOp>(
-                        loc, argType, arg, ins,
-                        utils.castSI64If(axLowerIncl),
-                        utils.castSI64If(axUpperExcl)
-                )
-        );
-    }
-    else
-        throw ErrorHandler::compilerError(
-            loc, "DSLVisitor (applyLeftIndexing)",
-            "unsupported type for left indexing");
+        return static_cast<mlir::Value>(builder.create<InsertAxOp>(
+            loc, argType, arg, ins, utils.castSI64If(axLowerIncl), utils.castSI64If(axUpperExcl)));
+    } else
+        throw ErrorHandler::compilerError(loc, "DSLVisitor (applyLeftIndexing)", "unsupported type for left indexing");
 }
 
 // ****************************************************************************
 // Visitor functions
 // ****************************************************************************
 
-antlrcpp::Any DaphneDSLVisitor::visitScript(DaphneDSLGrammarParser::ScriptContext * ctx) {
-    return visitChildren(ctx);
-}
+antlrcpp::Any DaphneDSLVisitor::visitScript(DaphneDSLGrammarParser::ScriptContext *ctx) { return visitChildren(ctx); }
 
-antlrcpp::Any DaphneDSLVisitor::visitStatement(DaphneDSLGrammarParser::StatementContext * ctx) {
+antlrcpp::Any DaphneDSLVisitor::visitStatement(DaphneDSLGrammarParser::StatementContext *ctx) {
     return visitChildren(ctx);
 }
 
-antlrcpp::Any DaphneDSLVisitor::visitBlockStatement(DaphneDSLGrammarParser::BlockStatementContext * ctx) {
+antlrcpp::Any DaphneDSLVisitor::visitBlockStatement(DaphneDSLGrammarParser::BlockStatementContext *ctx) {
     symbolTable.pushScope();
     antlrcpp::Any res = visitChildren(ctx);
     symbolTable.put(symbolTable.popScope());
     return res;
 }
 
-antlrcpp::Any DaphneDSLVisitor::visitImportStatement(DaphneDSLGrammarParser::ImportStatementContext * ctx) {
+antlrcpp::Any DaphneDSLVisitor::visitImportStatement(DaphneDSLGrammarParser::ImportStatementContext *ctx) {
     auto loc = utils.getLoc(ctx->start);
-    if(symbolTable.getNumScopes() != 1)
-        throw ErrorHandler::compilerError(loc,  "DSLVisitor (ImportStatement)",
-                "Imports can only be done in the main scope");
+    if (symbolTable.getNumScopes() != 1)
+        throw ErrorHandler::compilerError(loc, "DSLVisitor (ImportStatement)",
+                                          "Imports can only be done in the main scope");
 
     const char prefixDelim = '.';
     std::string prefix;
@@ -275,101 +211,100 @@ antlrcpp::Any DaphneDSLVisitor::visitImportStatement(DaphneDSLGrammarParser::Imp
     std::string path = ctx->filePath->getText();
     // Remove quotes
     path = path.substr(1, path.size() - 2);
-    
-    std::filesystem::path importerDirPath = std::filesystem::absolute(std::filesystem::path(scriptPaths.top())).parent_path();
+
+    std::filesystem::path importerDirPath =
+        std::filesystem::absolute(std::filesystem::path(scriptPaths.top())).parent_path();
     std::filesystem::path importingPath = path;
 
-    //Determine the prefix from alias/filename
-    if(ctx->alias)
-    {
+    // Determine the prefix from alias/filename
+    if (ctx->alias) {
         prefix = ctx->alias->getText();
         prefix = prefix.substr(1, prefix.size() - 2);
-    }
-    else
-        prefix = importingPath.stem().string(); 
+    } else
+        prefix = importingPath.stem().string();
 
     prefix += prefixDelim;
 
-    // Absolute path can be used as is, we have to handle relative paths and config paths
-    if(importingPath.is_relative())
-    {
-        std::filesystem::path absolutePath = importerDirPath /  importingPath;
-        if(std::filesystem::exists(absolutePath))
+    // Absolute path can be used as is, we have to handle relative paths and
+    // config paths
+    if (importingPath.is_relative()) {
+        std::filesystem::path absolutePath = importerDirPath / importingPath;
+        if (std::filesystem::exists(absolutePath))
             absolutePath = std::filesystem::canonical(absolutePath);
 
         // Check directories in UserConfig (if provided)
-        if(!userConf.daphnedsl_import_paths.empty())
-        {
-            const auto& configPaths = userConf.daphnedsl_import_paths;
+        if (!userConf.daphnedsl_import_paths.empty()) {
+            const auto &configPaths = userConf.daphnedsl_import_paths;
             // User specified _default_ paths.
-            if(importingPath.has_extension() && (configPaths.find("default_dirs") != configPaths.end()))
-            {
-                for(std::filesystem::path defaultPath: configPaths.at("default_dirs"))
-                {
+            if (importingPath.has_extension() && (configPaths.find("default_dirs") != configPaths.end())) {
+                for (std::filesystem::path defaultPath : configPaths.at("default_dirs")) {
                     std::filesystem::path libFile = defaultPath / path;
-                    if(std::filesystem::exists(libFile))
-                    {
-                        if(std::filesystem::exists(absolutePath) && std::filesystem::canonical(libFile) != absolutePath)
+                    if (std::filesystem::exists(libFile)) {
+                        if (std::filesystem::exists(absolutePath) &&
+                            std::filesystem::canonical(libFile) != absolutePath)
                             throw ErrorHandler::compilerError(loc, "DSLVisitor",
-                                std::string("Ambiguous import: ")
-                                    .append(importingPath)
-                                    .append(", found another file with the same name in default paths of UserConfig: ")
-                                    .append(libFile));
+                                                              std::string("Ambiguous import: ")
+                                                                  .append(importingPath)
+                                                                  .append(", found another file with the same "
+                                                                          "name in default paths of UserConfig: ")
+                                                                  .append(libFile));
                         absolutePath = libFile;
                     }
                 }
             }
 
             // User specified "libraries" -> import all files
-            if(!importingPath.has_extension() && (configPaths.find(path) != configPaths.end()))
-                for (std::filesystem::path const& dir_entry : std::filesystem::directory_iterator{configPaths.at(path)[0]})
+            if (!importingPath.has_extension() && (configPaths.find(path) != configPaths.end()))
+                for (std::filesystem::path const &dir_entry :
+                     std::filesystem::directory_iterator{configPaths.at(path)[0]})
                     importPaths.push_back(dir_entry.string());
         }
         path = absolutePath.string();
     }
 
-    if(importPaths.empty())
+    if (importPaths.empty())
         importPaths.push_back(path);
 
-    if(std::filesystem::absolute(scriptPaths.top()).string() == path)
-        throw ErrorHandler::compilerError(loc, "DSLVisitor",
-                std::string("You cannot import the file you are currently in: ").append(path));
+    if (std::filesystem::absolute(scriptPaths.top()).string() == path)
+        throw ErrorHandler::compilerError(
+            loc, "DSLVisitor", std::string("You cannot import the file you are currently in: ").append(path));
 
-    for(const auto& somePath: importPaths)
-    {
-        for(const auto& imported: importedFiles)
-            if(std::filesystem::equivalent(somePath, imported))
-                throw ErrorHandler::compilerError(loc, "DSLVisitor",
-                        std::string("You cannot import the same file twice: ").append(somePath));
+    for (const auto &somePath : importPaths) {
+        for (const auto &imported : importedFiles)
+            if (std::filesystem::equivalent(somePath, imported))
+                throw ErrorHandler::compilerError(
+                    loc, "DSLVisitor", std::string("You cannot import the same file twice: ").append(somePath));
 
         importedFiles.push_back(somePath);
     }
 
     antlrcpp::Any res;
-    for(const auto& importPath : importPaths)
-    {
-        if(!std::filesystem::exists(importPath))
+    for (const auto &importPath : importPaths) {
+        if (!std::filesystem::exists(importPath))
             throw ErrorHandler::compilerError(loc, "DSLVisitor",
-                    std::string("The import path doesn't exist: ").append(importPath));
+                                              std::string("The import path doesn't exist: ").append(importPath));
 
         std::string finalPrefix = prefix;
         auto origScope = symbolTable.extractScope();
 
-        // If we import a library, we insert a filename (e.g., "algorithms/kmeans.daphne" -> algorithms.kmeans.km)
-        if(!importingPath.has_extension())
+        // If we import a library, we insert a filename (e.g.,
+        // "algorithms/kmeans.daphne" -> algorithms.kmeans.km)
+        if (!importingPath.has_extension())
             finalPrefix += std::filesystem::path(importPath).stem().string() + prefixDelim;
-        else 
-        {
-            // If the prefix is already occupied (and is not part of some other prefix), we append a parent directory name
-            for(const auto& symbol : origScope)
-                if(symbol.first.find(finalPrefix) == 0 && std::count(symbol.first.begin(), symbol.first.end(), '.') == 1)
-                {
-                    // Throw error when we want to use an explicit alias that results in a prefix clash
-                    if(ctx->alias)
-                    {
+        else {
+            // If the prefix is already occupied (and is not part of some other
+            // prefix), we append a parent directory name
+            for (const auto &symbol : origScope)
+                if (symbol.first.find(finalPrefix) == 0 &&
+                    std::count(symbol.first.begin(), symbol.first.end(), '.') == 1) {
+                    // Throw error when we want to use an explicit alias that
+                    // results in a prefix clash
+                    if (ctx->alias) {
                         throw ErrorHandler::compilerError(loc, "DSLVisitor",
-                            std::string("Alias ").append(ctx->alias->getText())
-                            .append(" results in a name clash with another prefix"));
+                                                          std::string("Alias ")
+                                                              .append(ctx->alias->getText())
+                                                              .append(" results in a name clash with another "
+                                                                      "prefix"));
                     }
                     finalPrefix.insert(0, importingPath.parent_path().filename().string() + prefixDelim);
                     break;
@@ -389,8 +324,8 @@ antlrcpp::Any DaphneDSLVisitor::visitImportStatement(DaphneDSLGrammarParser::Imp
         DaphneDSLGrammarParser parser(&tokens);
         parser.removeErrorListeners();
         parser.addErrorListener(&errorListener);
-        DaphneDSLGrammarParser::ScriptContext * importCtx = parser.script();
-        
+        DaphneDSLGrammarParser::ScriptContext *importCtx = parser.script();
+
         std::multimap<std::string, mlir::func::FuncOp> origFuncMap = functionsSymbolMap;
         functionsSymbolMap.clear();
 
@@ -405,16 +340,16 @@ antlrcpp::Any DaphneDSLVisitor::visitImportStatement(DaphneDSLGrammarParser::Imp
         ScopedSymbolTable::SymbolTable symbTable = symbolTable.extractScope();
 
         // If the current import file also imported something, we discard it
-        for(const auto& symbol : symbTable)
-            if(symbol.first.find('.') == std::string::npos)
+        for (const auto &symbol : symbTable)
+            if (symbol.first.find('.') == std::string::npos)
                 origScope[finalPrefix + symbol.first] = symbol.second;
 
         symbolTable.put(origScope);
-        
+
         importedFiles = origImportedFiles;
 
-        for(std::pair<std::string, mlir::func::FuncOp> funcSymbol : functionsSymbolMap)
-            if(funcSymbol.first.find('.') == std::string::npos)
+        for (std::pair<std::string, mlir::func::FuncOp> funcSymbol : functionsSymbolMap)
+            if (funcSymbol.first.find('.') == std::string::npos)
                 origFuncMap.insert({finalPrefix + funcSymbol.first, funcSymbol.second});
         functionsSymbolMap.clear();
         functionsSymbolMap = origFuncMap;
@@ -422,56 +357,46 @@ antlrcpp::Any DaphneDSLVisitor::visitImportStatement(DaphneDSLGrammarParser::Imp
     return res;
 }
 
-antlrcpp::Any DaphneDSLVisitor::visitExprStatement(DaphneDSLGrammarParser::ExprStatementContext * ctx) {
+antlrcpp::Any DaphneDSLVisitor::visitExprStatement(DaphneDSLGrammarParser::ExprStatementContext *ctx) {
     return visitChildren(ctx);
 }
 
-antlrcpp::Any DaphneDSLVisitor::visitAssignStatement(DaphneDSLGrammarParser::AssignStatementContext * ctx) {
+antlrcpp::Any DaphneDSLVisitor::visitAssignStatement(DaphneDSLGrammarParser::AssignStatementContext *ctx) {
     const size_t numVars = ctx->IDENTIFIER().size();
     antlrcpp::Any rhsAny = visit(ctx->expr());
     bool rhsIsRR = rhsAny.is<mlir::ResultRange>();
     auto loc = utils.getLoc(ctx->start);
-    if(numVars == 1) {
+    if (numVars == 1) {
         // A single variable on the left-hand side.
-        if(rhsIsRR)
+        if (rhsIsRR)
             throw ErrorHandler::compilerError(loc, "DSLVisitor",
-                    "trying to assign multiple results to a single variable"
-            );
-        handleAssignmentPart(loc,
-                ctx->IDENTIFIER(0)->getText(),
-                ctx->indexing(0),
-                symbolTable,
-                utils.valueOrError(utils.getLoc(ctx->expr()->start), rhsAny)
-        );
+                                              "trying to assign multiple results to a single variable");
+        handleAssignmentPart(loc, ctx->IDENTIFIER(0)->getText(), ctx->indexing(0), symbolTable,
+                             utils.valueOrError(utils.getLoc(ctx->expr()->start), rhsAny));
         return nullptr;
-    }
-    else if(numVars > 1) {
+    } else if (numVars > 1) {
         // Multiple variables on the left-hand side; the expression must be an
         // operation returning multiple outputs.
-        if(rhsIsRR) {
+        if (rhsIsRR) {
             auto rhsAsRR = rhsAny.as<mlir::ResultRange>();
-            if(rhsAsRR.size() == numVars) {
-                for(size_t i = 0; i < numVars; i++)
-                    handleAssignmentPart(loc,
-                            ctx->IDENTIFIER(i)->getText(), ctx->indexing(i), symbolTable, rhsAsRR[i]
-                    );
+            if (rhsAsRR.size() == numVars) {
+                for (size_t i = 0; i < numVars; i++)
+                    handleAssignmentPart(loc, ctx->IDENTIFIER(i)->getText(), ctx->indexing(i), symbolTable, rhsAsRR[i]);
                 return nullptr;
             }
         }
         throw ErrorHandler::compilerError(loc, "DSLVisitor",
-                "right-hand side expression of assignment to multiple "
-                "variables must return multiple values, one for each "
-                "variable on the left-hand side"
-        );
+                                          "right-hand side expression of assignment to multiple "
+                                          "variables must return multiple values, one for each "
+                                          "variable on the left-hand side");
     }
     throw ErrorHandler::compilerError(loc, "DSLVisitor",
-            "the DaphneDSL grammar should prevent zero variables "
-            "on the left-hand side of an assignment"
-    );
+                                      "the DaphneDSL grammar should prevent zero variables "
+                                      "on the left-hand side of an assignment");
     return nullptr;
 }
 
-antlrcpp::Any DaphneDSLVisitor::visitIfStatement(DaphneDSLGrammarParser::IfStatementContext * ctx) {
+antlrcpp::Any DaphneDSLVisitor::visitIfStatement(DaphneDSLGrammarParser::IfStatementContext *ctx) {
     mlir::Value cond = utils.castBoolIf(valueOrErrorOnVisit(ctx->cond));
 
     mlir::Location loc = utils.getLoc(ctx->start);
@@ -490,7 +415,7 @@ antlrcpp::Any DaphneDSLVisitor::visitIfStatement(DaphneDSLGrammarParser::IfState
     // leave it empty; we might need to insert a yield-operation.
     mlir::Block elseBlock;
     ScopedSymbolTable::SymbolTable owElse;
-    if(ctx->elseStmt) {
+    if (ctx->elseStmt) {
         builder.setInsertionPointToEnd(&elseBlock);
         symbolTable.pushScope();
         visit(ctx->elseStmt);
@@ -502,21 +427,22 @@ antlrcpp::Any DaphneDSLVisitor::visitIfStatement(DaphneDSLGrammarParser::IfState
     std::set<std::string> owUnion = ScopedSymbolTable::mergeSymbols(owThen, owElse);
     std::vector<mlir::Value> resultsThen;
     std::vector<mlir::Value> resultsElse;
-    for(auto it = owUnion.begin(); it != owUnion.end(); it++) {
+    for (auto it = owUnion.begin(); it != owUnion.end(); it++) {
         mlir::Value valThen = symbolTable.get(*it, owThen).value;
         mlir::Value valElse = symbolTable.get(*it, owElse).value;
         mlir::Type tyThen = valThen.getType();
         mlir::Type tyElse = valElse.getType();
         // TODO These checks should happen after type inference.
-        if(!CompilerUtils::equalUnknownAware(tyThen, tyElse)) {
+        if (!CompilerUtils::equalUnknownAware(tyThen, tyElse)) {
             // TODO We could try to cast the types.
             // TODO Use DaphneDSL types (not MLIR types) in error message.
-            // TODO Adapt to the case of no else-branch in DaphneDSL (when there is no else in DaphneDSL,
-            // "else" should not be mentioned in the error message).
+            // TODO Adapt to the case of no else-branch in DaphneDSL (when there
+            // is no else in DaphneDSL, "else" should not be mentioned in the
+            // error message).
             std::stringstream s;
             s << "type of variable `" << symbolTable.getSymbol(valThen, owThen)
-                << "` after if-statement is ambiguous, could be either " << tyThen
-                << " (then-branch) or " << tyElse << " (else-branch)";
+              << "` after if-statement is ambiguous, could be either " << tyThen << " (then-branch) or " << tyElse
+              << " (else-branch)";
             throw ErrorHandler::compilerError(loc, "DSLVisitor", s.str());
         }
         resultsThen.push_back(valThen);
@@ -534,10 +460,10 @@ antlrcpp::Any DaphneDSLVisitor::visitIfStatement(DaphneDSLGrammarParser::IfState
 
     // Helper functions to move the operations in the two blocks created above
     // into the actual branches of the if-operation.
-    auto insertThenBlockDo = [&](mlir::OpBuilder & nested, mlir::Location loc) {
+    auto insertThenBlockDo = [&](mlir::OpBuilder &nested, mlir::Location loc) {
         nested.getBlock()->getOperations().splice(nested.getBlock()->end(), thenBlock.getOperations());
     };
-    auto insertElseBlockDo = [&](mlir::OpBuilder & nested, mlir::Location loc) {
+    auto insertElseBlockDo = [&](mlir::OpBuilder &nested, mlir::Location loc) {
         nested.getBlock()->getOperations().splice(nested.getBlock()->end(), elseBlock.getOperations());
     };
     llvm::function_ref<void(mlir::OpBuilder &, mlir::Location)> insertElseBlockNo = nullptr;
@@ -545,21 +471,17 @@ antlrcpp::Any DaphneDSLVisitor::visitIfStatement(DaphneDSLGrammarParser::IfState
     // Create the actual if-operation. Generate the else-block only if it was
     // explicitly given in the DSL script, or when it is needed to yield values.
     auto ifOp = builder.create<mlir::scf::IfOp>(
-            loc,
-            cond,
-            insertThenBlockDo,
-            (ctx->elseStmt || !owUnion.empty()) ? insertElseBlockDo : insertElseBlockNo
-    );
+        loc, cond, insertThenBlockDo, (ctx->elseStmt || !owUnion.empty()) ? insertElseBlockDo : insertElseBlockNo);
 
     // Rewire the results of the if-operation to their variable names.
     size_t i = 0;
-    for(auto it = owUnion.begin(); it != owUnion.end(); it++)
+    for (auto it = owUnion.begin(); it != owUnion.end(); it++)
         symbolTable.put(*it, ScopedSymbolTable::SymbolInfo(ifOp.getResults()[i++], false));
 
     return nullptr;
 }
 
-antlrcpp::Any DaphneDSLVisitor::visitWhileStatement(DaphneDSLGrammarParser::WhileStatementContext * ctx) {
+antlrcpp::Any DaphneDSLVisitor::visitWhileStatement(DaphneDSLGrammarParser::WhileStatementContext *ctx) {
     mlir::Location loc = utils.getLoc(ctx->start);
 
     auto ip = builder.saveInsertionPoint();
@@ -572,7 +494,7 @@ antlrcpp::Any DaphneDSLVisitor::visitWhileStatement(DaphneDSLGrammarParser::Whil
 
     mlir::Value cond;
     ScopedSymbolTable::SymbolTable ow;
-    if(isDoWhile) { // It's a do-while loop.
+    if (isDoWhile) { // It's a do-while loop.
         builder.setInsertionPointToEnd(beforeBlock);
 
         // Scope for body and condition, such that condition can see the body's
@@ -593,8 +515,7 @@ antlrcpp::Any DaphneDSLVisitor::visitWhileStatement(DaphneDSLGrammarParser::Whil
         cond = utils.castBoolIf(valueOrErrorOnVisit(ctx->cond));
 
         symbolTable.popScope();
-    }
-    else { // It's a while loop.
+    } else { // It's a while loop.
         builder.setInsertionPointToEnd(beforeBlock);
         cond = utils.castBoolIf(valueOrErrorOnVisit(ctx->cond));
 
@@ -610,7 +531,7 @@ antlrcpp::Any DaphneDSLVisitor::visitWhileStatement(DaphneDSLGrammarParser::Whil
     std::vector<mlir::Value> owVals;
     std::vector<mlir::Type> resultTypes;
     std::vector<mlir::Value> whileOperands;
-    for(auto it = ow.begin(); it != ow.end(); it++) {
+    for (auto it = ow.begin(); it != ow.end(); it++) {
         mlir::Value owVal = it->second.value;
         mlir::Type type = owVal.getType();
         auto owLoc = owVal.getLoc();
@@ -627,14 +548,14 @@ antlrcpp::Any DaphneDSLVisitor::visitWhileStatement(DaphneDSLGrammarParser::Whil
 
     // Create the ConditionOp of the "before" block.
     builder.setInsertionPointToEnd(beforeBlock);
-    if(isDoWhile)
+    if (isDoWhile)
         builder.create<mlir::scf::ConditionOp>(loc, cond, owVals);
     else
         builder.create<mlir::scf::ConditionOp>(loc, cond, beforeBlock->getArguments());
 
     // Create the YieldOp of the "after" block.
     builder.setInsertionPointToEnd(afterBlock);
-    if(isDoWhile)
+    if (isDoWhile)
         builder.create<mlir::scf::YieldOp>(loc, afterBlock->getArguments());
     else
         builder.create<mlir::scf::YieldOp>(loc, owVals);
@@ -647,14 +568,14 @@ antlrcpp::Any DaphneDSLVisitor::visitWhileStatement(DaphneDSLGrammarParser::Whil
     whileOp.getAfter().push_back(afterBlock);
 
     size_t i = 0;
-    for(auto & it : ow) {
+    for (auto &it : ow) {
         // Replace usages of the variables updated in the loop's body by the
         // corresponding block arguments.
-        whileOperands[i].replaceUsesWithIf(beforeBlock->getArgument(i), [&](mlir::OpOperand & operand) {
+        whileOperands[i].replaceUsesWithIf(beforeBlock->getArgument(i), [&](mlir::OpOperand &operand) {
             auto parentRegion = operand.getOwner()->getBlock()->getParent();
             return parentRegion != nullptr && whileOp.getBefore().isAncestor(parentRegion);
         });
-        whileOperands[i].replaceUsesWithIf(afterBlock->getArgument(i), [&](mlir::OpOperand & operand) {
+        whileOperands[i].replaceUsesWithIf(afterBlock->getArgument(i), [&](mlir::OpOperand &operand) {
             auto parentRegion = operand.getOwner()->getBlock()->getParent();
             return parentRegion != nullptr && whileOp.getAfter().isAncestor(parentRegion);
         });
@@ -666,7 +587,7 @@ antlrcpp::Any DaphneDSLVisitor::visitWhileStatement(DaphneDSLGrammarParser::Whil
     return nullptr;
 }
 
-antlrcpp::Any DaphneDSLVisitor::visitForStatement(DaphneDSLGrammarParser::ForStatementContext * ctx) {
+antlrcpp::Any DaphneDSLVisitor::visitForStatement(DaphneDSLGrammarParser::ForStatementContext *ctx) {
     mlir::Location loc = utils.getLoc(ctx->start);
 
     // The type we assume for from, to, and step.
@@ -677,23 +598,18 @@ antlrcpp::Any DaphneDSLVisitor::visitForStatement(DaphneDSLGrammarParser::ForSta
     mlir::Value to = utils.castIf(t, valueOrErrorOnVisit(ctx->to));
     mlir::Value step;
     mlir::Value direction; // count upwards (+1) or downwards (-1)
-    if(ctx->step) {
+    if (ctx->step) {
         // If the step is given, parse it and derive the counting direction.
         step = utils.castIf(t, valueOrErrorOnVisit(ctx->step));
         direction = builder.create<mlir::daphne::EwSignOp>(loc, t, step);
-    }
-    else {
+    } else {
         // If the step is not given, derive it as `-1 + 2 * (to >= from)`,
         // which always results in -1 or +1, even if to equals from.
         step = builder.create<mlir::daphne::EwAddOp>(
-                loc,
-                builder.create<mlir::daphne::ConstantOp>(loc, t, builder.getIntegerAttr(t, -1)),
-                builder.create<mlir::daphne::EwMulOp>(
-                        loc,
-                        builder.create<mlir::daphne::ConstantOp>(loc, t, builder.getIntegerAttr(t, 2)),
-                        utils.castIf(t, builder.create<mlir::daphne::EwGeOp>(loc, to, from))
-                )
-        );
+            loc, builder.create<mlir::daphne::ConstantOp>(loc, t, builder.getIntegerAttr(t, -1)),
+            builder.create<mlir::daphne::EwMulOp>(
+                loc, builder.create<mlir::daphne::ConstantOp>(loc, t, builder.getIntegerAttr(t, 2)),
+                utils.castIf(t, builder.create<mlir::daphne::EwGeOp>(loc, to, from))));
         direction = step;
     }
     // Compensate for the fact that the upper bound of SCF's ForOp is exclusive,
@@ -701,13 +617,13 @@ antlrcpp::Any DaphneDSLVisitor::visitForStatement(DaphneDSLGrammarParser::ForSta
     to = builder.create<mlir::daphne::EwAddOp>(loc, to, direction);
     // Compensate for the fact that SCF's ForOp can only count upwards.
     from = builder.create<mlir::daphne::EwMulOp>(loc, from, direction);
-    to   = builder.create<mlir::daphne::EwMulOp>(loc, to  , direction);
+    to = builder.create<mlir::daphne::EwMulOp>(loc, to, direction);
     step = builder.create<mlir::daphne::EwMulOp>(loc, step, direction);
     // Compensate for the fact that SCF's ForOp expects its parameters to be of
     // MLIR's IndexType.
     mlir::Type idxType = builder.getIndexType();
     from = utils.castIf(idxType, from);
-    to   = utils.castIf(idxType, to);
+    to = utils.castIf(idxType, to);
     step = utils.castIf(idxType, step);
 
     auto ip = builder.saveInsertionPoint();
@@ -721,16 +637,11 @@ antlrcpp::Any DaphneDSLVisitor::visitForStatement(DaphneDSLGrammarParser::ForSta
     // yet; will be replaced later.
     mlir::Value ph = builder.create<mlir::daphne::ConstantOp>(loc, builder.getIndexType(), builder.getIndexAttr(123));
     // Make the induction variable available by the specified name.
-    symbolTable.put(
-            ctx->var->getText(),
-            ScopedSymbolTable::SymbolInfo(
-                    // Un-compensate for counting direction.
-                    builder.create<mlir::daphne::EwMulOp>(
-                            loc, utils.castIf(t, ph), direction
-                    ),
-                    true // the for-loop's induction variable is read-only
-            )
-    );
+    symbolTable.put(ctx->var->getText(), ScopedSymbolTable::SymbolInfo(
+                                             // Un-compensate for counting direction.
+                                             builder.create<mlir::daphne::EwMulOp>(loc, utils.castIf(t, ph), direction),
+                                             true // the for-loop's induction variable is read-only
+                                             ));
 
     // Parse the loop's body.
     visit(ctx->bodyStmt);
@@ -740,7 +651,7 @@ antlrcpp::Any DaphneDSLVisitor::visitForStatement(DaphneDSLGrammarParser::ForSta
     ScopedSymbolTable::SymbolTable ow = symbolTable.popScope();
     std::vector<mlir::Value> resVals;
     std::vector<mlir::Value> forOperands;
-    for(auto it = ow.begin(); it != ow.end(); it++) {
+    for (auto it = ow.begin(); it != ow.end(); it++) {
         resVals.push_back(it->second.value);
         forOperands.push_back(symbolTable.get(it->first).value);
     }
@@ -751,7 +662,7 @@ antlrcpp::Any DaphneDSLVisitor::visitForStatement(DaphneDSLGrammarParser::ForSta
 
     // Helper function for moving the operations in the block created above
     // into the actual body of the ForOp.
-    auto insertBodyBlock = [&](mlir::OpBuilder & nested, mlir::Location loc, mlir::Value iv, mlir::ValueRange lcv) {
+    auto insertBodyBlock = [&](mlir::OpBuilder &nested, mlir::Location loc, mlir::Value iv, mlir::ValueRange lcv) {
         nested.getBlock()->getOperations().splice(nested.getBlock()->end(), bodyBlock.getOperations());
     };
 
@@ -762,10 +673,10 @@ antlrcpp::Any DaphneDSLVisitor::visitForStatement(DaphneDSLGrammarParser::ForSta
     ph.replaceAllUsesWith(forOp.getInductionVar());
 
     size_t i = 0;
-    for(auto it = ow.begin(); it != ow.end(); it++) {
+    for (auto it = ow.begin(); it != ow.end(); it++) {
         // Replace usages of the variables updated in the loop's body by the
         // corresponding block arguments.
-        forOperands[i].replaceUsesWithIf(forOp.getRegionIterArgs()[i], [&](mlir::OpOperand & operand) {
+        forOperands[i].replaceUsesWithIf(forOp.getRegionIterArgs()[i], [&](mlir::OpOperand &operand) {
             auto parentRegion = operand.getOwner()->getBlock()->getParent();
             return parentRegion != nullptr && forOp.getLoopBody().isAncestor(parentRegion);
         });
@@ -779,20 +690,20 @@ antlrcpp::Any DaphneDSLVisitor::visitForStatement(DaphneDSLGrammarParser::ForSta
     return nullptr;
 }
 
-antlrcpp::Any DaphneDSLVisitor::visitLiteralExpr(DaphneDSLGrammarParser::LiteralExprContext * ctx) {
+antlrcpp::Any DaphneDSLVisitor::visitLiteralExpr(DaphneDSLGrammarParser::LiteralExprContext *ctx) {
     return visitChildren(ctx);
 }
 
-antlrcpp::Any DaphneDSLVisitor::visitArgExpr(DaphneDSLGrammarParser::ArgExprContext * ctx) {
+antlrcpp::Any DaphneDSLVisitor::visitArgExpr(DaphneDSLGrammarParser::ArgExprContext *ctx) {
     // Retrieve the name of the referenced CLI argument.
     std::string arg = ctx->arg->getText();
 
     // Find out if this argument was specified on the command line.
     auto it = args.find(arg);
-    if(it == args.end())
+    if (it == args.end())
         throw ErrorHandler::compilerError(utils.getLoc(ctx->start), "DSLVisitor",
-            "argument " + arg + " referenced, but not provided as a command line argument"
-        );
+                                          "argument " + arg +
+                                              " referenced, but not provided as a command line argument");
 
     std::string argValue = it->second;
     bool hasMinus = false;
@@ -816,46 +727,41 @@ antlrcpp::Any DaphneDSLVisitor::visitArgExpr(DaphneDSLGrammarParser::ArgExprCont
     parser.removeErrorListeners();
     parser.addErrorListener(&errorListener);
 
-    DaphneDSLGrammarParser::LiteralContext * literalCtx = nullptr;
+    DaphneDSLGrammarParser::LiteralContext *literalCtx = nullptr;
     try {
         literalCtx = parser.literal();
         if (tokens.LA(1) != antlr4::Token::EOF) {
             // Ensure entire input is consumed; if not, it's not a valid literal
             throw std::runtime_error("Extra input after literal");
         }
-    }
-    catch (std::exception & e) {
+    } catch (std::exception &e) {
         throw ErrorHandler::compilerError(utils.getLoc(ctx->start), "DSLVisitor",
-            "Invalid literal value for argument '" + arg + "': " + argValue
-        );
+                                          "Invalid literal value for argument '" + arg + "': " + argValue);
     }
 
     mlir::Value lit = visitLiteral(literalCtx);
-    if(!hasMinus)
+    if (!hasMinus)
         return lit;
     else
-        return utils.retValWithInferedType(builder.create<mlir::daphne::EwMinusOp>(
-            utils.getLoc(ctx->start), utils.unknownType, lit
-        ));
+        return utils.retValWithInferedType(
+            builder.create<mlir::daphne::EwMinusOp>(utils.getLoc(ctx->start), utils.unknownType, lit));
 }
 
-antlrcpp::Any DaphneDSLVisitor::visitIdentifierExpr(DaphneDSLGrammarParser::IdentifierExprContext * ctx) {
+antlrcpp::Any DaphneDSLVisitor::visitIdentifierExpr(DaphneDSLGrammarParser::IdentifierExprContext *ctx) {
     std::string var;
-    const auto& identifierVec = ctx->IDENTIFIER();
-    for(size_t s = 0; s < identifierVec.size(); s++)
+    const auto &identifierVec = ctx->IDENTIFIER();
+    for (size_t s = 0; s < identifierVec.size(); s++)
         var += (s < identifierVec.size() - 1) ? identifierVec[s]->getText() + '.' : identifierVec[s]->getText();
 
     try {
         return symbolTable.get(var).value;
-    }
-    catch(std::runtime_error &) {
+    } catch (std::runtime_error &) {
         throw ErrorHandler::compilerError(utils.getLoc(ctx->start), "DSLVisitor",
-                "variable `" + var + "` referenced before assignment"
-        );
+                                          "variable `" + var + "` referenced before assignment");
     }
 }
 
-antlrcpp::Any DaphneDSLVisitor::visitParanthesesExpr(DaphneDSLGrammarParser::ParanthesesExprContext * ctx) {
+antlrcpp::Any DaphneDSLVisitor::visitParanthesesExpr(DaphneDSLGrammarParser::ParanthesesExprContext *ctx) {
     return valueOrErrorOnVisit(ctx->expr());
 }
 
@@ -865,23 +771,22 @@ bool DaphneDSLVisitor::argAndUDFParamCompatible(mlir::Type argTy, mlir::Type par
 
     // TODO This is rather a workaround than a thorough solution, since
     // unknown argument types do not really allow to check compatibility.
-    
+
     // Argument type and parameter type are compatible if...
     return
         // ...they are the same, OR
         paramTy == argTy ||
         // ...at least one of them is unknown, OR
         argTy == utils.unknownType || paramTy == utils.unknownType ||
-        // ...they are both matrices and at least one of them is of unknown value type.
-        (argMatTy && paramMatTy && (
-            argMatTy.getElementType() == utils.unknownType ||
-            paramMatTy.getElementType() == utils.unknownType
-        ));
+        // ...they are both matrices and at least one of them is of unknown
+        // value type.
+        (argMatTy && paramMatTy &&
+         (argMatTy.getElementType() == utils.unknownType || paramMatTy.getElementType() == utils.unknownType));
 }
 
-std::optional<mlir::func::FuncOp> DaphneDSLVisitor::findMatchingUDF(
-    const std::string &functionName, const std::vector<mlir::Value> &args, mlir::Location loc
-) const {
+std::optional<mlir::func::FuncOp> DaphneDSLVisitor::findMatchingUDF(const std::string &functionName,
+                                                                    const std::vector<mlir::Value> &args,
+                                                                    mlir::Location loc) const {
     // search user defined functions
     auto range = functionsSymbolMap.equal_range(functionName);
     // TODO: find not only a matching version, but the `most` specialized
@@ -906,17 +811,20 @@ std::optional<mlir::func::FuncOp> DaphneDSLVisitor::findMatchingUDF(
             return userDefinedFunc;
         }
     }
-    // UDF with the provided name exists, but no version matches the argument types
+    // UDF with the provided name exists, but no version matches the argument
+    // types
     if (range.second != range.first) {
-        // FIXME: disallow user-defined function with same name as builtins, otherwise this would be wrong behaviour
+        // FIXME: disallow user-defined function with same name as builtins,
+        // otherwise this would be wrong behaviour
         std::stringstream s;
         s << "no definition of function `" << functionName << "` for argument types (";
-        for(size_t i = 0; i < args.size(); i++) {
+        for (size_t i = 0; i < args.size(); i++) {
             s << args[i].getType();
-            if(i < args.size() - 1)
+            if (i < args.size() - 1)
                 s << ", ";
         }
-        // TODO For each available option, also say why it is not applicable (which type isn't compatible).
+        // TODO For each available option, also say why it is not applicable
+        // (which type isn't compatible).
         // TODO For each available option, also say where it is defined.
         s << "), available options: ";
         const size_t numOptions = functionsSymbolMap.count(functionName);
@@ -925,13 +833,13 @@ std::optional<mlir::func::FuncOp> DaphneDSLVisitor::findMatchingUDF(
             s << functionName << '(';
             auto userDefinedFunc = it->second;
             auto funcTy = userDefinedFunc.getFunctionType();
-            for(size_t k = 0; k < funcTy.getNumInputs(); k++) {
+            for (size_t k = 0; k < funcTy.getNumInputs(); k++) {
                 s << funcTy.getInput(k);
-                if(k < funcTy.getNumInputs() - 1)
+                if (k < funcTy.getNumInputs() - 1)
                     s << ", ";
             }
             s << ')';
-            if(i < numOptions - 1)
+            if (i < numOptions - 1)
                 s << ", ";
         }
         throw ErrorHandler::compilerError(loc, "DSLVisitor", s.str());
@@ -941,12 +849,12 @@ std::optional<mlir::func::FuncOp> DaphneDSLVisitor::findMatchingUDF(
     return std::nullopt;
 }
 
-
-std::optional<mlir::func::FuncOp> DaphneDSLVisitor::findMatchingUnaryUDF(mlir::Location loc, const std::string &functionName, mlir::Type argType) const {
+std::optional<mlir::func::FuncOp>
+DaphneDSLVisitor::findMatchingUnaryUDF(mlir::Location loc, const std::string &functionName, mlir::Type argType) const {
     // search user defined functions
     auto range = functionsSymbolMap.equal_range(functionName);
-    
-    // TODO: find not only a matching version, but the `most` specialized    
+
+    // TODO: find not only a matching version, but the `most` specialized
     for (auto it = range.first; it != range.second; ++it) {
         auto userDefinedFunc = it->second;
         auto funcTy = userDefinedFunc.getFunctionType();
@@ -959,36 +867,35 @@ std::optional<mlir::func::FuncOp> DaphneDSLVisitor::findMatchingUnaryUDF(mlir::L
             return userDefinedFunc;
         }
     }
-    // UDF with the provided name exists, but no version matches the argument types
+    // UDF with the provided name exists, but no version matches the argument
+    // types
     if (range.second != range.first) {
-        // FIXME: disallow user-defined function with same name as builtins, otherwise this would be wrong behaviour
+        // FIXME: disallow user-defined function with same name as builtins,
+        // otherwise this would be wrong behaviour
         throw ErrorHandler::compilerError(loc, "DSLVisitor",
-            "No function definition of `" + functionName + "` found with matching types");
+                                          "No function definition of `" + functionName + "` found with matching types");
     }
 
     // UDF with the provided name does not exist
     return std::nullopt;
 }
 
-antlrcpp::Any DaphneDSLVisitor::handleMapOpCall(DaphneDSLGrammarParser::CallExprContext * ctx) {
+antlrcpp::Any DaphneDSLVisitor::handleMapOpCall(DaphneDSLGrammarParser::CallExprContext *ctx) {
     std::string func;
-    const auto& identifierVec = ctx->IDENTIFIER();
-    for(size_t s = 0; s < identifierVec.size(); s++)
+    const auto &identifierVec = ctx->IDENTIFIER();
+    for (size_t s = 0; s < identifierVec.size(); s++)
         func += (s < identifierVec.size() - 1) ? identifierVec[s]->getText() + '.' : identifierVec[s]->getText();
 
     mlir::Location loc = utils.getLoc(ctx->start);
-    
+
     if (func != "map")
         throw ErrorHandler::compilerError(loc, "DSLVisitor",
-                "called 'handleMapOpCall' for function "
-                 + func + " instead of 'map'"
-        );
+                                          "called 'handleMapOpCall' for function " + func + " instead of 'map'");
 
     if (ctx->expr().size() != 2) {
         throw ErrorHandler::compilerError(loc, "DSLVisitor",
-                "built-in function 'map' expects exactly 2 argument(s), but got " +
-                std::to_string(ctx->expr().size())
-        );
+                                          "built-in function 'map' expects exactly 2 argument(s), but got " +
+                                              std::to_string(ctx->expr().size()));
     }
 
     std::vector<mlir::Value> args;
@@ -998,7 +905,9 @@ antlrcpp::Any DaphneDSLVisitor::handleMapOpCall(DaphneDSLGrammarParser::CallExpr
 
     auto argMatTy = argVal.getType().dyn_cast<mlir::daphne::MatrixType>();
     if (!argMatTy)
-        throw ErrorHandler::compilerError(loc, "DSLVisitor", "built-in function 'map' expects argument of type matrix as its first parameter");
+        throw ErrorHandler::compilerError(loc, "DSLVisitor",
+                                          "built-in function 'map' expects argument of type matrix as its "
+                                          "first parameter");
 
     std::string udfName = ctx->expr(1)->getText();
     auto maybeUDF = findMatchingUnaryUDF(loc, udfName, argMatTy.getElementType());
@@ -1006,45 +915,43 @@ antlrcpp::Any DaphneDSLVisitor::handleMapOpCall(DaphneDSLGrammarParser::CallExpr
     if (!maybeUDF)
         throw ErrorHandler::compilerError(loc, "DSLVisitor", "No function definition of `" + udfName + "` found");
 
-    args.push_back(static_cast<mlir::Value>(
-        builder.create<mlir::daphne::ConstantOp>(loc, maybeUDF->getSymName().str())
-    ));
+    args.push_back(
+        static_cast<mlir::Value>(builder.create<mlir::daphne::ConstantOp>(loc, maybeUDF->getSymName().str())));
 
     // Create DaphneIR operation for the built-in function.
     return builtins.build(loc, func, args);
 }
 
-antlrcpp::Any DaphneDSLVisitor::visitCallExpr(DaphneDSLGrammarParser::CallExprContext * ctx) {
+antlrcpp::Any DaphneDSLVisitor::visitCallExpr(DaphneDSLGrammarParser::CallExprContext *ctx) {
     std::string func;
-    const auto& identifierVec = ctx->IDENTIFIER();
+    const auto &identifierVec = ctx->IDENTIFIER();
     bool hasKernelHint = ctx->kernel != nullptr;
-    for(size_t s = 0; s < identifierVec.size() - 1 - hasKernelHint; s++)
+    for (size_t s = 0; s < identifierVec.size() - 1 - hasKernelHint; s++)
         func += identifierVec[s]->getText() + '.';
     func += ctx->func->getText();
     mlir::Location loc = utils.getLoc(ctx->start);
- 
-    if (func == "map") 
+
+    if (func == "map")
         return handleMapOpCall(ctx);
 
     // Parse arguments.
     std::vector<mlir::Value> args_vec;
-    for(unsigned i = 0; i < ctx->expr().size(); i++)
+    for (unsigned i = 0; i < ctx->expr().size(); i++)
         args_vec.push_back(valueOrErrorOnVisit(ctx->expr(i)));
 
     auto maybeUDF = findMatchingUDF(func, args_vec, loc);
     if (maybeUDF) {
-        if(hasKernelHint)
-            throw ErrorHandler::compilerError(
-                loc,
-                "DSLVisitor",
-                "kernel hints are not supported for calls to user-defined functions"
-            );
+        if (hasKernelHint)
+            throw ErrorHandler::compilerError(loc, "DSLVisitor",
+                                              "kernel hints are not supported for calls to user-defined "
+                                              "functions");
 
         auto funcTy = maybeUDF->getFunctionType();
-        auto co = builder.create<mlir::daphne::GenericCallOp>(loc, maybeUDF->getSymName(), args_vec, funcTy.getResults());
-        if(funcTy.getNumResults() > 1)
+        auto co =
+            builder.create<mlir::daphne::GenericCallOp>(loc, maybeUDF->getSymName(), args_vec, funcTy.getResults());
+        if (funcTy.getNumResults() > 1)
             return co.getResults();
-        else if(funcTy.getNumResults() == 1)
+        else if (funcTy.getNumResults() == 1)
             return co.getResult(0);
         else
             return nullptr;
@@ -1053,41 +960,38 @@ antlrcpp::Any DaphneDSLVisitor::visitCallExpr(DaphneDSLGrammarParser::CallExprCo
     // Create DaphneIR operation for the built-in function.
     antlrcpp::Any res = builtins.build(loc, func, args_vec);
 
-    if(hasKernelHint) {
+    if (hasKernelHint) {
         std::string kernel = ctx->kernel->getText();
 
         // We deliberately don't check if the specified kernel
         // is registered for the created kind of operation,
         // since this is checked in RewriteToCallKernelOpPass.
 
-        mlir::Operation* op;
-        if(res.is<mlir::Operation*>()) // DaphneIR ops with exactly zero results
-            op = res.as<mlir::Operation*>();
-        else if(res.is<mlir::Value>()) // DaphneIR ops with exactly one result
+        mlir::Operation *op;
+        if (res.is<mlir::Operation *>()) // DaphneIR ops with exactly zero
+                                         // results
+            op = res.as<mlir::Operation *>();
+        else if (res.is<mlir::Value>()) // DaphneIR ops with exactly one result
             op = res.as<mlir::Value>().getDefiningOp();
-        else if(res.is<mlir::ResultRange>()) { // DaphneIR ops with more than one results
+        else if (res.is<mlir::ResultRange>()) { // DaphneIR ops with more than
+                                                // one results
             auto rr = res.as<mlir::ResultRange>();
             op = rr[0].getDefiningOp();
             // Normally, all values in the ResultRange should be results of
             // the same op, but we check it nevertheless, just to be sure.
-            for(size_t i = 1; i < rr.size(); i++)
-                if(rr[i].getDefiningOp() != op)
-                    throw ErrorHandler::compilerError(
-                        loc,
-                        "DSLVisitor",
-                        "the given kernel hint `" + kernel +
-                        "` cannot be applied since the DaphneIR operation created for the built-in function `" +
-                        func + "` is ambiguous"
-                    );
-        }
-        else // unexpected case
-            throw ErrorHandler::compilerError(
-                loc,
-                "DSLVisitor",
-                "the given kernel hint `" + kernel +
-                "` cannot be applied since the DaphneIR operation created for the built-in function `" +
-                func + "` was not returned in a supported way"
-            );
+            for (size_t i = 1; i < rr.size(); i++)
+                if (rr[i].getDefiningOp() != op)
+                    throw ErrorHandler::compilerError(loc, "DSLVisitor",
+                                                      "the given kernel hint `" + kernel +
+                                                          "` cannot be applied since the DaphneIR operation "
+                                                          "created for the built-in function `" +
+                                                          func + "` is ambiguous");
+        } else // unexpected case
+            throw ErrorHandler::compilerError(loc, "DSLVisitor",
+                                              "the given kernel hint `" + kernel +
+                                                  "` cannot be applied since the DaphneIR operation created "
+                                                  "for the built-in function `" +
+                                                  func + "` was not returned in a supported way");
 
         // TODO Don't hardcode the attribute name.
         op->setAttr("kernel_hint", builder.getStringAttr(kernel));
@@ -1096,43 +1000,45 @@ antlrcpp::Any DaphneDSLVisitor::visitCallExpr(DaphneDSLGrammarParser::CallExprCo
     return res;
 }
 
-antlrcpp::Any DaphneDSLVisitor::visitCastExpr(DaphneDSLGrammarParser::CastExprContext * ctx) {
+antlrcpp::Any DaphneDSLVisitor::visitCastExpr(DaphneDSLGrammarParser::CastExprContext *ctx) {
     mlir::Type resType;
 
-    if(ctx->DATA_TYPE()) {
+    if (ctx->DATA_TYPE()) {
         std::string dtStr = ctx->DATA_TYPE()->getText();
-        if(dtStr == "matrix") {
+        if (dtStr == "matrix") {
             mlir::Type vt;
-            if(ctx->VALUE_TYPE())
+            if (ctx->VALUE_TYPE())
                 vt = utils.getValueTypeByName(ctx->VALUE_TYPE()->getText());
-            else
-            {
+            else {
                 vt = valueOrErrorOnVisit(ctx->expr()).getType();
-                if(llvm::isa<mlir::daphne::FrameType>(vt))
+                if (llvm::isa<mlir::daphne::FrameType>(vt))
                     // TODO Instead of using the value type of the first frame
                     // column as the value type of the matrix, we should better
                     // use the most general of all column types.
                     vt = vt.dyn_cast<mlir::daphne::FrameType>().getColumnTypes()[0];
-                if(llvm::isa<mlir::daphne::MatrixType>(vt))
+                if (llvm::isa<mlir::daphne::MatrixType>(vt))
                     vt = vt.dyn_cast<mlir::daphne::MatrixType>().getElementType();
             }
             resType = utils.matrixOf(vt);
-        }
-        else if(dtStr == "frame") {
-            // Currently does not support casts of type "Specify value type only" (e.g., as.si64(x)) 
-            // and "Specify data type and value type" (e.g., as.frame<[si64, f64]>(x)) 
+        } else if (dtStr == "frame") {
+            // Currently does not support casts of type "Specify value type
+            // only" (e.g., as.si64(x)) and "Specify data type and value type"
+            // (e.g., as.frame<[si64, f64]>(x))
             std::vector<mlir::Type> colTypes;
             // TODO Take the number of columns into account.
-            if(ctx->VALUE_TYPE())
-                throw ErrorHandler::compilerError(utils.getLoc(ctx->start), "DSLVisitor", "casting to a frame with particular column types is not supported yet");
-                //colTypes = {utils.getValueTypeByName(ctx->VALUE_TYPE()->getText())};
+            if (ctx->VALUE_TYPE())
+                throw ErrorHandler::compilerError(utils.getLoc(ctx->start), "DSLVisitor",
+                                                  "casting to a frame with particular column types is not "
+                                                  "supported yet");
+            // colTypes =
+            // {utils.getValueTypeByName(ctx->VALUE_TYPE()->getText())};
             else {
                 // TODO This fragment should be factored out, such that we can
                 // reuse it for matrix/frame/scalar.
                 mlir::Type argType = valueOrErrorOnVisit(ctx->expr()).getType();
-                if(llvm::isa<mlir::daphne::MatrixType>(argType))
+                if (llvm::isa<mlir::daphne::MatrixType>(argType))
                     colTypes = {argType.dyn_cast<mlir::daphne::MatrixType>().getElementType()};
-                else if(llvm::isa<mlir::daphne::FrameType>(argType))
+                else if (llvm::isa<mlir::daphne::FrameType>(argType))
                     // TODO Instead of using the value type of the first frame
                     // column as the value type of the matrix, we should better
                     // use the most general of all column types.
@@ -1141,18 +1047,16 @@ antlrcpp::Any DaphneDSLVisitor::visitCastExpr(DaphneDSLGrammarParser::CastExprCo
                     colTypes = {argType};
             }
             resType = mlir::daphne::FrameType::get(builder.getContext(), colTypes);
-        }
-        else if(dtStr == "scalar") {
-            if(ctx->VALUE_TYPE())
+        } else if (dtStr == "scalar") {
+            if (ctx->VALUE_TYPE())
                 resType = utils.getValueTypeByName(ctx->VALUE_TYPE()->getText());
-            else
-            {
+            else {
                 // TODO This fragment should be factored out, such that we can
                 // reuse it for matrix/frame/scalar.
                 mlir::Type argType = valueOrErrorOnVisit(ctx->expr()).getType();
-                if(llvm::isa<mlir::daphne::MatrixType>(argType))
+                if (llvm::isa<mlir::daphne::MatrixType>(argType))
                     resType = argType.dyn_cast<mlir::daphne::MatrixType>().getElementType();
-                else if(llvm::isa<mlir::daphne::FrameType>(argType))
+                else if (llvm::isa<mlir::daphne::FrameType>(argType))
                     // TODO Instead of using the value type of the first frame
                     // column as the value type of the matrix, we should better
                     // use the most general of all column types.
@@ -1160,61 +1064,46 @@ antlrcpp::Any DaphneDSLVisitor::visitCastExpr(DaphneDSLGrammarParser::CastExprCo
                 else
                     resType = argType;
             }
-        }
-        else
+        } else
             throw ErrorHandler::compilerError(utils.getLoc(ctx->start), "DSLVisitor",
-                    "unsupported data type in cast expression: " + dtStr
-            );
-    }
-    else if(ctx->VALUE_TYPE())
-    { // Data type shall be retained
+                                              "unsupported data type in cast expression: " + dtStr);
+    } else if (ctx->VALUE_TYPE()) { // Data type shall be retained
         mlir::Type vt = utils.getValueTypeByName(ctx->VALUE_TYPE()->getText());
         mlir::Type argTy = valueOrErrorOnVisit(ctx->expr()).getType();
-        if(llvm::isa<mlir::daphne::MatrixType>(argTy))
+        if (llvm::isa<mlir::daphne::MatrixType>(argTy))
             resType = utils.matrixOf(vt);
-        else if(llvm::isa<mlir::daphne::FrameType>(argTy))
-        {
+        else if (llvm::isa<mlir::daphne::FrameType>(argTy)) {
             throw ErrorHandler::compilerError(utils.getLoc(ctx->start), "DSLVisitor",
-                    "casting to a frame with particular column types is not supported yet");
-            //size_t numCols = argTy.dyn_cast<mlir::daphne::FrameType>().getColumnTypes().size();
-            //std::vector<mlir::Type> colTypes(numCols, vt);
-            //resType = mlir::daphne::FrameType::get(builder.getContext(), colTypes);
-        }
-        else if(llvm::isa<mlir::daphne::UnknownType>(argTy))
+                                              "casting to a frame with particular column types is not "
+                                              "supported yet");
+            // size_t numCols =
+            // argTy.dyn_cast<mlir::daphne::FrameType>().getColumnTypes().size();
+            // std::vector<mlir::Type> colTypes(numCols, vt);
+            // resType = mlir::daphne::FrameType::get(builder.getContext(),
+            // colTypes);
+        } else if (llvm::isa<mlir::daphne::UnknownType>(argTy))
             resType = utils.unknownType;
         else
             resType = vt;
-    }
-    else
+    } else
         throw ErrorHandler::compilerError(utils.getLoc(ctx->start), "DSLVisitor",
-                "casting requires the specification of the target data and/or "
-                "value type"
-        );
-
-    return static_cast<mlir::Value>(builder.create<mlir::daphne::CastOp>(
-            utils.getLoc(ctx->start),
-            resType,
-            valueOrErrorOnVisit(ctx->expr())
-    ));
+                                          "casting requires the specification of the target data and/or "
+                                          "value type");
+
+    return static_cast<mlir::Value>(
+        builder.create<mlir::daphne::CastOp>(utils.getLoc(ctx->start), resType, valueOrErrorOnVisit(ctx->expr())));
 }
 
-antlrcpp::Any DaphneDSLVisitor::visitRightIdxFilterExpr(DaphneDSLGrammarParser::RightIdxFilterExprContext * ctx) {
+antlrcpp::Any DaphneDSLVisitor::visitRightIdxFilterExpr(DaphneDSLGrammarParser::RightIdxFilterExprContext *ctx) {
     mlir::Value obj = valueOrErrorOnVisit(ctx->obj);
 
-    if(ctx->rows) // rows specified
-        obj = builder.create<mlir::daphne::FilterRowOp>(
-                utils.getLoc(ctx->rows->start),
-                obj.getType(),
-                obj,
-                valueOrErrorOnVisit(ctx->rows)
-        );
-    if(ctx->cols) // cols specified
-        obj = builder.create<mlir::daphne::FilterColOp>(
-                utils.getLoc(ctx->cols->start),
-                obj.getType(), // TODO Not correct for frames, see #484.
-                obj,
-                valueOrErrorOnVisit(ctx->cols)
-        );
+    if (ctx->rows) // rows specified
+        obj = builder.create<mlir::daphne::FilterRowOp>(utils.getLoc(ctx->rows->start), obj.getType(), obj,
+                                                        valueOrErrorOnVisit(ctx->rows));
+    if (ctx->cols) // cols specified
+        obj = builder.create<mlir::daphne::FilterColOp>(utils.getLoc(ctx->cols->start),
+                                                        obj.getType(), // TODO Not correct for frames, see #484.
+                                                        obj, valueOrErrorOnVisit(ctx->cols));
 
     // Note: If rows and cols are specified, we create two filter steps.
     // This can be inefficient, but it is simpler for now.
@@ -1226,30 +1115,21 @@ antlrcpp::Any DaphneDSLVisitor::visitRightIdxFilterExpr(DaphneDSLGrammarParser::
     return obj;
 }
 
-antlrcpp::Any DaphneDSLVisitor::visitRightIdxExtractExpr(DaphneDSLGrammarParser::RightIdxExtractExprContext * ctx) {
+antlrcpp::Any DaphneDSLVisitor::visitRightIdxExtractExpr(DaphneDSLGrammarParser::RightIdxExtractExprContext *ctx) {
     mlir::Value obj = valueOrErrorOnVisit(ctx->obj);
 
-    auto indexing = visit(ctx->idx).as<std::pair<
-            std::pair<bool, antlrcpp::Any>,
-            std::pair<bool, antlrcpp::Any>
-    >>();
+    auto indexing = visit(ctx->idx).as<std::pair<std::pair<bool, antlrcpp::Any>, std::pair<bool, antlrcpp::Any>>>();
     auto rows = indexing.first;
     auto cols = indexing.second;
 
     // TODO Use location of rows/cols in utils.getLoc(...) for better
     // error messages.
-    if(rows.first) // rows specified
-        obj = applyRightIndexing<
-                mlir::daphne::ExtractRowOp,
-                mlir::daphne::SliceRowOp,
-                mlir::daphne::NumRowsOp
-        >(utils.getLoc(ctx->idx->start), obj, rows.second, false);
-    if(cols.first) // cols specified
-        obj = applyRightIndexing<
-                mlir::daphne::ExtractColOp,
-                mlir::daphne::SliceColOp,
-                mlir::daphne::NumColsOp
-        >(utils.getLoc(ctx->idx->start), obj, cols.second, llvm::isa<mlir::daphne::FrameType>(obj.getType()));
+    if (rows.first) // rows specified
+        obj = applyRightIndexing<mlir::daphne::ExtractRowOp, mlir::daphne::SliceRowOp, mlir::daphne::NumRowsOp>(
+            utils.getLoc(ctx->idx->start), obj, rows.second, false);
+    if (cols.first) // cols specified
+        obj = applyRightIndexing<mlir::daphne::ExtractColOp, mlir::daphne::SliceColOp, mlir::daphne::NumColsOp>(
+            utils.getLoc(ctx->idx->start), obj, cols.second, llvm::isa<mlir::daphne::FrameType>(obj.getType()));
 
     // Note: If rows and cols are specified, we create two extraction steps.
     // This can be inefficient, but it is simpler for now.
@@ -1265,161 +1145,153 @@ antlrcpp::Any DaphneDSLVisitor::visitMinusExpr(DaphneDSLGrammarParser::MinusExpr
     std::string op = ctx->op->getText();
     mlir::Location loc = utils.getLoc(ctx->op);
     mlir::Value arg = valueOrErrorOnVisit(ctx->arg);
-    
-    if(op == "-")
-        return utils.retValWithInferedType(
-            builder.create<mlir::daphne::EwMinusOp>(loc, utils.unknownType, arg)
-        );
-    if(op == "+")
+
+    if (op == "-")
+        return utils.retValWithInferedType(builder.create<mlir::daphne::EwMinusOp>(loc, utils.unknownType, arg));
+    if (op == "+")
         return arg;
-    
+
     throw ErrorHandler::compilerError(utils.getLoc(ctx->start), "DSLVisitor", "unexpected op symbol");
 }
 
-antlrcpp::Any DaphneDSLVisitor::visitMatmulExpr(DaphneDSLGrammarParser::MatmulExprContext * ctx) {
+antlrcpp::Any DaphneDSLVisitor::visitMatmulExpr(DaphneDSLGrammarParser::MatmulExprContext *ctx) {
     std::string op = ctx->op->getText();
     mlir::Location loc = utils.getLoc(ctx->op);
     mlir::Value lhs = valueOrErrorOnVisit(ctx->lhs);
     mlir::Value rhs = valueOrErrorOnVisit(ctx->rhs);
 
-    if(op == "@") {
+    if (op == "@") {
         mlir::Value f = builder.create<mlir::daphne::ConstantOp>(loc, false);
-        return utils.retValWithInferedType(builder.create<mlir::daphne::MatMulOp>(
-                loc, lhs.getType(), lhs, rhs, f, f
-        ));
+        return utils.retValWithInferedType(builder.create<mlir::daphne::MatMulOp>(loc, lhs.getType(), lhs, rhs, f, f));
     }
 
     throw ErrorHandler::compilerError(utils.getLoc(ctx->start), "DSLVisitor", "unexpected op symbol");
 }
 
-antlrcpp::Any DaphneDSLVisitor::visitPowExpr(DaphneDSLGrammarParser::PowExprContext * ctx) {
+antlrcpp::Any DaphneDSLVisitor::visitPowExpr(DaphneDSLGrammarParser::PowExprContext *ctx) {
     std::string op = ctx->op->getText();
     mlir::Location loc = utils.getLoc(ctx->op);
     mlir::Value lhs = valueOrErrorOnVisit(ctx->lhs);
     mlir::Value rhs = valueOrErrorOnVisit(ctx->rhs);
 
-    if(op == "^")
+    if (op == "^")
         return utils.retValWithInferedType(builder.create<mlir::daphne::EwPowOp>(loc, lhs, rhs));
 
     throw ErrorHandler::compilerError(utils.getLoc(ctx->start), "DSLVisitor", "unexpected op symbol");
 }
 
-antlrcpp::Any DaphneDSLVisitor::visitModExpr(DaphneDSLGrammarParser::ModExprContext * ctx) {
+antlrcpp::Any DaphneDSLVisitor::visitModExpr(DaphneDSLGrammarParser::ModExprContext *ctx) {
     std::string op = ctx->op->getText();
     mlir::Location loc = utils.getLoc(ctx->op);
     mlir::Value lhs = valueOrErrorOnVisit(ctx->lhs);
     mlir::Value rhs = valueOrErrorOnVisit(ctx->rhs);
 
-    if(op == "%")
+    if (op == "%")
         return utils.retValWithInferedType(builder.create<mlir::daphne::EwModOp>(loc, lhs, rhs));
 
     throw ErrorHandler::compilerError(utils.getLoc(ctx->start), "DSLVisitor", "unexpected op symbol");
 }
 
-antlrcpp::Any DaphneDSLVisitor::visitMulExpr(DaphneDSLGrammarParser::MulExprContext * ctx) {
+antlrcpp::Any DaphneDSLVisitor::visitMulExpr(DaphneDSLGrammarParser::MulExprContext *ctx) {
     std::string op = ctx->op->getText();
     mlir::Location loc = utils.getLoc(ctx->op);
     mlir::Value lhs = valueOrErrorOnVisit(ctx->lhs);
     mlir::Value rhs = valueOrErrorOnVisit(ctx->rhs);
 
-    if(op == "*")
+    if (op == "*")
         return utils.retValWithInferedType(builder.create<mlir::daphne::EwMulOp>(loc, lhs, rhs));
-    if(op == "/")
+    if (op == "/")
         return utils.retValWithInferedType(builder.create<mlir::daphne::EwDivOp>(loc, lhs, rhs));
 
     throw ErrorHandler::compilerError(utils.getLoc(ctx->start), "DSLVisitor", "unexpected op symbol");
 }
 
-antlrcpp::Any DaphneDSLVisitor::visitAddExpr(DaphneDSLGrammarParser::AddExprContext * ctx) {
+antlrcpp::Any DaphneDSLVisitor::visitAddExpr(DaphneDSLGrammarParser::AddExprContext *ctx) {
     std::string op = ctx->op->getText();
     mlir::Location loc = utils.getLoc(ctx->op);
     mlir::Value lhs = valueOrErrorOnVisit(ctx->lhs);
     mlir::Value rhs = valueOrErrorOnVisit(ctx->rhs);
 
-    if(op == "+")
+    if (op == "+")
         // Note that we use '+' for both addition (EwAddOp) and concatenation
         // (EwConcatOp). The choice is made based on the types of the operands
-        // (if one operand is a string, we choose EwConcatOp). However, the types
-        // might not be known at this point in time. Thus, we always create an
-        // EwAddOp here. Note that EwAddOp has a canonicalize method rewriting
-        // it to EwConcatOp if necessary.
+        // (if one operand is a string, we choose EwConcatOp). However, the
+        // types might not be known at this point in time. Thus, we always
+        // create an EwAddOp here. Note that EwAddOp has a canonicalize method
+        // rewriting it to EwConcatOp if necessary.
         return utils.retValWithInferedType(builder.create<mlir::daphne::EwAddOp>(loc, lhs, rhs));
-    if(op == "-")
+    if (op == "-")
         return utils.retValWithInferedType(builder.create<mlir::daphne::EwSubOp>(loc, lhs, rhs));
 
     throw ErrorHandler::compilerError(utils.getLoc(ctx->start), "DSLVisitor", "unexpected op symbol");
 }
 
-antlrcpp::Any DaphneDSLVisitor::visitCmpExpr(DaphneDSLGrammarParser::CmpExprContext * ctx) {
+antlrcpp::Any DaphneDSLVisitor::visitCmpExpr(DaphneDSLGrammarParser::CmpExprContext *ctx) {
     std::string op = ctx->op->getText();
     mlir::Location loc = utils.getLoc(ctx->op);
     mlir::Value lhs = valueOrErrorOnVisit(ctx->lhs);
     mlir::Value rhs = valueOrErrorOnVisit(ctx->rhs);
 
-    if(op == "==")
+    if (op == "==")
         return utils.retValWithInferedType(builder.create<mlir::daphne::EwEqOp>(loc, lhs, rhs));
-    if(op == "!=")
+    if (op == "!=")
         return utils.retValWithInferedType(builder.create<mlir::daphne::EwNeqOp>(loc, lhs, rhs));
-    if(op == "<")
+    if (op == "<")
         return utils.retValWithInferedType(builder.create<mlir::daphne::EwLtOp>(loc, lhs, rhs));
-    if(op == "<=")
+    if (op == "<=")
         return utils.retValWithInferedType(builder.create<mlir::daphne::EwLeOp>(loc, lhs, rhs));
-    if(op == ">")
+    if (op == ">")
         return utils.retValWithInferedType(builder.create<mlir::daphne::EwGtOp>(loc, lhs, rhs));
-    if(op == ">=")
+    if (op == ">=")
         return utils.retValWithInferedType(builder.create<mlir::daphne::EwGeOp>(loc, lhs, rhs));
 
     throw ErrorHandler::compilerError(utils.getLoc(ctx->start), "DSLVisitor", "unexpected op symbol");
 }
 
-antlrcpp::Any DaphneDSLVisitor::visitConjExpr(DaphneDSLGrammarParser::ConjExprContext * ctx) {
+antlrcpp::Any DaphneDSLVisitor::visitConjExpr(DaphneDSLGrammarParser::ConjExprContext *ctx) {
     std::string op = ctx->op->getText();
     mlir::Location loc = utils.getLoc(ctx->op);
     mlir::Value lhs = valueOrErrorOnVisit(ctx->lhs);
     mlir::Value rhs = valueOrErrorOnVisit(ctx->rhs);
 
-    if(op == "&&")
+    if (op == "&&")
         return utils.retValWithInferedType(builder.create<mlir::daphne::EwAndOp>(loc, lhs, rhs));
 
     throw ErrorHandler::compilerError(utils.getLoc(ctx->start), "DSLVisitor", "unexpected op symbol");
 }
 
-antlrcpp::Any DaphneDSLVisitor::visitDisjExpr(DaphneDSLGrammarParser::DisjExprContext * ctx) {
+antlrcpp::Any DaphneDSLVisitor::visitDisjExpr(DaphneDSLGrammarParser::DisjExprContext *ctx) {
     std::string op = ctx->op->getText();
     mlir::Location loc = utils.getLoc(ctx->op);
     mlir::Value lhs = valueOrErrorOnVisit(ctx->lhs);
     mlir::Value rhs = valueOrErrorOnVisit(ctx->rhs);
 
-    if(op == "||")
+    if (op == "||")
         return utils.retValWithInferedType(builder.create<mlir::daphne::EwOrOp>(loc, lhs, rhs));
 
-     throw ErrorHandler::compilerError(utils.getLoc(ctx->start), "DSLVisitor", "unexpected op symbol");
+    throw ErrorHandler::compilerError(utils.getLoc(ctx->start), "DSLVisitor", "unexpected op symbol");
 }
 
-antlrcpp::Any DaphneDSLVisitor::visitCondExpr(DaphneDSLGrammarParser::CondExprContext * ctx) {
+antlrcpp::Any DaphneDSLVisitor::visitCondExpr(DaphneDSLGrammarParser::CondExprContext *ctx) {
     return static_cast<mlir::Value>(builder.create<mlir::daphne::CondOp>(
-            utils.getLoc(ctx->start),
-            utils.unknownType,
-            valueOrErrorOnVisit(ctx->cond),
-            valueOrErrorOnVisit(ctx->thenExpr),
-            valueOrErrorOnVisit(ctx->elseExpr)
-    ));
+        utils.getLoc(ctx->start), utils.unknownType, valueOrErrorOnVisit(ctx->cond), valueOrErrorOnVisit(ctx->thenExpr),
+        valueOrErrorOnVisit(ctx->elseExpr)));
 }
 
-template<typename VT>
-mlir::Value DaphneDSLVisitor::buildColMatrixFromValues(mlir::Location loc, const std::vector<mlir::Value> & values,
-                                                    const std::vector<mlir::Type> & valueTypes, mlir::Type matrixVt) {
+template <typename VT>
+mlir::Value DaphneDSLVisitor::buildColMatrixFromValues(mlir::Location loc, const std::vector<mlir::Value> &values,
+                                                       const std::vector<mlir::Type> &valueTypes, mlir::Type matrixVt) {
     std::shared_ptr<VT[]> constValues = std::shared_ptr<VT[]>(new VT[values.size()]);
     std::vector<int64_t> nonConstValsIdx;
 
     // convenience function
     auto fillRes = [&constValues, &nonConstValsIdx](int64_t i, std::pair<bool, auto> constValue) {
         if (constValue.first) {
-            // currently supported types for matrix literals support conversions to (most general)
-            // array's value type. if unsigned integers are added, this can lead to conflicts
+            // currently supported types for matrix literals support conversions
+            // to (most general) array's value type. if unsigned integers are
+            // added, this can lead to conflicts
             constValues.get()[i] = constValue.second;
-            }
-        else {
+        } else {
             constValues.get()[i] = 0;
             nonConstValsIdx.emplace_back(i);
         }
@@ -1432,35 +1304,38 @@ mlir::Value DaphneDSLVisitor::buildColMatrixFromValues(mlir::Location loc, const
         if (mlir::IntegerType valueIntType = currentType.dyn_cast<mlir::IntegerType>()) {
             if (currentType.isSignedInteger()) {
                 switch (valueIntType.getWidth()) {
-                    case 64:
-                        fillRes(i, CompilerUtils::isConstant<int64_t>(currentValue)); break;
-                    case 32:
-                        fillRes(i, CompilerUtils::isConstant<int32_t>(currentValue)); break;
-                    case 8:
-                        fillRes(i, CompilerUtils::isConstant<int8_t>(currentValue)); break;
-                    default:
-                        throw ErrorHandler::compilerError(loc, "DSLVisitor", "matrix literal of invalid value type");
+                case 64:
+                    fillRes(i, CompilerUtils::isConstant<int64_t>(currentValue));
+                    break;
+                case 32:
+                    fillRes(i, CompilerUtils::isConstant<int32_t>(currentValue));
+                    break;
+                case 8:
+                    fillRes(i, CompilerUtils::isConstant<int8_t>(currentValue));
+                    break;
+                default:
+                    throw ErrorHandler::compilerError(loc, "DSLVisitor", "matrix literal of invalid value type");
                 }
-            }
-            else if (currentType.isUnsignedInteger()) {
+            } else if (currentType.isUnsignedInteger()) {
                 switch (valueIntType.getWidth()) {
-                    case 64:
-                        fillRes(i, CompilerUtils::isConstant<uint64_t>(currentValue)); break;
-                    case 32:
-                        fillRes(i, CompilerUtils::isConstant<uint32_t>(currentValue)); break;
-                    case 8:
-                        fillRes(i, CompilerUtils::isConstant<uint8_t>(currentValue)); break;
-                    default:
-                        throw ErrorHandler::compilerError(loc, "DSLVisitor", "matrix literal of invalid value type");
+                case 64:
+                    fillRes(i, CompilerUtils::isConstant<uint64_t>(currentValue));
+                    break;
+                case 32:
+                    fillRes(i, CompilerUtils::isConstant<uint32_t>(currentValue));
+                    break;
+                case 8:
+                    fillRes(i, CompilerUtils::isConstant<uint8_t>(currentValue));
+                    break;
+                default:
+                    throw ErrorHandler::compilerError(loc, "DSLVisitor", "matrix literal of invalid value type");
                 }
-            }
-            else if (currentType.isSignlessInteger(1))
+            } else if (currentType.isSignlessInteger(1))
                 fillRes(i, CompilerUtils::isConstant<bool>(currentValue));
             else
                 throw ErrorHandler::compilerError(loc, "DSLVisitor", "matrix literal of invalid value type");
 
-        }
-        else if (currentType.isF64())
+        } else if (currentType.isF64())
             fillRes(i, CompilerUtils::isConstant<double>(currentValue));
         else if (currentType.isF32())
             fillRes(i, CompilerUtils::isConstant<float>(currentValue));
@@ -1471,40 +1346,38 @@ mlir::Value DaphneDSLVisitor::buildColMatrixFromValues(mlir::Location loc, const
 
     auto mat = DataObjectFactory::create<DenseMatrix<VT>>(values.size(), 1, constValues);
 
-    // Create a MatrixConstantOp backed by a DenseMatrix containing the parse-time constant values
-    // from the DaphneDSL matrix literal (and zeros for the remaining cells).
-    mlir::Value result = static_cast<mlir::Value>(
-        builder.create<mlir::daphne::MatrixConstantOp>(
-            loc,
-            utils.matrixOf(matrixVt),
-            builder.create<mlir::daphne::ConstantOp>(loc, reinterpret_cast<uint64_t>(mat))
-        )
-    );
-
-    // Patch the cells corresponding to non-parse-time constant values from the DaphneDSL matrix literal
-    // by creating InsertOps that insert the results of the expressions.
+    // Create a MatrixConstantOp backed by a DenseMatrix containing the
+    // parse-time constant values from the DaphneDSL matrix literal (and zeros
+    // for the remaining cells).
+    mlir::Value result = static_cast<mlir::Value>(builder.create<mlir::daphne::MatrixConstantOp>(
+        loc, utils.matrixOf(matrixVt), builder.create<mlir::daphne::ConstantOp>(loc, reinterpret_cast<uint64_t>(mat))));
+
+    // Patch the cells corresponding to non-parse-time constant values from the
+    // DaphneDSL matrix literal by creating InsertOps that insert the results of
+    // the expressions.
     for (int64_t idx : nonConstValsIdx) {
         mlir::Value insValue = values[idx];
 
-        // Cast the scalar expression result to the value type of the matrix, if necessary.
+        // Cast the scalar expression result to the value type of the matrix, if
+        // necessary.
         insValue = utils.castIf(matrixVt, insValue);
 
-        // Cast the scalar expression result to a 1x1 matrix (required for InsertOp).
-        mlir::Value ins = static_cast<mlir::Value>(builder.create<mlir::daphne::CastOp>(loc, utils.matrixOf(matrixVt), insValue));
+        // Cast the scalar expression result to a 1x1 matrix (required for
+        // InsertOp).
+        mlir::Value ins =
+            static_cast<mlir::Value>(builder.create<mlir::daphne::CastOp>(loc, utils.matrixOf(matrixVt), insValue));
 
         // Maybe later these InsertOps can be fused into a single one
         // or replaced with InsertOps that support scalar input.
-        result = static_cast<mlir::Value>(builder.create<mlir::daphne::InsertRowOp>(loc, utils.matrixOf(matrixVt),
-                                result,
-                                ins,
-                                builder.create<mlir::daphne::ConstantOp>(loc, idx),
-                                builder.create<mlir::daphne::ConstantOp>(loc, idx+1)));
+        result = static_cast<mlir::Value>(builder.create<mlir::daphne::InsertRowOp>(
+            loc, utils.matrixOf(matrixVt), result, ins, builder.create<mlir::daphne::ConstantOp>(loc, idx),
+            builder.create<mlir::daphne::ConstantOp>(loc, idx + 1)));
     }
 
     return result;
 }
 
-antlrcpp::Any DaphneDSLVisitor::visitMatrixLiteralExpr(DaphneDSLGrammarParser::MatrixLiteralExprContext * ctx) {
+antlrcpp::Any DaphneDSLVisitor::visitMatrixLiteralExpr(DaphneDSLGrammarParser::MatrixLiteralExprContext *ctx) {
     mlir::Location loc = utils.getLoc(ctx->start);
     mlir::Value rows;
     mlir::Value cols;
@@ -1517,27 +1390,27 @@ antlrcpp::Any DaphneDSLVisitor::visitMatrixLiteralExpr(DaphneDSLGrammarParser::M
         numMatElems = ctx->expr().size();
         cols = builder.create<mlir::daphne::ConstantOp>(loc, static_cast<size_t>(1));
         rows = builder.create<mlir::daphne::ConstantOp>(loc, static_cast<size_t>(ctx->expr().size()));
-    }
-    else {
+    } else {
         numMatElems = (ctx->rows && ctx->cols) ? ctx->expr().size() - 2 : ctx->expr().size() - 1;
         if (ctx->cols && ctx->rows) {
             cols = valueOrErrorOnVisit(ctx->cols);
             rows = valueOrErrorOnVisit(ctx->rows);
-        }
-        else if (ctx->cols) {
+        } else if (ctx->cols) {
             cols = valueOrErrorOnVisit(ctx->cols);
-            rows = builder.create<mlir::daphne::EwDivOp>(loc, builder.create<mlir::daphne::ConstantOp>(loc, numMatElems), cols);
-        }
-        else {
+            rows = builder.create<mlir::daphne::EwDivOp>(
+                loc, builder.create<mlir::daphne::ConstantOp>(loc, numMatElems), cols);
+        } else {
             rows = valueOrErrorOnVisit(ctx->rows);
-            cols = builder.create<mlir::daphne::EwDivOp>(loc, builder.create<mlir::daphne::ConstantOp>(loc, numMatElems), rows);
+            cols = builder.create<mlir::daphne::EwDivOp>(
+                loc, builder.create<mlir::daphne::ConstantOp>(loc, numMatElems), rows);
         }
     }
     cols = utils.castSizeIf(cols);
     rows = utils.castSizeIf(rows);
 
     if (numMatElems == 0)
-        throw ErrorHandler::compilerError(utils.getLoc(ctx->start), "DSLVisitor", "empty matrix literals are not supported");
+        throw ErrorHandler::compilerError(utils.getLoc(ctx->start), "DSLVisitor",
+                                          "empty matrix literals are not supported");
 
     std::vector<mlir::Value> values;
     std::vector<mlir::Type> valueTypes;
@@ -1555,34 +1428,37 @@ antlrcpp::Any DaphneDSLVisitor::visitMatrixLiteralExpr(DaphneDSLGrammarParser::M
     if (mlir::IntegerType valueIntType = valueType.dyn_cast<mlir::IntegerType>()) {
         if (valueType.isSignedInteger()) {
             switch (valueIntType.getWidth()) {
-                case 64:
-                    colMatrix = DaphneDSLVisitor::buildColMatrixFromValues<int64_t>(loc, values, valueTypes, valueType); break;
-                case 32:
-                    colMatrix = DaphneDSLVisitor::buildColMatrixFromValues<int32_t>(loc, values, valueTypes, valueType); break;
-                case 8:
-                    colMatrix = DaphneDSLVisitor::buildColMatrixFromValues<int8_t>(loc, values, valueTypes, valueType); break;
-                default:
-                    throw ErrorHandler::compilerError(loc, "DSLVisitor", "matrix literal of invalid value type");
+            case 64:
+                colMatrix = DaphneDSLVisitor::buildColMatrixFromValues<int64_t>(loc, values, valueTypes, valueType);
+                break;
+            case 32:
+                colMatrix = DaphneDSLVisitor::buildColMatrixFromValues<int32_t>(loc, values, valueTypes, valueType);
+                break;
+            case 8:
+                colMatrix = DaphneDSLVisitor::buildColMatrixFromValues<int8_t>(loc, values, valueTypes, valueType);
+                break;
+            default:
+                throw ErrorHandler::compilerError(loc, "DSLVisitor", "matrix literal of invalid value type");
             }
-        }
-        else if (valueType.isUnsignedInteger()) {
+        } else if (valueType.isUnsignedInteger()) {
             switch (valueIntType.getWidth()) {
-                case 64:
-                    colMatrix = DaphneDSLVisitor::buildColMatrixFromValues<uint64_t>(loc, values, valueTypes, valueType); break;
-                case 32:
-                    colMatrix = DaphneDSLVisitor::buildColMatrixFromValues<uint32_t>(loc, values, valueTypes, valueType); break;
-                case 8:
-                    colMatrix = DaphneDSLVisitor::buildColMatrixFromValues<uint8_t>(loc, values, valueTypes, valueType); break;
-                default:
-                    throw ErrorHandler::compilerError(loc, "DSLVisitor", "matrix literal of invalid value type");
+            case 64:
+                colMatrix = DaphneDSLVisitor::buildColMatrixFromValues<uint64_t>(loc, values, valueTypes, valueType);
+                break;
+            case 32:
+                colMatrix = DaphneDSLVisitor::buildColMatrixFromValues<uint32_t>(loc, values, valueTypes, valueType);
+                break;
+            case 8:
+                colMatrix = DaphneDSLVisitor::buildColMatrixFromValues<uint8_t>(loc, values, valueTypes, valueType);
+                break;
+            default:
+                throw ErrorHandler::compilerError(loc, "DSLVisitor", "matrix literal of invalid value type");
             }
-        }
-        else if (valueType.isSignlessInteger(1))
+        } else if (valueType.isSignlessInteger(1))
             colMatrix = DaphneDSLVisitor::buildColMatrixFromValues<bool>(loc, values, valueTypes, valueType);
         else
             throw ErrorHandler::compilerError(loc, "DSLVisitor", "matrix literal of invalid value type");
-    }
-    else if (valueType.isF64())
+    } else if (valueType.isF64())
         colMatrix = DaphneDSLVisitor::buildColMatrixFromValues<double>(loc, values, valueTypes, valueType);
     else if (valueType.isF32())
         colMatrix = DaphneDSLVisitor::buildColMatrixFromValues<float>(loc, values, valueTypes, valueType);
@@ -1591,19 +1467,23 @@ antlrcpp::Any DaphneDSLVisitor::visitMatrixLiteralExpr(DaphneDSLGrammarParser::M
     }
 
     // TODO: omit ReshapeOp if rows=1 (not always known at parse-time)
-    mlir::Value result = static_cast<mlir::Value>(builder.create<mlir::daphne::ReshapeOp>(loc, utils.matrixOf(valueType), colMatrix, rows, cols));
+    mlir::Value result = static_cast<mlir::Value>(
+        builder.create<mlir::daphne::ReshapeOp>(loc, utils.matrixOf(valueType), colMatrix, rows, cols));
 
     return result;
 }
 
-antlrcpp::Any DaphneDSLVisitor::visitColMajorFrameLiteralExpr(DaphneDSLGrammarParser::ColMajorFrameLiteralExprContext * ctx) {
+antlrcpp::Any
+DaphneDSLVisitor::visitColMajorFrameLiteralExpr(DaphneDSLGrammarParser::ColMajorFrameLiteralExprContext *ctx) {
     mlir::Location loc = utils.getLoc(ctx->start);
 
     size_t labelCount = ctx->labels.size();
     size_t colCount = ctx->cols.size();
 
     if (labelCount != colCount)
-        throw ErrorHandler::compilerError(loc, "DSLVisitor", "frame literals must have an equal number of column labels and column matrices");
+        throw ErrorHandler::compilerError(loc, "DSLVisitor",
+                                          "frame literals must have an equal number of column labels and "
+                                          "column matrices");
 
     std::vector<mlir::Value> parsedLabels;
     std::vector<mlir::Value> columnMatrices;
@@ -1628,18 +1508,17 @@ antlrcpp::Any DaphneDSLVisitor::visitColMajorFrameLiteralExpr(DaphneDSLGrammarPa
 
     mlir::Type frameColTypes = mlir::daphne::FrameType::get(builder.getContext(), columnMatElemType);
 
-    mlir::Value result = static_cast<mlir::Value>(builder.create<mlir::daphne::CreateFrameOp>(loc, frameColTypes, columnMatrices, parsedLabels));
+    mlir::Value result = static_cast<mlir::Value>(
+        builder.create<mlir::daphne::CreateFrameOp>(loc, frameColTypes, columnMatrices, parsedLabels));
 
     return result;
 }
 
-antlrcpp::Any DaphneDSLVisitor::visitRowMajorFrameLiteralExpr(DaphneDSLGrammarParser::RowMajorFrameLiteralExprContext * ctx) {
+antlrcpp::Any
+DaphneDSLVisitor::visitRowMajorFrameLiteralExpr(DaphneDSLGrammarParser::RowMajorFrameLiteralExprContext *ctx) {
     mlir::Location loc = utils.getLoc(ctx->start);
 
-    auto labelVectors = visit(ctx->labels).as<std::pair<
-                std::vector<mlir::Value>,
-                std::vector<mlir::Type>
-            >>();
+    auto labelVectors = visit(ctx->labels).as<std::pair<std::vector<mlir::Value>, std::vector<mlir::Type>>>();
     auto parsedLabels = labelVectors.first;
 
     size_t cols = parsedLabels.size();
@@ -1667,10 +1546,7 @@ antlrcpp::Any DaphneDSLVisitor::visitRowMajorFrameLiteralExpr(DaphneDSLGrammarPa
 
     // build row vector and place values in the corresponding column
     for (size_t i = 0; i < rows; ++i) {
-        auto rowVectors = visit(ctx->rows[i]).as<std::pair<
-                std::vector<mlir::Value>,
-                std::vector<mlir::Type>
-            >>();
+        auto rowVectors = visit(ctx->rows[i]).as<std::pair<std::vector<mlir::Value>, std::vector<mlir::Type>>>();
 
         if (rowVectors.first.size() != cols)
             throw ErrorHandler::compilerError(loc, "DSLVisitor", "size of row does not match the amount of labels");
@@ -1693,37 +1569,49 @@ antlrcpp::Any DaphneDSLVisitor::visitRowMajorFrameLiteralExpr(DaphneDSLGrammarPa
         if (mlir::IntegerType valueIntType = colTypes[i].dyn_cast<mlir::IntegerType>()) {
             if (colTypes[i].isSignedInteger()) {
                 switch (valueIntType.getWidth()) {
-                    case 64:
-                        colValues.emplace_back(DaphneDSLVisitor::buildColMatrixFromValues<int64_t>(loc, valuesVec[i], valueTypesVec[i], colTypes[i])); break;
-                    case 32:
-                        colValues.emplace_back(DaphneDSLVisitor::buildColMatrixFromValues<int32_t>(loc, valuesVec[i], valueTypesVec[i], colTypes[i])); break;
-                    case 8:
-                        colValues.emplace_back(DaphneDSLVisitor::buildColMatrixFromValues<int8_t>(loc, valuesVec[i], valueTypesVec[i], colTypes[i])); break;
-                    default:
-                        throw ErrorHandler::compilerError(loc, "DSLVisitor", "matrix literal of invalid value type");
+                case 64:
+                    colValues.emplace_back(DaphneDSLVisitor::buildColMatrixFromValues<int64_t>(
+                        loc, valuesVec[i], valueTypesVec[i], colTypes[i]));
+                    break;
+                case 32:
+                    colValues.emplace_back(DaphneDSLVisitor::buildColMatrixFromValues<int32_t>(
+                        loc, valuesVec[i], valueTypesVec[i], colTypes[i]));
+                    break;
+                case 8:
+                    colValues.emplace_back(DaphneDSLVisitor::buildColMatrixFromValues<int8_t>(
+                        loc, valuesVec[i], valueTypesVec[i], colTypes[i]));
+                    break;
+                default:
+                    throw ErrorHandler::compilerError(loc, "DSLVisitor", "matrix literal of invalid value type");
                 }
-            }
-            else if (colTypes[i].isUnsignedInteger()) {
+            } else if (colTypes[i].isUnsignedInteger()) {
                 switch (valueIntType.getWidth()) {
-                    case 64:
-                        colValues.emplace_back(DaphneDSLVisitor::buildColMatrixFromValues<uint64_t>(loc, valuesVec[i], valueTypesVec[i], colTypes[i])); break;
-                    case 32:
-                        colValues.emplace_back(DaphneDSLVisitor::buildColMatrixFromValues<uint32_t>(loc, valuesVec[i], valueTypesVec[i], colTypes[i])); break;
-                    case 8:
-                        colValues.emplace_back(DaphneDSLVisitor::buildColMatrixFromValues<uint8_t>(loc, valuesVec[i], valueTypesVec[i], colTypes[i])); break;
-                    default:
-                        throw ErrorHandler::compilerError(loc, "DSLVisitor", "matrix literal of invalid value type");
+                case 64:
+                    colValues.emplace_back(DaphneDSLVisitor::buildColMatrixFromValues<uint64_t>(
+                        loc, valuesVec[i], valueTypesVec[i], colTypes[i]));
+                    break;
+                case 32:
+                    colValues.emplace_back(DaphneDSLVisitor::buildColMatrixFromValues<uint32_t>(
+                        loc, valuesVec[i], valueTypesVec[i], colTypes[i]));
+                    break;
+                case 8:
+                    colValues.emplace_back(DaphneDSLVisitor::buildColMatrixFromValues<uint8_t>(
+                        loc, valuesVec[i], valueTypesVec[i], colTypes[i]));
+                    break;
+                default:
+                    throw ErrorHandler::compilerError(loc, "DSLVisitor", "matrix literal of invalid value type");
                 }
-            }
-            else if (colTypes[i].isSignlessInteger(1))
-                colValues.emplace_back(DaphneDSLVisitor::buildColMatrixFromValues<bool>(loc, valuesVec[i], valueTypesVec[i], colTypes[i]));
+            } else if (colTypes[i].isSignlessInteger(1))
+                colValues.emplace_back(
+                    DaphneDSLVisitor::buildColMatrixFromValues<bool>(loc, valuesVec[i], valueTypesVec[i], colTypes[i]));
             else
                 throw ErrorHandler::compilerError(loc, "DSLVisitor", "matrix literal of invalid value type");
-        }
-        else if (colTypes[i].isF64())
-            colValues.emplace_back(DaphneDSLVisitor::buildColMatrixFromValues<double>(loc, valuesVec[i], valueTypesVec[i], colTypes[i]));
+        } else if (colTypes[i].isF64())
+            colValues.emplace_back(
+                DaphneDSLVisitor::buildColMatrixFromValues<double>(loc, valuesVec[i], valueTypesVec[i], colTypes[i]));
         else if (colTypes[i].isF32())
-            colValues.emplace_back(DaphneDSLVisitor::buildColMatrixFromValues<float>(loc, valuesVec[i], valueTypesVec[i], colTypes[i]));
+            colValues.emplace_back(
+                DaphneDSLVisitor::buildColMatrixFromValues<float>(loc, valuesVec[i], valueTypesVec[i], colTypes[i]));
         else {
             throw ErrorHandler::compilerError(loc, "DSLVisitor", "matrix literal of invalid value type");
         }
@@ -1731,12 +1619,13 @@ antlrcpp::Any DaphneDSLVisitor::visitRowMajorFrameLiteralExpr(DaphneDSLGrammarPa
 
     mlir::Type frameColTypes = mlir::daphne::FrameType::get(builder.getContext(), colTypes);
 
-    mlir::Value result = static_cast<mlir::Value>(builder.create<mlir::daphne::CreateFrameOp>(loc, frameColTypes, colValues, parsedLabels));
+    mlir::Value result = static_cast<mlir::Value>(
+        builder.create<mlir::daphne::CreateFrameOp>(loc, frameColTypes, colValues, parsedLabels));
 
     return result;
 }
 
-antlrcpp::Any DaphneDSLVisitor::visitFrameRow(DaphneDSLGrammarParser::FrameRowContext * ctx) {
+antlrcpp::Any DaphneDSLVisitor::visitFrameRow(DaphneDSLGrammarParser::FrameRowContext *ctx) {
     size_t elementCount = ctx->expr().size();
     std::vector<mlir::Value> values;
     std::vector<mlir::Type> types;
@@ -1750,35 +1639,31 @@ antlrcpp::Any DaphneDSLVisitor::visitFrameRow(DaphneDSLGrammarParser::FrameRowCo
     return std::make_pair(values, types);
 }
 
-antlrcpp::Any DaphneDSLVisitor::visitIndexing(DaphneDSLGrammarParser::IndexingContext * ctx) {
-    auto rows = ctx->rows
-            ? visit(ctx->rows).as<std::pair<bool, antlrcpp::Any>>()
-            : std::make_pair(false, antlrcpp::Any(nullptr));
-    auto cols = ctx->cols
-            ? visit(ctx->cols).as<std::pair<bool, antlrcpp::Any>>()
-            : std::make_pair(false, antlrcpp::Any(nullptr));
+antlrcpp::Any DaphneDSLVisitor::visitIndexing(DaphneDSLGrammarParser::IndexingContext *ctx) {
+    auto rows = ctx->rows ? visit(ctx->rows).as<std::pair<bool, antlrcpp::Any>>()
+                          : std::make_pair(false, antlrcpp::Any(nullptr));
+    auto cols = ctx->cols ? visit(ctx->cols).as<std::pair<bool, antlrcpp::Any>>()
+                          : std::make_pair(false, antlrcpp::Any(nullptr));
     return std::make_pair(rows, cols);
 }
 
-antlrcpp::Any DaphneDSLVisitor::visitRange(DaphneDSLGrammarParser::RangeContext * ctx) {
-    if(ctx->pos)
+antlrcpp::Any DaphneDSLVisitor::visitRange(DaphneDSLGrammarParser::RangeContext *ctx) {
+    if (ctx->pos)
         return std::make_pair(true, antlrcpp::Any(valueOrErrorOnVisit(ctx->pos)));
     else {
         mlir::Value posLowerIncl = ctx->posLowerIncl ? valueOrErrorOnVisit(ctx->posLowerIncl) : nullptr;
         mlir::Value posUpperExcl = ctx->posUpperExcl ? valueOrErrorOnVisit(ctx->posUpperExcl) : nullptr;
-        return std::make_pair(
-                posLowerIncl != nullptr || posUpperExcl != nullptr,
-                antlrcpp::Any(std::make_pair(posLowerIncl, posUpperExcl))
-        );
+        return std::make_pair(posLowerIncl != nullptr || posUpperExcl != nullptr,
+                              antlrcpp::Any(std::make_pair(posLowerIncl, posUpperExcl)));
     }
 }
 
-antlrcpp::Any DaphneDSLVisitor::visitLiteral(DaphneDSLGrammarParser::LiteralContext * ctx) {
+antlrcpp::Any DaphneDSLVisitor::visitLiteral(DaphneDSLGrammarParser::LiteralContext *ctx) {
     // TODO The creation of the ConstantOps could be simplified: We don't need
     // to create attributes here, since there are custom builder methods for
     // primitive C++ data types.
     mlir::Location loc = utils.getLoc(ctx->start);
-    if(auto lit = ctx->INT_LITERAL()) {
+    if (auto lit = ctx->INT_LITERAL()) {
         std::string litStr = lit->getText();
 
         // remove digit separators
@@ -1786,63 +1671,62 @@ antlrcpp::Any DaphneDSLVisitor::visitLiteral(DaphneDSLGrammarParser::LiteralCont
 
         if (litStr.back() == 'u')
             return static_cast<mlir::Value>(builder.create<mlir::daphne::ConstantOp>(loc, std::stoul(litStr)));
-        else if ((litStr.length() > 2) && std::string_view(litStr).substr(litStr.length()-3) == "ull") {
-            // The suffix "ull" must be checked before the suffix "l", since "l" is a suffix of "ull".
-            return static_cast<mlir::Value>(builder.create<mlir::daphne::ConstantOp>(loc,
-                    static_cast<uint64_t>(std::stoull(litStr))));
-        }
-        else if (litStr.back() == 'l')
+        else if ((litStr.length() > 2) && std::string_view(litStr).substr(litStr.length() - 3) == "ull") {
+            // The suffix "ull" must be checked before the suffix "l", since "l"
+            // is a suffix of "ull".
+            return static_cast<mlir::Value>(
+                builder.create<mlir::daphne::ConstantOp>(loc, static_cast<uint64_t>(std::stoull(litStr))));
+        } else if (litStr.back() == 'l')
             return static_cast<mlir::Value>(builder.create<mlir::daphne::ConstantOp>(loc, std::stol(litStr)));
         else if (litStr.back() == 'z') {
-            return static_cast<mlir::Value>(builder.create<mlir::daphne::ConstantOp>(loc,
-                    static_cast<std::size_t>(std::stoll(litStr))));
-        }
-        else {
-            // Note that a leading minus of a numeric literal is not parsed as part of the literal itself,
-            // but handled separately as a unary minus operator. Thus, this visitor actually sees the
-            // number without the minus. This is problematic when a DaphneDSL script contains the minimum
-            // int64 value -2^63, because without the minus, 2^63 is beyond the range of int64, as the
-            // maximum int64 value is 2^63 - 1. Thus, we need a special case here.
-            if(std::stoull(litStr) == (std::numeric_limits<int64_t>::max() + 1ull))
-                return static_cast<mlir::Value>(builder.create<mlir::daphne::ConstantOp>(loc,
-                        static_cast<int64_t>(std::numeric_limits<int64_t>::min())));
+            return static_cast<mlir::Value>(
+                builder.create<mlir::daphne::ConstantOp>(loc, static_cast<std::size_t>(std::stoll(litStr))));
+        } else {
+            // Note that a leading minus of a numeric literal is not parsed as
+            // part of the literal itself, but handled separately as a unary
+            // minus operator. Thus, this visitor actually sees the number
+            // without the minus. This is problematic when a DaphneDSL script
+            // contains the minimum int64 value -2^63, because without the
+            // minus, 2^63 is beyond the range of int64, as the maximum int64
+            // value is 2^63 - 1. Thus, we need a special case here.
+            if (std::stoull(litStr) == (std::numeric_limits<int64_t>::max() + 1ull))
+                return static_cast<mlir::Value>(builder.create<mlir::daphne::ConstantOp>(
+                    loc, static_cast<int64_t>(std::numeric_limits<int64_t>::min())));
             else
-                return static_cast<mlir::Value>(builder.create<mlir::daphne::ConstantOp>(loc,
-                        static_cast<int64_t>(std::stoll(litStr))));
+                return static_cast<mlir::Value>(
+                    builder.create<mlir::daphne::ConstantOp>(loc, static_cast<int64_t>(std::stoll(litStr))));
         }
     }
-    if(auto lit = ctx->FLOAT_LITERAL()) {
+    if (auto lit = ctx->FLOAT_LITERAL()) {
         std::string litStr = lit->getText();
         double val;
-        if(litStr == "nan")
+        if (litStr == "nan")
             val = std::numeric_limits<double>::quiet_NaN();
-        else if(litStr == "nanf")
+        else if (litStr == "nanf")
             val = std::numeric_limits<float>::quiet_NaN();
-        else if(litStr == "inf")
+        else if (litStr == "inf")
             val = std::numeric_limits<double>::infinity();
-        else if(litStr == "inff")
+        else if (litStr == "inff")
             val = std::numeric_limits<float>::infinity();
-        else if(litStr == "-inf")
+        else if (litStr == "-inf")
             val = -std::numeric_limits<double>::infinity();
-        else if(litStr == "-inff")
+        else if (litStr == "-inff")
             val = -std::numeric_limits<float>::infinity();
         else if (litStr.back() == 'f') {
             // remove digit separators
             litStr = std::regex_replace(litStr, std::regex("_|'"), "");
             auto fval = std::stof(litStr.c_str());
             return static_cast<mlir::Value>(builder.create<mlir::daphne::ConstantOp>(loc, fval));
-        }
-        else {
+        } else {
             // remove digit separators
             litStr = std::regex_replace(litStr, std::regex("_|'"), "");
             val = std::atof(litStr.c_str());
         }
-        return static_cast<mlir::Value>(builder.create<mlir::daphne::ConstantOp>(loc, val)
-        );
+        return static_cast<mlir::Value>(builder.create<mlir::daphne::ConstantOp>(loc, val));
     }
-    if(ctx->bl)
+    if (ctx->bl)
         return visit(ctx->bl);
-    if(auto lit = ctx->STRING_LITERAL()) {
+    if (auto lit = ctx->STRING_LITERAL()) {
         std::string val = lit->getText();
 
         // Remove quotation marks.
@@ -1857,34 +1741,28 @@ antlrcpp::Any DaphneDSLVisitor::visitLiteral(DaphneDSLGrammarParser::LiteralCont
         val = std::regex_replace(val, std::regex(R"(\\\")"), "\"");
         val = std::regex_replace(val, std::regex(R"(\\\\)"), "\\");
 
-        return static_cast<mlir::Value>(
-                builder.create<mlir::daphne::ConstantOp>(loc, val)
-        );
+        return static_cast<mlir::Value>(builder.create<mlir::daphne::ConstantOp>(loc, val));
     }
     throw ErrorHandler::compilerError(utils.getLoc(ctx->start), "DSLVisitor", "unexpected literal");
 }
 
-antlrcpp::Any DaphneDSLVisitor::visitBoolLiteral(DaphneDSLGrammarParser::BoolLiteralContext * ctx) {
+antlrcpp::Any DaphneDSLVisitor::visitBoolLiteral(DaphneDSLGrammarParser::BoolLiteralContext *ctx) {
     mlir::Location loc = utils.getLoc(ctx->start);
     bool val;
-    if(ctx->KW_TRUE())
+    if (ctx->KW_TRUE())
         val = true;
-    else if(ctx->KW_FALSE())
+    else if (ctx->KW_FALSE())
         val = false;
     else
         throw ErrorHandler::compilerError(utils.getLoc(ctx->start), "DSLVisitor", "unexpected bool literal");
 
-    return static_cast<mlir::Value>(
-        builder.create<mlir::daphne::ConstantOp>(
-                loc, val
-        )
-    );
+    return static_cast<mlir::Value>(builder.create<mlir::daphne::ConstantOp>(loc, val));
 }
 
 void removeOperationsBeforeReturnOp(mlir::daphne::ReturnOp firstReturnOp, mlir::Block *block) {
     auto op = &block->getOperations().back();
     // erase in reverse order to ensure no uses will be left
-    while(op != firstReturnOp) {
+    while (op != firstReturnOp) {
         auto prev = op->getPrevNode();
         op->emitWarning() << "Operation is ignored, as the function will return at " << firstReturnOp.getLoc();
         op->erase();
@@ -1893,37 +1771,39 @@ void removeOperationsBeforeReturnOp(mlir::daphne::ReturnOp firstReturnOp, mlir::
 }
 
 /**
- * @brief Ensures that the `caseBlock` has correct behaviour by appending operations, as the other case has an early return.
+ * @brief Ensures that the `caseBlock` has correct behaviour by appending
+ * operations, as the other case has an early return.
  *
  * @param ifOpWithEarlyReturn The old `IfOp` with the early return
  * @param caseBlock The new block for the case without a `ReturnOp`
  */
 void rectifyIfCaseWithoutReturnOp(mlir::scf::IfOp ifOpWithEarlyReturn, mlir::Block *caseBlock) {
     // ensure there is a `YieldOp` (for later removal of such)
-    if(caseBlock->empty() || !llvm::isa<mlir::scf::YieldOp>(caseBlock->back())) {
+    if (caseBlock->empty() || !llvm::isa<mlir::scf::YieldOp>(caseBlock->back())) {
         mlir::OpBuilder builder(ifOpWithEarlyReturn->getContext());
         builder.setInsertionPoint(caseBlock, caseBlock->end());
         builder.create<mlir::scf::YieldOp>(builder.getUnknownLoc());
     }
 
-    // As this if-case doesn't have an early return we need to move/clone operations that should happen
-    // into this case.
+    // As this if-case doesn't have an early return we need to move/clone
+    // operations that should happen into this case.
     auto opsAfterIf = ifOpWithEarlyReturn->getNextNode();
-    while(opsAfterIf) {
+    while (opsAfterIf) {
         auto next = opsAfterIf->getNextNode();
-        if(auto yieldOp = llvm::dyn_cast<mlir::scf::YieldOp>(opsAfterIf)) {
+        if (auto yieldOp = llvm::dyn_cast<mlir::scf::YieldOp>(opsAfterIf)) {
             auto parentOp = llvm::dyn_cast<mlir::scf::IfOp>(yieldOp->getParentOp());
-            if(!parentOp) {
-                throw ErrorHandler::compilerError(yieldOp->getLoc(), "DSLVisitor", "Early return not nested in `if`s not yet supported!");
+            if (!parentOp) {
+                throw ErrorHandler::compilerError(yieldOp->getLoc(), "DSLVisitor",
+                                                  "Early return not nested in `if`s not yet supported!");
             }
             next = parentOp->getNextNode();
         }
-        if(opsAfterIf->getBlock() == ifOpWithEarlyReturn->getBlock()) {
+        if (opsAfterIf->getBlock() == ifOpWithEarlyReturn->getBlock()) {
             // can be moved inside if
             opsAfterIf->moveBefore(caseBlock, caseBlock->end());
-        }
-        else {
-            // can't move them directly, need clone (operations will be needed later)
+        } else {
+            // can't move them directly, need clone (operations will be needed
+            // later)
             auto clonedOp = opsAfterIf->clone();
             mlir::OpBuilder builder(clonedOp->getContext());
             builder.setInsertionPoint(caseBlock, caseBlock->end());
@@ -1932,15 +1812,15 @@ void rectifyIfCaseWithoutReturnOp(mlir::scf::IfOp ifOpWithEarlyReturn, mlir::Blo
         opsAfterIf = next;
     }
 
-    // Remove `YieldOp`s and replace the result values of `IfOp`s used by operations that got moved in
-    // the previous loop with the correct values.
+    // Remove `YieldOp`s and replace the result values of `IfOp`s used by
+    // operations that got moved in the previous loop with the correct values.
     auto currIfOp = ifOpWithEarlyReturn;
     auto currOp = &caseBlock->front();
-    while(currOp) {
+    while (currOp) {
         auto nextOp = currOp->getNextNode();
-        if(auto yieldOp = llvm::dyn_cast<mlir::scf::YieldOp>(currOp)) {
+        if (auto yieldOp = llvm::dyn_cast<mlir::scf::YieldOp>(currOp)) {
             // cast was checked in previous loop
-            for(auto it : llvm::zip(currIfOp.getResults(), yieldOp.getOperands())) {
+            for (auto it : llvm::zip(currIfOp.getResults(), yieldOp.getOperands())) {
                 auto ifResult = std::get<0>(it);
                 auto yieldedVal = std::get<1>(it);
                 ifResult.replaceUsesWithIf(yieldedVal, [&](mlir::OpOperand &opOperand) {
@@ -1968,61 +1848,63 @@ void rectifyEarlyReturn(mlir::scf::IfOp ifOp) {
         nested.getBlock()->getOperations().splice(nested.getBlock()->end(), ifOp.thenBlock()->getOperations());
 
         auto returnOps = newThenBlock->getOps<mlir::daphne::ReturnOp>();
-        if(!returnOps.empty()) {
-            // NOTE: we ignore operations after return, could also throw an error
+        if (!returnOps.empty()) {
+            // NOTE: we ignore operations after return, could also throw an
+            // error
             removeOperationsBeforeReturnOp(*returnOps.begin(), newThenBlock);
-        }
-        else {
+        } else {
             rectifyIfCaseWithoutReturnOp(ifOp, newThenBlock);
         }
         auto returnOp = llvm::dyn_cast<mlir::daphne::ReturnOp>(newThenBlock->back());
-        if(!returnOp) {
-            // this should never happen, if it does check the `rectifyCaseByAppendingNecessaryOperations` function
-            throw ErrorHandler::compilerError(ifOp->getLoc(), "DSLVisitor", "Final operation in then case has to be return op");
+        if (!returnOp) {
+            // this should never happen, if it does check the
+            // `rectifyCaseByAppendingNecessaryOperations` function
+            throw ErrorHandler::compilerError(ifOp->getLoc(), "DSLVisitor",
+                                              "Final operation in then case has to be return op");
         }
         replaceReturnWithYield(returnOp);
     };
     auto insertElseBlock = [&](mlir::OpBuilder &nested, mlir::Location loc) {
         auto newElseBlock = nested.getBlock();
-        if(!ifOp.getElseRegion().empty()) {
+        if (!ifOp.getElseRegion().empty()) {
             newElseBlock->getOperations().splice(newElseBlock->end(), ifOp.elseBlock()->getOperations());
         }
         // TODO: check if already final operation is a return
 
         auto returnOps = newElseBlock->getOps<mlir::daphne::ReturnOp>();
-        if(!returnOps.empty()) {
-            // NOTE: we ignore operations after return, could also throw an error
+        if (!returnOps.empty()) {
+            // NOTE: we ignore operations after return, could also throw an
+            // error
             removeOperationsBeforeReturnOp(*returnOps.begin(), newElseBlock);
-        }
-        else {
+        } else {
             rectifyIfCaseWithoutReturnOp(ifOp, newElseBlock);
         }
         auto returnOp = llvm::dyn_cast<mlir::daphne::ReturnOp>(newElseBlock->back());
-        if(!returnOp) {
-            // this should never happen, if it does check the `rectifyCaseByAppendingNecessaryOperations` function
-            throw ErrorHandler::compilerError(ifOp->getLoc(), "DSLVisitor", "Final operation in else case has to be return op");
+        if (!returnOp) {
+            // this should never happen, if it does check the
+            // `rectifyCaseByAppendingNecessaryOperations` function
+            throw ErrorHandler::compilerError(ifOp->getLoc(), "DSLVisitor",
+                                              "Final operation in else case has to be return op");
         }
         replaceReturnWithYield(returnOp);
     };
     mlir::OpBuilder builder(ifOp);
 
-    auto newIfOp = builder.create<mlir::scf::IfOp>(
-        builder.getUnknownLoc(),
-        ifOp.getCondition(),
-        insertThenBlock,
-        insertElseBlock
-    );
+    auto newIfOp =
+        builder.create<mlir::scf::IfOp>(builder.getUnknownLoc(), ifOp.getCondition(), insertThenBlock, insertElseBlock);
     builder.create<mlir::daphne::ReturnOp>(ifOp->getLoc(), newIfOp.getResults());
     ifOp.erase();
 }
 
 /**
- * @brief Adapts the block such that only a single return at the end of the block is present, by moving early returns in
- * SCF-Ops.
+ * @brief Adapts the block such that only a single return at the end of the
+ * block is present, by moving early returns in SCF-Ops.
  *
- * General procedure is finding the most nested early return and then SCF-Op by SCF-Op moves the return outside,
- * putting the case without early return into the other case. This is repeated until all SCF-Ops are valid and
- * only a final return exists. Might duplicate operations if we have more nested if ops like this example:
+ * General procedure is finding the most nested early return and then SCF-Op by
+ * SCF-Op moves the return outside, putting the case without early return into
+ * the other case. This is repeated until all SCF-Ops are valid and only a final
+ * return exists. Might duplicate operations if we have more nested if ops like
+ * this example:
  * ```
  * if (a > 5) {
  *   if (a > 10) {
@@ -2056,36 +1938,36 @@ void rectifyEarlyReturn(mlir::scf::IfOp ifOp) {
  * @param funcBlock The block of the function with possible early returns
  */
 void rectifyEarlyReturns(mlir::Block *funcBlock) {
-    if(funcBlock->empty())
+    if (funcBlock->empty())
         return;
-    while(true) {
+    while (true) {
         size_t levelOfMostNested = 0;
         mlir::daphne::ReturnOp mostNestedReturn;
         funcBlock->walk([&](mlir::daphne::ReturnOp returnOp) {
             size_t nested = 1;
             auto op = returnOp.getOperation();
-            while(op->getBlock() != funcBlock) {
+            while (op->getBlock() != funcBlock) {
                 ++nested;
                 op = op->getParentOp();
             }
 
-            if(nested > levelOfMostNested) {
+            if (nested > levelOfMostNested) {
                 mostNestedReturn = returnOp;
                 levelOfMostNested = nested;
             }
         });
-        if(!mostNestedReturn || mostNestedReturn == &funcBlock->back()) {
+        if (!mostNestedReturn || mostNestedReturn == &funcBlock->back()) {
             // finished!
             break;
         }
 
         auto parentOp = mostNestedReturn->getParentOp();
-        if(auto ifOp = llvm::dyn_cast<mlir::scf::IfOp>(parentOp)) {
+        if (auto ifOp = llvm::dyn_cast<mlir::scf::IfOp>(parentOp)) {
             rectifyEarlyReturn(ifOp);
-        }
-        else {
+        } else {
             throw ErrorHandler::compilerError(parentOp->getLoc(), "DSLVisitor",
-                "Early return in `" + parentOp->getName().getStringRef().str() + "` is not supported.");
+                                              "Early return in `" + parentOp->getName().getStringRef().str() +
+                                                  "` is not supported.");
         }
     }
 }
@@ -2099,7 +1981,7 @@ antlrcpp::Any DaphneDSLVisitor::visitFunctionStatement(DaphneDSLGrammarParser::F
     symbolTable = ScopedSymbolTable();
 
     // TODO: better check?
-    if(globalSymbolTable.getNumScopes() > 1) {
+    if (globalSymbolTable.getNumScopes() > 1) {
         // TODO: create a function/class for throwing errors
         std::string s;
         llvm::raw_string_ostream stream(s);
@@ -2109,11 +1991,13 @@ antlrcpp::Any DaphneDSLVisitor::visitFunctionStatement(DaphneDSLGrammarParser::F
 
     std::vector<std::string> funcArgNames;
     std::vector<mlir::Type> funcArgTypes;
-    if(ctx->args) {
-        auto functionArguments = static_cast<std::vector<std::pair<std::string, mlir::Type>>>((visit(ctx->args)).as<std::vector<std::pair<std::string, mlir::Type>>>());
-        for(const auto &pair : functionArguments) {
-            if(std::find(funcArgNames.begin(), funcArgNames.end(), pair.first) != funcArgNames.end()) {
-                throw ErrorHandler::compilerError(loc, "DSLVisitor", "Function argument name `" + pair.first + "` is used twice.");
+    if (ctx->args) {
+        auto functionArguments = static_cast<std::vector<std::pair<std::string, mlir::Type>>>(
+            (visit(ctx->args)).as<std::vector<std::pair<std::string, mlir::Type>>>());
+        for (const auto &pair : functionArguments) {
+            if (std::find(funcArgNames.begin(), funcArgNames.end(), pair.first) != funcArgNames.end()) {
+                throw ErrorHandler::compilerError(loc, "DSLVisitor",
+                                                  "Function argument name `" + pair.first + "` is used twice.");
             }
             funcArgNames.push_back(pair.first);
             funcArgTypes.push_back(pair.second);
@@ -2121,19 +2005,18 @@ antlrcpp::Any DaphneDSLVisitor::visitFunctionStatement(DaphneDSLGrammarParser::F
     }
 
     auto funcBlock = new mlir::Block();
-    for(auto it : llvm::zip(funcArgNames, funcArgTypes)) {
+    for (auto it : llvm::zip(funcArgNames, funcArgTypes)) {
         auto blockArg = funcBlock->addArgument(std::get<1>(it), builder.getUnknownLoc());
         handleAssignmentPart(utils.getLoc(ctx->start), std::get<0>(it), nullptr, symbolTable, blockArg);
     }
 
     std::vector<mlir::Type> returnTypes;
     mlir::func::FuncOp functionOperation;
-    if(ctx->retTys) {
+    if (ctx->retTys) {
         // early creation of FuncOp for recursion
         returnTypes = visit(ctx->retTys).as<std::vector<mlir::Type>>();
-        functionOperation = createUserDefinedFuncOp(loc,
-            builder.getFunctionType(funcArgTypes, returnTypes),
-            functionName);
+        functionOperation =
+            createUserDefinedFuncOp(loc, builder.getFunctionType(funcArgTypes, returnTypes), functionName);
     }
 
     mlir::OpBuilder::InsertionGuard guard(builder);
@@ -2142,35 +2025,33 @@ antlrcpp::Any DaphneDSLVisitor::visitFunctionStatement(DaphneDSLGrammarParser::F
 
     rectifyEarlyReturns(funcBlock);
 
-    if(funcBlock->getOperations().empty() || !funcBlock->getOperations().back().hasTrait<mlir::OpTrait::IsTerminator>()) {
+    if (funcBlock->getOperations().empty() ||
+        !funcBlock->getOperations().back().hasTrait<mlir::OpTrait::IsTerminator>()) {
         builder.create<mlir::daphne::ReturnOp>(utils.getLoc(ctx->stop));
     }
     auto terminator = funcBlock->getTerminator();
     auto returnOpTypes = terminator->getOperandTypes();
-    if(!functionOperation) {
+    if (!functionOperation) {
         // late creation if no return types defined
-        functionOperation = createUserDefinedFuncOp(loc,
-            builder.getFunctionType(funcArgTypes, returnOpTypes),
-            functionName);
-    }
-    else {
-        if(returnOpTypes.size() != returnTypes.size()) {
+        functionOperation =
+            createUserDefinedFuncOp(loc, builder.getFunctionType(funcArgTypes, returnOpTypes), functionName);
+    } else {
+        if (returnOpTypes.size() != returnTypes.size()) {
             std::stringstream s;
             s << "function `" << functionName << "` returns a different number of "
-                << "values than specified in the definition (" << returnOpTypes.size()
-                << " vs. " << returnTypes.size() << ')';
+              << "values than specified in the definition (" << returnOpTypes.size() << " vs. " << returnTypes.size()
+              << ')';
             throw ErrorHandler::compilerError(terminator->getLoc(), "DSLVisitor", s.str());
         }
-        for(size_t i = 0; i < returnTypes.size(); i++)
+        for (size_t i = 0; i < returnTypes.size(); i++)
             // TODO These checks should happen after type inference.
-            if(!CompilerUtils::equalUnknownAware(returnOpTypes[i], returnTypes[i])) {
+            if (!CompilerUtils::equalUnknownAware(returnOpTypes[i], returnTypes[i])) {
                 std::stringstream s;
-                s << "function `" << functionName
-                    << "` returns a different type for return value #"
-                    << i << " than specified in the definition ("
-                    << returnOpTypes[i] << " vs. " << returnTypes[i] << ')';
-                // TODO Should we use the location of the i-th argument of the ReturnOp (more precise)?
-                throw ErrorHandler::compilerError(terminator->getLoc(), "DSLVisitor",  s.str());
+                s << "function `" << functionName << "` returns a different type for return value #" << i
+                  << " than specified in the definition (" << returnOpTypes[i] << " vs. " << returnTypes[i] << ')';
+                // TODO Should we use the location of the i-th argument of the
+                // ReturnOp (more precise)?
+                throw ErrorHandler::compilerError(terminator->getLoc(), "DSLVisitor", s.str());
             }
     }
     functionOperation.getBody().push_front(funcBlock);
@@ -2180,8 +2061,8 @@ antlrcpp::Any DaphneDSLVisitor::visitFunctionStatement(DaphneDSLGrammarParser::F
 }
 
 mlir::func::FuncOp DaphneDSLVisitor::createUserDefinedFuncOp(const mlir::Location &loc,
-                                                       const mlir::FunctionType &funcType,
-                                                       const std::string &functionName) {
+                                                             const mlir::FunctionType &funcType,
+                                                             const std::string &functionName) {
     mlir::OpBuilder::InsertionGuard guard(builder);
     auto *moduleBody = module.getBody();
     auto functionSymbolName = utils.getUniqueFunctionSymbol(functionName);
@@ -2194,7 +2075,7 @@ mlir::func::FuncOp DaphneDSLVisitor::createUserDefinedFuncOp(const mlir::Locatio
 
 antlrcpp::Any DaphneDSLVisitor::visitFunctionArgs(DaphneDSLGrammarParser::FunctionArgsContext *ctx) {
     std::vector<std::pair<std::string, mlir::Type>> functionArguments;
-    for(auto funcArgCtx: ctx->functionArg()) {
+    for (auto funcArgCtx : ctx->functionArg()) {
         functionArguments.push_back(visitFunctionArg(funcArgCtx).as<std::pair<std::string, mlir::Type>>());
     }
     return functionArguments;
@@ -2202,7 +2083,7 @@ antlrcpp::Any DaphneDSLVisitor::visitFunctionArgs(DaphneDSLGrammarParser::Functi
 
 antlrcpp::Any DaphneDSLVisitor::visitFunctionArg(DaphneDSLGrammarParser::FunctionArgContext *ctx) {
     auto ty = utils.unknownType;
-    if(ctx->ty) {
+    if (ctx->ty) {
         ty = utils.typeOrError(visitFuncTypeDef(ctx->ty));
     }
     return std::make_pair(ctx->var->getText(), ty);
@@ -2210,40 +2091,38 @@ antlrcpp::Any DaphneDSLVisitor::visitFunctionArg(DaphneDSLGrammarParser::Functio
 
 antlrcpp::Any DaphneDSLVisitor::visitFunctionRetTypes(DaphneDSLGrammarParser::FunctionRetTypesContext *ctx) {
     std::vector<mlir::Type> retTys;
-    for(auto ftdCtx : ctx->funcTypeDef())
+    for (auto ftdCtx : ctx->funcTypeDef())
         retTys.push_back(visitFuncTypeDef(ftdCtx).as<mlir::Type>());
     return retTys;
 }
 
 antlrcpp::Any DaphneDSLVisitor::visitFuncTypeDef(DaphneDSLGrammarParser::FuncTypeDefContext *ctx) {
     auto type = utils.unknownType;
-    if(ctx->dataTy) {
+    if (ctx->dataTy) {
         std::string dtStr = ctx->dataTy->getText();
-        if(dtStr == "matrix") {
+        if (dtStr == "matrix") {
             mlir::Type vt;
-            if(ctx->elTy)
+            if (ctx->elTy)
                 vt = utils.getValueTypeByName(ctx->elTy->getText());
             else
                 vt = utils.unknownType;
             type = utils.matrixOf(vt);
-        }
-        else {
+        } else {
             // TODO: should we do this?
             // auto loc = utils.getLoc(ctx->start);
-            // emitError(loc) << "unsupported data type for function argument: " + dtStr;
+            // emitError(loc) << "unsupported data type for function argument: "
+            // + dtStr;
             throw ErrorHandler::compilerError(utils.getLoc(ctx->start), "DSLVisitor",
-                "unsupported data type for function argument: " + dtStr
-            );
+                                              "unsupported data type for function argument: " + dtStr);
         }
-    }
-    else if(ctx->scalarTy)
+    } else if (ctx->scalarTy)
         type = utils.getValueTypeByName(ctx->scalarTy->getText());
     return type;
 }
 
 antlrcpp::Any DaphneDSLVisitor::visitReturnStatement(DaphneDSLGrammarParser::ReturnStatementContext *ctx) {
     std::vector<mlir::Value> returns;
-    for(auto expr: ctx->expr()) {
+    for (auto expr : ctx->expr()) {
         returns.push_back(valueOrErrorOnVisit(expr));
     }
     return builder.create<mlir::daphne::ReturnOp>(utils.getLoc(ctx->start), returns);
diff --git a/src/parser/daphnedsl/DaphneDSLVisitor.h b/src/parser/daphnedsl/DaphneDSLVisitor.h
index 83b9eb3d4..264935144 100644
--- a/src/parser/daphnedsl/DaphneDSLVisitor.h
+++ b/src/parser/daphnedsl/DaphneDSLVisitor.h
@@ -17,23 +17,23 @@
 #ifndef SRC_PARSER_DAPHNEDSL_DAPHNEDSLVISITOR_H
 #define SRC_PARSER_DAPHNEDSL_DAPHNEDSLVISITOR_H
 
-#include <parser/daphnedsl/DaphneDSLBuiltins.h>
 #include <parser/ParserUtils.h>
 #include <parser/ScopedSymbolTable.h>
 #include <parser/config/ConfigParser.h>
+#include <parser/daphnedsl/DaphneDSLBuiltins.h>
 
-#include "antlr4-runtime.h"
 #include "DaphneDSLGrammarParser.h"
 #include "DaphneDSLGrammarVisitor.h"
+#include "antlr4-runtime.h"
 
 #include <mlir/IR/Builders.h>
 #include <mlir/IR/Value.h>
 
 #include <optional>
+#include <stack>
 #include <string>
 #include <unordered_map>
 #include <utility>
-#include <stack>
 
 class DaphneDSLVisitor : public DaphneDSLGrammarVisitor {
     // By inheriting from DaphneDSLGrammarVisitor (as opposed to
@@ -44,24 +44,24 @@ class DaphneDSLVisitor : public DaphneDSLGrammarVisitor {
     /**
      * The module.
      */
-    mlir::ModuleOp & module;
+    mlir::ModuleOp &module;
 
     /**
      * The OpBuilder used to generate DaphneIR operations.
      */
-    mlir::OpBuilder & builder;
-    
+    mlir::OpBuilder &builder;
+
     /**
      * Maps a variable name from the input DaphneDSL script to the MLIR SSA
      * value that has been assigned to it most recently.
      */
     ScopedSymbolTable symbolTable;
-    
+
     /**
      * @brief General utilities for parsing to DaphneIR.
      */
     ParserUtils utils;
-    
+
     /**
      * @brief Utility for creating DaphneIR operations for DaphneDSL built-in
      * functions.
@@ -69,7 +69,8 @@ class DaphneDSLVisitor : public DaphneDSLGrammarVisitor {
     DaphneDSLBuiltins builtins;
 
     /**
-     * @brief Maps function names to MLIR symbols of functions defined in the IR.
+     * @brief Maps function names to MLIR symbols of functions defined in the
+     * IR.
      */
     std::multimap<std::string, mlir::func::FuncOp> functionsSymbolMap;
 
@@ -82,65 +83,64 @@ class DaphneDSLVisitor : public DaphneDSLGrammarVisitor {
      * @brief Creates a `FuncOp` for a UDF.
      * @param loc The source code location
      * @param funcType The type of the function
-     * @param functionName The name used in source code to refer to this function
+     * @param functionName The name used in source code to refer to this
+     * function
      * @return The `FuncOp`
      */
-    mlir::func::FuncOp createUserDefinedFuncOp(const mlir::Location &loc,
-                                         const mlir::FunctionType &funcType,
-                                         const std::string &functionName);
-    
+    mlir::func::FuncOp createUserDefinedFuncOp(const mlir::Location &loc, const mlir::FunctionType &funcType,
+                                               const std::string &functionName);
+
     /**
-     * @brief Wraps the given SSA value into a RenameOp, if the given SSA value is
-     * already known to the symbol table by some variable name.
+     * @brief Wraps the given SSA value into a RenameOp, if the given SSA value
+     * is already known to the symbol table by some variable name.
      *
-     * This is important for DaphneDSL statements like `B = A;`, where `B` and `A`
-     * should both represent the same SSA value. However, we need a way to distinguish
-     * `A` and `B` in the IR during parsing to ensure that SSA value replacements in
-     * DaphneDSL loops are correct.
+     * This is important for DaphneDSL statements like `B = A;`, where `B` and
+     * `A` should both represent the same SSA value. However, we need a way to
+     * distinguish `A` and `B` in the IR during parsing to ensure that SSA value
+     * replacements in DaphneDSL loops are correct.
      */
     mlir::Value renameIf(mlir::Value v);
 
-    void handleAssignmentPart(mlir::Location loc,
-        const std::string & var,
-        DaphneDSLGrammarParser::IndexingContext * idxCtx,
-        ScopedSymbolTable & symbolTable,
-        mlir::Value val
-    );
+    void handleAssignmentPart(mlir::Location loc, const std::string &var,
+                              DaphneDSLGrammarParser::IndexingContext *idxCtx, ScopedSymbolTable &symbolTable,
+                              mlir::Value val);
 
-    template<class ExtractAxOp, class SliceAxOp, class NumAxOp>
+    template <class ExtractAxOp, class SliceAxOp, class NumAxOp>
     mlir::Value applyRightIndexing(mlir::Location loc, mlir::Value arg, antlrcpp::Any ax, bool allowLabel);
-    
-    template<class InsertAxOp, class NumAxOp>
-    mlir::Value applyLeftIndexing(mlir::Location loc, mlir::Value arg, mlir::Value ins, antlrcpp::Any ax, bool allowLabel);
+
+    template <class InsertAxOp, class NumAxOp>
+    mlir::Value applyLeftIndexing(mlir::Location loc, mlir::Value arg, mlir::Value ins, antlrcpp::Any ax,
+                                  bool allowLabel);
 
     /**
      * @brief Tries to find a matching UDF based on the arguments provided
      * @param functionName Name of the UDF
      * @param args Arguments passed to the UDF
      * @param loc The location of the call to the function
-     * @return `FuncOp` of the matched UDF or `std::nullopt` if no UDF with the provided 
-     *  name exists
-     * @throws `std::runtime_error` if a UDF with the name exists but no matching 
-     *  version was found
+     * @return `FuncOp` of the matched UDF or `std::nullopt` if no UDF with the
+     * provided name exists
+     * @throws `std::runtime_error` if a UDF with the name exists but no
+     * matching version was found
      */
-    std::optional<mlir::func::FuncOp> findMatchingUDF(
-        const std::string &functionName, const std::vector<mlir::Value> &args, mlir::Location loc
-    ) const;
+    std::optional<mlir::func::FuncOp> findMatchingUDF(const std::string &functionName,
+                                                      const std::vector<mlir::Value> &args, mlir::Location loc) const;
 
     /**
-     * @brief Tries to find a unary (i.e. single param) UDF based on the argument type
+     * @brief Tries to find a unary (i.e. single param) UDF based on the
+     * argument type
      * @param mlir::Location of UDF
      * @param functionName Name of the UDF
      * @param argType The type of the argument passed to the UDF
-     * @return `FuncOp` of the matched UDF or `std::nullopt` if no UDF with the provided 
-     *  name exists
-     * @throws `std::runtime_error` if a UDF with the name exists but no matching 
-     *  version was found
+     * @return `FuncOp` of the matched UDF or `std::nullopt` if no UDF with the
+     * provided name exists
+     * @throws `std::runtime_error` if a UDF with the name exists but no
+     * matching version was found
      */
-    std::optional<mlir::func::FuncOp> findMatchingUnaryUDF(mlir::Location loc, const std::string &functionName, mlir::Type argType) const;
+    std::optional<mlir::func::FuncOp> findMatchingUnaryUDF(mlir::Location loc, const std::string &functionName,
+                                                           mlir::Type argType) const;
 
     /**
-     * @brief Checks if the type of an agrument to a UDF is compatible with the 
+     * @brief Checks if the type of an agrument to a UDF is compatible with the
      *   corresponding parameter type
      * @param argTy Type of the argument passed to the UDF
      * @param paramTy Type of the corresponding UDF parameter
@@ -153,7 +153,7 @@ class DaphneDSLVisitor : public DaphneDSLGrammarVisitor {
      * @param ctx Context of the call expression
      * @return the created `mapOp`
      */
-    antlrcpp::Any handleMapOpCall(DaphneDSLGrammarParser::CallExprContext * ctx);
+    antlrcpp::Any handleMapOpCall(DaphneDSLGrammarParser::CallExprContext *ctx);
 
     /**
      * @brief Creates a column matrix from a vector of MLIR values and
@@ -161,50 +161,47 @@ class DaphneDSLVisitor : public DaphneDSLGrammarVisitor {
      * @tparam VT value type of the result (e.g. `int64_t`)
      * @param loc Location of where the matrix is beeing constructed
      * @param values Pointer to a vector of MLIR values
-     * @param valueTypes Pointer to matching vector with MLIR type of given values
+     * @param valueTypes Pointer to matching vector with MLIR type of given
+     * values
      * @return MLIR value containing the built matrix
-    */
-    template<typename VT>
-    mlir::Value buildColMatrixFromValues(mlir::Location loc, const std::vector<mlir::Value> & values,
-                                    const std::vector<mlir::Type> & valueTypes, mlir::Type matrixVt);
+     */
+    template <typename VT>
+    mlir::Value buildColMatrixFromValues(mlir::Location loc, const std::vector<mlir::Value> &values,
+                                         const std::vector<mlir::Type> &valueTypes, mlir::Type matrixVt);
 
-    template<class Context>
-    mlir::Value valueOrErrorOnVisit(Context * ctx) {
+    template <class Context> mlir::Value valueOrErrorOnVisit(Context *ctx) {
         return utils.valueOrError(utils.getLoc(ctx->start), visit(ctx));
     }
 
     std::shared_ptr<spdlog::logger> logger;
 
-public:
-    DaphneDSLVisitor(
-            mlir::ModuleOp & module,
-            mlir::OpBuilder & builder,
-            std::unordered_map<std::string, std::string> args,
-            const std::string & rootScriptPath,
-            DaphneUserConfig userConf_
-    ) : module(module), builder(builder), utils(builder), builtins(builder), args(std::move(args)) {
+  public:
+    DaphneDSLVisitor(mlir::ModuleOp &module, mlir::OpBuilder &builder,
+                     std::unordered_map<std::string, std::string> args, const std::string &rootScriptPath,
+                     DaphneUserConfig userConf_)
+        : module(module), builder(builder), utils(builder), builtins(builder), args(std::move(args)) {
         scriptPaths.push(rootScriptPath);
         userConf = std::move(userConf_);
         logger = spdlog::get("parser");
     };
-    
-    antlrcpp::Any visitScript(DaphneDSLGrammarParser::ScriptContext * ctx) override;
 
-    antlrcpp::Any visitStatement(DaphneDSLGrammarParser::StatementContext * ctx) override;
-    
-    antlrcpp::Any visitImportStatement(DaphneDSLGrammarParser::ImportStatementContext * ctx) override;
+    antlrcpp::Any visitScript(DaphneDSLGrammarParser::ScriptContext *ctx) override;
 
-    antlrcpp::Any visitBlockStatement(DaphneDSLGrammarParser::BlockStatementContext * ctx) override;
+    antlrcpp::Any visitStatement(DaphneDSLGrammarParser::StatementContext *ctx) override;
 
-    antlrcpp::Any visitExprStatement(DaphneDSLGrammarParser::ExprStatementContext * ctx) override;
+    antlrcpp::Any visitImportStatement(DaphneDSLGrammarParser::ImportStatementContext *ctx) override;
 
-    antlrcpp::Any visitAssignStatement(DaphneDSLGrammarParser::AssignStatementContext * ctx) override;
+    antlrcpp::Any visitBlockStatement(DaphneDSLGrammarParser::BlockStatementContext *ctx) override;
 
-    antlrcpp::Any visitIfStatement(DaphneDSLGrammarParser::IfStatementContext * ctx) override;
+    antlrcpp::Any visitExprStatement(DaphneDSLGrammarParser::ExprStatementContext *ctx) override;
 
-    antlrcpp::Any visitWhileStatement(DaphneDSLGrammarParser::WhileStatementContext * ctx) override;
+    antlrcpp::Any visitAssignStatement(DaphneDSLGrammarParser::AssignStatementContext *ctx) override;
 
-    antlrcpp::Any visitForStatement(DaphneDSLGrammarParser::ForStatementContext * ctx) override;
+    antlrcpp::Any visitIfStatement(DaphneDSLGrammarParser::IfStatementContext *ctx) override;
+
+    antlrcpp::Any visitWhileStatement(DaphneDSLGrammarParser::WhileStatementContext *ctx) override;
+
+    antlrcpp::Any visitForStatement(DaphneDSLGrammarParser::ForStatementContext *ctx) override;
 
     antlrcpp::Any visitFunctionStatement(DaphneDSLGrammarParser::FunctionStatementContext *ctx) override;
 
@@ -218,57 +215,57 @@ class DaphneDSLVisitor : public DaphneDSLGrammarVisitor {
 
     antlrcpp::Any visitFuncTypeDef(DaphneDSLGrammarParser::FuncTypeDefContext *ctx) override;
 
-    antlrcpp::Any visitLiteralExpr(DaphneDSLGrammarParser::LiteralExprContext * ctx) override;
+    antlrcpp::Any visitLiteralExpr(DaphneDSLGrammarParser::LiteralExprContext *ctx) override;
 
-    antlrcpp::Any visitArgExpr(DaphneDSLGrammarParser::ArgExprContext * ctx) override;
+    antlrcpp::Any visitArgExpr(DaphneDSLGrammarParser::ArgExprContext *ctx) override;
 
-    antlrcpp::Any visitIdentifierExpr(DaphneDSLGrammarParser::IdentifierExprContext * ctx) override;
+    antlrcpp::Any visitIdentifierExpr(DaphneDSLGrammarParser::IdentifierExprContext *ctx) override;
 
-    antlrcpp::Any visitParanthesesExpr(DaphneDSLGrammarParser::ParanthesesExprContext * ctx) override;
+    antlrcpp::Any visitParanthesesExpr(DaphneDSLGrammarParser::ParanthesesExprContext *ctx) override;
 
-    antlrcpp::Any visitCallExpr(DaphneDSLGrammarParser::CallExprContext * ctx) override;
+    antlrcpp::Any visitCallExpr(DaphneDSLGrammarParser::CallExprContext *ctx) override;
 
-    antlrcpp::Any visitCastExpr(DaphneDSLGrammarParser::CastExprContext * ctx) override;
+    antlrcpp::Any visitCastExpr(DaphneDSLGrammarParser::CastExprContext *ctx) override;
 
-    antlrcpp::Any visitRightIdxFilterExpr(DaphneDSLGrammarParser::RightIdxFilterExprContext * ctx) override;
+    antlrcpp::Any visitRightIdxFilterExpr(DaphneDSLGrammarParser::RightIdxFilterExprContext *ctx) override;
 
-    antlrcpp::Any visitRightIdxExtractExpr(DaphneDSLGrammarParser::RightIdxExtractExprContext * ctx) override;
+    antlrcpp::Any visitRightIdxExtractExpr(DaphneDSLGrammarParser::RightIdxExtractExprContext *ctx) override;
 
     antlrcpp::Any visitMinusExpr(DaphneDSLGrammarParser::MinusExprContext *ctx) override;
-    
-    antlrcpp::Any visitMatmulExpr(DaphneDSLGrammarParser::MatmulExprContext * ctx) override;
-    
-    antlrcpp::Any visitPowExpr(DaphneDSLGrammarParser::PowExprContext * ctx) override;
 
-    antlrcpp::Any visitModExpr(DaphneDSLGrammarParser::ModExprContext * ctx) override;
-    
-    antlrcpp::Any visitMulExpr(DaphneDSLGrammarParser::MulExprContext * ctx) override;
-    
-    antlrcpp::Any visitAddExpr(DaphneDSLGrammarParser::AddExprContext * ctx) override;
-    
-    antlrcpp::Any visitCmpExpr(DaphneDSLGrammarParser::CmpExprContext * ctx) override;
-    
-    antlrcpp::Any visitConjExpr(DaphneDSLGrammarParser::ConjExprContext * ctx) override;
-    
-    antlrcpp::Any visitDisjExpr(DaphneDSLGrammarParser::DisjExprContext * ctx) override;
+    antlrcpp::Any visitMatmulExpr(DaphneDSLGrammarParser::MatmulExprContext *ctx) override;
+
+    antlrcpp::Any visitPowExpr(DaphneDSLGrammarParser::PowExprContext *ctx) override;
+
+    antlrcpp::Any visitModExpr(DaphneDSLGrammarParser::ModExprContext *ctx) override;
+
+    antlrcpp::Any visitMulExpr(DaphneDSLGrammarParser::MulExprContext *ctx) override;
+
+    antlrcpp::Any visitAddExpr(DaphneDSLGrammarParser::AddExprContext *ctx) override;
+
+    antlrcpp::Any visitCmpExpr(DaphneDSLGrammarParser::CmpExprContext *ctx) override;
+
+    antlrcpp::Any visitConjExpr(DaphneDSLGrammarParser::ConjExprContext *ctx) override;
+
+    antlrcpp::Any visitDisjExpr(DaphneDSLGrammarParser::DisjExprContext *ctx) override;
+
+    antlrcpp::Any visitCondExpr(DaphneDSLGrammarParser::CondExprContext *ctx) override;
+
+    antlrcpp::Any visitMatrixLiteralExpr(DaphneDSLGrammarParser::MatrixLiteralExprContext *ctx) override;
 
-    antlrcpp::Any visitCondExpr(DaphneDSLGrammarParser::CondExprContext * ctx) override;
+    antlrcpp::Any visitColMajorFrameLiteralExpr(DaphneDSLGrammarParser::ColMajorFrameLiteralExprContext *ctx) override;
 
-    antlrcpp::Any visitMatrixLiteralExpr(DaphneDSLGrammarParser::MatrixLiteralExprContext * ctx) override;
+    antlrcpp::Any visitRowMajorFrameLiteralExpr(DaphneDSLGrammarParser::RowMajorFrameLiteralExprContext *ctx) override;
 
-    antlrcpp::Any visitColMajorFrameLiteralExpr(DaphneDSLGrammarParser::ColMajorFrameLiteralExprContext * ctx) override;
+    antlrcpp::Any visitFrameRow(DaphneDSLGrammarParser::FrameRowContext *ctx) override;
 
-    antlrcpp::Any visitRowMajorFrameLiteralExpr(DaphneDSLGrammarParser::RowMajorFrameLiteralExprContext * ctx) override;
+    antlrcpp::Any visitIndexing(DaphneDSLGrammarParser::IndexingContext *ctx) override;
 
-    antlrcpp::Any visitFrameRow(DaphneDSLGrammarParser::FrameRowContext * ctx) override;
-    
-    antlrcpp::Any visitIndexing(DaphneDSLGrammarParser::IndexingContext * ctx) override;
-    
-    antlrcpp::Any visitRange(DaphneDSLGrammarParser::RangeContext * ctx) override;
+    antlrcpp::Any visitRange(DaphneDSLGrammarParser::RangeContext *ctx) override;
 
-    antlrcpp::Any visitLiteral(DaphneDSLGrammarParser::LiteralContext * ctx) override;
+    antlrcpp::Any visitLiteral(DaphneDSLGrammarParser::LiteralContext *ctx) override;
 
-    antlrcpp::Any visitBoolLiteral(DaphneDSLGrammarParser::BoolLiteralContext * ctx) override;
+    antlrcpp::Any visitBoolLiteral(DaphneDSLGrammarParser::BoolLiteralContext *ctx) override;
 };
 
-#endif //SRC_PARSER_DAPHNEDSL_DAPHNEDSLVISITOR_H
+#endif // SRC_PARSER_DAPHNEDSL_DAPHNEDSLVISITOR_H
diff --git a/src/parser/daphnedsl_old/Builtins.cpp b/src/parser/daphnedsl_old/Builtins.cpp
index d74ce5aa4..75d8597ff 100644
--- a/src/parser/daphnedsl_old/Builtins.cpp
+++ b/src/parser/daphnedsl_old/Builtins.cpp
@@ -21,15 +21,11 @@
 using namespace mlir;
 using namespace mlir::daphne;
 
-template<typename T>
+template <typename T>
 Builtin<T>::Builtin(std::vector<unsigned int> expectedNumOfParams)
-: expectedNumOfParams(std::move(expectedNumOfParams))
-{
-}
+    : expectedNumOfParams(std::move(expectedNumOfParams)) {}
 
-template<typename T>
-LogicalResult Builtin<T>::checkNumParams(Location &loc, llvm::StringRef name, size_t size)
-{
+template <typename T> LogicalResult Builtin<T>::checkNumParams(Location &loc, llvm::StringRef name, size_t size) {
     if (std::find(expectedNumOfParams.begin(), expectedNumOfParams.end(), size) == expectedNumOfParams.end()) {
         emitError(loc) << '`' << name << "` does not accept " << size << " parameters\n";
         return failure();
@@ -37,15 +33,11 @@ LogicalResult Builtin<T>::checkNumParams(Location &loc, llvm::StringRef name, si
     return success();
 }
 
-template<typename T>
-Builtin<T>::~Builtin() = default;
+template <typename T> Builtin<T>::~Builtin() = default;
 
 const llvm::StringRef PrintBuiltin::name = "print";
 
-PrintOp PrintBuiltin::create(OpBuilder builder,
-                             Location &loc,
-                             ValueRange values)
-{
+PrintOp PrintBuiltin::create(OpBuilder builder, Location &loc, ValueRange values) {
     if (failed(checkNumParams(loc, name, values.size())))
         return nullptr;
     return builder.create<PrintOp>(loc, values[0]);
@@ -53,16 +45,10 @@ PrintOp PrintBuiltin::create(OpBuilder builder,
 
 const llvm::StringRef RandBuiltin::name = "rand";
 
-RandMatrixOp RandBuiltin::create(OpBuilder builder,
-                           Location &loc,
-                           ValueRange values)
-{
+RandMatrixOp RandBuiltin::create(OpBuilder builder, Location &loc, ValueRange values) {
     if (values.size() == 6) {
-        return builder.create<RandMatrixOp>(
-                loc,
-                MatrixType::get(builder.getContext(), values[2].getType()),
-                values[0], values[1], values[2], values[3], values[4], values[5]
-                );
+        return builder.create<RandMatrixOp>(loc, MatrixType::get(builder.getContext(), values[2].getType()), values[0],
+                                            values[1], values[2], values[3], values[4], values[5]);
     }
     if (failed(checkNumParams(loc, name, values.size())))
         return nullptr;
@@ -71,18 +57,13 @@ RandMatrixOp RandBuiltin::create(OpBuilder builder,
 
 const llvm::StringRef TransposeBuiltin::name = "t";
 
-TransposeOp TransposeBuiltin::create(OpBuilder builder, Location &loc, ValueRange values)
-{
+TransposeOp TransposeBuiltin::create(OpBuilder builder, Location &loc, ValueRange values) {
     if (failed(checkNumParams(loc, name, values.size())))
         return nullptr;
     return builder.create<TransposeOp>(loc, values[0]);
 }
 
-antlrcpp::Any Builtins::build(OpBuilder &builder,
-                              Location &loc,
-                              ValueRange values,
-                              const std::string &name)
-{
+antlrcpp::Any Builtins::build(OpBuilder &builder, Location &loc, ValueRange values, const std::string &name) {
     if (name == PrintBuiltin::name) {
         Operation *op = PrintBuiltin().create(builder, loc, values);
         return op;
@@ -95,15 +76,16 @@ antlrcpp::Any Builtins::build(OpBuilder &builder,
         Value transpose = TransposeBuiltin().create(builder, loc, values);
         return transpose;
     }
-//    if (name == "sum") {
-//        if (values[0].getType().isa<daphne::MatrixType>()) {
-//            Value sum = builder.create<SumOp>(loc, values[0].getType().dyn_cast<daphne::MatrixType>().getElementType(), values[0]);
-//            return sum;
-//        }
-//        else
-//            // TODO This is probably not a good way of handling it.
-//            return nullptr;
-//    }
+    //    if (name == "sum") {
+    //        if (values[0].getType().isa<daphne::MatrixType>()) {
+    //            Value sum = builder.create<SumOp>(loc,
+    //            values[0].getType().dyn_cast<daphne::MatrixType>().getElementType(),
+    //            values[0]); return sum;
+    //        }
+    //        else
+    //            // TODO This is probably not a good way of handling it.
+    //            return nullptr;
+    //    }
     //  if (name == "rowSums") {
     //    Value rowSums = builder.create<RowAggOp>(loc, AggFn::sum, values[0]);
     //    return rowSums;
@@ -121,8 +103,8 @@ antlrcpp::Any Builtins::build(OpBuilder &builder,
     //    return colMins;
     //  }
     //  if (name == "repeat") {
-    //    Value repeat = builder.create<RepeatOp>(loc, values[0], values[1], values[2]);
-    //    return repeat;
+    //    Value repeat = builder.create<RepeatOp>(loc, values[0], values[1],
+    //    values[2]); return repeat;
     //  }
     return nullptr;
 }
diff --git a/src/parser/daphnedsl_old/Builtins.h b/src/parser/daphnedsl_old/Builtins.h
index 53722568f..52689ed00 100644
--- a/src/parser/daphnedsl_old/Builtins.h
+++ b/src/parser/daphnedsl_old/Builtins.h
@@ -23,68 +23,46 @@
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/Types.h"
 
-#include <vector>
 #include <string>
+#include <vector>
 
-template<typename T>
-struct Builtin
-{
+template <typename T> struct Builtin {
     std::vector<unsigned int> expectedNumOfParams;
 
     Builtin(std::vector<unsigned int> expectedNumOfParams);
     virtual ~Builtin();
 
     mlir::LogicalResult checkNumParams(mlir::Location &loc, llvm::StringRef name, size_t size);
-    virtual T create(mlir::OpBuilder builder,
-                     mlir::Location &loc,
-                     mlir::ValueRange values) = 0;
+    virtual T create(mlir::OpBuilder builder, mlir::Location &loc, mlir::ValueRange values) = 0;
 };
 
-struct PrintBuiltin : public Builtin<mlir::daphne::PrintOp>
-{
+struct PrintBuiltin : public Builtin<mlir::daphne::PrintOp> {
     using Builtin<mlir::daphne::PrintOp>::Builtin;
     static const llvm::StringRef name;
 
-    PrintBuiltin() : Builtin({1})
-    {
-    };
-    mlir::daphne::PrintOp create(mlir::OpBuilder builder,
-                                 mlir::Location &loc,
-                                 mlir::ValueRange values) override;
+    PrintBuiltin() : Builtin({1}){};
+    mlir::daphne::PrintOp create(mlir::OpBuilder builder, mlir::Location &loc, mlir::ValueRange values) override;
 };
 
-struct RandBuiltin : public Builtin<mlir::daphne::RandMatrixOp>
-{
+struct RandBuiltin : public Builtin<mlir::daphne::RandMatrixOp> {
     using Builtin<mlir::daphne::RandMatrixOp>::Builtin;
     static const llvm::StringRef name;
 
-    RandBuiltin() : Builtin({2, 4})
-    {
-    };
-    mlir::daphne::RandMatrixOp create(mlir::OpBuilder builder,
-                                mlir::Location &loc,
-                                mlir::ValueRange values) override;
+    RandBuiltin() : Builtin({2, 4}){};
+    mlir::daphne::RandMatrixOp create(mlir::OpBuilder builder, mlir::Location &loc, mlir::ValueRange values) override;
 };
 
-struct TransposeBuiltin : public Builtin<mlir::daphne::TransposeOp>
-{
+struct TransposeBuiltin : public Builtin<mlir::daphne::TransposeOp> {
     using Builtin<mlir::daphne::TransposeOp>::Builtin;
     static const llvm::StringRef name;
 
-    TransposeBuiltin() : Builtin({1})
-    {
-    };
-    mlir::daphne::TransposeOp create(mlir::OpBuilder builder,
-                                     mlir::Location &loc,
-                                     mlir::ValueRange values) override;
+    TransposeBuiltin() : Builtin({1}){};
+    mlir::daphne::TransposeOp create(mlir::OpBuilder builder, mlir::Location &loc, mlir::ValueRange values) override;
 };
 
-struct Builtins
-{
-    static antlrcpp::Any build(mlir::OpBuilder &builder,
-                               mlir::Location &loc,
-                               mlir::ValueRange values,
+struct Builtins {
+    static antlrcpp::Any build(mlir::OpBuilder &builder, mlir::Location &loc, mlir::ValueRange values,
                                const std::string &name);
 };
 
-#endif //SRC_PARSER_DAPHNEDSL_BUILTINS_H
+#endif // SRC_PARSER_DAPHNEDSL_BUILTINS_H
diff --git a/src/parser/daphnedsl_old/MLIRGenVisitors.cpp b/src/parser/daphnedsl_old/MLIRGenVisitors.cpp
index a8378736c..b4a683ea8 100644
--- a/src/parser/daphnedsl_old/MLIRGenVisitors.cpp
+++ b/src/parser/daphnedsl_old/MLIRGenVisitors.cpp
@@ -14,22 +14,22 @@
  * limitations under the License.
  */
 
-#include "parser/daphnedsl/Builtins.h"
-#include "DaphneParser.h"
 #include "parser/daphnedsl/MLIRGenVisitors.h"
+#include "DaphneParser.h"
 #include "ir/daphneir/Daphne.h"
+#include "parser/daphnedsl/Builtins.h"
 
 #include "antlr4-runtime.h"
-#include "llvm/ADT/APFloat.h"
-#include "llvm/ADT/APInt.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/Support/raw_ostream.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/OperationSupport.h"
 #include "mlir/Parser.h"
 #include "mlir/Pass/Pass.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/raw_ostream.h"
 
 #include <regex>
 
@@ -37,8 +37,8 @@ using namespace mlir;
 using namespace mlir::daphne;
 using namespace mlir_gen;
 
-using daphne_antlr::DaphneParser;
 using daphne_antlr::DaphneLexer;
+using daphne_antlr::DaphneParser;
 using daphne_antlr::DaphneParserBaseVisitor;
 using llvm::ArrayRef;
 using llvm::cast;
@@ -51,38 +51,36 @@ using llvm::SmallVector;
 using llvm::StringRef;
 using llvm::Twine;
 
-DaphneMlirVisitor::DaphneMlirVisitor(OpBuilder &builder) : builder(builder)
-{
-}
+DaphneMlirVisitor::DaphneMlirVisitor(OpBuilder &builder) : builder(builder) {}
 
-Location DaphneMlirVisitor::getLocMLIR(antlr4::Token *token)
-{
+Location DaphneMlirVisitor::getLocMLIR(antlr4::Token *token) {
     auto filename = token->getTokenSource()->getSourceName();
     auto line = token->getLine();
     auto col = token->getCharPositionInLine();
     return mlir::FileLineColLoc::get(builder.getIdentifier(filename), line, col);
 }
 
-antlrcpp::Any DaphneMlirVisitor::visitFloatType(DaphneParser::FloatTypeContext *ctx)
-{
+antlrcpp::Any DaphneMlirVisitor::visitFloatType(DaphneParser::FloatTypeContext *ctx) {
     switch (ctx->FLOAT_TYPE()->getSymbol()->getType()) {
-    case DaphneLexer::F64: return builder.getF64Type();
-    case DaphneLexer::F32: return builder.getF32Type();
+    case DaphneLexer::F64:
+        return builder.getF64Type();
+    case DaphneLexer::F32:
+        return builder.getF32Type();
     }
     llvm_unreachable("Parser does not handle float type bit-size");
 }
 
-antlrcpp::Any DaphneMlirVisitor::visitIntegerType(DaphneParser::IntegerTypeContext *ctx)
-{
+antlrcpp::Any DaphneMlirVisitor::visitIntegerType(DaphneParser::IntegerTypeContext *ctx) {
     switch (ctx->INTEGER_TYPE()->getSymbol()->getType()) {
-    case DaphneLexer::I64: return builder.getI64Type();
-    case DaphneLexer::I32: return builder.getI32Type();
+    case DaphneLexer::I64:
+        return builder.getI64Type();
+    case DaphneLexer::I32:
+        return builder.getI32Type();
     }
     llvm_unreachable("Parser does not handle integer type bit-size");
 }
 
-std::string removeLiteralUnderscores(llvm::StringRef literal)
-{
+std::string removeLiteralUnderscores(llvm::StringRef literal) {
     auto str = literal.str();
     str.erase(std::remove(str.begin(), str.end(), '_'), str.end());
     str.erase(std::remove(str.begin(), str.end(), '+'), str.end());
@@ -90,14 +88,12 @@ std::string removeLiteralUnderscores(llvm::StringRef literal)
     return str;
 }
 
-llvm::Optional<llvm::APFloat> DaphneMlirVisitor::parseFloatLiteral(Location loc, llvm::StringRef floatLiteral)
-{
+llvm::Optional<llvm::APFloat> DaphneMlirVisitor::parseFloatLiteral(Location loc, llvm::StringRef floatLiteral) {
     return llvm::APFloat(std::stod(removeLiteralUnderscores(floatLiteral)));
 }
 
 llvm::Optional<llvm::APInt> DaphneMlirVisitor::parseIntegerLiteral(Location loc, llvm::StringRef decimalLiteral,
-                                                                   unsigned int bitWidth)
-{
+                                                                   unsigned int bitWidth) {
     auto literal = removeLiteralUnderscores(decimalLiteral);
     llvm::APInt number;
     if (llvm::StringRef(literal).getAsInteger(10, number)) {
@@ -105,26 +101,24 @@ llvm::Optional<llvm::APInt> DaphneMlirVisitor::parseIntegerLiteral(Location loc,
         return llvm::None;
     }
     if (number.getBitWidth() > bitWidth) {
-        emitError(loc) << "Integer literal does not fit in variable, too small bit width. Needed: `"
-                << number.getBitWidth()
-                << "` Actual: `" << bitWidth << "`\n";
+        emitError(loc) << "Integer literal does not fit in variable, too small "
+                          "bit width. Needed: `"
+                       << number.getBitWidth() << "` Actual: `" << bitWidth << "`\n";
         return llvm::None;
-    }
-    else if (number.getBitWidth() < bitWidth) {
+    } else if (number.getBitWidth() < bitWidth) {
         number = number.zext(bitWidth);
     }
     return std::move(number);
 }
 
-std::string DaphneMlirVisitor::parseStringLiteral(llvm::StringRef rawStringLiteral)
-{
+std::string DaphneMlirVisitor::parseStringLiteral(llvm::StringRef rawStringLiteral) {
     // remove starting and ending "
     std::string str = std::string(rawStringLiteral.substr(1, rawStringLiteral.size() - 2));
 
-    //std::string notBackslash = R"([^\\])";
-    //auto multiple2Backslash = R"((\\)*)";
-    //auto backslash = "(" + notBackslash + multiple2Backslash + R"()?\\)";
-    // FIXME: "\\n" should lead to "\n" (non new-line)
+    // std::string notBackslash = R"([^\\])";
+    // auto multiple2Backslash = R"((\\)*)";
+    // auto backslash = "(" + notBackslash + multiple2Backslash + R"()?\\)";
+    //  FIXME: "\\n" should lead to "\n" (non new-line)
     str = std::regex_replace(str, std::regex(R"(\\n)"), "\n");
     str = std::regex_replace(str, std::regex(R"(\\t)"), "\t");
     str = std::regex_replace(str, std::regex(R"(\\b)"), "\b");
@@ -135,8 +129,7 @@ std::string DaphneMlirVisitor::parseStringLiteral(llvm::StringRef rawStringLiter
     return str;
 }
 
-antlrcpp::Any FileVisitor::visitFile(DaphneParser::FileContext *ctx)
-{
+antlrcpp::Any FileVisitor::visitFile(DaphneParser::FileContext *ctx) {
     auto module = ModuleOp::create(getLocMLIR(ctx->start));
 
     auto *body = module.getBody();
@@ -154,14 +147,12 @@ antlrcpp::Any FileVisitor::visitFile(DaphneParser::FileContext *ctx)
     return module;
 }
 
-antlrcpp::Any ItemVisitor::visitItem(DaphneParser::ItemContext *ctx)
-{
+antlrcpp::Any ItemVisitor::visitItem(DaphneParser::ItemContext *ctx) {
     FunctionVisitor functionVisitor(builder);
     return functionVisitor.visitFunction(ctx->function());
 }
 
-antlrcpp::Any FunctionVisitor::visitFunction(DaphneParser::FunctionContext *ctx)
-{
+antlrcpp::Any FunctionVisitor::visitFunction(DaphneParser::FunctionContext *ctx) {
     auto loc = getLocMLIR(ctx->KW_DEF()->getSymbol());
     auto *funcBlock = new Block();
     {
@@ -186,15 +177,15 @@ antlrcpp::Any FunctionVisitor::visitFunction(DaphneParser::FunctionContext *ctx)
     }
 
     auto *terminator = funcBlock->getTerminator();
-    auto funcType = FunctionType::get(builder.getContext(), funcBlock->getArgumentTypes(), terminator->getOperandTypes());
+    auto funcType =
+        FunctionType::get(builder.getContext(), funcBlock->getArgumentTypes(), terminator->getOperandTypes());
     // TODO The function name prefix should probably not be inserted here.
     auto func = builder.create<FuncOp>(loc, "_mlir__mlir_ciface_" + ctx->IDENTIFIER()->getText(), funcType);
     func.push_back(funcBlock);
     return func.getOperation();
 }
 
-antlrcpp::Any FunctionVisitor::visitFunctionArgs(DaphneParser::FunctionArgsContext *ctx)
-{
+antlrcpp::Any FunctionVisitor::visitFunctionArgs(DaphneParser::FunctionArgsContext *ctx) {
     for (auto arg : ctx->functionArg()) {
         if (failed(visitFunctionArg(arg).as<LogicalResult>())) {
             return failure();
@@ -203,8 +194,7 @@ antlrcpp::Any FunctionVisitor::visitFunctionArgs(DaphneParser::FunctionArgsConte
     return success();
 }
 
-antlrcpp::Any FunctionVisitor::visitFunctionArg(DaphneParser::FunctionArgContext *ctx)
-{
+antlrcpp::Any FunctionVisitor::visitFunctionArg(DaphneParser::FunctionArgContext *ctx) {
     Type argType = visit(ctx->type()).as<Type>();
     auto arg = builder.getBlock()->addArgument(argType);
     if (failed(declareVar(ctx->IDENTIFIER()->getText(), arg))) {
@@ -214,22 +204,18 @@ antlrcpp::Any FunctionVisitor::visitFunctionArg(DaphneParser::FunctionArgContext
     return success();
 }
 
-antlrcpp::Any FunctionVisitor::visitLiteralExpression(DaphneParser::LiteralExpressionContext *ctx)
-{
+antlrcpp::Any FunctionVisitor::visitLiteralExpression(DaphneParser::LiteralExpressionContext *ctx) {
     auto res = visitLiteralExpressionRule(ctx->literalExpressionRule());
     auto loc = getLocMLIR(ctx->start);
     Value value = nullptr;
     if (res.is<APFloat>()) {
         // TODO: get bit type, if specified
         Type type = builder.getF64Type();
-        value = builder.create<daphne::ConstantOp>(loc,
-                builder.getFloatAttr(type, res.as<APFloat>()));
-    }
-    else if (res.is<APInt>()) {
+        value = builder.create<daphne::ConstantOp>(loc, builder.getFloatAttr(type, res.as<APFloat>()));
+    } else if (res.is<APInt>()) {
         // TODO: get bit type, if specified
         Type type = builder.getIntegerType(64, true);
-        value = builder.create<daphne::ConstantOp>(loc,
-                builder.getIntegerAttr(type, res.as<APInt>()));
+        value = builder.create<daphne::ConstantOp>(loc, builder.getIntegerAttr(type, res.as<APInt>()));
     }
 #if 0
     else if (res.is<MatrixLiteral>()) {
@@ -273,41 +259,34 @@ antlrcpp::Any FunctionVisitor::visitLiteralExpression(DaphneParser::LiteralExpre
     return value;
 }
 
-antlrcpp::Any FunctionVisitor::visitLiteralExpressionRule(DaphneParser::LiteralExpressionRuleContext *ctx)
-{
+antlrcpp::Any FunctionVisitor::visitLiteralExpressionRule(DaphneParser::LiteralExpressionRuleContext *ctx) {
     if (auto floatLiteral = ctx->FLOAT_LITERAL()) {
         auto loc = getLocMLIR(floatLiteral->getSymbol());
         if (auto val = parseFloatLiteral(loc, floatLiteral->getText()))
-            return static_cast<APFloat> (val.getValue());
-    }
-    else if (auto intLiteral = ctx->INTEGER_LITERAL()) {
+            return static_cast<APFloat>(val.getValue());
+    } else if (auto intLiteral = ctx->INTEGER_LITERAL()) {
         // TODO: get bit type, if specified
         Type type = builder.getI64Type();
         auto bitWidth = type.cast<IntegerType>().getWidth();
         auto loc = getLocMLIR(intLiteral->getSymbol());
         if (auto val = parseIntegerLiteral(loc, intLiteral->getText(), bitWidth))
-            return static_cast<APInt> (val.getValue());
-    }
-    else if (auto strLiteral = ctx->STRING_LITERAL()) {
+            return static_cast<APInt>(val.getValue());
+    } else if (auto strLiteral = ctx->STRING_LITERAL()) {
         return strLiteral->getText();
-    }
-    else if (auto matrixLiteralCtx = ctx->matrixLiteral()) {
+    } else if (auto matrixLiteralCtx = ctx->matrixLiteral()) {
         return visitMatrixLiteral(matrixLiteralCtx);
-    }
-    else {
+    } else {
         llvm_unreachable("Literal not handled!");
     }
     emitError(getLocMLIR(ctx->start)) << "Reading literal failed";
     return nullptr;
 }
 
-antlrcpp::Any FunctionVisitor::visitMatrixLiteral(DaphneParser::MatrixLiteralContext *ctx)
-{
+antlrcpp::Any FunctionVisitor::visitMatrixLiteral(DaphneParser::MatrixLiteralContext *ctx) {
     return visitMatrixLiteralElements(ctx->matrixLiteralElements());
 }
 
-antlrcpp::Any FunctionVisitor::visitMatrixLiteralElements(DaphneParser::MatrixLiteralElementsContext *ctx)
-{
+antlrcpp::Any FunctionVisitor::visitMatrixLiteralElements(DaphneParser::MatrixLiteralElementsContext *ctx) {
     MatrixLiteral matrixLiteral;
     for (auto litExprRule : ctx->literalExpressionRule()) {
         auto loc = getLocMLIR(litExprRule->start);
@@ -321,8 +300,7 @@ antlrcpp::Any FunctionVisitor::visitMatrixLiteralElements(DaphneParser::MatrixLi
     return matrixLiteral;
 }
 
-antlrcpp::Any FunctionVisitor::visitBlockStatement(DaphneParser::BlockStatementContext *ctx)
-{
+antlrcpp::Any FunctionVisitor::visitBlockStatement(DaphneParser::BlockStatementContext *ctx) {
     // scope handling here?
     for (auto statement : ctx->statement()) {
         auto result = visitStatement(statement);
@@ -333,23 +311,20 @@ antlrcpp::Any FunctionVisitor::visitBlockStatement(DaphneParser::BlockStatementC
     return success();
 }
 
-antlrcpp::Any FunctionVisitor::visitExpressionStatement(DaphneParser::ExpressionStatementContext *ctx)
-{
+antlrcpp::Any FunctionVisitor::visitExpressionStatement(DaphneParser::ExpressionStatementContext *ctx) {
     if (visit(ctx->expression()).isNotNull()) {
         return success();
     }
     return failure();
 }
 
-antlrcpp::Any FunctionVisitor::visitAssignmentExpression(DaphneParser::AssignmentExpressionContext *ctx)
-{
+antlrcpp::Any FunctionVisitor::visitAssignmentExpression(DaphneParser::AssignmentExpressionContext *ctx) {
     auto varName = ctx->IDENTIFIER()->getText();
     // TODO: check if shadowing is allowed
     auto val = visit(ctx->expression());
     if (val.isNull()) {
         return nullptr;
-    }
-    else if (val.is<Operation *>()) {
+    } else if (val.is<Operation *>()) {
         emitError(getLocMLIR(ctx->EQ()->getSymbol())) << "Can't assign expression that does not return a value";
         return nullptr;
     }
@@ -359,19 +334,18 @@ antlrcpp::Any FunctionVisitor::visitAssignmentExpression(DaphneParser::Assignmen
     return val;
 }
 
-antlrcpp::Any FunctionVisitor::visitGroupedExpression(DaphneParser::GroupedExpressionContext *ctx)
-{
+antlrcpp::Any FunctionVisitor::visitGroupedExpression(DaphneParser::GroupedExpressionContext *ctx) {
     return visit(ctx->expression());
 }
 
-antlrcpp::Any FunctionVisitor::visitLetStatement(DaphneParser::LetStatementContext *ctx)
-{
+antlrcpp::Any FunctionVisitor::visitLetStatement(DaphneParser::LetStatementContext *ctx) {
     auto val = visit(ctx->expression());
     if (val.isNull()) {
         return failure();
     }
     if (val.is<Operation *>()) {
-        emitError(getLocMLIR(ctx->start)) << "Can't initialize variable with expression that does not return a value";
+        emitError(getLocMLIR(ctx->start)) << "Can't initialize variable with expression that does not return "
+                                             "a value";
         return failure();
     }
     if (failed(declareVar(ctx->IDENTIFIER()->getText(), val.as<Value>()))) {
@@ -380,32 +354,29 @@ antlrcpp::Any FunctionVisitor::visitLetStatement(DaphneParser::LetStatementConte
     return success();
 }
 
-antlrcpp::Any FunctionVisitor::visitIdentifierExpression(DaphneParser::IdentifierExpressionContext *ctx)
-{
+antlrcpp::Any FunctionVisitor::visitIdentifierExpression(DaphneParser::IdentifierExpressionContext *ctx) {
     auto varName = ctx->IDENTIFIER()->getText();
     auto it = symbolTable.find(varName);
     if (it != symbolTable.end()) {
         return it->second;
-    }
-    else {
-        emitError(getLocMLIR(ctx->IDENTIFIER()->getSymbol())) << "Variable `" << varName
-                << "` is not defined in this scope";
+    } else {
+        emitError(getLocMLIR(ctx->IDENTIFIER()->getSymbol()))
+            << "Variable `" << varName << "` is not defined in this scope";
     }
     return nullptr;
 }
 
-antlrcpp::Any FunctionVisitor::visitCallExpression(DaphneParser::CallExpressionContext *ctx)
-{
+antlrcpp::Any FunctionVisitor::visitCallExpression(DaphneParser::CallExpressionContext *ctx) {
     std::vector<Value> parameters;
     if (ctx->parameters()) {
         auto parametersRet = visitParameters(ctx->parameters());
         if (parametersRet.isNull()) {
             return nullptr;
         }
-        if (!parametersRet.is<std::vector < Value >> ()) {
+        if (!parametersRet.is<std::vector<Value>>()) {
             llvm_unreachable("`visitParameters()` returned wrong type!");
         }
-        parameters = parametersRet.as<std::vector < Value >> ();
+        parameters = parametersRet.as<std::vector<Value>>();
     }
     auto loc = getLocMLIR(ctx->IDENTIFIER()->getSymbol());
     auto builtin = Builtins::build(builder, loc, parameters, ctx->fn->getText());
@@ -420,8 +391,7 @@ antlrcpp::Any FunctionVisitor::visitCallExpression(DaphneParser::CallExpressionC
     return builtin.as<Operation *>();
 }
 
-antlrcpp::Any FunctionVisitor::visitArithmeticExpression(DaphneParser::ArithmeticExpressionContext *ctx)
-{
+antlrcpp::Any FunctionVisitor::visitArithmeticExpression(DaphneParser::ArithmeticExpressionContext *ctx) {
     auto lhsAny = visit(ctx->lhs);
     if (lhsAny.isNull()) {
         emitError(getLocMLIR(ctx->lhs->start)) << "left hand side of arithmetic expression did not return a value";
@@ -430,7 +400,8 @@ antlrcpp::Any FunctionVisitor::visitArithmeticExpression(DaphneParser::Arithmeti
     Value lhs = lhsAny.as<Value>();
     auto rhsAny = visit(ctx->rhs);
     if (rhsAny.isNull()) {
-        emitError(getLocMLIR(ctx->rhs->start)) << "right hand side of arithmetic expression did not return a value";
+        emitError(getLocMLIR(ctx->rhs->start)) << "right hand side of arithmetic expression did not return a "
+                                                  "value";
         return nullptr;
     }
     Value rhs = rhsAny.as<Value>();
@@ -451,8 +422,7 @@ antlrcpp::Any FunctionVisitor::visitArithmeticExpression(DaphneParser::Arithmeti
     return retVal;
 }
 
-antlrcpp::Any FunctionVisitor::visitParameters(DaphneParser::ParametersContext *ctx)
-{
+antlrcpp::Any FunctionVisitor::visitParameters(DaphneParser::ParametersContext *ctx) {
     std::vector<Value> parameters;
     for (auto parameter : ctx->parameter()) {
         auto val = visitParameter(parameter);
@@ -464,46 +434,40 @@ antlrcpp::Any FunctionVisitor::visitParameters(DaphneParser::ParametersContext *
     return parameters;
 }
 
-antlrcpp::Any FunctionVisitor::visitParameter(DaphneParser::ParameterContext *ctx)
-{
+antlrcpp::Any FunctionVisitor::visitParameter(DaphneParser::ParameterContext *ctx) {
     auto valRet = visit(ctx->expression());
     if (valRet.isNull()) {
         return nullptr;
-    }
-    else if (valRet.is<Operation *>()) {
+    } else if (valRet.is<Operation *>()) {
         emitError(getLocMLIR(ctx->start)) << "Parameter expression has to return a value";
         return nullptr;
     }
     return valRet;
 }
 
-antlrcpp::Any FunctionVisitor::visitStatement(DaphneParser::StatementContext *ctx)
-{
+antlrcpp::Any FunctionVisitor::visitStatement(DaphneParser::StatementContext *ctx) {
     if (ctx->blockStatement()) {
         return visitBlockStatement(ctx->blockStatement());
-    }
-    else if (ctx->expressionStatement()) {
+    } else if (ctx->expressionStatement()) {
         return visitExpressionStatement(ctx->expressionStatement());
-    }
-    else if (ctx->letStatement()) {
+    } else if (ctx->letStatement()) {
         return visitLetStatement(ctx->letStatement());
     }
     emitError(getLocMLIR(ctx->start)) << "Statement kind is not handled\n";
     return failure();
 }
 
-LogicalResult FunctionVisitor::declareVar(std::string name, Value value, bool allowShadowing)
-{
+LogicalResult FunctionVisitor::declareVar(std::string name, Value value, bool allowShadowing) {
     if (symbolTable.count(name) && !allowShadowing) {
-        emitError(value.getLoc()) << "Variable with name `" << name << "` already exists and shadowing is not allowed here";
+        emitError(value.getLoc()) << "Variable with name `" << name
+                                  << "` already exists and shadowing is not allowed here";
         return failure();
     }
     symbolTable[name] = value;
     return success();
 }
 
-void MatrixLiteral::addData(Location &loc, OpBuilder &builder, antlrcpp::Any data)
-{
+void MatrixLiteral::addData(Location &loc, OpBuilder &builder, antlrcpp::Any data) {
     if (data.is<MatrixLiteral>()) {
         auto childLit = data.as<MatrixLiteral>();
         if (!initialized) {
@@ -511,62 +475,53 @@ void MatrixLiteral::addData(Location &loc, OpBuilder &builder, antlrcpp::Any dat
             rows = 1;
             elementType = childLit.elementType;
             cols = childLit.cols;
-        }
-        else {
+        } else {
             rows++;
         }
 
         if (childLit.rows != -1) {
-            emitError(loc) << "Matrix literal is nested too often (only exactly 2 nestings are supported)";
-        }
-        else if (elementType != childLit.elementType) {
+            emitError(loc) << "Matrix literal is nested too often (only "
+                              "exactly 2 nestings are supported)";
+        } else if (elementType != childLit.elementType) {
             emitError(loc) << "Matrix literal element types don't match";
             return;
-        }
-        else if (cols != childLit.cols) {
-            emitError(loc) << "Number of elements in every column have to match for matrix literals";
+        } else if (cols != childLit.cols) {
+            emitError(loc) << "Number of elements in every column have to "
+                              "match for matrix literals";
         }
         if (elementType.isa<FloatType>()) {
-            linearizedFloatData.insert(linearizedFloatData.end(),
-                                       childLit.linearizedFloatData.begin(),
+            linearizedFloatData.insert(linearizedFloatData.end(), childLit.linearizedFloatData.begin(),
                                        childLit.linearizedFloatData.end());
-        }
-        else {
-            linearizedIntData.insert(linearizedIntData.end(),
-                                     childLit.linearizedIntData.begin(),
+        } else {
+            linearizedIntData.insert(linearizedIntData.end(), childLit.linearizedIntData.begin(),
                                      childLit.linearizedIntData.end());
         }
-    }
-    else if (rows != -1) {
+    } else if (rows != -1) {
         emitError(loc) << "Matrix literals have to be nested exactly 2 times";
-        emitRemark(loc)
-                << "Single 'array'/'row' which can be reshaped to a matrix shape will be supported in the future:\n"
-                << "Some syntax similar to the following line will be supported.\n" << "\tlet a: 2x2 = [1., 2., 3., 4.];";
+        emitRemark(loc) << "Single 'array'/'row' which can be reshaped to a matrix shape "
+                           "will be supported in the future:\n"
+                        << "Some syntax similar to the following line will be supported.\n"
+                        << "\tlet a: 2x2 = [1., 2., 3., 4.];";
         return;
-    }
-    else if (data.is<APFloat>()) {
+    } else if (data.is<APFloat>()) {
         if (!initialized) {
             initialized = true;
             // TODO: different bit-widths
             elementType = builder.getF64Type();
             cols = 1;
-        }
-        else {
+        } else {
             cols++;
         }
         linearizedFloatData.push_back(data.as<APFloat>());
-    }
-    else {
+    } else {
         if (!initialized) {
             initialized = true;
             // TODO: different bit-widths
             elementType = builder.getIntegerType(64, true);
             cols = 1;
-        }
-        else {
+        } else {
             cols++;
         }
         linearizedIntData.push_back(data.as<APInt>());
-
     }
 }
diff --git a/src/parser/daphnedsl_old/MLIRGenVisitors.h b/src/parser/daphnedsl_old/MLIRGenVisitors.h
index dc87ba4be..50a8490d9 100644
--- a/src/parser/daphnedsl_old/MLIRGenVisitors.h
+++ b/src/parser/daphnedsl_old/MLIRGenVisitors.h
@@ -22,111 +22,89 @@
 #include "DaphneParserBaseVisitor.h"
 
 #include "antlr4-runtime.h"
-#include "llvm/ADT/ScopedHashTable.h"
-#include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "llvm/ADT/ScopedHashTable.h"
 
 using daphne_antlr::DaphneParser;
 using daphne_antlr::DaphneParserBaseVisitor;
 
-namespace mlir_gen
-{
-
-    class DaphneMlirVisitor : public DaphneParserBaseVisitor
-    {
-    protected:
-        mlir::OpBuilder &builder;
-
-        mlir::Location getLocMLIR(antlr4::Token *token);
-        static llvm::Optional<llvm::APFloat> parseFloatLiteral(mlir::Location loc, llvm::StringRef floatLiteral);
-        static llvm::Optional<llvm::APInt> parseIntegerLiteral(mlir::Location loc,
-                                                               llvm::StringRef decimalLiteral,
-                                                               unsigned int bitWidth);
-    public:
-        explicit DaphneMlirVisitor(mlir::OpBuilder &builder);
-
-        antlrcpp::Any visitFloatType(DaphneParser::FloatTypeContext *ctx) override;
-        antlrcpp::Any visitIntegerType(DaphneParser::IntegerTypeContext *ctx) override;
-
-        static std::string parseStringLiteral(llvm::StringRef floatLiteral);
-    };
-
-    struct FileVisitor : public DaphneMlirVisitor
-    {
-        using DaphneMlirVisitor::DaphneMlirVisitor;
-        antlrcpp::Any visitFile(DaphneParser::FileContext *ctx) override;
-    };
-
-    struct ItemVisitor : public DaphneMlirVisitor
-    {
-        using DaphneMlirVisitor::DaphneMlirVisitor;
-        antlrcpp::Any
-        visitItem(DaphneParser::ItemContext *ctx) override;
-    };
-
-    class FunctionVisitor : public DaphneMlirVisitor
-    {
-        // FIXME: this maps the variable name to a pair of variable name and value.
-        //  This is only done to ensure that the string, the stringref is referring to exists.
-        std::map<std::string, mlir::Value> symbolTable;
-
-        __attribute_warn_unused_result__
-        mlir::LogicalResult declareVar(std::string name, mlir::Value value, bool allowShadowing = false);
-
-    public:
-        using DaphneMlirVisitor::DaphneMlirVisitor;
-        antlrcpp::Any
-        visitFunction(DaphneParser::FunctionContext *ctx) override;
-        antlrcpp::Any visitFunctionArgs(DaphneParser::FunctionArgsContext *ctx) override;
-        antlrcpp::Any visitFunctionArg(DaphneParser::FunctionArgContext *ctx) override;
-        antlrcpp::Any visitLiteralExpression(DaphneParser::LiteralExpressionContext *ctx) override;
-        antlrcpp::Any visitLiteralExpressionRule(DaphneParser::LiteralExpressionRuleContext *ctx) override;
-        antlrcpp::Any visitBlockStatement(DaphneParser::BlockStatementContext *ctx) override;
-        antlrcpp::Any visitAssignmentExpression(DaphneParser::AssignmentExpressionContext *ctx) override;
-        antlrcpp::Any visitLetStatement(DaphneParser::LetStatementContext *ctx) override;
-        antlrcpp::Any visitCallExpression(DaphneParser::CallExpressionContext *ctx) override;
-        antlrcpp::Any visitIdentifierExpression(DaphneParser::IdentifierExpressionContext *ctx) override;
-        antlrcpp::Any visitStatement(DaphneParser::StatementContext *ctx) override;
-        antlrcpp::Any visitExpressionStatement(DaphneParser::ExpressionStatementContext *ctx) override;
-        antlrcpp::Any visitParameters(DaphneParser::ParametersContext *ctx) override;
-        antlrcpp::Any visitParameter(DaphneParser::ParameterContext *ctx) override;
-        antlrcpp::Any visitArithmeticExpression(DaphneParser::ArithmeticExpressionContext *ctx) override;
-        antlrcpp::Any visitGroupedExpression(DaphneParser::GroupedExpressionContext *ctx) override;
-        antlrcpp::Any visitMatrixLiteral(DaphneParser::MatrixLiteralContext *ctx) override;
-        antlrcpp::Any visitMatrixLiteralElements(DaphneParser::MatrixLiteralElementsContext *ctx) override;
-    };
-
-    class MatrixLiteral
-    {
-        bool initialized = false;
-        long rows = -1;
-        long cols = -1;
-        mlir::Type elementType = nullptr;
-        std::vector<llvm::APFloat> linearizedFloatData;
-        std::vector<llvm::APInt> linearizedIntData;
-    public:
-
-        void addData(mlir::Location &loc, mlir::OpBuilder &builder, antlrcpp::Any data);
-        [[nodiscard]] long getRows() const
-        {
-            return rows;
-        }
-        [[nodiscard]] long getCols() const
-        {
-            return cols;
-        }
-        [[nodiscard]] mlir::Type getElementType() const
-        {
-            return elementType;
-        }
-        [[nodiscard]] std::vector<llvm::APFloat> getLinFloatData() const
-        {
-            return linearizedFloatData;
-        }
-        [[nodiscard]] std::vector<llvm::APInt> getLinIntData() const
-        {
-            return linearizedIntData;
-        }
-    };
+namespace mlir_gen {
+
+class DaphneMlirVisitor : public DaphneParserBaseVisitor {
+  protected:
+    mlir::OpBuilder &builder;
+
+    mlir::Location getLocMLIR(antlr4::Token *token);
+    static llvm::Optional<llvm::APFloat> parseFloatLiteral(mlir::Location loc, llvm::StringRef floatLiteral);
+    static llvm::Optional<llvm::APInt> parseIntegerLiteral(mlir::Location loc, llvm::StringRef decimalLiteral,
+                                                           unsigned int bitWidth);
+
+  public:
+    explicit DaphneMlirVisitor(mlir::OpBuilder &builder);
+
+    antlrcpp::Any visitFloatType(DaphneParser::FloatTypeContext *ctx) override;
+    antlrcpp::Any visitIntegerType(DaphneParser::IntegerTypeContext *ctx) override;
+
+    static std::string parseStringLiteral(llvm::StringRef floatLiteral);
+};
+
+struct FileVisitor : public DaphneMlirVisitor {
+    using DaphneMlirVisitor::DaphneMlirVisitor;
+    antlrcpp::Any visitFile(DaphneParser::FileContext *ctx) override;
+};
+
+struct ItemVisitor : public DaphneMlirVisitor {
+    using DaphneMlirVisitor::DaphneMlirVisitor;
+    antlrcpp::Any visitItem(DaphneParser::ItemContext *ctx) override;
+};
+
+class FunctionVisitor : public DaphneMlirVisitor {
+    // FIXME: this maps the variable name to a pair of variable name and value.
+    //  This is only done to ensure that the string, the stringref is referring
+    //  to exists.
+    std::map<std::string, mlir::Value> symbolTable;
+
+    __attribute_warn_unused_result__ mlir::LogicalResult declareVar(std::string name, mlir::Value value,
+                                                                    bool allowShadowing = false);
+
+  public:
+    using DaphneMlirVisitor::DaphneMlirVisitor;
+    antlrcpp::Any visitFunction(DaphneParser::FunctionContext *ctx) override;
+    antlrcpp::Any visitFunctionArgs(DaphneParser::FunctionArgsContext *ctx) override;
+    antlrcpp::Any visitFunctionArg(DaphneParser::FunctionArgContext *ctx) override;
+    antlrcpp::Any visitLiteralExpression(DaphneParser::LiteralExpressionContext *ctx) override;
+    antlrcpp::Any visitLiteralExpressionRule(DaphneParser::LiteralExpressionRuleContext *ctx) override;
+    antlrcpp::Any visitBlockStatement(DaphneParser::BlockStatementContext *ctx) override;
+    antlrcpp::Any visitAssignmentExpression(DaphneParser::AssignmentExpressionContext *ctx) override;
+    antlrcpp::Any visitLetStatement(DaphneParser::LetStatementContext *ctx) override;
+    antlrcpp::Any visitCallExpression(DaphneParser::CallExpressionContext *ctx) override;
+    antlrcpp::Any visitIdentifierExpression(DaphneParser::IdentifierExpressionContext *ctx) override;
+    antlrcpp::Any visitStatement(DaphneParser::StatementContext *ctx) override;
+    antlrcpp::Any visitExpressionStatement(DaphneParser::ExpressionStatementContext *ctx) override;
+    antlrcpp::Any visitParameters(DaphneParser::ParametersContext *ctx) override;
+    antlrcpp::Any visitParameter(DaphneParser::ParameterContext *ctx) override;
+    antlrcpp::Any visitArithmeticExpression(DaphneParser::ArithmeticExpressionContext *ctx) override;
+    antlrcpp::Any visitGroupedExpression(DaphneParser::GroupedExpressionContext *ctx) override;
+    antlrcpp::Any visitMatrixLiteral(DaphneParser::MatrixLiteralContext *ctx) override;
+    antlrcpp::Any visitMatrixLiteralElements(DaphneParser::MatrixLiteralElementsContext *ctx) override;
+};
+
+class MatrixLiteral {
+    bool initialized = false;
+    long rows = -1;
+    long cols = -1;
+    mlir::Type elementType = nullptr;
+    std::vector<llvm::APFloat> linearizedFloatData;
+    std::vector<llvm::APInt> linearizedIntData;
+
+  public:
+    void addData(mlir::Location &loc, mlir::OpBuilder &builder, antlrcpp::Any data);
+    [[nodiscard]] long getRows() const { return rows; }
+    [[nodiscard]] long getCols() const { return cols; }
+    [[nodiscard]] mlir::Type getElementType() const { return elementType; }
+    [[nodiscard]] std::vector<llvm::APFloat> getLinFloatData() const { return linearizedFloatData; }
+    [[nodiscard]] std::vector<llvm::APInt> getLinIntData() const { return linearizedIntData; }
+};
 } // namespace mlir_gen
-#endif //SRC_PARSER_DAPHNEDSL_MLIRGENVISITORS_H
\ No newline at end of file
+#endif // SRC_PARSER_DAPHNEDSL_MLIRGENVISITORS_H
\ No newline at end of file
diff --git a/src/parser/metadata/JsonKeys.h b/src/parser/metadata/JsonKeys.h
index d91c80f44..cc46410f4 100644
--- a/src/parser/metadata/JsonKeys.h
+++ b/src/parser/metadata/JsonKeys.h
@@ -22,21 +22,21 @@
 /**
  * @brief A container that contains names of JSON keys for a file
  * metadata.
- * 
+ *
  */
 struct JsonKeys {
 
     // mandatory keys
-    inline static const std::string NUM_ROWS = "numRows";   // int
-    inline static const std::string NUM_COLS = "numCols";   // int
+    inline static const std::string NUM_ROWS = "numRows"; // int
+    inline static const std::string NUM_COLS = "numCols"; // int
 
     // should always contain exactly one of the following keys
-    inline static const std::string VALUE_TYPE = "valueType";   // string
-    inline static const std::string SCHEMA = "schema";  // array of objects
+    inline static const std::string VALUE_TYPE = "valueType"; // string
+    inline static const std::string SCHEMA = "schema";        // array of objects
 
     // optional key
-    inline static const std::string NUM_NON_ZEROS = "numNonZeros";  // int (default: -1)
-    inline static const std::string HDFS = "hdfs";   // json
+    inline static const std::string NUM_NON_ZEROS = "numNonZeros"; // int (default: -1)
+    inline static const std::string HDFS = "hdfs";                 // json
     struct HDFSKeys {
         inline static const std::string isHDFS = "isHDFS";
         inline static const std::string HDFSFilename = "HDFSFilename";
diff --git a/src/parser/metadata/MetaDataParser.cpp b/src/parser/metadata/MetaDataParser.cpp
index 300a911a7..f906edc13 100644
--- a/src/parser/metadata/MetaDataParser.cpp
+++ b/src/parser/metadata/MetaDataParser.cpp
@@ -14,14 +14,14 @@
  * limitations under the License.
  */
 
-#include <parser/metadata/MetaDataParser.h>
 #include <parser/metadata/JsonKeys.h>
+#include <parser/metadata/MetaDataParser.h>
 
+#include <filesystem>
 #include <fstream>
 #include <iostream>
-#include <filesystem>
 
-FileMetaData MetaDataParser::readMetaData(const std::string& filename_) {
+FileMetaData MetaDataParser::readMetaData(const std::string &filename_) {
     std::string metaFilename = filename_ + ".meta";
     std::ifstream ifs(metaFilename, std::ios::in);
     if (!ifs.good())
@@ -30,38 +30,37 @@ FileMetaData MetaDataParser::readMetaData(const std::string& filename_) {
     buffer << ifs.rdbuf();
     return MetaDataParser::readMetaDataFromString(buffer.str());
 }
-FileMetaData MetaDataParser::readMetaDataFromString(const std::string& str) {
+FileMetaData MetaDataParser::readMetaDataFromString(const std::string &str) {
     nlohmann::json jf = nlohmann::json::parse(str);
 
     if (!keyExists(jf, JsonKeys::NUM_ROWS) || !keyExists(jf, JsonKeys::NUM_COLS)) {
-        throw std::invalid_argument("A meta data JSON file should always contain \"" + JsonKeys::NUM_ROWS + "\" and \""
-                                    + JsonKeys::NUM_COLS + "\" keys.");
+        throw std::invalid_argument("A meta data JSON file should always contain \"" + JsonKeys::NUM_ROWS +
+                                    "\" and \"" + JsonKeys::NUM_COLS + "\" keys.");
     }
 
     const size_t numRows = jf.at(JsonKeys::NUM_ROWS).get<size_t>();
     const size_t numCols = jf.at(JsonKeys::NUM_COLS).get<size_t>();
     const bool isHDFS = (keyExists(jf, JsonKeys::HDFS));
     const bool isSingleValueType = !(keyExists(jf, JsonKeys::SCHEMA));
-    const ssize_t numNonZeros = (keyExists(jf, JsonKeys::NUM_NON_ZEROS)) ? jf.at(JsonKeys::NUM_NON_ZEROS).get<ssize_t>()
-            : -1;
-    
+    const ssize_t numNonZeros =
+        (keyExists(jf, JsonKeys::NUM_NON_ZEROS)) ? jf.at(JsonKeys::NUM_NON_ZEROS).get<ssize_t>() : -1;
+
     HDFSMetaData hdfs;
-    if (isHDFS){
+    if (isHDFS) {
         // TODO check if key exist and throw errors if not
-        hdfs.isHDFS = jf.at(JsonKeys::HDFS)["isHDFS"];;
+        hdfs.isHDFS = jf.at(JsonKeys::HDFS)["isHDFS"];
+        ;
         hdfs.HDFSFilename = jf.at(JsonKeys::HDFS)["HDFSFilename"];
     }
     if (isSingleValueType) {
         if (keyExists(jf, JsonKeys::VALUE_TYPE)) {
             ValueTypeCode vtc = jf.at(JsonKeys::VALUE_TYPE).get<ValueTypeCode>();
             return {numRows, numCols, isSingleValueType, vtc, numNonZeros, hdfs};
+        } else {
+            throw std::invalid_argument("A (matrix) meta data JSON file should contain the \"" + JsonKeys::VALUE_TYPE +
+                                        "\" key.");
         }
-        else {
-            throw std::invalid_argument("A (matrix) meta data JSON file should contain the \"" + JsonKeys::VALUE_TYPE
-                    + "\" key.");
-        }
-    }
-    else {
+    } else {
         if (keyExists(jf, JsonKeys::SCHEMA)) {
             ValueTypeCode default_vtc = ValueTypeCode::INVALID;
             if (keyExists(jf, JsonKeys::VALUE_TYPE)) {
@@ -70,28 +69,28 @@ FileMetaData MetaDataParser::readMetaDataFromString(const std::string& str) {
             std::vector<ValueTypeCode> schema;
             std::vector<std::string> labels;
             auto schemaColumn = jf.at(JsonKeys::SCHEMA).get<std::vector<SchemaColumn>>();
-            for (const auto& column: schemaColumn) {
+            for (const auto &column : schemaColumn) {
                 auto vtc = column.getValueType();
                 if (vtc == ValueTypeCode::INVALID) {
                     vtc = default_vtc;
                     if (default_vtc == ValueTypeCode::INVALID)
-                        throw std::invalid_argument("While reading a frame's meta data, a column without value type was "
+                        throw std::invalid_argument("While reading a frame's meta data, a column "
+                                                    "without value type was "
                                                     "found while not providing a default value type.");
                 }
                 schema.emplace_back(vtc);
                 labels.emplace_back(column.getLabel());
             }
             return {numRows, numCols, isSingleValueType, schema, labels, numNonZeros, hdfs};
-        }
-        else {
-            throw std::invalid_argument("A (frame) meta data JSON file should contain the \"" + JsonKeys::SCHEMA
-                    + "\" key.");
+        } else {
+            throw std::invalid_argument("A (frame) meta data JSON file should contain the \"" + JsonKeys::SCHEMA +
+                                        "\" key.");
         }
     }
 }
 
-std::string MetaDataParser::writeMetaDataToString(const FileMetaData& metaData) {
-    nlohmann::json json;        
+std::string MetaDataParser::writeMetaDataToString(const FileMetaData &metaData) {
+    nlohmann::json json;
 
     json[JsonKeys::NUM_ROWS] = metaData.numRows;
     json[JsonKeys::NUM_COLS] = metaData.numCols;
@@ -100,8 +99,7 @@ std::string MetaDataParser::writeMetaDataToString(const FileMetaData& metaData)
         if (metaData.schema.size() != 1)
             throw std::runtime_error("inappropriate meta data tried to be written to file");
         json[JsonKeys::VALUE_TYPE] = metaData.schema[0];
-    }
-    else {
+    } else {
         std::vector<SchemaColumn> schemaColumns;
         // assume that the schema and labels are the same lengths
         for (unsigned int i = 0; i < metaData.schema.size(); i++) {
@@ -115,9 +113,9 @@ std::string MetaDataParser::writeMetaDataToString(const FileMetaData& metaData)
 
     if (metaData.numNonZeros != -1)
         json[JsonKeys::NUM_NON_ZEROS] = metaData.numNonZeros;
-    
+
     // HDFS
-    if (metaData.hdfs.isHDFS){
+    if (metaData.hdfs.isHDFS) {
         json[JsonKeys::HDFS][JsonKeys::HDFSKeys::isHDFS] = metaData.hdfs.isHDFS;
         std::filesystem::path filePath(metaData.hdfs.HDFSFilename);
         auto baseFileName = filePath.filename().string();
@@ -126,17 +124,16 @@ std::string MetaDataParser::writeMetaDataToString(const FileMetaData& metaData)
     }
     return json.dump();
 }
-void MetaDataParser::writeMetaData(const std::string& filename_, const FileMetaData& metaData) {
-    std::string metaFilename = filename_ + ".meta";    
+void MetaDataParser::writeMetaData(const std::string &filename_, const FileMetaData &metaData) {
+    std::string metaFilename = filename_ + ".meta";
     std::ofstream ofs(metaFilename, std::ios::out);
     if (!ofs.good())
         throw std::runtime_error("could not open file '" + metaFilename + "' for writing meta data");
 
-    if(ofs.is_open()) {
+    if (ofs.is_open()) {
         ofs << MetaDataParser::writeMetaDataToString(metaData);
-    }
-    else
+    } else
         throw std::runtime_error("could not open file '" + metaFilename + "' for writing meta data");
 }
 
-bool MetaDataParser::keyExists(const nlohmann::json& j, const std::string& key) { return j.find(key) != j.end(); }
+bool MetaDataParser::keyExists(const nlohmann::json &j, const std::string &key) { return j.find(key) != j.end(); }
diff --git a/src/parser/metadata/MetaDataParser.h b/src/parser/metadata/MetaDataParser.h
index b998037b7..c1c50f0fd 100644
--- a/src/parser/metadata/MetaDataParser.h
+++ b/src/parser/metadata/MetaDataParser.h
@@ -18,67 +18,67 @@
 
 #include <nlohmannjson/json.hpp>
 
-#include <runtime/local/io/FileMetaData.h>
 #include <runtime/local/datastructures/ValueTypeCode.h>
+#include <runtime/local/io/FileMetaData.h>
 
 #include <string>
 
 // must be in the same namespace as the enum class ValueTypeCode
-NLOHMANN_JSON_SERIALIZE_ENUM(ValueTypeCode, {
-    { ValueTypeCode::INVALID, nullptr },
-    { ValueTypeCode::SI8, "si8" },
-    { ValueTypeCode::SI32, "si32" },
-    { ValueTypeCode::SI64, "si64" },
-    { ValueTypeCode::UI8, "ui8" },
-    { ValueTypeCode::UI32, "ui32" },
-    { ValueTypeCode::UI64, "ui64" },
-    { ValueTypeCode::F32, "f32" },
-    { ValueTypeCode::F64, "f64" }
-})
+NLOHMANN_JSON_SERIALIZE_ENUM(ValueTypeCode, {{ValueTypeCode::INVALID, nullptr},
+                                             {ValueTypeCode::SI8, "si8"},
+                                             {ValueTypeCode::SI32, "si32"},
+                                             {ValueTypeCode::SI64, "si64"},
+                                             {ValueTypeCode::UI8, "ui8"},
+                                             {ValueTypeCode::UI32, "ui32"},
+                                             {ValueTypeCode::UI64, "ui64"},
+                                             {ValueTypeCode::F32, "f32"},
+                                             {ValueTypeCode::F64, "f64"}})
 
 /**
- * @brief A JSON representation of a schema column needed to serialize/deserialize
- * it to/from JSON format.
+ * @brief A JSON representation of a schema column needed to
+ * serialize/deserialize it to/from JSON format.
  */
 class SchemaColumn {
-public:
+  public:
     NLOHMANN_DEFINE_TYPE_INTRUSIVE(SchemaColumn, label, valueType)
-    [[nodiscard]] const std::string& getLabel() const { return label; }
+    [[nodiscard]] const std::string &getLabel() const { return label; }
     [[nodiscard]] ValueTypeCode getValueType() const { return valueType; }
-    void setLabel(const std::string& label_) { this->label = label_; }
+    void setLabel(const std::string &label_) { this->label = label_; }
     void setValueType(ValueTypeCode valueType_) { this->valueType = valueType_; }
 
-private:
+  private:
     std::string label;
     ValueTypeCode valueType;
 };
 
 class MetaDataParser {
 
-public:
+  public:
     /**
      * @brief Retrieves the file meta data for the specified file.
      *
      * @param filename The name of the file for which to retrieve the meta data.
      * Meta data should be passed using a simple JSON-based format.
      * @return The meta data of the specified file.
-     * @throws std::runtime_error Thrown if the specified file could not be open.
-     * @throws std::invalid_argument Thrown if the JSON file contains any unexpected
-     * keys or if the file doesn't contain all the metadata.
+     * @throws std::runtime_error Thrown if the specified file could not be
+     * open.
+     * @throws std::invalid_argument Thrown if the JSON file contains any
+     * unexpected keys or if the file doesn't contain all the metadata.
      */
-    static FileMetaData readMetaData(const std::string& filename);
-    static FileMetaData readMetaDataFromString(const std::string& str);
+    static FileMetaData readMetaData(const std::string &filename);
+    static FileMetaData readMetaDataFromString(const std::string &str);
     /**
-     * @brief Saves the file meta data to the specified file. 
-     * 
+     * @brief Saves the file meta data to the specified file.
+     *
      * @param filename The name of the file for which to save the
      * meta data. Note that the extension ".meta" is appended to this filename.
-     * @throws std::runtime_error Thrown if the specified file could not be openn.
+     * @throws std::runtime_error Thrown if the specified file could not be
+     * openn.
      */
-    static void writeMetaData(const std::string& filename, const FileMetaData& metaData);
-    static std::string writeMetaDataToString(const FileMetaData& metaData);
+    static void writeMetaData(const std::string &filename, const FileMetaData &metaData);
+    static std::string writeMetaDataToString(const FileMetaData &metaData);
 
-private:
+  private:
     /**
      * @brief Checks whether a specified key exists in JSON or not.
      *
@@ -86,5 +86,5 @@ class MetaDataParser {
      * @param key A JSON key.
      * @return True if the key exists; otherwise, false.
      */
-    static bool keyExists(const nlohmann::json& j, const std::string& key);
+    static bool keyExists(const nlohmann::json &j, const std::string &key);
 };
diff --git a/src/parser/sql/SQLParser.cpp b/src/parser/sql/SQLParser.cpp
index 3cff6f9c6..cc8595e32 100644
--- a/src/parser/sql/SQLParser.cpp
+++ b/src/parser/sql/SQLParser.cpp
@@ -19,9 +19,9 @@
 #include <parser/sql/SQLVisitor.h>
 #include <util/ErrorHandler.h>
 
-#include "antlr4-runtime.h"
 #include "SQLGrammarLexer.h"
 #include "SQLGrammarParser.h"
+#include "antlr4-runtime.h"
 
 #include <mlir/IR/Block.h>
 #include <mlir/IR/Builders.h>
@@ -32,17 +32,11 @@
 #include <istream>
 #include <parser/CancelingErrorListener.h>
 
-void SQLParser::setView(std::unordered_map <std::string, mlir::Value> arg){
-    view = arg;
-}
+void SQLParser::setView(std::unordered_map<std::string, mlir::Value> arg) { view = arg; }
 
-void SQLParser::setSqlOp(mlir::daphne::SqlOp sqlOp) {
-    this->sqlOp = sqlOp;
-}
+void SQLParser::setSqlOp(mlir::daphne::SqlOp sqlOp) { this->sqlOp = sqlOp; }
 
-mlir::Value SQLParser::parseStreamFrame(mlir::OpBuilder &builder,
-                                        std::istream &stream,
-                                        const std::string &sourceName) {
+mlir::Value SQLParser::parseStreamFrame(mlir::OpBuilder &builder, std::istream &stream, const std::string &sourceName) {
     CancelingErrorListener errorListener;
     auto errorStrategy = std::make_shared<antlr4::BailErrorStrategy>();
     {
@@ -55,24 +49,22 @@ mlir::Value SQLParser::parseStreamFrame(mlir::OpBuilder &builder,
         SQLGrammarParser parser(&tokens);
         // TODO: evaluate if overloading error handler makes sense
         parser.setErrorHandler(errorStrategy);
-        SQLGrammarParser::SqlContext * ctx = parser.sql();
+        SQLGrammarParser::SqlContext *ctx = parser.sql();
         SQLVisitor visitor(builder, view, sqlOp);
         antlrcpp::Any a;
         try {
             a = visitor.visitSql(ctx);
-        }
-        catch (std::runtime_error& re) {
-            throw ErrorHandler::rethrowError(
-                "SQLParser", re.what());
+        } catch (std::runtime_error &re) {
+            throw ErrorHandler::rethrowError("SQLParser", re.what());
         }
 
-        if(a.is<mlir::Value>()){
-          return a.as<mlir::Value>();
+        if (a.is<mlir::Value>()) {
+            return a.as<mlir::Value>();
         }
         throw std::runtime_error("expected a mlir::Value");
     }
 }
 
-void SQLParser::parseStream(mlir::OpBuilder & builder, std::istream & stream, const std::string &sourceName){
+void SQLParser::parseStream(mlir::OpBuilder &builder, std::istream &stream, const std::string &sourceName) {
     parseStreamFrame(builder, stream, sourceName);
 }
diff --git a/src/parser/sql/SQLParser.h b/src/parser/sql/SQLParser.h
index b5f0de823..8342a9fd6 100644
--- a/src/parser/sql/SQLParser.h
+++ b/src/parser/sql/SQLParser.h
@@ -28,16 +28,15 @@
 
 struct SQLParser : public Parser {
 
-    std::unordered_map <std::string, mlir::Value> view;
+    std::unordered_map<std::string, mlir::Value> view;
     mlir::daphne::SqlOp sqlOp;
 
-    void setView(std::unordered_map <std::string, mlir::Value> view);
+    void setView(std::unordered_map<std::string, mlir::Value> view);
     void setSqlOp(mlir::daphne::SqlOp);
 
-    void parseStream(mlir::OpBuilder & builder, std::istream & stream, const std::string &sourceName) override;
+    void parseStream(mlir::OpBuilder &builder, std::istream &stream, const std::string &sourceName) override;
 
-    mlir::Value parseStreamFrame(mlir::OpBuilder &builder, std::istream &stream,
-                                 const std::string &sourceName);
+    mlir::Value parseStreamFrame(mlir::OpBuilder &builder, std::istream &stream, const std::string &sourceName);
 };
 
 #endif /* SRC_PARSER_SQL_SQLPARSER_H */
diff --git a/src/parser/sql/SQLVisitor.cpp b/src/parser/sql/SQLVisitor.cpp
index b3d0d35b8..d3c2a3d9f 100644
--- a/src/parser/sql/SQLVisitor.cpp
+++ b/src/parser/sql/SQLVisitor.cpp
@@ -14,19 +14,18 @@
  * limitations under the License.
  */
 
-
-#include <ir/daphneir/Daphne.h>
-#include <parser/sql/SQLVisitor.h>
 #include "antlr4-runtime.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/OpDefinition.h"
+#include <ir/daphneir/Daphne.h>
+#include <parser/sql/SQLVisitor.h>
 #include <util/ErrorHandler.h>
 
+#include <algorithm>
+#include <sstream>
 #include <stdexcept>
 #include <string>
 #include <vector>
-#include <sstream>
-#include <algorithm>
 
 #include <cstdint>
 #include <regex>
@@ -37,14 +36,12 @@
 /**
  * @brief Test if the flag at the position is set and returns result
  */
-bool isBitSet(int64_t flag, int64_t position){
-    return ((flag >> position) & 1) == 1;
-}
+bool isBitSet(int64_t flag, int64_t position) { return ((flag >> position) & 1) == 1; }
 
 /**
  * @brief Sets the Flag at the given position with a value
  */
-void setBit(int64_t& flag, int64_t position, int64_t val){
+void setBit(int64_t &flag, int64_t position, int64_t val) {
     val = !!val;
     flag ^= (-val ^ flag) & (0b1 << position);
 }
@@ -52,14 +49,12 @@ void setBit(int64_t& flag, int64_t position, int64_t val){
 /**
  * @brief Flips the bit of the Flag at the position
  */
-void toggleBit(int64_t& flag, int64_t position){
-    setBit(flag, position, !isBitSet(flag, position));
-}
+void toggleBit(int64_t &flag, int64_t position) { setBit(flag, position, !isBitSet(flag, position)); }
 
 /**
  * @brief Creates a lower cast version of a string
  */
-std::string toLower(std::string str){
+std::string toLower(std::string str) {
     std::transform(str.begin(), str.end(), str.begin(), ::tolower);
     return str;
 }
@@ -68,57 +63,46 @@ std::string toLower(std::string str){
 // Member Helper functions
 // ****************************************************************************
 
-template<typename T>
-T fetch(std::unordered_map <std::string, T> x, const std::string& name) {
+template <typename T> T fetch(std::unordered_map<std::string, T> x, const std::string &name) {
     auto search = x.find(name);
     return search != x.end() ? search->second : nullptr;
 }
 
-template<>
-std::string fetch<std::string>(
-    std::unordered_map <std::string, std::string> x,
-    const std::string& name
-)
-{
+template <> std::string fetch<std::string>(std::unordered_map<std::string, std::string> x, const std::string &name) {
     auto search = x.find(name);
-    if(search != x.end()){
+    if (search != x.end()) {
         return search->second;
     }
     return "";
 }
 
-void SQLVisitor::registerAlias(const std::string& name, mlir::Value arg){
-    alias[name] = arg;
-}
+void SQLVisitor::registerAlias(const std::string &name, mlir::Value arg) { alias[name] = arg; }
 
-std::string SQLVisitor::setFramePrefix(
-    const std::string& framename,
-    const std::string& prefix,
-    bool necessary = true,
-    bool ignore = false
-)
-{
+std::string SQLVisitor::setFramePrefix(const std::string &framename, const std::string &prefix, bool necessary = true,
+                                       bool ignore = false) {
     bool frameHasPrefix = !fetch<std::string>(framePrefix, framename).empty();
-    if(frameHasPrefix){
-        if(necessary){
+    if (frameHasPrefix) {
+        if (necessary) {
             std::stringstream x;
-            x << "Error: " << framename << " is marked as necessary for Prefix generation, but already got a Prefix. Please consider an Alias\n";
+            x << "Error: " << framename
+              << " is marked as necessary for Prefix generation, but already "
+                 "got a Prefix. Please consider an Alias\n";
             throw ErrorHandler::compilerError(queryLoc, "SQLVisitor", x.str());
-        }else{
+        } else {
             return "";
         }
     }
     bool inuse = !fetch<std::string>(reverseFramePrefix, prefix).empty() && !ignore;
     std::string newPrefix = prefix;
     int i = 1;
-    while(inuse){
+    while (inuse) {
         std::stringstream x;
         x << prefix << i;
         newPrefix = x.str();
         inuse = !fetch<std::string>(reverseFramePrefix, newPrefix).empty();
     }
 
-    if(!ignore){
+    if (!ignore) {
         reverseFramePrefix[newPrefix] = framename;
     }
     framePrefix[framename] = newPrefix;
@@ -126,9 +110,9 @@ std::string SQLVisitor::setFramePrefix(
     return newPrefix;
 }
 
-[[maybe_unused]] mlir::Value SQLVisitor::fetchAlias(const std::string& name){
+[[maybe_unused]] mlir::Value SQLVisitor::fetchAlias(const std::string &name) {
     auto res = fetch<mlir::Value>(alias, name);
-    if(res != nullptr){
+    if (res != nullptr) {
         return res;
     }
     std::stringstream x;
@@ -136,33 +120,35 @@ std::string SQLVisitor::setFramePrefix(
     throw ErrorHandler::compilerError(queryLoc, "SQLVisitor", x.str());
 }
 
-mlir::Value SQLVisitor::fetchMLIR(const std::string& name){
+mlir::Value SQLVisitor::fetchMLIR(const std::string &name) {
     mlir::Value res;
     res = fetch<mlir::Value>(alias, name);
-    if(res != nullptr){
+    if (res != nullptr) {
         return res;
     }
     res = fetch<mlir::Value>(view, name);
-    if(res != nullptr){
+    if (res != nullptr) {
         return res;
     }
     std::stringstream x;
-    x << "Error: " << name << " was not registered with the Function \"registerView\" or were given an alias\n";
+    x << "Error: " << name
+      << " was not registered with the Function \"registerView\" or were given "
+         "an alias\n";
     throw ErrorHandler::compilerError(queryLoc, "SQLVisitor", x.str());
 }
 
-std::string SQLVisitor::fetchPrefix(const std::string& name){
+std::string SQLVisitor::fetchPrefix(const std::string &name) {
     auto prefix = fetch<std::string>(framePrefix, name);
     return prefix.empty() ? "" : prefix;
 }
 
-bool SQLVisitor::hasMLIR(const std::string& name){
+bool SQLVisitor::hasMLIR(const std::string &name) {
     auto searchview = view.find(name);
     auto searchalias = alias.find(name);
     return (searchview != view.end() || searchalias != alias.end());
 }
 
-mlir::Value SQLVisitor::createStringConstant(std::string str){
+mlir::Value SQLVisitor::createStringConstant(std::string str) {
     // Replace escape sequences.
     str = std::regex_replace(str, std::regex(R"(\\b)"), "\b");
     str = std::regex_replace(str, std::regex(R"(\\f)"), "\f");
@@ -171,60 +157,41 @@ mlir::Value SQLVisitor::createStringConstant(std::string str){
     str = std::regex_replace(str, std::regex(R"(\\t)"), "\t");
     str = std::regex_replace(str, std::regex(R"(\\\")"), "\"");
     str = std::regex_replace(str, std::regex(R"(\\\\)"), "\\");
-    return static_cast<mlir::Value>(
-        builder.create<mlir::daphne::ConstantOp>(queryLoc, str)
-    );
+    return static_cast<mlir::Value>(builder.create<mlir::daphne::ConstantOp>(queryLoc, str));
 }
 
-mlir::Value SQLVisitor::castToMatrixColumn(mlir::Value toCast){
-    if(llvm::isa<mlir::daphne::MatrixType>(toCast.getType())){
+mlir::Value SQLVisitor::castToMatrixColumn(mlir::Value toCast) {
+    if (llvm::isa<mlir::daphne::MatrixType>(toCast.getType())) {
         return toCast;
-    }else{
-        mlir::Value numRow = static_cast<mlir::Value>(
-            builder.create<mlir::daphne::NumRowsOp>(
-                queryLoc,
-                utils.sizeType,
-                currentFrame
-            ));
-
-        mlir::Value one = static_cast<mlir::Value>(
-            builder.create<mlir::daphne::ConstantOp>(
-                queryLoc, static_cast<int64_t>(1)
-            ));
+    } else {
+        mlir::Value numRow =
+            static_cast<mlir::Value>(builder.create<mlir::daphne::NumRowsOp>(queryLoc, utils.sizeType, currentFrame));
 
-        return static_cast<mlir::Value>(
-            builder.create<mlir::daphne::FillOp>(
-                queryLoc,
-                utils.matrixOf(toCast),
-                toCast,
-                numRow,
-                utils.castSizeIf(one)
-            ));
+        mlir::Value one =
+            static_cast<mlir::Value>(builder.create<mlir::daphne::ConstantOp>(queryLoc, static_cast<int64_t>(1)));
+
+        return static_cast<mlir::Value>(builder.create<mlir::daphne::FillOp>(queryLoc, utils.matrixOf(toCast), toCast,
+                                                                             numRow, utils.castSizeIf(one)));
     }
 }
 
-mlir::Value SQLVisitor::castToIntMatrixColumn(mlir::Value toCast){
+mlir::Value SQLVisitor::castToIntMatrixColumn(mlir::Value toCast) {
     mlir::Value toCastMatrix = castToMatrixColumn(toCast);
     mlir::Type vt = utils.getValueTypeByName("si64");
     mlir::Type resType = utils.matrixOf(vt);
 
-    if(toCastMatrix.getType() != resType){
-        mlir::Value toCastFrame = matrixToFrame(toCastMatrix, "TempCol"); // We need this step because castOp can't cast a matrix to a matrix
+    if (toCastMatrix.getType() != resType) {
+        mlir::Value toCastFrame = matrixToFrame(toCastMatrix, "TempCol"); // We need this step because castOp can't
+                                                                          // cast a matrix to a matrix
 
-        return static_cast<mlir::Value>(builder.create<mlir::daphne::CastOp>(
-                queryLoc, resType, toCastFrame
-        ));
+        return static_cast<mlir::Value>(builder.create<mlir::daphne::CastOp>(queryLoc, resType, toCastFrame));
     }
     return toCastMatrix;
 }
 
-mlir::Value SQLVisitor::matrixToFrame(
-    mlir::Value matrix,
-    std::string newColumnName
-)
-{
-    if(llvm::isa<mlir::daphne::MatrixType>(matrix.getType())){
-        //make a Frame from the Matrix.
+mlir::Value SQLVisitor::matrixToFrame(mlir::Value matrix, std::string newColumnName) {
+    if (llvm::isa<mlir::daphne::MatrixType>(matrix.getType())) {
+        // make a Frame from the Matrix.
         std::vector<mlir::Type> colTypes;
         std::vector<mlir::Value> cols;
         std::vector<mlir::Value> labels;
@@ -234,97 +201,72 @@ mlir::Value SQLVisitor::matrixToFrame(
         labels.push_back(createStringConstant(newColumnName));
 
         mlir::Type t = mlir::daphne::FrameType::get(builder.getContext(), colTypes);
-        return static_cast<mlir::Value>(
-            builder.create<mlir::daphne::CreateFrameOp>(queryLoc, t, cols, labels)
-        );
-    }else{
+        return static_cast<mlir::Value>(builder.create<mlir::daphne::CreateFrameOp>(queryLoc, t, cols, labels));
+    } else {
         std::stringstream err_msg;
         err_msg << "matrixToFrame expects a mlir::daphne::MatrixType\n";
         throw ErrorHandler::compilerError(queryLoc, "SQLVisitor", err_msg.str());
     }
 }
 
-mlir::Value SQLVisitor::addMatrixToCurrentFrame(
-    mlir::Value matrix,
-    std::string newColumnName
-)
-{
-    if(llvm::isa<mlir::daphne::MatrixType>(matrix.getType())){
+mlir::Value SQLVisitor::addMatrixToCurrentFrame(mlir::Value matrix, std::string newColumnName) {
+    if (llvm::isa<mlir::daphne::MatrixType>(matrix.getType())) {
         mlir::Value add = matrixToFrame(matrix, newColumnName);
 
-        //ADD new Frame to currentFrame
+        // ADD new Frame to currentFrame
         std::vector<mlir::Type> currentFrame_colTypes;
-        for(mlir::Type t : currentFrame.getType().dyn_cast<mlir::daphne::FrameType>().getColumnTypes())
+        for (mlir::Type t : currentFrame.getType().dyn_cast<mlir::daphne::FrameType>().getColumnTypes())
             currentFrame_colTypes.push_back(t);
-        for(mlir::Type t : add.getType().dyn_cast<mlir::daphne::FrameType>().getColumnTypes())
+        for (mlir::Type t : add.getType().dyn_cast<mlir::daphne::FrameType>().getColumnTypes())
             currentFrame_colTypes.push_back(t);
         mlir::Type resType = mlir::daphne::FrameType::get(builder.getContext(), currentFrame_colTypes);
 
-        currentFrame = static_cast<mlir::Value>(
-            builder.create<mlir::daphne::ColBindOp>(
-                queryLoc,
-                resType,
-                currentFrame,
-                add
-            )
-        );
+        currentFrame =
+            static_cast<mlir::Value>(builder.create<mlir::daphne::ColBindOp>(queryLoc, resType, currentFrame, add));
         return currentFrame;
-    }else{
+    } else {
         std::stringstream err_msg;
         err_msg << "addMatrixToCurrentFrame expects a mlir::daphne::MatrixType\n";
         throw ErrorHandler::compilerError(queryLoc, "SQLVisitor", err_msg.str());
     }
 }
 
-mlir::Value SQLVisitor::getColIdx(
-    mlir::Value frame,
-    mlir::Value colName
-)
-{
+mlir::Value SQLVisitor::getColIdx(mlir::Value frame, mlir::Value colName) {
     return static_cast<mlir::Value>(
-        builder.create<mlir::daphne::GetColIdxOp>(
-            queryLoc,
-            utils.sizeType,
-            frame,
-            colName
-        )
-    );
+        builder.create<mlir::daphne::GetColIdxOp>(queryLoc, utils.sizeType, frame, colName));
 }
 
-mlir::Value SQLVisitor::extractColumnAsMatrixFromFrame(
-    mlir::Value frame,
-    mlir::Value colname
-)
-{
+mlir::Value SQLVisitor::extractColumnAsMatrixFromFrame(mlir::Value frame, mlir::Value colname) {
 
-    //TODO: Integration of Transformation of ColName to ColIdx Op.
-    //mlir::Value colIdx = getColIdx(frame, colname);
+    // TODO: Integration of Transformation of ColName to ColIdx Op.
+    // mlir::Value colIdx = getColIdx(frame, colname);
     mlir::Value col = extractColumnFromFrame(frame, colname);
 
     mlir::Type vt = utils.unknownType;
     mlir::Type resType = utils.matrixOf(vt);
-    return static_cast<mlir::Value>(builder.create<mlir::daphne::CastOp>(
-            queryLoc,
-            resType,
-            col
-    ));
+    return static_cast<mlir::Value>(builder.create<mlir::daphne::CastOp>(queryLoc, resType, col));
 }
 
-mlir::Attribute SQLVisitor::getGroupEnum(const std::string & func){
-    if(func == "count"){
-        return static_cast<mlir::Attribute>(mlir::daphne::GroupEnumAttr::get(builder.getContext(), mlir::daphne::GroupEnum::COUNT));
+mlir::Attribute SQLVisitor::getGroupEnum(const std::string &func) {
+    if (func == "count") {
+        return static_cast<mlir::Attribute>(
+            mlir::daphne::GroupEnumAttr::get(builder.getContext(), mlir::daphne::GroupEnum::COUNT));
     }
-    if(func == "sum"){
-        return static_cast<mlir::Attribute>(mlir::daphne::GroupEnumAttr::get(builder.getContext(), mlir::daphne::GroupEnum::SUM));
+    if (func == "sum") {
+        return static_cast<mlir::Attribute>(
+            mlir::daphne::GroupEnumAttr::get(builder.getContext(), mlir::daphne::GroupEnum::SUM));
     }
-    if(func == "min"){
-        return static_cast<mlir::Attribute>(mlir::daphne::GroupEnumAttr::get(builder.getContext(), mlir::daphne::GroupEnum::MIN));
+    if (func == "min") {
+        return static_cast<mlir::Attribute>(
+            mlir::daphne::GroupEnumAttr::get(builder.getContext(), mlir::daphne::GroupEnum::MIN));
     }
-    if(func == "max"){
-        return static_cast<mlir::Attribute>(mlir::daphne::GroupEnumAttr::get(builder.getContext(), mlir::daphne::GroupEnum::MAX));
+    if (func == "max") {
+        return static_cast<mlir::Attribute>(
+            mlir::daphne::GroupEnumAttr::get(builder.getContext(), mlir::daphne::GroupEnum::MAX));
     }
-    if(func == "avg"){
-        return static_cast<mlir::Attribute>(mlir::daphne::GroupEnumAttr::get(builder.getContext(), mlir::daphne::GroupEnum::AVG));
+    if (func == "avg") {
+        return static_cast<mlir::Attribute>(
+            mlir::daphne::GroupEnumAttr::get(builder.getContext(), mlir::daphne::GroupEnum::AVG));
     }
     std::stringstream x;
     x << "Error: " << func << " does not name an aggregation Function for Group\n";
@@ -332,411 +274,310 @@ mlir::Attribute SQLVisitor::getGroupEnum(const std::string & func){
     return nullptr;
 }
 
-mlir::Attribute SQLVisitor::getCompareEnum(const std::string & op){
-    if(op == "="){
+mlir::Attribute SQLVisitor::getCompareEnum(const std::string &op) {
+    if (op == "=") {
         return static_cast<mlir::Attribute>(
-            mlir::daphne::CompareOperationAttr::get(builder.getContext(),
-            mlir::daphne::CompareOperation::Equal)
-        );
+            mlir::daphne::CompareOperationAttr::get(builder.getContext(), mlir::daphne::CompareOperation::Equal));
     }
-    if(op == "<"){
+    if (op == "<") {
         return static_cast<mlir::Attribute>(
-            mlir::daphne::CompareOperationAttr::get(builder.getContext(),
-            mlir::daphne::CompareOperation::LessThan)
-        );
+            mlir::daphne::CompareOperationAttr::get(builder.getContext(), mlir::daphne::CompareOperation::LessThan));
     }
-    if(op == "<="){
+    if (op == "<=") {
         return static_cast<mlir::Attribute>(
-            mlir::daphne::CompareOperationAttr::get(builder.getContext(),
-            mlir::daphne::CompareOperation::LessEqual)
-        );
+            mlir::daphne::CompareOperationAttr::get(builder.getContext(), mlir::daphne::CompareOperation::LessEqual));
     }
-    if(op == ">"){
+    if (op == ">") {
         return static_cast<mlir::Attribute>(
-            mlir::daphne::CompareOperationAttr::get(builder.getContext(),
-            mlir::daphne::CompareOperation::GreaterThan)
-        );
+            mlir::daphne::CompareOperationAttr::get(builder.getContext(), mlir::daphne::CompareOperation::GreaterThan));
     }
-    if(op == ">="){
-        return static_cast<mlir::Attribute>(
-            mlir::daphne::CompareOperationAttr::get(builder.getContext(),
-            mlir::daphne::CompareOperation::GreaterEqual)
-        );
+    if (op == ">=") {
+        return static_cast<mlir::Attribute>(mlir::daphne::CompareOperationAttr::get(
+            builder.getContext(), mlir::daphne::CompareOperation::GreaterEqual));
     }
-    if(op == "<>" or op == "!="){
+    if (op == "<>" or op == "!=") {
         return static_cast<mlir::Attribute>(
-            mlir::daphne::CompareOperationAttr::get(builder.getContext(),
-            mlir::daphne::CompareOperation::NotEqual)
-        );
+            mlir::daphne::CompareOperationAttr::get(builder.getContext(), mlir::daphne::CompareOperation::NotEqual));
     }
     std::stringstream x;
     x << "Error: " << op << " does not name a compare operation\n";
     throw ErrorHandler::compilerError(queryLoc, "SQLVisitor", x.str());
 }
 
-
-std::string SQLVisitor::getEnumLabelExt(const std::string &func){
-    return mlir::daphne::stringifyGroupEnum(getGroupEnum(func).dyn_cast<mlir::daphne::GroupEnumAttr>().getValue()).str();
+std::string SQLVisitor::getEnumLabelExt(const std::string &func) {
+    return mlir::daphne::stringifyGroupEnum(getGroupEnum(func).dyn_cast<mlir::daphne::GroupEnumAttr>().getValue())
+        .str();
 }
 
-mlir::Value SQLVisitor::extractColumnFromFrame(
-    mlir::Value frame,
-    mlir::Value columnName
-) {
+mlir::Value SQLVisitor::extractColumnFromFrame(mlir::Value frame, mlir::Value columnName) {
     mlir::Type vt = utils.unknownType;
-    mlir::Type resTypeCol = mlir::daphne::FrameType::get(
-            builder.getContext(), {vt}
-    );
-
-    return(static_cast<mlir::Value>(
-        builder.create<mlir::daphne::ExtractColOp>(
-            queryLoc,
-            resTypeCol,
-            frame,
-            columnName
-        )
-    ));
+    mlir::Type resTypeCol = mlir::daphne::FrameType::get(builder.getContext(), {vt});
+
+    return (
+        static_cast<mlir::Value>(builder.create<mlir::daphne::ExtractColOp>(queryLoc, resTypeCol, frame, columnName)));
 }
 
 // ****************************************************************************
 // Visitor functions wrongeedsda
 // ****************************************************************************
 
-//script
-antlrcpp::Any SQLVisitor::visitScript(
-    SQLGrammarParser::ScriptContext * ctx
-)
-{
+// script
+antlrcpp::Any SQLVisitor::visitScript(SQLGrammarParser::ScriptContext *ctx) {
     mlir::Value res = utils.valueOrError(utils.getLoc(ctx->start), visitChildren(ctx));
     return res;
 }
 
-//sql
-antlrcpp::Any SQLVisitor::visitSql(
-    SQLGrammarParser::SqlContext * ctx
-)
-{
+// sql
+antlrcpp::Any SQLVisitor::visitSql(SQLGrammarParser::SqlContext *ctx) {
     mlir::Value res = valueOrErrorOnVisit(ctx->query());
     return res;
 }
 
-//query
-antlrcpp::Any SQLVisitor::visitQuery(
-    SQLGrammarParser::QueryContext * ctx
-)
-{
+// query
+antlrcpp::Any SQLVisitor::visitQuery(SQLGrammarParser::QueryContext *ctx) {
     mlir::Value res = valueOrErrorOnVisit(ctx->select());
     return res;
 }
 
-//select
-antlrcpp::Any SQLVisitor::visitSelect(
-    SQLGrammarParser::SelectContext * ctx
-)
-{
+// select
+antlrcpp::Any SQLVisitor::visitSelect(SQLGrammarParser::SelectContext *ctx) {
     mlir::Value res;
 
-    //Setting Codegeneration for Where Clause
+    // Setting Codegeneration for Where Clause
     setBit(sqlFlag, (int64_t)SQLBit::codegen, 1);
 
-    //Creating a Frame using FROM and JOIN
-    try{
+    // Creating a Frame using FROM and JOIN
+    try {
         currentFrame = valueOrErrorOnVisit(ctx->tableExpr());
-    }catch(std::runtime_error & e){
+    } catch (std::runtime_error &e) {
         std::stringstream err_msg;
-        err_msg << "Error during From statement. "
-            << "Couldn't create Frame.\n\t\t" << e.what();
+        err_msg << "Error during From statement. " << "Couldn't create Frame.\n\t\t" << e.what();
         throw ErrorHandler::rethrowError("SQLVisitor (visitSelect)", err_msg.str());
     }
 
-    //If a where clause exist, filter <currentFrame> accordingly.
-    if(ctx->whereClause()){
+    // If a where clause exist, filter <currentFrame> accordingly.
+    if (ctx->whereClause()) {
         currentFrame = valueOrErrorOnVisit(ctx->whereClause());
     }
 
-    //In case of a group by clause, we deactivate code generation for a moment
-    //to scan the projection and havingClause for identifiers that need to be
-    //included in the group. NOTE: in case the having or projection includes an
-    //aggregation function we are going to generate the code on which the
-    //aggregation and extend the currentFrame with it.
-    if(ctx->groupByClause()){
+    // In case of a group by clause, we deactivate code generation for a moment
+    // to scan the projection and havingClause for identifiers that need to be
+    // included in the group. NOTE: in case the having or projection includes an
+    // aggregation function we are going to generate the code on which the
+    // aggregation and extend the currentFrame with it.
+    if (ctx->groupByClause()) {
         setBit(sqlFlag, (int64_t)SQLBit::group, 1);
         setBit(sqlFlag, (int64_t)SQLBit::codegen, 0);
         visit(ctx->groupByClause());
-        for(size_t i = 0; i < ctx->selectExpr().size(); i++){
+        for (size_t i = 0; i < ctx->selectExpr().size(); i++) {
             visit(ctx->selectExpr(i));
         }
         setBit(sqlFlag, (int64_t)SQLBit::codegen, 1);
         visit(ctx->groupByClause());
     }
 
-    if(ctx->orderByClause()){
+    if (ctx->orderByClause()) {
         currentFrame = valueOrErrorOnVisit(ctx->orderByClause());
     }
 
-    //Runs over the projections and seeks columns and adds them to a Frame,
-    //which is the result of this function
+    // Runs over the projections and seeks columns and adds them to a Frame,
+    // which is the result of this function
     res = valueOrErrorOnVisit(ctx->selectExpr(0));
-    for(auto i = 1ul; i < ctx->selectExpr().size(); i++){
+    for (auto i = 1ul; i < ctx->selectExpr().size(); i++) {
         mlir::Value add;
-        try{
+        try {
             add = valueOrErrorOnVisit(ctx->selectExpr(i));
-        }catch(std::runtime_error &e){
+        } catch (std::runtime_error &e) {
             std::stringstream err_msg;
             err_msg << "Something went wrong in SelectExpr.\n\t\t" << e.what();
             throw ErrorHandler::rethrowError("SQLVisitor (visitSelect)", err_msg.str());
         }
-        try{
+        try {
             std::vector<mlir::Type> colTypes;
-            for(mlir::Type t : res.getType().dyn_cast<mlir::daphne::FrameType>().getColumnTypes())
+            for (mlir::Type t : res.getType().dyn_cast<mlir::daphne::FrameType>().getColumnTypes())
                 colTypes.push_back(t);
-            for(mlir::Type t : add.getType().dyn_cast<mlir::daphne::FrameType>().getColumnTypes())
+            for (mlir::Type t : add.getType().dyn_cast<mlir::daphne::FrameType>().getColumnTypes())
                 colTypes.push_back(t);
             mlir::Type resType = mlir::daphne::FrameType::get(builder.getContext(), colTypes);
 
-            res = static_cast<mlir::Value>(
-                builder.create<mlir::daphne::ColBindOp>(
-                    queryLoc,
-                    resType,
-                    res,
-                    add
-                )
-            );
-        }catch(std::runtime_error & e){
+            res = static_cast<mlir::Value>(builder.create<mlir::daphne::ColBindOp>(queryLoc, resType, res, add));
+        } catch (std::runtime_error &e) {
             std::stringstream err_msg;
-            err_msg << "Error during SELECT statement. "
-                << "Couldn't Insert Extracted Columns.\n\t\t" << e.what();
+            err_msg << "Error during SELECT statement. " << "Couldn't Insert Extracted Columns.\n\t\t" << e.what();
             throw ErrorHandler::rethrowError("SQLVisitor (visitSelect)", err_msg.str());
         }
     }
     currentFrame = res;
-    if(ctx->distinctExpr()) {
+    if (ctx->distinctExpr()) {
         res = valueOrErrorOnVisit(ctx->distinctExpr());
     }
     return res;
 }
 
-//subquery
-antlrcpp::Any SQLVisitor::visitSubquery(
-    SQLGrammarParser::SubqueryContext * ctx
-)
-{
-    //TODO: Subquery Implementations
+// subquery
+antlrcpp::Any SQLVisitor::visitSubquery(SQLGrammarParser::SubqueryContext *ctx) {
+    // TODO: Subquery Implementations
     return visitChildren(ctx);
 }
 
-//subqueryExpr
-antlrcpp::Any SQLVisitor::visitSubqueryExpr(
-    SQLGrammarParser::SubqueryExprContext * ctx
-)
-{
-    //TODO: Subquery Implementations
+// subqueryExpr
+antlrcpp::Any SQLVisitor::visitSubqueryExpr(SQLGrammarParser::SubqueryExprContext *ctx) {
+    // TODO: Subquery Implementations
     return nullptr;
 }
 
-//SelectExpr
-antlrcpp::Any SQLVisitor::visitSelectExpr(
-    SQLGrammarParser::SelectExprContext * ctx
-)
-{
+// SelectExpr
+antlrcpp::Any SQLVisitor::visitSelectExpr(SQLGrammarParser::SelectExprContext *ctx) {
     mlir::Value matrix;
     antlrcpp::Any vExpr = visit(ctx->var);
 
-    if(!isBitSet(sqlFlag, (int64_t)SQLBit::codegen)){
+    if (!isBitSet(sqlFlag, (int64_t)SQLBit::codegen)) {
         return nullptr;
     }
 
-    //we get a Matrix or int/float value. From this we generate a Matrix.
+    // we get a Matrix or int/float value. From this we generate a Matrix.
     mlir::Value expr = utils.valueOrError(utils.getLoc(ctx->var->start), vExpr);
 
-    if(llvm::isa<mlir::daphne::FrameType>(expr.getType())){
+    if (llvm::isa<mlir::daphne::FrameType>(expr.getType())) {
         return expr;
     }
 
     matrix = castToMatrixColumn(expr);
 
-    //Now we look up what the label for the result should be
+    // Now we look up what the label for the result should be
     std::string label;
-    if(ctx->aka){
+    if (ctx->aka) {
         label = ctx->aka->getText();
-    }else{
+    } else {
         label = ctx->getText();
     }
-    //And generate with the matrix and label a Frame which we return
+    // And generate with the matrix and label a Frame which we return
     return matrixToFrame(matrix, label);
 }
 
-//tableExpr
-antlrcpp::Any SQLVisitor::visitTableExpr(
-    SQLGrammarParser::TableExprContext * ctx
-)
-{
-    //We set the current frame as the result of the fromExpr
+// tableExpr
+antlrcpp::Any SQLVisitor::visitTableExpr(SQLGrammarParser::TableExprContext *ctx) {
+    // We set the current frame as the result of the fromExpr
     currentFrame = valueOrErrorOnVisit(ctx->fromExpr());
-    //And join other frames to the currentFrame.
-    for(size_t i = 0; i < ctx->joinExpr().size(); i++){
+    // And join other frames to the currentFrame.
+    for (size_t i = 0; i < ctx->joinExpr().size(); i++) {
         currentFrame = valueOrErrorOnVisit(ctx->joinExpr(i));
     }
     return currentFrame;
 }
 
-//distinctExpr
-antlrcpp::Any SQLVisitor::visitDistinctExpr(
-        SQLGrammarParser::DistinctExprContext *ctx
-)
-{
-  if (isBitSet(sqlFlag, (int64_t)SQLBit::group) // If group is active
-      && columnName.size())                     // AND there is an aggregation
-  {
-      throw ErrorHandler::compilerError(queryLoc, "SQLVisitor",
-        "DISTINCT with GROUP BY and Aggregation is not supported");
-  } else if (isBitSet(sqlFlag, (int64_t)SQLBit::group) // If group is active
-             || columnName.size()) // OR there is an aggregation
-  {
-    return currentFrame; // due to earlier grouping/aggregation the result is
-                         // already distinct
-  }
-
-  mlir::Value starLiteral = createStringConstant("*");
-  std::vector<mlir::Value> cols{starLiteral};
-  std::vector<mlir::Value> aggs;
-  std::vector<mlir::Attribute> functions;
-  mlir::Type vt = utils.unknownType;
-  std::vector<mlir::Type> colTypes{vt};
-  mlir::Type resType =
-      mlir::daphne::FrameType::get(builder.getContext(), colTypes);
-  return static_cast<mlir::Value>(builder.create<mlir::daphne::GroupOp>(
-      queryLoc, resType, currentFrame, cols, aggs, builder.getArrayAttr(functions)));
+// distinctExpr
+antlrcpp::Any SQLVisitor::visitDistinctExpr(SQLGrammarParser::DistinctExprContext *ctx) {
+    if (isBitSet(sqlFlag, (int64_t)SQLBit::group) // If group is active
+        && columnName.size())                     // AND there is an aggregation
+    {
+        throw ErrorHandler::compilerError(queryLoc, "SQLVisitor",
+                                          "DISTINCT with GROUP BY and Aggregation is not supported");
+    } else if (isBitSet(sqlFlag, (int64_t)SQLBit::group) // If group is active
+               || columnName.size())                     // OR there is an aggregation
+    {
+        return currentFrame; // due to earlier grouping/aggregation the result
+                             // is already distinct
+    }
+
+    mlir::Value starLiteral = createStringConstant("*");
+    std::vector<mlir::Value> cols{starLiteral};
+    std::vector<mlir::Value> aggs;
+    std::vector<mlir::Attribute> functions;
+    mlir::Type vt = utils.unknownType;
+    std::vector<mlir::Type> colTypes{vt};
+    mlir::Type resType = mlir::daphne::FrameType::get(builder.getContext(), colTypes);
+    return static_cast<mlir::Value>(builder.create<mlir::daphne::GroupOp>(queryLoc, resType, currentFrame, cols, aggs,
+                                                                          builder.getArrayAttr(functions)));
 }
 
-//fromExpr
-antlrcpp::Any SQLVisitor::visitTableIdentifierExpr(
-    SQLGrammarParser::TableIdentifierExprContext *ctx
-)
-{
-    try{
+// fromExpr
+antlrcpp::Any SQLVisitor::visitTableIdentifierExpr(SQLGrammarParser::TableIdentifierExprContext *ctx) {
+    try {
         mlir::Value var = valueOrErrorOnVisit(ctx->var);
         return var;
-    }catch(std::runtime_error &e){
-        throw ErrorHandler::compilerError(
-            queryLoc, "SQLVisitor (visitTableIdentifierExpr)", e.what());
+    } catch (std::runtime_error &e) {
+        throw ErrorHandler::compilerError(queryLoc, "SQLVisitor (visitTableIdentifierExpr)", e.what());
     }
 }
 
-antlrcpp::Any SQLVisitor::visitCartesianExpr(
-    SQLGrammarParser::CartesianExprContext * ctx
-)
-{
-    //we have to at least two frames in the fromExpr. We join them together
-    //with the Cartesian product.
-    try{
+antlrcpp::Any SQLVisitor::visitCartesianExpr(SQLGrammarParser::CartesianExprContext *ctx) {
+    // we have to at least two frames in the fromExpr. We join them together
+    // with the Cartesian product.
+    try {
         mlir::Location loc = utils.getLoc(ctx->start);
         mlir::Value res;
         mlir::Value lhs = valueOrErrorOnVisit(ctx->lhs);
         mlir::Value rhs = valueOrErrorOnVisit(ctx->rhs);
 
         std::vector<mlir::Type> colTypes;
-        for(mlir::Type t : lhs.getType().dyn_cast<mlir::daphne::FrameType>().getColumnTypes()){
+        for (mlir::Type t : lhs.getType().dyn_cast<mlir::daphne::FrameType>().getColumnTypes()) {
             colTypes.push_back(t);
         }
-        for(mlir::Type t : rhs.getType().dyn_cast<mlir::daphne::FrameType>().getColumnTypes()){
+        for (mlir::Type t : rhs.getType().dyn_cast<mlir::daphne::FrameType>().getColumnTypes()) {
             colTypes.push_back(t);
         }
         mlir::Type t = mlir::daphne::FrameType::get(builder.getContext(), colTypes);
 
-        res = static_cast<mlir::Value>(
-            builder.create<mlir::daphne::CartesianOp>(
-                loc,
-                t,
-                lhs,
-                rhs
-            )
-        );
+        res = static_cast<mlir::Value>(builder.create<mlir::daphne::CartesianOp>(loc, t, lhs, rhs));
         return res;
-    }catch(std::runtime_error &){
-        throw ErrorHandler::rethrowError("SQLVisitor (visitCartesianExpr)", 
-            "Unexpected Error during Cartesian operation"
-        );
+    } catch (std::runtime_error &) {
+        throw ErrorHandler::rethrowError("SQLVisitor (visitCartesianExpr)",
+                                         "Unexpected Error during Cartesian operation");
     }
 }
 
-//joinExpr
-antlrcpp::Any SQLVisitor::visitInnerJoin(
-    SQLGrammarParser::InnerJoinContext * ctx
-)
-{
-    //we join to frames together. One is the currentFrame and the other is a
-    //new frame. The argument that referneces the currentFrame has to be on the
-    //left side of the Comparisons and the to be joined on the right side.
-    //This behavior could be changed here.
-    //TODO: Make the position independent
+// joinExpr
+antlrcpp::Any SQLVisitor::visitInnerJoin(SQLGrammarParser::InnerJoinContext *ctx) {
+    // we join to frames together. One is the currentFrame and the other is a
+    // new frame. The argument that referneces the currentFrame has to be on the
+    // left side of the Comparisons and the to be joined on the right side.
+    // This behavior could be changed here.
+    // TODO: Make the position independent
     mlir::Location loc = utils.getLoc(ctx->start);
     mlir::Value tojoin = valueOrErrorOnVisit(ctx->var);
 
-
     std::vector<mlir::Type> colTypes;
-    for(mlir::Type t : currentFrame.getType().dyn_cast<mlir::daphne::FrameType>().getColumnTypes())
+    for (mlir::Type t : currentFrame.getType().dyn_cast<mlir::daphne::FrameType>().getColumnTypes())
         colTypes.push_back(t);
-    for(mlir::Type t : tojoin.getType().dyn_cast<mlir::daphne::FrameType>().getColumnTypes())
+    for (mlir::Type t : tojoin.getType().dyn_cast<mlir::daphne::FrameType>().getColumnTypes())
         colTypes.push_back(t);
     mlir::Type t = mlir::daphne::FrameType::get(builder.getContext(), colTypes);
 
-//ctx->CMP_OP(0)->getText()
-    if(ctx->op->getText() == "=" && ctx->selectIdent().size() == 2){
-        //rhs is join
-        //lhs is currentFrame
+    // ctx->CMP_OP(0)->getText()
+    if (ctx->op->getText() == "=" && ctx->selectIdent().size() == 2) {
+        // rhs is join
+        // lhs is currentFrame
         mlir::Value rhsName = valueOrErrorOnVisit(ctx->rhs);
         mlir::Value lhsName = valueOrErrorOnVisit(ctx->lhs);
 
         return static_cast<mlir::Value>(
-            builder.create<mlir::daphne::InnerJoinOp>(
-                loc,
-                t,
-                currentFrame,
-                tojoin,
-                rhsName,
-                lhsName
-            ));
+            builder.create<mlir::daphne::InnerJoinOp>(loc, t, currentFrame, tojoin, rhsName, lhsName));
     }
 
     std::vector<mlir::Value> rhsNames;
     std::vector<mlir::Value> lhsNames;
     std::vector<mlir::Attribute> ops;
 
-    for(auto i = 0ul; i < ctx->selectIdent().size()/2; i++){
-        mlir::Value lhsName = valueOrErrorOnVisit(ctx->selectIdent(i*2));
-        mlir::Value rhsName = valueOrErrorOnVisit(ctx->selectIdent(i*2 + 1));
+    for (auto i = 0ul; i < ctx->selectIdent().size() / 2; i++) {
+        mlir::Value lhsName = valueOrErrorOnVisit(ctx->selectIdent(i * 2));
+        mlir::Value rhsName = valueOrErrorOnVisit(ctx->selectIdent(i * 2 + 1));
         mlir::Attribute op = getCompareEnum(ctx->CMP_OP(i)->getText());
 
         lhsNames.push_back(lhsName);
         rhsNames.push_back(rhsName);
         ops.push_back(op);
-
     }
 
-    return static_cast<mlir::Value>(
-        builder.create<mlir::daphne::ThetaJoinOp>(
-            loc,
-            t,
-            currentFrame,
-            tojoin,
-            lhsNames,
-            rhsNames,
-            builder.getArrayAttr(ops)
-        )
-    );
-
+    return static_cast<mlir::Value>(builder.create<mlir::daphne::ThetaJoinOp>(loc, t, currentFrame, tojoin, lhsNames,
+                                                                              rhsNames, builder.getArrayAttr(ops)));
 }
 
-
-antlrcpp::Any SQLVisitor::visitWhereClause(
-    SQLGrammarParser::WhereClauseContext * ctx
-)
-{
-    //Creates a FilterRowOp with the result of a generalExpr. The result is a
-    //matrix or a single value, expr. expr gets cast to a matrix, which
-    //FilterRowOp uses. IMPORTANT: FilterRowOp takes up the work to make a
-    //int/float into a boolean for the filtering.
+antlrcpp::Any SQLVisitor::visitWhereClause(SQLGrammarParser::WhereClauseContext *ctx) {
+    // Creates a FilterRowOp with the result of a generalExpr. The result is a
+    // matrix or a single value, expr. expr gets cast to a matrix, which
+    // FilterRowOp uses. IMPORTANT: FilterRowOp takes up the work to make a
+    // int/float into a boolean for the filtering.
     mlir::Location loc = utils.getLoc(ctx->start);
     mlir::Value filter;
 
@@ -744,78 +585,56 @@ antlrcpp::Any SQLVisitor::visitWhereClause(
     filter = castToMatrixColumn(expr);
 
     mlir::Value v = static_cast<mlir::Value>(
-        builder.create<mlir::daphne::FilterRowOp>(
-            loc,
-            currentFrame.getType(),
-            currentFrame,
-            filter
-        )
-    );
+        builder.create<mlir::daphne::FilterRowOp>(loc, currentFrame.getType(), currentFrame, filter));
     return v;
 }
 
-//groupByClause
-antlrcpp::Any SQLVisitor::visitGroupByClause(
-    SQLGrammarParser::GroupByClauseContext * ctx
-)
-{
-    //groupByClause has two moods:
-    //Codegeneration = false:
-    //  groupByClause collects all the column names by which a grouping
-    //  should occur.
-    //  TODO: call the havingClause.
-    //Codegeneration = true:
-    //  groupByClause creates the groupingOperation with the gathered
-    //  information from here, having and the projections.
-    //  followed by a having check on the newly grouped result.
+// groupByClause
+antlrcpp::Any SQLVisitor::visitGroupByClause(SQLGrammarParser::GroupByClauseContext *ctx) {
+    // groupByClause has two moods:
+    // Codegeneration = false:
+    //   groupByClause collects all the column names by which a grouping
+    //   should occur.
+    //   TODO: call the havingClause.
+    // Codegeneration = true:
+    //   groupByClause creates the groupingOperation with the gathered
+    //   information from here, having and the projections.
+    //   followed by a having check on the newly grouped result.
     mlir::Location loc = utils.getLoc(ctx->start);
 
-    if(!isBitSet(sqlFlag, (int64_t)SQLBit::codegen)){
-        for(size_t i = 0; i < ctx->selectIdent().size(); i++){
+    if (!isBitSet(sqlFlag, (int64_t)SQLBit::codegen)) {
+        for (size_t i = 0; i < ctx->selectIdent().size(); i++) {
             groupName.push_back(valueOrErrorOnVisit(ctx->selectIdent(i)));
             grouped[ctx->selectIdent(i)->getText()] = 1;
         }
-        if(ctx->havingClause()){
+        if (ctx->havingClause()) {
             visit(ctx->havingClause());
         }
         return nullptr;
-    }else{
+    } else {
         mlir::Type vt = utils.unknownType;
         std::vector<mlir::Type> colTypes;
-        for(size_t i = 0; i < groupName.size() + columnName.size(); i++){
+        for (size_t i = 0; i < groupName.size() + columnName.size(); i++) {
             colTypes.push_back(vt);
         }
-        mlir::Type resType = mlir::daphne::FrameType::get(
-            builder.getContext(), colTypes
-        );
-        currentFrame = static_cast<mlir::Value>(
-            builder.create<mlir::daphne::GroupOp>(
-                loc,
-                resType,
-                currentFrame,
-                groupName,
-                columnName,
-                builder.getArrayAttr(functionName)
-            )
-        );
-        if(ctx->havingClause()){
+        mlir::Type resType = mlir::daphne::FrameType::get(builder.getContext(), colTypes);
+        currentFrame = static_cast<mlir::Value>(builder.create<mlir::daphne::GroupOp>(
+            loc, resType, currentFrame, groupName, columnName, builder.getArrayAttr(functionName)));
+        if (ctx->havingClause()) {
             currentFrame = valueOrErrorOnVisit(ctx->havingClause());
         }
     }
     return nullptr;
 }
 
-//havingClause
-antlrcpp::Any SQLVisitor::visitHavingClause(
-    SQLGrammarParser::HavingClauseContext * ctx
-)
-{
-    //Same as Where
+// havingClause
+antlrcpp::Any SQLVisitor::visitHavingClause(SQLGrammarParser::HavingClauseContext *ctx) {
+    // Same as Where
     mlir::Location loc = utils.getLoc(ctx->start);
     mlir::Value filter;
     antlrcpp::Any vExpr = visit(ctx->cond);
 
-    if(!isBitSet(sqlFlag, (int64_t)SQLBit::codegen)){
+    if (!isBitSet(sqlFlag, (int64_t)SQLBit::codegen)) {
         return nullptr;
     }
 
@@ -823,111 +642,73 @@ antlrcpp::Any SQLVisitor::visitHavingClause(
     filter = castToMatrixColumn(expr);
 
     mlir::Value v = static_cast<mlir::Value>(
-        builder.create<mlir::daphne::FilterRowOp>(
-            loc,
-            currentFrame.getType(),
-            currentFrame,
-            filter
-        )
-    );
+        builder.create<mlir::daphne::FilterRowOp>(loc, currentFrame.getType(), currentFrame, filter));
     return v;
 }
 
-
-
-antlrcpp::Any SQLVisitor::visitOrderByClause(
-    SQLGrammarParser::OrderByClauseContext * ctx
-)
-{
+antlrcpp::Any SQLVisitor::visitOrderByClause(SQLGrammarParser::OrderByClauseContext *ctx) {
     mlir::Location loc = utils.getLoc(ctx->start);
 
     std::vector<mlir::Value> columnIdxs;
     std::vector<mlir::Value> asc;
-    for(auto i = 0ul; i < ctx->selectIdent().size(); i++){
+    for (auto i = 0ul; i < ctx->selectIdent().size(); i++) {
         mlir::Value boolean = valueOrErrorOnVisit(ctx->orderInformation(i));
         mlir::Value columnName = valueOrErrorOnVisit(ctx->selectIdent(i));
         mlir::Value idx = getColIdx(currentFrame, columnName);
         columnIdxs.push_back(utils.castSizeIf(idx));
         asc.push_back(utils.castBoolIf(boolean));
     }
-    mlir::Value returnFrame = static_cast<mlir::Value>(
-            builder.create<mlir::daphne::ConstantOp>(
-                    loc, false
-            )
-        );
-    return static_cast<mlir::Value>(
-        builder.create<mlir::daphne::OrderOp>(
-            loc,
-            currentFrame.getType().dyn_cast<mlir::daphne::FrameType>(),
-            currentFrame,
-            columnIdxs,
-            asc,
-            returnFrame
-        )
-    );
+    mlir::Value returnFrame = static_cast<mlir::Value>(builder.create<mlir::daphne::ConstantOp>(loc, false));
+    return static_cast<mlir::Value>(builder.create<mlir::daphne::OrderOp>(
+        loc, currentFrame.getType().dyn_cast<mlir::daphne::FrameType>(), currentFrame, columnIdxs, asc, returnFrame));
 }
 
-antlrcpp::Any SQLVisitor::visitOrderInformation(
-    SQLGrammarParser::OrderInformationContext * ctx
-)
-{
+antlrcpp::Any SQLVisitor::visitOrderInformation(SQLGrammarParser::OrderInformationContext *ctx) {
     mlir::Location loc = utils.getLoc(ctx->start);
-    //ASC is default
-    if(ctx->desc){
-        return static_cast<mlir::Value>(
-            builder.create<mlir::daphne::ConstantOp>(
-                    loc, false
-            )
-        );
+    // ASC is default
+    if (ctx->desc) {
+        return static_cast<mlir::Value>(builder.create<mlir::daphne::ConstantOp>(loc, false));
     }
-    return static_cast<mlir::Value>(
-        builder.create<mlir::daphne::ConstantOp>(
-                loc, true
-        )
-    );
+    return static_cast<mlir::Value>(builder.create<mlir::daphne::ConstantOp>(loc, true));
 }
-//generalExpr
-
-//For the following generalExpr:
-//  If Code generation is turned off, it will still visit the generalExpr
-//      underneath and then return a nullptr.
-//  If Code generation is turned on, it will generate Code like an addition.
-//  If something else is happening, it got additional Documentation.
-
-antlrcpp::Any SQLVisitor::visitLiteralExpr(
-    SQLGrammarParser::LiteralExprContext * ctx
-)
-{
-    if(!isBitSet(sqlFlag, (int64_t)SQLBit::codegen)){
+// generalExpr
+
+// For the following generalExpr:
+//   If Code generation is turned off, it will still visit the generalExpr
+//       underneath and then return a nullptr.
+//   If Code generation is turned on, it will generate Code like an addition.
+//   If something else is happening, it got additional Documentation.
+
+antlrcpp::Any SQLVisitor::visitLiteralExpr(SQLGrammarParser::LiteralExprContext *ctx) {
+    if (!isBitSet(sqlFlag, (int64_t)SQLBit::codegen)) {
         return nullptr;
     }
     return valueOrErrorOnVisit(ctx->literal());
 }
 
-antlrcpp::Any SQLVisitor::visitIdentifierExpr(
-    SQLGrammarParser::IdentifierExprContext * ctx)
-{
-    if(     isBitSet(sqlFlag, (int64_t)SQLBit::group) //If group is active
-        && !isBitSet(sqlFlag, (int64_t)SQLBit::agg) //AND there isn't an aggregation
-        && grouped.count(ctx->selectIdent()->getText()) == 0 //AND the label is not in group expr
-        && ctx->selectIdent()->getText()[ctx->selectIdent()->getText().length() - 1] != '*' //AND the label does not end in * (* or f.*)
-        && grouped.count("*") == 0) //AND there is no * in group expr
+antlrcpp::Any SQLVisitor::visitIdentifierExpr(SQLGrammarParser::IdentifierExprContext *ctx) {
+    if (isBitSet(sqlFlag, (int64_t)SQLBit::group) // If group is active
+        && !isBitSet(sqlFlag,
+                     (int64_t)SQLBit::agg)                   // AND there isn't an aggregation
+        && grouped.count(ctx->selectIdent()->getText()) == 0 // AND the label is not in group expr
+        && ctx->selectIdent()->getText()[ctx->selectIdent()->getText().length() - 1] !=
+               '*'                  // AND the label does not end in * (* or f.*)
+        && grouped.count("*") == 0) // AND there is no * in group expr
     {
         std::stringstream err_msg;
-        err_msg << "Error during a generalExpr. \""
-            << ctx->selectIdent()->getText() << "\" Must be part of "
-            << "the Group Expression or have an Aggregation Function";
+        err_msg << "Error during a generalExpr. \"" << ctx->selectIdent()->getText() << "\" Must be part of "
+                << "the Group Expression or have an Aggregation Function";
         throw ErrorHandler::compilerError(queryLoc, "SQLVisitor", err_msg.str());
     }
 
-    if(!isBitSet(sqlFlag, (int64_t)SQLBit::codegen)){
+    if (!isBitSet(sqlFlag, (int64_t)SQLBit::codegen)) {
         return nullptr;
     }
-    
+
     auto label = ctx->selectIdent()->getText();
-    if(label.compare("*") == 0){                                        //SELECT *
+    if (label.compare("*") == 0) { // SELECT *
         return valueOrErrorOnVisit(ctx->selectIdent());
-    } else if(label.compare(label.length() - 2, 2, ".*") == 0){      //SELECT frame.*
+    } else if (label.compare(label.length() - 2, 2, ".*") == 0) { // SELECT frame.*
         mlir::Value colname = valueOrErrorOnVisit(ctx->selectIdent());
         return extractColumnFromFrame(currentFrame, colname);
     }
@@ -936,15 +717,13 @@ antlrcpp::Any SQLVisitor::visitIdentifierExpr(
     return extractColumnAsMatrixFromFrame(currentFrame, colname);
 }
 
-antlrcpp::Any SQLVisitor::visitStarExpr(
-    SQLGrammarParser::StarExprContext * ctx
-)
-{
-    if(!isBitSet(sqlFlag, (int64_t)SQLBit::codegen)){
+antlrcpp::Any SQLVisitor::visitStarExpr(SQLGrammarParser::StarExprContext *ctx) {
+    if (!isBitSet(sqlFlag, (int64_t)SQLBit::codegen)) {
         return nullptr;
-    } else if(isBitSet(sqlFlag, (int64_t)SQLBit::group)         //If group is active
-            && !groundGroupColumns.empty()                                     //AND there is an aggregation
-            && isBitSet(sqlFlag, (int64_t)SQLBit::codegen)){    //AND codegen is active
+    } else if (isBitSet(sqlFlag, (int64_t)SQLBit::group) // If group is active
+               && !groundGroupColumns.empty()            // AND there is an aggregation
+               && isBitSet(sqlFlag,
+                           (int64_t)SQLBit::codegen)) { // AND codegen is active
         std::string columnName = groundGroupColumns.begin()->c_str();
         groundGroupColumns.erase(groundGroupColumns.begin());
 
@@ -952,65 +731,59 @@ antlrcpp::Any SQLVisitor::visitStarExpr(
 
         mlir::Value resultFrame = extractColumnFromFrame(currentFrame, colname);
         std::set<std::string>::iterator itr;
-        for (itr = groundGroupColumns.begin(); itr != groundGroupColumns.end(); itr++ ) {
+        for (itr = groundGroupColumns.begin(); itr != groundGroupColumns.end(); itr++) {
             mlir::Value groupColname = createStringConstant(itr->c_str());
             mlir::Value addFrame = extractColumnFromFrame(currentFrame, groupColname);
 
             std::vector<mlir::Type> colTypes;
-            for(mlir::Type t : resultFrame.getType().dyn_cast<mlir::daphne::FrameType>().getColumnTypes())
+            for (mlir::Type t : resultFrame.getType().dyn_cast<mlir::daphne::FrameType>().getColumnTypes())
                 colTypes.push_back(t);
-            for(mlir::Type t : addFrame.getType().dyn_cast<mlir::daphne::FrameType>().getColumnTypes())
+            for (mlir::Type t : addFrame.getType().dyn_cast<mlir::daphne::FrameType>().getColumnTypes())
                 colTypes.push_back(t);
             mlir::Type resType = mlir::daphne::FrameType::get(builder.getContext(), colTypes);
             resultFrame = static_cast<mlir::Value>(
-                builder.create<mlir::daphne::ColBindOp>(
-                    queryLoc,
-                    resType,
-                    resultFrame,
-                    addFrame
-                )
-            );
+                builder.create<mlir::daphne::ColBindOp>(queryLoc, resType, resultFrame, addFrame));
         }
         return resultFrame;
-    } else if(!isBitSet(sqlFlag, (int64_t)SQLBit::group)        //If group is not active
-            && isBitSet(sqlFlag, (int64_t)SQLBit::agg)          //AND there is an aggreagtion
-            && isBitSet(sqlFlag, (int64_t)SQLBit::codegen)){    //AND codegen is active)
+    } else if (!isBitSet(sqlFlag,
+                         (int64_t)SQLBit::group) // If group is not active
+               && isBitSet(sqlFlag,
+                           (int64_t)SQLBit::agg) // AND there is an aggreagtion
+               && isBitSet(sqlFlag,
+                           (int64_t)SQLBit::codegen)) { // AND codegen is active)
         throw ErrorHandler::compilerError(queryLoc, "SQLVisitor",
-                "Using the asterisk with only aggregation functions is not allowed");
+                                          "Using the asterisk with only aggregation functions is not "
+                                          "allowed");
     } else {
         return currentFrame;
     }
 }
 
-antlrcpp::Any SQLVisitor::visitGroupAggExpr(
-    SQLGrammarParser::GroupAggExprContext * ctx
-)
-{
-    //This function should only be called if there is a "group by" in the query
-    //Codegeneration = false:
-    //  This function activates the code generation and ignores the aggreagtion
-    //  It lets the generalExpr create code as usual. When a Value is returned
-    //  it takes this value and adds it to the currentFrame under a new and
-    //  somewhat unique name.
-    //  (TODO: there might be an issue with not unique name generation)
-    //  the name gets saved alongside with the aggregation function.
-    //  After all that, code generation is turned off again.
-    //Codegeneration = true:
-    //  The function looks up the unique name again and extracts a matrix from
-    //  the currentFrame. This Matrix is the result of this function.
-    std::string newColumnName = "group_" + std::to_string(groupCounter) + "_"  + ctx->var->getText();
+antlrcpp::Any SQLVisitor::visitGroupAggExpr(SQLGrammarParser::GroupAggExprContext *ctx) {
+    // This function should only be called if there is a "group by" in the query
+    // Codegeneration = false:
+    //   This function activates the code generation and ignores the aggreagtion
+    //   It lets the generalExpr create code as usual. When a Value is returned
+    //   it takes this value and adds it to the currentFrame under a new and
+    //   somewhat unique name.
+    //   (TODO: there might be an issue with not unique name generation)
+    //   the name gets saved alongside with the aggregation function.
+    //   After all that, code generation is turned off again.
+    // Codegeneration = true:
+    //   The function looks up the unique name again and extracts a matrix from
+    //   the currentFrame. This Matrix is the result of this function.
+    std::string newColumnName = "group_" + std::to_string(groupCounter) + "_" + ctx->var->getText();
     // Increment groupCounter
     groupCounter++;
 
     groundGroupColumns.insert(ctx->var->getText());
 
-    if(ctx->var->getText()[ctx->var->getText().length() - 1] == '*'){
-        throw ErrorHandler::compilerError(queryLoc, "SQLVisitor",
-                "Using the asterisk in aggregations is not allowed");
+    if (ctx->var->getText()[ctx->var->getText().length() - 1] == '*') {
+        throw ErrorHandler::compilerError(queryLoc, "SQLVisitor", "Using the asterisk in aggregations is not allowed");
     }
 
     // Run aggreagation for whole column
-    if(!isBitSet(sqlFlag, (int64_t)SQLBit::group) && isBitSet(sqlFlag, (int64_t)SQLBit::codegen)){  
+    if (!isBitSet(sqlFlag, (int64_t)SQLBit::group) && isBitSet(sqlFlag, (int64_t)SQLBit::codegen)) {
         mlir::Location loc = utils.getLoc(ctx->start);
 
         mlir::Value col = valueOrErrorOnVisit(ctx->var);
@@ -1019,50 +792,22 @@ antlrcpp::Any SQLVisitor::visitGroupAggExpr(
 
         std::string func = toLower(ctx->func->getText());
 
-        mlir::Value result; 
-        if(func == "count"){
-            result = utils.castSI64If(static_cast<mlir::Value>(
-            builder.create<mlir::daphne::NumRowsOp>(
-                loc,
-                utils.sizeType,
-                col
-            )));
+        mlir::Value result;
+        if (func == "count") {
+            result = utils.castSI64If(
+                static_cast<mlir::Value>(builder.create<mlir::daphne::NumRowsOp>(loc, utils.sizeType, col)));
         }
-        if(func == "sum"){
-            result = static_cast<mlir::Value>(
-            builder.create<mlir::daphne::AllAggSumOp>(
-                loc,
-                resTypeCol,
-                col
-                )
-            );
+        if (func == "sum") {
+            result = static_cast<mlir::Value>(builder.create<mlir::daphne::AllAggSumOp>(loc, resTypeCol, col));
         }
-        if(func == "min"){
-            result = static_cast<mlir::Value>(
-            builder.create<mlir::daphne::AllAggMinOp>(
-                loc,
-                resTypeCol,
-                col
-                ) 
-            );
+        if (func == "min") {
+            result = static_cast<mlir::Value>(builder.create<mlir::daphne::AllAggMinOp>(loc, resTypeCol, col));
         }
-        if(func == "max"){
-            result = static_cast<mlir::Value>(
-            builder.create<mlir::daphne::AllAggMaxOp>(
-                loc,
-                resTypeCol,
-                col
-                )
-            );
+        if (func == "max") {
+            result = static_cast<mlir::Value>(builder.create<mlir::daphne::AllAggMaxOp>(loc, resTypeCol, col));
         }
-        if(func == "avg"){
-            result = static_cast<mlir::Value>(
-            builder.create<mlir::daphne::AllAggMeanOp>(
-                loc,
-                resTypeCol,
-                col
-                )
-            );
+        if (func == "avg") {
+            result = static_cast<mlir::Value>(builder.create<mlir::daphne::AllAggMeanOp>(loc, resTypeCol, col));
         }
 
         std::string newColumnNameAppended = getEnumLabelExt(func) + "(" + newColumnName + ")";
@@ -1074,13 +819,13 @@ antlrcpp::Any SQLVisitor::visitGroupAggExpr(
         throw ErrorHandler::compilerError(queryLoc, "SQLVisitor", x.str());
     }
 
-    if(isBitSet(sqlFlag, (int64_t)SQLBit::agg)){ //Not allowed nested Function Call
-        throw ErrorHandler::compilerError(queryLoc, "SQLVisitor",
-                "Nested Aggregation Functions not allowed");
+    if (isBitSet(sqlFlag,
+                 (int64_t)SQLBit::agg)) { // Not allowed nested Function Call
+        throw ErrorHandler::compilerError(queryLoc, "SQLVisitor", "Nested Aggregation Functions not allowed");
     }
 
-    //create Column pre Group for in group Aggregation
-    if(!isBitSet(sqlFlag, (int64_t)SQLBit::codegen)){
+    // create Column pre Group for in group Aggregation
+    if (!isBitSet(sqlFlag, (int64_t)SQLBit::codegen)) {
         columnName.push_back(createStringConstant(newColumnName));
         const std::string &func = toLower(ctx->func->getText());
         functionName.push_back(getGroupEnum(func));
@@ -1094,141 +839,107 @@ antlrcpp::Any SQLVisitor::visitGroupAggExpr(
         mlir::Value matrix = castToMatrixColumn(expr);
         currentFrame = addMatrixToCurrentFrame(matrix, newColumnName);
         return nullptr;
-    }else{ //Get Column after Group
+    } else { // Get Column after Group
         std::string newColumnName = "group_" + std::to_string(groupCounterCodegen) + "_" + ctx->var->getText();
         // Increment groupCounter
         groupCounterCodegen++;
         const std::string &func = toLower(ctx->func->getText());
         std::string newColumnNameAppended = getEnumLabelExt(func) + "(" + newColumnName + ")";
         mlir::Value colname = createStringConstant(newColumnNameAppended);
-        return extractColumnAsMatrixFromFrame(currentFrame, colname); //returns Matrix
+        return extractColumnAsMatrixFromFrame(currentFrame,
+                                              colname); // returns Matrix
     }
 }
 
-antlrcpp::Any SQLVisitor::visitParanthesesExpr(
-    SQLGrammarParser::ParanthesesExprContext * ctx
-)
-{
+antlrcpp::Any SQLVisitor::visitParanthesesExpr(SQLGrammarParser::ParanthesesExprContext *ctx) {
     antlrcpp::Any vRes = visit(ctx->generalExpr());
-    if(!isBitSet(sqlFlag, (int64_t)SQLBit::codegen)){
+    if (!isBitSet(sqlFlag, (int64_t)SQLBit::codegen)) {
         return nullptr;
     }
     return utils.valueOrError(utils.getLoc(ctx->generalExpr()->start), vRes);
 }
 
-antlrcpp::Any SQLVisitor::visitMulExpr(
-    SQLGrammarParser::MulExprContext * ctx
-)
-{
+antlrcpp::Any SQLVisitor::visitMulExpr(SQLGrammarParser::MulExprContext *ctx) {
     mlir::Location loc = utils.getLoc(ctx->start);
     std::string op = ctx->op->getText();
 
     antlrcpp::Any vLhs = visit(ctx->lhs);
     antlrcpp::Any vRhs = visit(ctx->rhs);
 
-    if(!isBitSet(sqlFlag, (int64_t)SQLBit::codegen)){
+    if (!isBitSet(sqlFlag, (int64_t)SQLBit::codegen)) {
         return nullptr;
     }
 
     mlir::Value lhs = utils.valueOrError(utils.getLoc(ctx->lhs->start), vLhs);
     mlir::Value rhs = utils.valueOrError(utils.getLoc(ctx->rhs->start), vRhs);
 
-    if(op == "*")
-        return static_cast<mlir::Value>(builder.create<mlir::daphne::EwMulOp>(
-            loc, lhs, rhs
-        ));
-    if(op == "/")
-        return static_cast<mlir::Value>(builder.create<mlir::daphne::EwDivOp>(
-            loc, lhs, rhs
-        ));
+    if (op == "*")
+        return static_cast<mlir::Value>(builder.create<mlir::daphne::EwMulOp>(loc, lhs, rhs));
+    if (op == "/")
+        return static_cast<mlir::Value>(builder.create<mlir::daphne::EwDivOp>(loc, lhs, rhs));
 
-     throw ErrorHandler::compilerError(queryLoc, "SQLVisitor", "unexpected op symbol");
+    throw ErrorHandler::compilerError(queryLoc, "SQLVisitor", "unexpected op symbol");
 }
 
-antlrcpp::Any SQLVisitor::visitAddExpr(
-    SQLGrammarParser::AddExprContext * ctx
-)
-{
+antlrcpp::Any SQLVisitor::visitAddExpr(SQLGrammarParser::AddExprContext *ctx) {
     mlir::Location loc = utils.getLoc(ctx->start);
     std::string op = ctx->op->getText();
 
     antlrcpp::Any vLhs = visit(ctx->lhs);
     antlrcpp::Any vRhs = visit(ctx->rhs);
 
-    if(!isBitSet(sqlFlag, (int64_t)SQLBit::codegen)){
+    if (!isBitSet(sqlFlag, (int64_t)SQLBit::codegen)) {
         return nullptr;
     }
 
     mlir::Value lhs = utils.valueOrError(utils.getLoc(ctx->lhs->start), vLhs);
     mlir::Value rhs = utils.valueOrError(utils.getLoc(ctx->rhs->start), vRhs);
 
-    if(op == "+")
-        return static_cast<mlir::Value>(builder.create<mlir::daphne::EwAddOp>(
-            loc, lhs, rhs
-        ));
-    if(op == "-")
-        return static_cast<mlir::Value>(builder.create<mlir::daphne::EwSubOp>(
-            loc, lhs, rhs
-        ));
+    if (op == "+")
+        return static_cast<mlir::Value>(builder.create<mlir::daphne::EwAddOp>(loc, lhs, rhs));
+    if (op == "-")
+        return static_cast<mlir::Value>(builder.create<mlir::daphne::EwSubOp>(loc, lhs, rhs));
 
     throw ErrorHandler::compilerError(queryLoc, "SQLVisitor", "unexpected op symbol");
 }
 
-antlrcpp::Any SQLVisitor::visitCmpExpr(
-    SQLGrammarParser::CmpExprContext * ctx
-)
-{
+antlrcpp::Any SQLVisitor::visitCmpExpr(SQLGrammarParser::CmpExprContext *ctx) {
     mlir::Location loc = utils.getLoc(ctx->start);
     std::string op = ctx->op->getText();
 
     antlrcpp::Any vLhs = visit(ctx->lhs);
     antlrcpp::Any vRhs = visit(ctx->rhs);
 
-    if(!isBitSet(sqlFlag, (int64_t)SQLBit::codegen)){
+    if (!isBitSet(sqlFlag, (int64_t)SQLBit::codegen)) {
         return nullptr;
     }
 
     mlir::Value lhs = utils.valueOrError(utils.getLoc(ctx->lhs->start), vLhs);
     mlir::Value rhs = utils.valueOrError(utils.getLoc(ctx->rhs->start), vRhs);
 
-    if(op == "=")
-        return static_cast<mlir::Value>(builder.create<mlir::daphne::EwEqOp>(
-            loc, lhs, rhs
-        ));
-    if(op == "<>")
-        return static_cast<mlir::Value>(builder.create<mlir::daphne::EwNeqOp>(
-            loc, lhs, rhs
-        ));
-    if(op == "<")
-        return static_cast<mlir::Value>(builder.create<mlir::daphne::EwLtOp>(
-            loc, lhs, rhs
-        ));
-    if(op == "<=")
-        return static_cast<mlir::Value>(builder.create<mlir::daphne::EwLeOp>(
-            loc, lhs, rhs
-        ));
-    if(op == ">")
-        return static_cast<mlir::Value>(builder.create<mlir::daphne::EwGtOp>(
-            loc, lhs, rhs
-        ));
-    if(op == ">=")
-        return static_cast<mlir::Value>(builder.create<mlir::daphne::EwGeOp>(
-            loc, lhs, rhs
-        ));
+    if (op == "=")
+        return static_cast<mlir::Value>(builder.create<mlir::daphne::EwEqOp>(loc, lhs, rhs));
+    if (op == "<>")
+        return static_cast<mlir::Value>(builder.create<mlir::daphne::EwNeqOp>(loc, lhs, rhs));
+    if (op == "<")
+        return static_cast<mlir::Value>(builder.create<mlir::daphne::EwLtOp>(loc, lhs, rhs));
+    if (op == "<=")
+        return static_cast<mlir::Value>(builder.create<mlir::daphne::EwLeOp>(loc, lhs, rhs));
+    if (op == ">")
+        return static_cast<mlir::Value>(builder.create<mlir::daphne::EwGtOp>(loc, lhs, rhs));
+    if (op == ">=")
+        return static_cast<mlir::Value>(builder.create<mlir::daphne::EwGeOp>(loc, lhs, rhs));
 
     throw ErrorHandler::compilerError(queryLoc, "SQLVisitor", "unexpected op symbol");
 }
 
-antlrcpp::Any SQLVisitor::visitAndExpr(
-    SQLGrammarParser::AndExprContext * ctx
-)
-{
+antlrcpp::Any SQLVisitor::visitAndExpr(SQLGrammarParser::AndExprContext *ctx) {
     mlir::Location loc = utils.getLoc(ctx->start);
 
     antlrcpp::Any vLhs = visit(ctx->lhs);
     antlrcpp::Any vRhs = visit(ctx->rhs);
 
-    if(!isBitSet(sqlFlag, (int64_t)SQLBit::codegen)){
+    if (!isBitSet(sqlFlag, (int64_t)SQLBit::codegen)) {
         return nullptr;
     }
 
@@ -1238,21 +949,16 @@ antlrcpp::Any SQLVisitor::visitAndExpr(
     lhs = castToIntMatrixColumn(lhs);
     rhs = castToIntMatrixColumn(rhs);
 
-    return static_cast<mlir::Value>(builder.create<mlir::daphne::EwAndOp>(
-        loc, lhs, rhs
-    ));
+    return static_cast<mlir::Value>(builder.create<mlir::daphne::EwAndOp>(loc, lhs, rhs));
 }
 
-antlrcpp::Any SQLVisitor::visitOrExpr(
-    SQLGrammarParser::OrExprContext * ctx
-)
-{
+antlrcpp::Any SQLVisitor::visitOrExpr(SQLGrammarParser::OrExprContext *ctx) {
     mlir::Location loc = utils.getLoc(ctx->start);
 
     antlrcpp::Any vLhs = visit(ctx->lhs);
     antlrcpp::Any vRhs = visit(ctx->rhs);
 
-    if(!isBitSet(sqlFlag, (int64_t)SQLBit::codegen)){
+    if (!isBitSet(sqlFlag, (int64_t)SQLBit::codegen)) {
         return nullptr;
     }
 
@@ -1262,18 +968,13 @@ antlrcpp::Any SQLVisitor::visitOrExpr(
     lhs = castToIntMatrixColumn(lhs);
     rhs = castToIntMatrixColumn(rhs);
 
-    return static_cast<mlir::Value>(builder.create<mlir::daphne::EwOrOp>(
-        loc, lhs, rhs)
-    );
+    return static_cast<mlir::Value>(builder.create<mlir::daphne::EwOrOp>(loc, lhs, rhs));
 }
 
-//tableReference
-// Returns a modified Frame.
-// Modification: the Frame labels get a prefix.
-antlrcpp::Any SQLVisitor::visitTableReference(
-    SQLGrammarParser::TableReferenceContext * ctx
-)
-{
+// tableReference
+//  Returns a modified Frame.
+//  Modification: the Frame labels get a prefix.
+antlrcpp::Any SQLVisitor::visitTableReference(SQLGrammarParser::TableReferenceContext *ctx) {
     mlir::Location loc = utils.getLoc(ctx->start);
 
     std::string var = ctx->var->getText();
@@ -1281,7 +982,7 @@ antlrcpp::Any SQLVisitor::visitTableReference(
     try {
         mlir::Value res = fetchMLIR(var);
         registerAlias(var, res);
-        if(ctx->aka){
+        if (ctx->aka) {
             std::string aka = ctx->aka->getText();
             registerAlias(aka, res);
             prefix = setFramePrefix(aka, aka);
@@ -1289,83 +990,57 @@ antlrcpp::Any SQLVisitor::visitTableReference(
 
         setFramePrefix(var, prefix, !ctx->aka, ctx->aka);
 
-        mlir::Value prefixSSA = static_cast<mlir::Value>(
-                builder.create<mlir::daphne::ConstantOp>(loc, prefix)
-        );
-
-        res = static_cast<mlir::Value>(
-            builder.create<mlir::daphne::SetColLabelsPrefixOp>(
-                loc,
-                res.getType().dyn_cast<mlir::daphne::FrameType>().withSameColumnTypes(),
-                res,
-                prefixSSA
-            )
-        );
+        mlir::Value prefixSSA = static_cast<mlir::Value>(builder.create<mlir::daphne::ConstantOp>(loc, prefix));
+
+        res = static_cast<mlir::Value>(builder.create<mlir::daphne::SetColLabelsPrefixOp>(
+            loc, res.getType().dyn_cast<mlir::daphne::FrameType>().withSameColumnTypes(), res, prefixSSA));
         return res;
-    }
-    catch(std::runtime_error &)
-{
-    throw ErrorHandler::rethrowError("SQLVisitor (visitTableReference)",
-            "Frame " + var + " referenced before assignment"
-        );
+    } catch (std::runtime_error &) {
+        throw ErrorHandler::rethrowError("SQLVisitor (visitTableReference)",
+                                         "Frame " + var + " referenced before assignment");
     }
 }
 
-//selectIdent //rowReference
+// selectIdent //rowReference
 
 // Returns A SSA to StringLabel for an ExtractColOp
 // If a Frame is referenced, it checks its availability.
-antlrcpp::Any SQLVisitor::visitStringIdent(
-    SQLGrammarParser::StringIdentContext * ctx
-)
-{
+antlrcpp::Any SQLVisitor::visitStringIdent(SQLGrammarParser::StringIdentContext *ctx) {
     std::string getSTR;
     std::string columnSTR = ctx->var->getText();
     std::string frameSTR;
 
-    if(ctx->frame){
-        if(!hasMLIR(ctx->frame->getText())){
+    if (ctx->frame) {
+        if (!hasMLIR(ctx->frame->getText())) {
             throw ErrorHandler::compilerError(queryLoc, "SQLVisitor",
-                "Unknown Frame: " + ctx->frame->getText()
-                + " use before declaration during selection"
-            );
+                                              "Unknown Frame: " + ctx->frame->getText() +
+                                                  " use before declaration during selection");
         }
         std::string framePrefix_ = fetchPrefix(ctx->frame->getText());
-        if(!framePrefix_.empty()){
+        if (!framePrefix_.empty()) {
             frameSTR = framePrefix_ + ".";
-        }else{
+        } else {
             frameSTR = "";
         }
-        getSTR = frameSTR+columnSTR;
-    }else{
+        getSTR = frameSTR + columnSTR;
+    } else {
         getSTR = columnSTR;
     }
     return createStringConstant(getSTR);
 }
 
-//literal
-antlrcpp::Any SQLVisitor::visitLiteral(
-    SQLGrammarParser::LiteralContext * ctx
-)
-{
+// literal
+antlrcpp::Any SQLVisitor::visitLiteral(SQLGrammarParser::LiteralContext *ctx) {
     mlir::Location loc = utils.getLoc(ctx->start);
-    if(auto lit = ctx->INT_LITERAL()) {
+    if (auto lit = ctx->INT_LITERAL()) {
         // ToDo: converted from atol to stol for safety -> check perf
         int64_t val = std::stol(lit->getText());
-        return static_cast<mlir::Value>(
-            builder.create<mlir::daphne::ConstantOp>(
-                    loc, val
-            )
-        );
+        return static_cast<mlir::Value>(builder.create<mlir::daphne::ConstantOp>(loc, val));
     }
-    if(auto lit = ctx->FLOAT_LITERAL()) {
+    if (auto lit = ctx->FLOAT_LITERAL()) {
         // ToDo: converted from atof to std::stod for safety -> check perf
         double val = std::stod(lit->getText());
-        return static_cast<mlir::Value>(
-            builder.create<mlir::daphne::ConstantOp>(
-                loc, val
-            )
-        );
+        return static_cast<mlir::Value>(builder.create<mlir::daphne::ConstantOp>(loc, val));
     }
     throw ErrorHandler::compilerError(queryLoc, "SQLVisitor", "unexpected literal");
 }
diff --git a/src/parser/sql/SQLVisitor.h b/src/parser/sql/SQLVisitor.h
index e4bc2c783..c360b9906 100644
--- a/src/parser/sql/SQLVisitor.h
+++ b/src/parser/sql/SQLVisitor.h
@@ -19,29 +19,28 @@
 #include <parser/ParserUtils.h>
 #include <parser/ScopedSymbolTable.h>
 
-#include "antlr4-runtime.h"
 #include "SQLGrammarParser.h"
 #include "SQLGrammarVisitor.h"
+#include "antlr4-runtime.h"
 
 #include <mlir/IR/Builders.h>
 #include <mlir/IR/Value.h>
 
 #include <string>
 #include <unordered_map>
-#include <vector>
 #include <utility>
+#include <vector>
 
 class SQLVisitor : public SQLGrammarVisitor {
 
     ParserUtils utils;
     mlir::OpBuilder builder;
 
-//special Variables
-    mlir::Value currentFrame; //holds the complete Frame with all columns
+    // special Variables
+    mlir::Value currentFrame; // holds the complete Frame with all columns
     mlir::Location queryLoc;
 
-
-//Helper Functions:
+    // Helper Functions:
 
     /**
      * @brief creates a mlir-string-value from a c++ String
@@ -57,43 +56,39 @@ class SQLVisitor : public SQLGrammarVisitor {
     /**
      * @brief Creates a Frame out of a Matrix Column and a name
      */
-    mlir::Value matrixToFrame(
-        mlir::Value matrix, std::string newColumnName);
+    mlir::Value matrixToFrame(mlir::Value matrix, std::string newColumnName);
 
     /**
      * @brief creates ColBindOp to add the matirx to the currentFrame.
      */
-    mlir::Value addMatrixToCurrentFrame(
-        mlir::Value matrix, std::string newColumnName);
+    mlir::Value addMatrixToCurrentFrame(mlir::Value matrix, std::string newColumnName);
 
     /**
-    * @brief creates a GetColIdxOp for a specific colName of the frame.
-    */
-    mlir::Value getColIdx(
-        mlir::Value frame, mlir::Value colName);
+     * @brief creates a GetColIdxOp for a specific colName of the frame.
+     */
+    mlir::Value getColIdx(mlir::Value frame, mlir::Value colName);
 
     /**
      * @brief creates ExtractColOp and CastOp
      */
-    mlir::Value extractColumnAsMatrixFromFrame(
-        mlir::Value frame, mlir::Value colname);
+    mlir::Value extractColumnAsMatrixFromFrame(mlir::Value frame, mlir::Value colname);
 
     /**
      * @brief returns GroupEnumAttr for a given aggregation function
      *
      * TODO: extend if more aggregation functions get implemented.
      */
-    mlir::Attribute getGroupEnum(const std::string& func);
+    mlir::Attribute getGroupEnum(const std::string &func);
 
     /**
      * @brief returns CompareEnumAttr for a given compare operation
      */
-    mlir::Attribute getCompareEnum(const std::string& op);
+    mlir::Attribute getCompareEnum(const std::string &op);
 
     /**
      * @brief returns result of stringifyGroupEnum for the given func.
      */
-    std::string getEnumLabelExt(const std::string& func);
+    std::string getEnumLabelExt(const std::string &func);
 
     /**
      * @brief returns a frame in which the contents of a column specified by
@@ -101,159 +96,158 @@ class SQLVisitor : public SQLGrammarVisitor {
      */
     mlir::Value extractColumnFromFrame(mlir::Value frame, mlir::Value columnName);
 
+    // Data Structures and access functions
+    std::unordered_map<std::string, mlir::Value> view;  // name, mlir::Value
+    std::unordered_map<std::string, mlir::Value> alias; // name, mlir::Value
 
-//Data Structures and access functions
-    std::unordered_map <std::string, mlir::Value> view; //name, mlir::Value
-    std::unordered_map <std::string, mlir::Value> alias; //name, mlir::Value
-
-    std::unordered_map <std::string, std::string> framePrefix; //framename, prefix
-    std::unordered_map<std::string, std::string> reverseFramePrefix; //prefix, framename
-
+    std::unordered_map<std::string, std::string> framePrefix;        // framename, prefix
+    std::unordered_map<std::string, std::string> reverseFramePrefix; // prefix, framename
 
     /**
-     * @brief adds a mlir Value under the string into the alias map for later lookup.
+     * @brief adds a mlir Value under the string into the alias map for later
+     * lookup.
      */
-    void registerAlias(const std::string& framename, mlir::Value arg);
+    void registerAlias(const std::string &framename, mlir::Value arg);
 
     /**
      * @brief first looks up the Alias map and if the item is not found it looks
      * into the View map.
      */
-    mlir::Value fetchMLIR(const std::string& framename);
+    mlir::Value fetchMLIR(const std::string &framename);
 
     /**
      * @brief look up in Alias map
      */
-    [[maybe_unused]] mlir::Value fetchAlias(const std::string& framename);
+    [[maybe_unused]] mlir::Value fetchAlias(const std::string &framename);
 
     /**
-     * @brief looks up if the string specifies a mlir Value in the Alias or View Map
+     * @brief looks up if the string specifies a mlir Value in the Alias or View
+     * Map
      */
-    bool hasMLIR(const std::string& name);
+    bool hasMLIR(const std::string &name);
 
     /**
-     * @brief checks if a given prefix already given to annother framename otherwise
-     * registers the prefix for this framename
+     * @brief checks if a given prefix already given to annother framename
+     * otherwise registers the prefix for this framename
      */
-    std::string setFramePrefix(const std::string& framename, const std::string& prefix, bool necessary, bool ignore);
+    std::string setFramePrefix(const std::string &framename, const std::string &prefix, bool necessary, bool ignore);
 
     /**
      * @brief looks up the prefix for a given framename
      */
-    std::string fetchPrefix(const std::string& framename);
+    std::string fetchPrefix(const std::string &framename);
 
-    template<class Context>
-    mlir::Value valueOrErrorOnVisit(Context * ctx) {
+    template <class Context> mlir::Value valueOrErrorOnVisit(Context *ctx) {
         return utils.valueOrError(utils.getLoc(ctx->start), visit(ctx));
     }
 
-    //TODO: Recognize Literals and somehow handle them for the group expr.
-//GROUP Information
-    std::unordered_map <std::string, int8_t> grouped;
+    // TODO: Recognize Literals and somehow handle them for the group expr.
+    // GROUP Information
+    std::unordered_map<std::string, int8_t> grouped;
     std::vector<mlir::Value> groupName;
     std::vector<mlir::Value> columnName;
     std::vector<mlir::Attribute> functionName;
     std::set<std::string> groundGroupColumns;
 
-    //Flags
-    enum class SQLBit{group=0, codegen, agg, checkgroup};
-    //group has group clause, activated codegen, is a complex general Expression, is a complex Group Expression, has aggregation function.
+    // Flags
+    enum class SQLBit { group = 0, codegen, agg, checkgroup };
+    // group has group clause, activated codegen, is a complex general
+    // Expression, is a complex Group Expression, has aggregation function.
     int64_t sqlFlag = 0;
 
-    //Counter for the group names, to enable multiple aggregations on the same column and to avoid name clashes.
-    //We need two counter, as we need to reproduce the same names when actually doing the code generation.
+    // Counter for the group names, to enable multiple aggregations on the same
+    // column and to avoid name clashes. We need two counter, as we need to
+    // reproduce the same names when actually doing the code generation.
     int64_t groupCounter = 0;
     int64_t groupCounterCodegen = 0;
 
-public:
-  [[maybe_unused]] explicit SQLVisitor(mlir::OpBuilder &builder,
-                                       mlir::daphne::SqlOp sqlOp)
-      : utils(builder), builder(builder), queryLoc(sqlOp.getLoc()){};
+  public:
+    [[maybe_unused]] explicit SQLVisitor(mlir::OpBuilder &builder, mlir::daphne::SqlOp sqlOp)
+        : utils(builder), builder(builder), queryLoc(sqlOp.getLoc()){};
 
-  SQLVisitor(mlir::OpBuilder &builder,
-             std::unordered_map<std::string, mlir::Value> view_arg,
-             mlir::daphne::SqlOp sqlOp)
-      : utils(builder), builder(builder), queryLoc(sqlOp.getLoc()) {
-      view = std::move(view_arg);
-  };
+    SQLVisitor(mlir::OpBuilder &builder, std::unordered_map<std::string, mlir::Value> view_arg,
+               mlir::daphne::SqlOp sqlOp)
+        : utils(builder), builder(builder), queryLoc(sqlOp.getLoc()) {
+        view = std::move(view_arg);
+    };
 
-//script
-    antlrcpp::Any visitScript(SQLGrammarParser::ScriptContext * ctx) override;
+    // script
+    antlrcpp::Any visitScript(SQLGrammarParser::ScriptContext *ctx) override;
 
-//sql
-    antlrcpp::Any visitSql(SQLGrammarParser::SqlContext * ctx) override;
+    // sql
+    antlrcpp::Any visitSql(SQLGrammarParser::SqlContext *ctx) override;
 
-//query
-    antlrcpp::Any visitQuery(SQLGrammarParser::QueryContext * ctx) override;
+    // query
+    antlrcpp::Any visitQuery(SQLGrammarParser::QueryContext *ctx) override;
 
-//select
-    antlrcpp::Any visitSelect(SQLGrammarParser::SelectContext * ctx) override;
+    // select
+    antlrcpp::Any visitSelect(SQLGrammarParser::SelectContext *ctx) override;
 
-//subquery
-    antlrcpp::Any visitSubquery(SQLGrammarParser::SubqueryContext * ctx) override;
+    // subquery
+    antlrcpp::Any visitSubquery(SQLGrammarParser::SubqueryContext *ctx) override;
 
-//subqueryExpr
-    antlrcpp::Any visitSubqueryExpr(SQLGrammarParser::SubqueryExprContext * ctx) override;
+    // subqueryExpr
+    antlrcpp::Any visitSubqueryExpr(SQLGrammarParser::SubqueryExprContext *ctx) override;
 
-//selectExpr
-    antlrcpp::Any visitSelectExpr(SQLGrammarParser::SelectExprContext * ctx) override;
+    // selectExpr
+    antlrcpp::Any visitSelectExpr(SQLGrammarParser::SelectExprContext *ctx) override;
 
-//tableExpr
-    antlrcpp::Any visitTableExpr(SQLGrammarParser::TableExprContext * ctx) override;
+    // tableExpr
+    antlrcpp::Any visitTableExpr(SQLGrammarParser::TableExprContext *ctx) override;
 
-//distinctExpr
-    antlrcpp::Any visitDistinctExpr(SQLGrammarParser::DistinctExprContext * ctx) override;
+    // distinctExpr
+    antlrcpp::Any visitDistinctExpr(SQLGrammarParser::DistinctExprContext *ctx) override;
 
-//fromExpr
+    // fromExpr
     antlrcpp::Any visitTableIdentifierExpr(SQLGrammarParser::TableIdentifierExprContext *ctx) override;
 
-    antlrcpp::Any visitCartesianExpr(SQLGrammarParser::CartesianExprContext * ctx) override;
+    antlrcpp::Any visitCartesianExpr(SQLGrammarParser::CartesianExprContext *ctx) override;
 
-//joinExpr
-    antlrcpp::Any visitInnerJoin(SQLGrammarParser::InnerJoinContext * ctx) override;
+    // joinExpr
+    antlrcpp::Any visitInnerJoin(SQLGrammarParser::InnerJoinContext *ctx) override;
 
-//whereClause
-    antlrcpp::Any visitWhereClause(SQLGrammarParser::WhereClauseContext * ctx) override;
+    // whereClause
+    antlrcpp::Any visitWhereClause(SQLGrammarParser::WhereClauseContext *ctx) override;
 
-//groupByClause
-    antlrcpp::Any visitGroupByClause(SQLGrammarParser::GroupByClauseContext * ctx) override;
+    // groupByClause
+    antlrcpp::Any visitGroupByClause(SQLGrammarParser::GroupByClauseContext *ctx) override;
 
-//havingClause
-    antlrcpp::Any visitHavingClause(SQLGrammarParser::HavingClauseContext * ctx) override;
+    // havingClause
+    antlrcpp::Any visitHavingClause(SQLGrammarParser::HavingClauseContext *ctx) override;
 
-//orderByClause
-    antlrcpp::Any visitOrderByClause(SQLGrammarParser::OrderByClauseContext * ctx) override;
+    // orderByClause
+    antlrcpp::Any visitOrderByClause(SQLGrammarParser::OrderByClauseContext *ctx) override;
 
-//orderInformation
-    antlrcpp::Any visitOrderInformation(SQLGrammarParser::OrderInformationContext * ctx) override;
+    // orderInformation
+    antlrcpp::Any visitOrderInformation(SQLGrammarParser::OrderInformationContext *ctx) override;
 
-//generalExpr
-    antlrcpp::Any visitLiteralExpr(SQLGrammarParser::LiteralExprContext * ctx) override;
+    // generalExpr
+    antlrcpp::Any visitLiteralExpr(SQLGrammarParser::LiteralExprContext *ctx) override;
 
-    antlrcpp::Any visitStarExpr(SQLGrammarParser::StarExprContext * ctx) override;
+    antlrcpp::Any visitStarExpr(SQLGrammarParser::StarExprContext *ctx) override;
 
-    antlrcpp::Any visitIdentifierExpr(SQLGrammarParser::IdentifierExprContext * ctx) override;
+    antlrcpp::Any visitIdentifierExpr(SQLGrammarParser::IdentifierExprContext *ctx) override;
 
-    antlrcpp::Any visitGroupAggExpr(SQLGrammarParser::GroupAggExprContext * ctx) override;
+    antlrcpp::Any visitGroupAggExpr(SQLGrammarParser::GroupAggExprContext *ctx) override;
 
-    antlrcpp::Any visitParanthesesExpr(SQLGrammarParser::ParanthesesExprContext * ctx) override;
+    antlrcpp::Any visitParanthesesExpr(SQLGrammarParser::ParanthesesExprContext *ctx) override;
 
-    antlrcpp::Any visitMulExpr(SQLGrammarParser::MulExprContext * ctx) override;
+    antlrcpp::Any visitMulExpr(SQLGrammarParser::MulExprContext *ctx) override;
 
-    antlrcpp::Any visitAddExpr(SQLGrammarParser::AddExprContext * ctx) override;
+    antlrcpp::Any visitAddExpr(SQLGrammarParser::AddExprContext *ctx) override;
 
-    antlrcpp::Any visitCmpExpr(SQLGrammarParser::CmpExprContext * ctx) override;
+    antlrcpp::Any visitCmpExpr(SQLGrammarParser::CmpExprContext *ctx) override;
 
-    antlrcpp::Any visitAndExpr(SQLGrammarParser::AndExprContext * ctx) override;
+    antlrcpp::Any visitAndExpr(SQLGrammarParser::AndExprContext *ctx) override;
 
-    antlrcpp::Any visitOrExpr(SQLGrammarParser::OrExprContext * ctx) override;
+    antlrcpp::Any visitOrExpr(SQLGrammarParser::OrExprContext *ctx) override;
 
-//tableReference
-    antlrcpp::Any visitTableReference(SQLGrammarParser::TableReferenceContext * ctx) override;
+    // tableReference
+    antlrcpp::Any visitTableReference(SQLGrammarParser::TableReferenceContext *ctx) override;
 
-//selectIdent
-    antlrcpp::Any visitStringIdent(SQLGrammarParser::StringIdentContext * ctx) override;
+    // selectIdent
+    antlrcpp::Any visitStringIdent(SQLGrammarParser::StringIdentContext *ctx) override;
 
-//literal
-    antlrcpp::Any visitLiteral(SQLGrammarParser::LiteralContext * ctx) override;
+    // literal
+    antlrcpp::Any visitLiteral(SQLGrammarParser::LiteralContext *ctx) override;
 };
diff --git a/src/runtime/distributed/coordinator/kernels/Broadcast.h b/src/runtime/distributed/coordinator/kernels/Broadcast.h
index d048be423..1d34470c6 100644
--- a/src/runtime/distributed/coordinator/kernels/Broadcast.h
+++ b/src/runtime/distributed/coordinator/kernels/Broadcast.h
@@ -21,17 +21,17 @@
 #include <runtime/local/datastructures/DenseMatrix.h>
 #include <runtime/local/io/DaphneSerializer.h>
 
+#include <runtime/distributed/coordinator/scheduling/LoadPartitioningDistributed.h>
+#include <runtime/distributed/proto/DistributedGRPCCaller.h>
+#include <runtime/distributed/worker/WorkerImpl.h>
 #include <runtime/local/datastructures/AllocationDescriptorGRPC.h>
 #include <runtime/local/datastructures/DataPlacement.h>
 #include <runtime/local/datastructures/Range.h>
-#include <runtime/distributed/worker/WorkerImpl.h>
-#include <runtime/distributed/proto/DistributedGRPCCaller.h>
-#include <runtime/distributed/coordinator/scheduling/LoadPartitioningDistributed.h>
 
 #ifdef USE_MPI
 #include <runtime/distributed/worker/MPIHelper.h>
-#include <runtime/local/datastructures/AllocationDescriptorMPI.h>
 #include <runtime/distributed/worker/MPIWorker.h>
+#include <runtime/local/datastructures/AllocationDescriptorMPI.h>
 #endif
 
 #include <cstddef>
@@ -40,8 +40,7 @@
 // Struct for partial template specialization
 // ****************************************************************************
 
-template<ALLOCATION_TYPE AT, class DT>
-struct Broadcast {
+template <ALLOCATION_TYPE AT, class DT> struct Broadcast {
     static void apply(DT *mat, bool isScalar, DCTX(dctx)) = delete;
 };
 
@@ -49,13 +48,10 @@ struct Broadcast {
 // Convenience function
 // ****************************************************************************
 
-template<ALLOCATION_TYPE AT, class DT>
-void broadcast(DT *&mat, bool isScalar, DCTX(dctx))
-{
+template <ALLOCATION_TYPE AT, class DT> void broadcast(DT *&mat, bool isScalar, DCTX(dctx)) {
     Broadcast<AT, DT>::apply(mat, isScalar, dctx);
 }
 
-
 // ****************************************************************************
 // (Partial) template specializations for different distributed backends
 // ****************************************************************************
@@ -64,69 +60,66 @@ void broadcast(DT *&mat, bool isScalar, DCTX(dctx))
 // MPI
 // ----------------------------------------------------------------------------
 #ifdef USE_MPI
-template<class DT>
-struct Broadcast<ALLOCATION_TYPE::DIST_MPI, DT>
-{
-    static void apply(DT *&mat, bool isScalar, DCTX(dctx))
-    {
-        size_t messageLength=0;
+template <class DT> struct Broadcast<ALLOCATION_TYPE::DIST_MPI, DT> {
+    static void apply(DT *&mat, bool isScalar, DCTX(dctx)) {
+        size_t messageLength = 0;
         std::vector<char> dataToSend;
         double val = 1;
-        if (isScalar){
-            auto ptr = (double*)(&mat);
+        if (isScalar) {
+            auto ptr = (double *)(&mat);
             val = *ptr;
             mat = DataObjectFactory::create<DenseMatrix<double>>(0, 0, false);
         }
-        std::vector<int> targetGroup; // We will not be able to take the advantage of broadcast if some mpi processes have the data
-        
+        std::vector<int> targetGroup; // We will not be able to take the advantage of
+                                      // broadcast if some mpi processes have the data
+
         LoadPartitioningDistributed<DT, AllocationDescriptorMPI> partioner(DistributionSchema::BROADCAST, mat, dctx);
-        while (partioner.HasNextChunk()){
+        while (partioner.HasNextChunk()) {
             auto dp = partioner.GetNextChunk();
-            auto rank = dynamic_cast<AllocationDescriptorMPI&>(*(dp->allocation)).getRank();
-            
-            if (dynamic_cast<AllocationDescriptorMPI&>(*(dp->allocation)).getDistributedData().isPlacedAtWorker)
+            auto rank = dynamic_cast<AllocationDescriptorMPI &>(*(dp->allocation)).getRank();
+
+            if (dynamic_cast<AllocationDescriptorMPI &>(*(dp->allocation)).getDistributedData().isPlacedAtWorker)
                 continue;
-            
+
             // Minimum chunk size
-            auto min_chunk_size = dctx->config.max_distributed_serialization_chunk_size < DaphneSerializer<DT>::length(mat) ? 
-                        dctx->config.max_distributed_serialization_chunk_size : 
-                        DaphneSerializer<DT>::length(mat);
+            auto min_chunk_size =
+                dctx->config.max_distributed_serialization_chunk_size < DaphneSerializer<DT>::length(mat)
+                    ? dctx->config.max_distributed_serialization_chunk_size
+                    : DaphneSerializer<DT>::length(mat);
 
             MPIHelper::initiateStreaming(rank, min_chunk_size);
-            targetGroup.push_back(rank);  
+            targetGroup.push_back(rank);
         }
 
-        if((int)targetGroup.size()==MPIHelper::getCommSize() - 1){ // exclude coordinator
-            if (isScalar){
+        if ((int)targetGroup.size() == MPIHelper::getCommSize() - 1) { // exclude coordinator
+            if (isScalar) {
                 std::vector<char> buffer;
                 auto length = DaphneSerializer<double>::serialize(val, buffer);
                 MPIHelper::broadcastData(length, buffer.data());
             } else {
-                auto serializer = DaphneSerializerChunks<DT>(mat, dctx->config.max_distributed_serialization_chunk_size);
+                auto serializer =
+                    DaphneSerializerChunks<DT>(mat, dctx->config.max_distributed_serialization_chunk_size);
                 for (auto it = serializer.begin(); it != serializer.end(); ++it)
                     MPIHelper::broadcastData(it->first, it->second->data());
             }
-        }
-        else{
-            for(int i=0;i<(int)targetGroup.size();i++)
+        } else {
+            for (int i = 0; i < (int)targetGroup.size(); i++)
                 MPIHelper::sendData(messageLength, dataToSend.data(), targetGroup.at(i));
         }
-        for(int i=0;i<(int)targetGroup.size();i++)
-        {            
+        for (int i = 0; i < (int)targetGroup.size(); i++) {
             int rank = targetGroup.at(i);
             if (rank == COORDINATOR)
                 continue;
 
             WorkerImpl::StoredInfo dataAcknowledgement = MPIHelper::getDataAcknowledgement(&rank);
-            std::string address=std::to_string(rank);
+            std::string address = std::to_string(rank);
             DataPlacement *dp = mat->getMetaDataObject()->getDataPlacementByLocation(address);
-            auto data = dynamic_cast<AllocationDescriptorMPI&>(*(dp->allocation)).getDistributedData();
+            auto data = dynamic_cast<AllocationDescriptorMPI &>(*(dp->allocation)).getDistributedData();
             data.identifier = dataAcknowledgement.identifier;
             data.numRows = dataAcknowledgement.numRows;
             data.numCols = dataAcknowledgement.numCols;
             data.isPlacedAtWorker = true;
-            dynamic_cast<AllocationDescriptorMPI&>(*(dp->allocation)).updateDistributedData(data);
-
+            dynamic_cast<AllocationDescriptorMPI &>(*(dp->allocation)).updateDistributedData(data);
         }
     }
 };
@@ -135,42 +128,40 @@ struct Broadcast<ALLOCATION_TYPE::DIST_MPI, DT>
 // Asynchronous GRPC
 // ----------------------------------------------------------------------------
 
-template<class DT>
-struct Broadcast<ALLOCATION_TYPE::DIST_GRPC_ASYNC, DT>
-{
-    static void apply(DT *&mat, bool isScalar, DCTX(dctx)) 
-    {
+template <class DT> struct Broadcast<ALLOCATION_TYPE::DIST_GRPC_ASYNC, DT> {
+    static void apply(DT *&mat, bool isScalar, DCTX(dctx)) {
         struct StoredInfo {
             size_t dp_id;
         };
         DistributedGRPCCaller<StoredInfo, distributed::Data, distributed::StoredData> caller(dctx);
-        
+
         auto ctx = DistributedContext::get(dctx);
         auto workers = ctx->getWorkers();
-        
+
         distributed::Data protoMsg;
         double val = 1;
         if (isScalar) {
-            auto ptr = (double*)(&mat);        
+            auto ptr = (double *)(&mat);
             val = *ptr;
             // Need matrix for metadata, type of matrix does not really matter.
-            mat = DataObjectFactory::create<DenseMatrix<double>>(0, 0, false); 
-        }         
+            mat = DataObjectFactory::create<DenseMatrix<double>>(0, 0, false);
+        }
         LoadPartitioningDistributed<DT, AllocationDescriptorGRPC> partioner(DistributionSchema::BROADCAST, mat, dctx);
-        
-        while(partioner.HasNextChunk()){
+
+        while (partioner.HasNextChunk()) {
             auto dp = partioner.GetNextChunk();
-            if (dynamic_cast<AllocationDescriptorGRPC&>(*(dp->allocation)).getDistributedData().isPlacedAtWorker)
+            if (dynamic_cast<AllocationDescriptorGRPC &>(*(dp->allocation)).getDistributedData().isPlacedAtWorker)
                 continue;
-            
-            auto address = dynamic_cast<AllocationDescriptorGRPC&>(*(dp->allocation)).getLocation();
-            
+
+            auto address = dynamic_cast<AllocationDescriptorGRPC &>(*(dp->allocation)).getLocation();
+
             StoredInfo storedInfo({dp->dp_id});
             caller.asyncStoreCall(address, storedInfo);
             // Minimum chunk size
-            auto min_chunk_size = dctx->config.max_distributed_serialization_chunk_size < DaphneSerializer<DT>::length(mat) ? 
-                        dctx->config.max_distributed_serialization_chunk_size : 
-                        DaphneSerializer<DT>::length(mat);
+            auto min_chunk_size =
+                dctx->config.max_distributed_serialization_chunk_size < DaphneSerializer<DT>::length(mat)
+                    ? dctx->config.max_distributed_serialization_chunk_size
+                    : DaphneSerializer<DT>::length(mat);
 
             // First send chunk size
             protoMsg.set_bytes(&min_chunk_size, sizeof(size_t));
@@ -181,9 +172,9 @@ struct Broadcast<ALLOCATION_TYPE::DIST_GRPC_ASYNC, DT>
                 protoMsg.set_bytes(buffer.data(), length);
 
                 caller.sendDataStream(address, protoMsg);
-            } else{
+            } else {
                 auto serializer = DaphneSerializerChunks<DT>(mat, min_chunk_size);
-                for (auto it = serializer.begin(); it != serializer.end(); ++it){                
+                for (auto it = serializer.begin(); it != serializer.end(); ++it) {
                     protoMsg.set_bytes(it->second->data(), it->first);
                     caller.sendDataStream(address, protoMsg);
                 }
@@ -191,12 +182,12 @@ struct Broadcast<ALLOCATION_TYPE::DIST_GRPC_ASYNC, DT>
         }
         caller.writesDone();
 
-        while (!caller.isQueueEmpty()){
-            auto response = caller.getNextResult();            
+        while (!caller.isQueueEmpty()) {
+            auto response = caller.getNextResult();
             auto dp_id = response.storedInfo.dp_id;
             auto dp = mat->getMetaDataObject()->getDataPlacementByID(dp_id);
 
-            auto data = dynamic_cast<AllocationDescriptorGRPC&>(*(dp->allocation)).getDistributedData();
+            auto data = dynamic_cast<AllocationDescriptorGRPC &>(*(dp->allocation)).getDistributedData();
 
             auto storedData = response.result;
             data.identifier = storedData.identifier();
@@ -204,42 +195,38 @@ struct Broadcast<ALLOCATION_TYPE::DIST_GRPC_ASYNC, DT>
             data.numCols = storedData.num_cols();
             data.isPlacedAtWorker = true;
 
-            dynamic_cast<AllocationDescriptorGRPC&>(*(dp->allocation)).updateDistributedData(data);            
-        }                
-    };           
+            dynamic_cast<AllocationDescriptorGRPC &>(*(dp->allocation)).updateDistributedData(data);
+        }
+    };
 };
 
 // ----------------------------------------------------------------------------
 // Synchronous GRPC
 // ----------------------------------------------------------------------------
 
-template<class DT>
-struct Broadcast<ALLOCATION_TYPE::DIST_GRPC_SYNC, DT>
-{
-    static void apply(DT *&mat, bool isScalar, DCTX(dctx)) 
-    {
+template <class DT> struct Broadcast<ALLOCATION_TYPE::DIST_GRPC_SYNC, DT> {
+    static void apply(DT *&mat, bool isScalar, DCTX(dctx)) {
         auto ctx = DistributedContext::get(dctx);
         auto workers = ctx->getWorkers();
-    
+
         std::vector<std::thread> threads_vector;
         std::vector<char> buffer;
         double val = 1;
         if (isScalar) {
-            auto ptr = (double*)(&mat);        
-            val = *ptr;            
+            auto ptr = (double *)(&mat);
+            val = *ptr;
             // Need matrix for metadata, type of matrix does not really matter.
-            mat = DataObjectFactory::create<DenseMatrix<double>>(0, 0, false); 
-        } 
+            mat = DataObjectFactory::create<DenseMatrix<double>>(0, 0, false);
+        }
         LoadPartitioningDistributed<DT, AllocationDescriptorGRPC> partioner(DistributionSchema::BROADCAST, mat, dctx);
 
-        while(partioner.HasNextChunk()){
+        while (partioner.HasNextChunk()) {
             auto dp = partioner.GetNextChunk();
-            if (dynamic_cast<AllocationDescriptorGRPC&>(*(dp->allocation)).getDistributedData().isPlacedAtWorker)
+            if (dynamic_cast<AllocationDescriptorGRPC &>(*(dp->allocation)).getDistributedData().isPlacedAtWorker)
                 continue;
-            
-            auto workerAddr = dynamic_cast<AllocationDescriptorGRPC&>(*(dp->allocation)).getLocation();
-            std::thread t([=, &mat]() 
-            {
+
+            auto workerAddr = dynamic_cast<AllocationDescriptorGRPC &>(*(dp->allocation)).getLocation();
+            std::thread t([=, &mat]() {
                 // TODO Consider saving channels inside DaphneContext
                 grpc::ChannelArguments ch_args;
                 ch_args.SetMaxSendMessageSize(-1);
@@ -250,33 +237,34 @@ struct Broadcast<ALLOCATION_TYPE::DIST_GRPC_SYNC, DT>
                 grpc::ClientContext grpc_ctx;
                 auto writer = stub->Store(&grpc_ctx, &storedData);
                 distributed::Data protoMsg;
-                
-                if (isScalar){
+
+                if (isScalar) {
                     std::vector<char> buffer;
                     auto length = DaphneSerializer<double>::serialize(val, buffer);
                     protoMsg.set_bytes(buffer.data(), length);
 
                     writer->Write(protoMsg);
                 } else {
-                    auto serializer = DaphneSerializerChunks<DT>(mat, dctx->config.max_distributed_serialization_chunk_size);
-                    for (auto it = serializer.begin(); it != serializer.end(); ++it){                
+                    auto serializer =
+                        DaphneSerializerChunks<DT>(mat, dctx->config.max_distributed_serialization_chunk_size);
+                    for (auto it = serializer.begin(); it != serializer.end(); ++it) {
                         protoMsg.set_bytes(it->second->data(), it->first);
                         writer->Write(protoMsg);
-                    }  
+                    }
                 }
                 writer->WritesDone();
                 auto status = writer->Finish();
-                
+
                 DistributedData newData;
                 newData.identifier = storedData.identifier();
                 newData.numRows = storedData.num_rows();
                 newData.numCols = storedData.num_cols();
                 newData.isPlacedAtWorker = true;
-                dynamic_cast<AllocationDescriptorGRPC&>(*(dp->allocation)).updateDistributedData(newData);
+                dynamic_cast<AllocationDescriptorGRPC &>(*(dp->allocation)).updateDistributedData(newData);
             });
             threads_vector.push_back(move(t));
         }
         for (auto &thread : threads_vector)
             thread.join();
-    };           
+    };
 };
diff --git a/src/runtime/distributed/coordinator/kernels/Distribute.h b/src/runtime/distributed/coordinator/kernels/Distribute.h
index 179fda62c..0d734395b 100644
--- a/src/runtime/distributed/coordinator/kernels/Distribute.h
+++ b/src/runtime/distributed/coordinator/kernels/Distribute.h
@@ -16,18 +16,18 @@
 
 #pragma once
 
+#include <runtime/distributed/coordinator/scheduling/LoadPartitioningDistributed.h>
 #include <runtime/local/context/DistributedContext.h>
 #include <runtime/local/datastructures/DataObjectFactory.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
-#include <runtime/distributed/coordinator/scheduling/LoadPartitioningDistributed.h>
 
-#include <runtime/local/datastructures/AllocationDescriptorGRPC.h>
 #include <runtime/distributed/proto/DistributedGRPCCaller.h>
 #include <runtime/distributed/worker/WorkerImpl.h>
+#include <runtime/local/datastructures/AllocationDescriptorGRPC.h>
 
 #ifdef USE_MPI
-    #include <runtime/distributed/worker/MPIHelper.h>
-#endif 
+#include <runtime/distributed/worker/MPIHelper.h>
+#endif
 
 #include <cstddef>
 #include <stdexcept>
@@ -36,8 +36,7 @@
 // Struct for partial template specialization
 // ****************************************************************************
 
-template<ALLOCATION_TYPE AT, class DT>
-struct Distribute {
+template <ALLOCATION_TYPE AT, class DT> struct Distribute {
     static void apply(DT *mat, DCTX(dctx)) = delete;
 };
 
@@ -45,12 +44,7 @@ struct Distribute {
 // Convenience function
 // ****************************************************************************
 
-template<ALLOCATION_TYPE AT, class DT>
-void distribute(DT *mat, DCTX(dctx))
-{
-    Distribute<AT, DT>::apply(mat, dctx);
-}
-
+template <ALLOCATION_TYPE AT, class DT> void distribute(DT *mat, DCTX(dctx)) { Distribute<AT, DT>::apply(mat, dctx); }
 
 // ****************************************************************************
 // (Partial) template specializations for different distributed backends
@@ -60,53 +54,50 @@ void distribute(DT *mat, DCTX(dctx))
 // ----------------------------------------------------------------------------
 // MPI
 // ----------------------------------------------------------------------------
-template<class DT>
-struct Distribute<ALLOCATION_TYPE::DIST_MPI, DT>
-{
+template <class DT> struct Distribute<ALLOCATION_TYPE::DIST_MPI, DT> {
     static void apply(DT *mat, DCTX(dctx)) {
         std::vector<char> dataToSend;
-        std::vector<int> targetGroup;  
+        std::vector<int> targetGroup;
+
+        LoadPartitioningDistributed<DT, AllocationDescriptorMPI> partioner(DistributionSchema::DISTRIBUTE, mat, dctx);
 
-        LoadPartitioningDistributed<DT, AllocationDescriptorMPI> partioner(DistributionSchema::DISTRIBUTE, mat, dctx);        
-        
-        while (partioner.HasNextChunk()){
+        while (partioner.HasNextChunk()) {
             DataPlacement *dp = partioner.GetNextChunk();
-            auto rank = dynamic_cast<AllocationDescriptorMPI&>(*(dp->allocation)).getRank();
-            
-            if (dynamic_cast<AllocationDescriptorMPI&>(*(dp->allocation)).getDistributedData().isPlacedAtWorker)
+            auto rank = dynamic_cast<AllocationDescriptorMPI &>(*(dp->allocation)).getRank();
+
+            if (dynamic_cast<AllocationDescriptorMPI &>(*(dp->allocation)).getDistributedData().isPlacedAtWorker)
                 continue;
-            
+
             auto slicedMat = mat->sliceRow(dp->range->r_start, dp->range->r_start + dp->range->r_len);
-            
+
             // Minimum chunk size
-            auto min_chunk_size = dctx->config.max_distributed_serialization_chunk_size < DaphneSerializer<DT>::length(slicedMat) ? 
-                        dctx->config.max_distributed_serialization_chunk_size : 
-                        DaphneSerializer<DT>::length(slicedMat);
+            auto min_chunk_size =
+                dctx->config.max_distributed_serialization_chunk_size < DaphneSerializer<DT>::length(slicedMat)
+                    ? dctx->config.max_distributed_serialization_chunk_size
+                    : DaphneSerializer<DT>::length(slicedMat);
             MPIHelper::initiateStreaming(rank, min_chunk_size);
             auto serializer = DaphneSerializerChunks<DT>(slicedMat, min_chunk_size);
-            for (auto it = serializer.begin(); it != serializer.end(); ++it){
+            for (auto it = serializer.begin(); it != serializer.end(); ++it) {
                 MPIHelper::sendData(it->first, it->second->data(), rank);
             }
-            targetGroup.push_back(rank);     
+            targetGroup.push_back(rank);
             DataObjectFactory::destroy(slicedMat);
         }
-        for(size_t i=0;i<targetGroup.size();i++)
-        {
-            int rank=targetGroup.at(i);
-            if (rank==COORDINATOR)
+        for (size_t i = 0; i < targetGroup.size(); i++) {
+            int rank = targetGroup.at(i);
+            if (rank == COORDINATOR)
                 continue;
-            
+
             WorkerImpl::StoredInfo dataAcknowledgement = MPIHelper::getDataAcknowledgement(&rank);
             std::string address = std::to_string(rank);
             DataPlacement *dp = mat->getMetaDataObject()->getDataPlacementByLocation(address);
-            auto data = dynamic_cast<AllocationDescriptorMPI&>(*(dp->allocation)).getDistributedData();
-            data.identifier = dataAcknowledgement.identifier ;
+            auto data = dynamic_cast<AllocationDescriptorMPI &>(*(dp->allocation)).getDistributedData();
+            data.identifier = dataAcknowledgement.identifier;
             data.numRows = dataAcknowledgement.numRows;
             data.numCols = dataAcknowledgement.numCols;
             data.isPlacedAtWorker = true;
-            dynamic_cast<AllocationDescriptorMPI&>(*(dp->allocation)).updateDistributedData(data);
+            dynamic_cast<AllocationDescriptorMPI &>(*(dp->allocation)).updateDistributedData(data);
         }
-
     }
 };
 #endif
@@ -115,71 +106,68 @@ struct Distribute<ALLOCATION_TYPE::DIST_MPI, DT>
 // Asynchronous GRPC
 // ----------------------------------------------------------------------------
 
-template<class DT>
-struct Distribute<ALLOCATION_TYPE::DIST_GRPC_ASYNC, DT>
-{
+template <class DT> struct Distribute<ALLOCATION_TYPE::DIST_GRPC_ASYNC, DT> {
     static void apply(DT *mat, DCTX(dctx)) {
         struct StoredInfo {
             size_t dp_id;
-        }; 
+        };
         DistributedGRPCCaller<StoredInfo, distributed::Data, distributed::StoredData> caller(dctx);
-            
+
         if (mat == nullptr)
             throw std::runtime_error("Distribute gRPC: mat must not be a nullptr");
-        
+
         LoadPartitioningDistributed<DT, AllocationDescriptorGRPC> partioner(DistributionSchema::DISTRIBUTE, mat, dctx);
-        
-        while (partioner.HasNextChunk()){ 
+
+        while (partioner.HasNextChunk()) {
             auto dp = partioner.GetNextChunk();
             // Skip if already placed at workers
-            if (dynamic_cast<AllocationDescriptorGRPC&>(*(dp->allocation)).getDistributedData().isPlacedAtWorker)
+            if (dynamic_cast<AllocationDescriptorGRPC &>(*(dp->allocation)).getDistributedData().isPlacedAtWorker)
                 continue;
             distributed::Data protoMsg;
 
             std::vector<char> buffer;
-            
+
             auto slicedMat = mat->sliceRow(dp->range->r_start, dp->range->r_start + dp->range->r_len);
 
-            StoredInfo storedInfo({dp->dp_id}); 
+            StoredInfo storedInfo({dp->dp_id});
 
-            auto address = dynamic_cast<AllocationDescriptorGRPC&>(*(dp->allocation)).getLocation();
+            auto address = dynamic_cast<AllocationDescriptorGRPC &>(*(dp->allocation)).getLocation();
 
             caller.asyncStoreCall(address, storedInfo);
             // Minimum chunk size
-            auto min_chunk_size = dctx->config.max_distributed_serialization_chunk_size < DaphneSerializer<DT>::length(mat) ? 
-                        dctx->config.max_distributed_serialization_chunk_size : 
-                        DaphneSerializer<DT>::length(mat);
+            auto min_chunk_size =
+                dctx->config.max_distributed_serialization_chunk_size < DaphneSerializer<DT>::length(mat)
+                    ? dctx->config.max_distributed_serialization_chunk_size
+                    : DaphneSerializer<DT>::length(mat);
 
             protoMsg.set_bytes(&min_chunk_size, sizeof(size_t));
             caller.sendDataStream(address, protoMsg);
 
             auto serializer = DaphneSerializerChunks<DT>(slicedMat, min_chunk_size);
-            for (auto it = serializer.begin(); it != serializer.end(); ++it){                
+            for (auto it = serializer.begin(); it != serializer.end(); ++it) {
                 protoMsg.set_bytes(it->second->data(), it->first);
                 caller.sendDataStream(address, protoMsg);
             }
             DataObjectFactory::destroy(slicedMat);
-        }                
+        }
         caller.writesDone();
-                       
 
-        // get results       
-        while (!caller.isQueueEmpty()){
+        // get results
+        while (!caller.isQueueEmpty()) {
             auto response = caller.getNextResult();
             auto dp_id = response.storedInfo.dp_id;
-            
-            auto storedData = response.result;            
+
+            auto storedData = response.result;
 
             auto dp = mat->getMetaDataObject()->getDataPlacementByID(dp_id);
-            
-            auto data = dynamic_cast<AllocationDescriptorGRPC&>(*(dp->allocation)).getDistributedData();
+
+            auto data = dynamic_cast<AllocationDescriptorGRPC &>(*(dp->allocation)).getDistributedData();
             data.identifier = storedData.identifier();
             data.numRows = storedData.num_rows();
             data.numCols = storedData.num_cols();
             data.isPlacedAtWorker = true;
-            dynamic_cast<AllocationDescriptorGRPC&>(*(dp->allocation)).updateDistributedData(data);            
+            dynamic_cast<AllocationDescriptorGRPC &>(*(dp->allocation)).updateDistributedData(data);
         }
-
     }
 };
 
@@ -187,44 +175,40 @@ struct Distribute<ALLOCATION_TYPE::DIST_GRPC_ASYNC, DT>
 // Synchronous GRPC
 // ----------------------------------------------------------------------------
 
-template<class DT>
-struct Distribute<ALLOCATION_TYPE::DIST_GRPC_SYNC, DT>
-{
+template <class DT> struct Distribute<ALLOCATION_TYPE::DIST_GRPC_SYNC, DT> {
     static void apply(DT *mat, DCTX(dctx)) {
         auto ctx = DistributedContext::get(dctx);
         auto workers = ctx->getWorkers();
-        
+
         if (mat == nullptr)
             throw std::runtime_error("Distribute gRPC: mat must not be a nullptr");
-        
+
         std::vector<std::thread> threads_vector;
         LoadPartitioningDistributed<DT, AllocationDescriptorGRPC> partioner(DistributionSchema::DISTRIBUTE, mat, dctx);
-        while (partioner.HasNextChunk()){ 
+        while (partioner.HasNextChunk()) {
             auto dp = partioner.GetNextChunk();
             // Skip if already placed at workers
-            if (dynamic_cast<AllocationDescriptorGRPC&>(*(dp->allocation)).getDistributedData().isPlacedAtWorker)
+            if (dynamic_cast<AllocationDescriptorGRPC &>(*(dp->allocation)).getDistributedData().isPlacedAtWorker)
                 continue;
 
-
             std::vector<char> buffer;
-            
-            
-            auto workerAddr = dynamic_cast<AllocationDescriptorGRPC&>(*(dp->allocation)).getLocation();
-            std::thread t([=, &mat]()
-            {
+
+            auto workerAddr = dynamic_cast<AllocationDescriptorGRPC &>(*(dp->allocation)).getLocation();
+            std::thread t([=, &mat]() {
                 auto stub = ctx->stubs[workerAddr].get();
 
                 distributed::StoredData storedData;
                 grpc::ClientContext grpc_ctx;
 
-                auto slicedMat = mat->sliceRow(dp->range->r_start, dp->range->r_start + dp->range->r_len);            
-                auto serializer = DaphneSerializerChunks<DT>(slicedMat, dctx->config.max_distributed_serialization_chunk_size);
-                
+                auto slicedMat = mat->sliceRow(dp->range->r_start, dp->range->r_start + dp->range->r_len);
+                auto serializer =
+                    DaphneSerializerChunks<DT>(slicedMat, dctx->config.max_distributed_serialization_chunk_size);
+
                 distributed::Data protoMsg;
 
                 // Send chunks
                 auto writer = stub->Store(&grpc_ctx, &storedData);
-                for (auto it = serializer.begin(); it != serializer.end(); ++it){                
+                for (auto it = serializer.begin(); it != serializer.end(); ++it) {
                     protoMsg.set_bytes(it->second->data(), it->first);
                     writer->Write(protoMsg);
                 }
@@ -238,10 +222,10 @@ struct Distribute<ALLOCATION_TYPE::DIST_GRPC_SYNC, DT>
                 newData.numRows = storedData.num_rows();
                 newData.numCols = storedData.num_cols();
                 newData.isPlacedAtWorker = true;
-                dynamic_cast<AllocationDescriptorGRPC&>(*(dp->allocation)).updateDistributedData(newData);
+                dynamic_cast<AllocationDescriptorGRPC &>(*(dp->allocation)).updateDistributedData(newData);
                 DataObjectFactory::destroy(slicedMat);
             });
-            threads_vector.push_back(move(t));            
+            threads_vector.push_back(move(t));
         }
         for (auto &thread : threads_vector)
             thread.join();
diff --git a/src/runtime/distributed/coordinator/kernels/DistributedCollect.h b/src/runtime/distributed/coordinator/kernels/DistributedCollect.h
index 9e9216bf0..cee446f17 100644
--- a/src/runtime/distributed/coordinator/kernels/DistributedCollect.h
+++ b/src/runtime/distributed/coordinator/kernels/DistributedCollect.h
@@ -20,27 +20,27 @@
 #include <runtime/local/context/DistributedContext.h>
 #include <runtime/local/datastructures/DataObjectFactory.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
+#include <runtime/local/kernels/BinaryOpCode.h>
+#include <runtime/local/kernels/EwBinaryMat.h>
 
-#include <runtime/local/datastructures/AllocationDescriptorGRPC.h>
-#include <runtime/local/io/DaphneSerializer.h>
 #include <runtime/distributed/proto/DistributedGRPCCaller.h>
-#include <runtime/distributed/proto/worker.pb.h>
 #include <runtime/distributed/proto/worker.grpc.pb.h>
+#include <runtime/distributed/proto/worker.pb.h>
+#include <runtime/local/datastructures/AllocationDescriptorGRPC.h>
+#include <runtime/local/io/DaphneSerializer.h>
 
 #ifdef USE_MPI
-    #include <runtime/distributed/worker/MPIHelper.h>
+#include <runtime/distributed/worker/MPIHelper.h>
 #endif
 
 #include <cstddef>
 #include <stdexcept>
 
-
 // ****************************************************************************
 // Struct for partial template specialization
 // ****************************************************************************
 
-template<ALLOCATION_TYPE AT, class DT>
-struct DistributedCollect {
+template <ALLOCATION_TYPE AT, class DT> struct DistributedCollect {
     static void apply(DT *&mat, const VectorCombine &combine, DCTX(dctx)) = delete;
 };
 
@@ -48,44 +48,34 @@ struct DistributedCollect {
 // Convenience function
 // ****************************************************************************
 
-template<ALLOCATION_TYPE AT, class DT>
-void distributedCollect(DT *&mat, const VectorCombine &combine, DCTX(dctx))
-{
+template <ALLOCATION_TYPE AT, class DT> void distributedCollect(DT *&mat, const VectorCombine &combine, DCTX(dctx)) {
     DistributedCollect<AT, DT>::apply(mat, combine, dctx);
 }
 
-
-
 // ****************************************************************************
 // (Partial) template specializations for different distributed backends
 // ****************************************************************************
 
-
 // ----------------------------------------------------------------------------
 // MPI
 // ----------------------------------------------------------------------------
 #ifdef USE_MPI
-template<class DT>
-struct DistributedCollect<ALLOCATION_TYPE::DIST_MPI, DT>
-{
-    static void apply(DT *&mat, const VectorCombine& combine, DCTX(dctx)) 
-    {
+template <class DT> struct DistributedCollect<ALLOCATION_TYPE::DIST_MPI, DT> {
+    static void apply(DT *&mat, const VectorCombine &combine, DCTX(dctx)) {
         if (mat == nullptr)
-            throw std::runtime_error("DistributedCollect gRPC: result matrix must be already allocated by wrapper since information regarding size only exists there");        
+            throw std::runtime_error("DistributedCollect gRPC: result matrix must be already "
+                                     "allocated by wrapper since information regarding size only "
+                                     "exists there");
         size_t worldSize = MPIHelper::getCommSize();
-        for(size_t rank=0; rank<worldSize ; rank++) 
-        {
-            if(rank==COORDINATOR) // we currently exclude the coordinator
-               continue;
-            
-            std::string address = std::to_string(rank);  
-            auto dp=mat->getMetaDataObject()->getDataPlacementByLocation(address);   
-            auto distributedData = dynamic_cast<AllocationDescriptorMPI&>(*(dp->allocation)).getDistributedData();            
-            WorkerImpl::StoredInfo info = {
-                distributedData.identifier,
-                distributedData.numRows,
-                distributedData.numCols
-            };
+        for (size_t rank = 0; rank < worldSize; rank++) {
+            if (rank == COORDINATOR) // we currently exclude the coordinator
+                continue;
+
+            std::string address = std::to_string(rank);
+            auto dp = mat->getMetaDataObject()->getDataPlacementByLocation(address);
+            auto distributedData = dynamic_cast<AllocationDescriptorMPI &>(*(dp->allocation)).getDistributedData();
+            WorkerImpl::StoredInfo info = {distributedData.identifier, distributedData.numRows,
+                                           distributedData.numCols};
             MPIHelper::requestData(rank, info);
         }
         auto collectedDataItems = 0u;
@@ -94,36 +84,39 @@ struct DistributedCollect<ALLOCATION_TYPE::DIST_MPI, DT>
             int rank;
             std::vector<char> buffer;
             MPIHelper::getMessage(&rank, TypesOfMessages::OUTPUT, MPI_UNSIGNED_CHAR, buffer, &len);
-            
-            std::string address = std::to_string(rank);  
-            auto dp = mat->getMetaDataObject()->getDataPlacementByLocation(address);   
-                    
-            auto denseMat = dynamic_cast<DenseMatrix<double>*>(mat);
-            if (!denseMat){
-                throw std::runtime_error("Distribute grpc only supports DenseMatrix<double> for now");
-            }            
-
-            auto slicedMat = dynamic_cast<DenseMatrix<double>*>(DF_deserialize(buffer));
+
+            std::string address = std::to_string(rank);
+            auto dp = mat->getMetaDataObject()->getDataPlacementByLocation(address);
+
+            auto denseMat = dynamic_cast<DenseMatrix<double> *>(mat);
+            if (!denseMat) {
+                throw std::runtime_error("Distribute grpc only supports "
+                                         "DenseMatrix<double> for now");
+            }
+
+            auto slicedMat = dynamic_cast<DenseMatrix<double> *>(DF_deserialize(buffer));
             if (combine == VectorCombine::ADD) {
                 ewBinaryMat(BinaryOpCode::ADD, denseMat, slicedMat, denseMat, nullptr);
-            } else {                    
+            } else {
                 auto resValues = denseMat->getValues() + (dp->range->r_start * denseMat->getRowSkip());
                 auto slicedMatValues = slicedMat->getValues();
-                for (size_t r = 0; r < dp->range->r_len; r++){
+                for (size_t r = 0; r < dp->range->r_len; r++) {
                     memcpy(resValues + dp->range->c_start, slicedMatValues, dp->range->c_len * sizeof(double));
                     resValues += denseMat->getRowSkip();
                     slicedMatValues += slicedMat->getRowSkip();
                 }
             }
             DataObjectFactory::destroy(slicedMat);
-            
-            collectedDataItems+=  dp->range->r_len *  dp->range->c_len;
 
-            auto distributedData = dynamic_cast<AllocationDescriptorMPI&>(*(dp->allocation)).getDistributedData();            
+            collectedDataItems += dp->range->r_len * dp->range->c_len;
+
+            auto distributedData = dynamic_cast<AllocationDescriptorMPI &>(*(dp->allocation)).getDistributedData();
             distributedData.isPlacedAtWorker = false;
-            dynamic_cast<AllocationDescriptorMPI&>(*(dp->allocation)).updateDistributedData(distributedData);
-            // this is to handle the case when not all workers participate in the computation, i.e., number of workers is larger than of the work items
-            if(collectedDataItems == denseMat->getNumRows() * denseMat->getNumCols())
+            dynamic_cast<AllocationDescriptorMPI &>(*(dp->allocation)).updateDistributedData(distributedData);
+            // this is to handle the case when not all workers participate in
+            // the computation, i.e., number of workers is larger than of the
+            // work items
+            if (collectedDataItems == denseMat->getNumRows() * denseMat->getNumCols())
                 break;
         }
     };
@@ -134,83 +127,79 @@ struct DistributedCollect<ALLOCATION_TYPE::DIST_MPI, DT>
 // Asynchronous GRPC
 // ----------------------------------------------------------------------------
 
-template<class DT>
-struct DistributedCollect<ALLOCATION_TYPE::DIST_GRPC_ASYNC, DT>
-{
-    static void apply(DT *&mat, const VectorCombine& combine, DCTX(dctx)) 
-    {
+template <class DT> struct DistributedCollect<ALLOCATION_TYPE::DIST_GRPC_ASYNC, DT> {
+    static void apply(DT *&mat, const VectorCombine &combine, DCTX(dctx)) {
         if (mat == nullptr)
-            throw std::runtime_error("DistributedCollect gRPC: result matrix must be already allocated by wrapper since information regarding size only exists there");        
+            throw std::runtime_error("DistributedCollect gRPC: result matrix must be already "
+                                     "allocated by wrapper since information regarding size only "
+                                     "exists there");
 
-        struct StoredInfo{
+        struct StoredInfo {
             size_t dp_id;
         };
         DistributedGRPCCaller<StoredInfo, distributed::StoredData, distributed::Data> caller(dctx);
 
-
         auto dpVector = mat->getMetaDataObject()->getDataPlacementByType(ALLOCATION_TYPE::DIST_GRPC);
         for (auto &dp : *dpVector) {
             auto address = dp->allocation->getLocation();
-            
-            auto distributedData = dynamic_cast<AllocationDescriptorGRPC&>(*(dp->allocation)).getDistributedData();
+
+            auto distributedData = dynamic_cast<AllocationDescriptorGRPC &>(*(dp->allocation)).getDistributedData();
             StoredInfo storedInfo({dp->dp_id});
             distributed::StoredData protoData;
             protoData.set_identifier(distributedData.identifier);
             protoData.set_num_rows(distributedData.numRows);
-            protoData.set_num_cols(distributedData.numCols);                       
+            protoData.set_num_cols(distributedData.numCols);
 
             caller.asyncTransferCall(address, storedInfo, protoData);
         }
-                
-        
 
-        while (!caller.isQueueEmpty()){
+        while (!caller.isQueueEmpty()) {
             auto response = caller.getNextResult();
             auto dp_id = response.storedInfo.dp_id;
             auto dp = mat->getMetaDataObject()->getDataPlacementByID(dp_id);
-            auto data = dynamic_cast<AllocationDescriptorGRPC&>(*(dp->allocation)).getDistributedData();            
+            auto data = dynamic_cast<AllocationDescriptorGRPC &>(*(dp->allocation)).getDistributedData();
 
             auto matProto = response.result;
-            
-            // TODO: We need to handle different data types 
-            auto denseMat = dynamic_cast<DenseMatrix<double>*>(mat);
-            if (!denseMat){
-                throw std::runtime_error("Distribute grpc only supports DenseMatrix<double> for now");
+
+            // TODO: We need to handle different data types
+            auto denseMat = dynamic_cast<DenseMatrix<double> *>(mat);
+            if (!denseMat) {
+                throw std::runtime_error("Distribute grpc only supports "
+                                         "DenseMatrix<double> for now");
             }
             // Zero copy buffer
-            std::vector<char> buf(static_cast<const char*>(matProto.bytes().data()), static_cast<const char*>(matProto.bytes().data()) + matProto.bytes().size()); 
-            auto slicedMat = dynamic_cast<DenseMatrix<double>*>(DF_deserialize(buf));
+            std::vector<char> buf(static_cast<const char *>(matProto.bytes().data()),
+                                  static_cast<const char *>(matProto.bytes().data()) + matProto.bytes().size());
+            auto slicedMat = dynamic_cast<DenseMatrix<double> *>(DF_deserialize(buf));
             if (combine == VectorCombine::ADD) {
                 ewBinaryMat(BinaryOpCode::ADD, denseMat, slicedMat, denseMat, nullptr);
             } else {
                 auto resValues = denseMat->getValues() + (dp->range->r_start * denseMat->getRowSkip());
                 auto slicedMatValues = slicedMat->getValues();
-                for (size_t r = 0; r < dp->range->r_len; r++){
+                for (size_t r = 0; r < dp->range->r_len; r++) {
                     memcpy(resValues + dp->range->c_start, slicedMatValues, dp->range->c_len * sizeof(double));
-                    resValues += denseMat->getRowSkip();                    
+                    resValues += denseMat->getRowSkip();
                     slicedMatValues += slicedMat->getRowSkip();
                 }
             }
             DataObjectFactory::destroy(slicedMat);
 
             data.isPlacedAtWorker = false;
-            dynamic_cast<AllocationDescriptorGRPC&>(*(dp->allocation)).updateDistributedData(data);
-        } 
+            dynamic_cast<AllocationDescriptorGRPC &>(*(dp->allocation)).updateDistributedData(data);
+        }
     };
 };
 
-
 // ----------------------------------------------------------------------------
 // Synchronous GRPC
 // ----------------------------------------------------------------------------
 
-template<class DT>
-struct DistributedCollect<ALLOCATION_TYPE::DIST_GRPC_SYNC, DT>
-{
-    static void apply(DT *&mat, const VectorCombine& combine, DCTX(dctx)) 
-    {
+template <class DT> struct DistributedCollect<ALLOCATION_TYPE::DIST_GRPC_SYNC, DT> {
+    static void apply(DT *&mat, const VectorCombine &combine, DCTX(dctx)) {
         if (mat == nullptr)
-            throw std::runtime_error("DistributedCollect gRPC: result matrix must be already allocated by wrapper since information regarding size only exists there");        
+            throw std::runtime_error("DistributedCollect gRPC: result matrix must be already "
+                                     "allocated by wrapper since information regarding size only "
+                                     "exists there");
 
         auto ctx = DistributedContext::get(dctx);
         std::vector<std::thread> threads_vector;
@@ -219,49 +208,49 @@ struct DistributedCollect<ALLOCATION_TYPE::DIST_GRPC_SYNC, DT>
         auto dpVector = mat->getMetaDataObject()->getDataPlacementByType(ALLOCATION_TYPE::DIST_GRPC);
         for (auto &dp : *dpVector) {
             auto address = dp->allocation->getLocation();
-            
-            auto distributedData = dynamic_cast<AllocationDescriptorGRPC&>(*(dp->allocation)).getDistributedData();            
+
+            auto distributedData = dynamic_cast<AllocationDescriptorGRPC &>(*(dp->allocation)).getDistributedData();
             distributed::StoredData protoData;
             protoData.set_identifier(distributedData.identifier);
             protoData.set_num_rows(distributedData.numRows);
             protoData.set_num_cols(distributedData.numCols);
 
-            std::thread t([address, dp = dp.get(), protoData, distributedData, &combine, &lock, &mat, &ctx]() mutable
-            {
+            std::thread t([address, dp = dp.get(), protoData, distributedData, &combine, &lock, &mat, &ctx]() mutable {
                 auto stub = ctx->stubs[address].get();
 
                 distributed::Data matProto;
                 grpc::ClientContext grpc_ctx;
                 stub->Transfer(&grpc_ctx, protoData, &matProto);
-            
-                // TODO: We need to handle different data types 
-                auto denseMat = dynamic_cast<DenseMatrix<double>*>(mat);
-                if (!denseMat){
-                    throw std::runtime_error("Distribute grpc only supports DenseMatrix<double> for now");
+
+                // TODO: We need to handle different data types
+                auto denseMat = dynamic_cast<DenseMatrix<double> *>(mat);
+                if (!denseMat) {
+                    throw std::runtime_error("Distribute grpc only supports "
+                                             "DenseMatrix<double> for now");
                 }
                 // Zero copy buffer
-                std::vector<char> buf(static_cast<const char*>(matProto.bytes().data()), static_cast<const char*>(matProto.bytes().data()) + matProto.bytes().size()); 
-                auto slicedMat = dynamic_cast<DenseMatrix<double>*>(DF_deserialize(buf));
+                std::vector<char> buf(static_cast<const char *>(matProto.bytes().data()),
+                                      static_cast<const char *>(matProto.bytes().data()) + matProto.bytes().size());
+                auto slicedMat = dynamic_cast<DenseMatrix<double> *>(DF_deserialize(buf));
                 if (combine == VectorCombine::ADD) {
                     std::lock_guard g(lock);
                     ewBinaryMat(BinaryOpCode::ADD, denseMat, slicedMat, denseMat, nullptr);
                 } else {
                     auto resValues = denseMat->getValues() + (dp->range->r_start * denseMat->getRowSkip());
                     auto slicedMatValues = slicedMat->getValues();
-                    for (size_t r = 0; r < dp->range->r_len; r++){
+                    for (size_t r = 0; r < dp->range->r_len; r++) {
                         memcpy(resValues + dp->range->c_start, slicedMatValues, dp->range->c_len * sizeof(double));
-                        resValues += denseMat->getRowSkip();                    
+                        resValues += denseMat->getRowSkip();
                         slicedMatValues += slicedMat->getRowSkip();
                     }
                 }
                 DataObjectFactory::destroy(slicedMat);
                 distributedData.isPlacedAtWorker = false;
-                dynamic_cast<AllocationDescriptorGRPC&>(*(dp->allocation)).updateDistributedData(distributedData);
+                dynamic_cast<AllocationDescriptorGRPC &>(*(dp->allocation)).updateDistributedData(distributedData);
             });
-            threads_vector.push_back(move(t));        
+            threads_vector.push_back(move(t));
         }
         for (auto &thread : threads_vector)
             thread.join();
     };
 };
-
diff --git a/src/runtime/distributed/coordinator/kernels/DistributedCompute.h b/src/runtime/distributed/coordinator/kernels/DistributedCompute.h
index d8b700c01..f909906c2 100644
--- a/src/runtime/distributed/coordinator/kernels/DistributedCompute.h
+++ b/src/runtime/distributed/coordinator/kernels/DistributedCompute.h
@@ -18,40 +18,37 @@
 #define SRC_RUNTIME_DISTRIBUTED_COORDINATOR_KERNELS_DISTRIBUTEDCOMPUTE_H
 
 #include <runtime/local/context/DaphneContext.h>
-#include <runtime/local/datastructures/DataObjectFactory.h>
 #include <runtime/local/datastructures/AllocationDescriptorGRPC.h>
+#include <runtime/local/datastructures/DataObjectFactory.h>
 
-#include <runtime/distributed/proto/worker.pb.h>
-#include <runtime/distributed/proto/worker.grpc.pb.h>
-#include <runtime/distributed/proto/DistributedGRPCCaller.h>
 #include <runtime/distributed/coordinator/scheduling/LoadPartitioningDistributed.h>
+#include <runtime/distributed/proto/DistributedGRPCCaller.h>
+#include <runtime/distributed/proto/worker.grpc.pb.h>
+#include <runtime/distributed/proto/worker.pb.h>
 #ifdef USE_MPI
-    #include <runtime/distributed/worker/MPIHelper.h>
+#include <runtime/distributed/worker/MPIHelper.h>
 #endif
 
 #include <cstddef>
 
 using mlir::daphne::VectorCombine;
 
-
-
 // ****************************************************************************
 // Struct for partial template specialization
 // ****************************************************************************
 
-template<ALLOCATION_TYPE AT, class DTRes, class DTArgs>
-struct DistributedCompute
-{
-    static void apply(DTRes **&res, size_t numOutputs, DTArgs **args, size_t numInputs, const char *mlirCode, VectorCombine *vectorCombine, DCTX(dctx)) = delete;
+template <ALLOCATION_TYPE AT, class DTRes, class DTArgs> struct DistributedCompute {
+    static void apply(DTRes **&res, size_t numOutputs, DTArgs **args, size_t numInputs, const char *mlirCode,
+                      VectorCombine *vectorCombine, DCTX(dctx)) = delete;
 };
 
 // ****************************************************************************
 // Convenience function
 // ****************************************************************************
 
-template<ALLOCATION_TYPE AT, class DTRes, class DTArgs>
-void distributedCompute(DTRes **&res, size_t numOutputs, DTArgs **args, size_t numInputs, const char *mlirCode, VectorCombine *vectorCombine, DCTX(dctx))
-{
+template <ALLOCATION_TYPE AT, class DTRes, class DTArgs>
+void distributedCompute(DTRes **&res, size_t numOutputs, DTArgs **args, size_t numInputs, const char *mlirCode,
+                        VectorCombine *vectorCombine, DCTX(dctx)) {
     DistributedCompute<AT, DTRes, DTArgs>::apply(res, numOutputs, args, numInputs, mlirCode, vectorCombine, dctx);
 }
 
@@ -63,32 +60,24 @@ void distributedCompute(DTRes **&res, size_t numOutputs, DTArgs **args, size_t n
 // MPI
 // ----------------------------------------------------------------------------
 #ifdef USE_MPI
-template<class DTRes>
-struct DistributedCompute<ALLOCATION_TYPE::DIST_MPI, DTRes, const Structure>
-{
-    static void apply(DTRes **&res,
-                      size_t numOutputs,
-                      const Structure **args,
-                      size_t numInputs,
-                      const char *mlirCode,
-                      VectorCombine *vectorCombine,                      
-                      DCTX(dctx))
-    {
+template <class DTRes> struct DistributedCompute<ALLOCATION_TYPE::DIST_MPI, DTRes, const Structure> {
+    static void apply(DTRes **&res, size_t numOutputs, const Structure **args, size_t numInputs, const char *mlirCode,
+                      VectorCombine *vectorCombine, DCTX(dctx)) {
         size_t worldSize = MPIHelper::getCommSize(); // exclude coordinator
 
-        LoadPartitioningDistributed<DTRes, AllocationDescriptorMPI>::SetOutputsMetadata(res, numOutputs, vectorCombine, dctx);
-        
+        LoadPartitioningDistributed<DTRes, AllocationDescriptorMPI>::SetOutputsMetadata(res, numOutputs, vectorCombine,
+                                                                                        dctx);
+
         std::vector<char> taskBuffer;
         for (size_t rank = 1; rank < worldSize; rank++) // we currently exclude the coordinator
         {
             MPIHelper::Task task;
-            std::string addr= std::to_string(rank);
-            for (size_t i = 0; i < numInputs; i++)
-            {
+            std::string addr = std::to_string(rank);
+            for (size_t i = 0; i < numInputs; i++) {
                 auto dp = args[i]->getMetaDataObject()->getDataPlacementByLocation(addr);
-                auto distrData = dynamic_cast<AllocationDescriptorMPI&>(*(dp->allocation)).getDistributedData();
-                
-                MPIHelper::StoredInfo storedData({distrData.identifier, distrData.numRows, distrData.numCols});                
+                auto distrData = dynamic_cast<AllocationDescriptorMPI &>(*(dp->allocation)).getDistributedData();
+
+                MPIHelper::StoredInfo storedData({distrData.identifier, distrData.numRows, distrData.numCols});
                 task.inputs.push_back(storedData);
             }
             task.mlir_code = mlirCode;
@@ -97,20 +86,20 @@ struct DistributedCompute<ALLOCATION_TYPE::DIST_MPI, DTRes, const Structure>
             MPIHelper::sendTask(len, taskBuffer.data(), rank);
         }
 
-        for (size_t rank = 1; rank < worldSize; rank++){
+        for (size_t rank = 1; rank < worldSize; rank++) {
             auto buffer = MPIHelper::getComputeResults(rank);
             std::vector<WorkerImpl::StoredInfo> infoVec = MPIHelper::constructStoredInfoVector(buffer);
             size_t idx = 0;
-            for (auto info : infoVec){            
+            for (auto info : infoVec) {
                 auto resMat = *res[idx++];
                 auto dp = resMat->getMetaDataObject()->getDataPlacementByLocation(std::to_string(rank));
 
-                auto data = dynamic_cast<AllocationDescriptorMPI&>(*(dp->allocation)).getDistributedData();
+                auto data = dynamic_cast<AllocationDescriptorMPI &>(*(dp->allocation)).getDistributedData();
                 data.identifier = info.identifier;
                 data.numRows = info.numRows;
                 data.numCols = info.numCols;
                 data.isPlacedAtWorker = true;
-                dynamic_cast<AllocationDescriptorMPI&>(*(dp->allocation)).updateDistributedData(data);                                                
+                dynamic_cast<AllocationDescriptorMPI &>(*(dp->allocation)).updateDistributedData(data);
             }
         }
     }
@@ -121,37 +110,29 @@ struct DistributedCompute<ALLOCATION_TYPE::DIST_MPI, DTRes, const Structure>
 // Asynchronous GRPC
 // ----------------------------------------------------------------------------
 
-
-template<class DTRes>
-struct DistributedCompute<ALLOCATION_TYPE::DIST_GRPC_ASYNC, DTRes, const Structure>
-{
-    static void apply(DTRes **&res,
-                      size_t numOutputs,
-                      const Structure **args,
-                      size_t numInputs,
-                      const char *mlirCode,
-                      VectorCombine *vectorCombine,                      
-                      DCTX(dctx))
-    {
+template <class DTRes> struct DistributedCompute<ALLOCATION_TYPE::DIST_GRPC_ASYNC, DTRes, const Structure> {
+    static void apply(DTRes **&res, size_t numOutputs, const Structure **args, size_t numInputs, const char *mlirCode,
+                      VectorCombine *vectorCombine, DCTX(dctx)) {
         auto ctx = DistributedContext::get(dctx);
         auto workers = ctx->getWorkers();
-        
+
         struct StoredInfo {
             std::string addr;
-        };                
+        };
         DistributedGRPCCaller<StoredInfo, distributed::Task, distributed::ComputeResult> caller(dctx);
-        
+
         // Set output meta data
-        LoadPartitioningDistributed<DTRes, AllocationDescriptorGRPC>::SetOutputsMetadata(res, numOutputs, vectorCombine, dctx);
-        
+        LoadPartitioningDistributed<DTRes, AllocationDescriptorGRPC>::SetOutputsMetadata(res, numOutputs, vectorCombine,
+                                                                                         dctx);
+
         // Iterate over workers
         // Pass all the nessecary arguments for the pipeline
         for (auto addr : workers) {
 
             distributed::Task task;
-            for (size_t i = 0; i < numInputs; i++){
+            for (size_t i = 0; i < numInputs; i++) {
                 auto dp = args[i]->getMetaDataObject()->getDataPlacementByLocation(addr);
-                auto distrData = dynamic_cast<AllocationDescriptorGRPC&>(*(dp->allocation)).getDistributedData();
+                auto distrData = dynamic_cast<AllocationDescriptorGRPC &>(*(dp->allocation)).getDistributedData();
 
                 distributed::StoredData protoData;
                 protoData.set_identifier(distrData.identifier);
@@ -161,71 +142,63 @@ struct DistributedCompute<ALLOCATION_TYPE::DIST_GRPC_ASYNC, DTRes, const Structu
                 *task.add_inputs()->mutable_stored() = protoData;
             }
             task.set_mlir_code(mlirCode);
-            StoredInfo storedInfo({addr});    
-            // TODO for now resuing channels seems to slow things down... 
-            // It is faster if we generate channel for each call and let gRPC handle resources internally
-            // We might need to change this in the future and re-use channels ( data.getChannel() )
+            StoredInfo storedInfo({addr});
+            // TODO for now resuing channels seems to slow things down...
+            // It is faster if we generate channel for each call and let gRPC
+            // handle resources internally We might need to change this in the
+            // future and re-use channels ( data.getChannel() )
             caller.asyncComputeCall(addr, storedInfo, task);
         }
-        
+
         // Get Results
-        while (!caller.isQueueEmpty()){
+        while (!caller.isQueueEmpty()) {
             auto response = caller.getNextResult();
             auto addr = response.storedInfo.addr;
-            
-            auto computeResult = response.result;            
-            
-            for (int o = 0; o < computeResult.outputs_size(); o++){            
+
+            auto computeResult = response.result;
+
+            for (int o = 0; o < computeResult.outputs_size(); o++) {
                 auto resMat = *res[o];
                 auto dp = resMat->getMetaDataObject()->getDataPlacementByLocation(addr);
 
-                auto data = dynamic_cast<AllocationDescriptorGRPC&>(*(dp->allocation)).getDistributedData();
+                auto data = dynamic_cast<AllocationDescriptorGRPC &>(*(dp->allocation)).getDistributedData();
                 data.identifier = computeResult.outputs()[o].stored().identifier();
                 data.numRows = computeResult.outputs()[o].stored().num_rows();
                 data.numCols = computeResult.outputs()[o].stored().num_cols();
                 data.isPlacedAtWorker = true;
-                dynamic_cast<AllocationDescriptorGRPC&>(*(dp->allocation)).updateDistributedData(data);                                                
-            }            
-        }                
+                dynamic_cast<AllocationDescriptorGRPC &>(*(dp->allocation)).updateDistributedData(data);
+            }
+        }
     }
 };
 
-
 // ----------------------------------------------------------------------------
 // Synchronous GRPC
 // ----------------------------------------------------------------------------
 
-
-template<class DTRes>
-struct DistributedCompute<ALLOCATION_TYPE::DIST_GRPC_SYNC, DTRes, const Structure>
-{
-    static void apply(DTRes **&res,
-                      size_t numOutputs,
-                      const Structure **args,
-                      size_t numInputs,
-                      const char *mlirCode,
-                      VectorCombine *vectorCombine,                      
-                      DCTX(dctx))
-    {
+template <class DTRes> struct DistributedCompute<ALLOCATION_TYPE::DIST_GRPC_SYNC, DTRes, const Structure> {
+    static void apply(DTRes **&res, size_t numOutputs, const Structure **args, size_t numInputs, const char *mlirCode,
+                      VectorCombine *vectorCombine, DCTX(dctx)) {
         auto ctx = DistributedContext::get(dctx);
         auto workers = ctx->getWorkers();
-        
+
         // Initialize Distributed index array, needed for results
         std::vector<DistributedIndex> ix(numOutputs, DistributedIndex(0, 0));
-        
+
         std::vector<std::thread> threads_vector;
-        
+
         // Set output meta data
-        LoadPartitioningDistributed<DTRes, AllocationDescriptorGRPC>::SetOutputsMetadata(res, numOutputs, vectorCombine, dctx);
-        
+        LoadPartitioningDistributed<DTRes, AllocationDescriptorGRPC>::SetOutputsMetadata(res, numOutputs, vectorCombine,
+                                                                                         dctx);
+
         // Iterate over workers
         // Pass all the nessecary arguments for the pipeline
         for (auto addr : workers) {
 
             distributed::Task task;
-            for (size_t i = 0; i < numInputs; i++){
+            for (size_t i = 0; i < numInputs; i++) {
                 auto dp = args[i]->getMetaDataObject()->getDataPlacementByLocation(addr);
-                auto distrData = dynamic_cast<AllocationDescriptorGRPC&>(*(dp->allocation)).getDistributedData();
+                auto distrData = dynamic_cast<AllocationDescriptorGRPC &>(*(dp->allocation)).getDistributedData();
 
                 distributed::StoredData protoData;
                 protoData.set_identifier(distrData.identifier);
@@ -234,9 +207,8 @@ struct DistributedCompute<ALLOCATION_TYPE::DIST_GRPC_SYNC, DTRes, const Structur
 
                 *task.add_inputs()->mutable_stored() = protoData;
             }
-            task.set_mlir_code(mlirCode);            
-            std::thread t([&, task, addr]()
-            {
+            task.set_mlir_code(mlirCode);
+            std::thread t([&, task, addr]() {
                 auto stub = ctx->stubs[addr].get();
 
                 distributed::ComputeResult computeResult;
@@ -246,23 +218,23 @@ struct DistributedCompute<ALLOCATION_TYPE::DIST_GRPC_SYNC, DTRes, const Structur
                 if (!status.ok())
                     throw std::runtime_error(status.error_message());
 
-                for (int o = 0; o < computeResult.outputs_size(); o++){            
+                for (int o = 0; o < computeResult.outputs_size(); o++) {
                     auto resMat = *res[o];
                     auto dp = resMat->getMetaDataObject()->getDataPlacementByLocation(addr);
 
-                    auto data = dynamic_cast<AllocationDescriptorGRPC&>(*(dp->allocation)).getDistributedData();
+                    auto data = dynamic_cast<AllocationDescriptorGRPC &>(*(dp->allocation)).getDistributedData();
                     data.identifier = computeResult.outputs()[o].stored().identifier();
                     data.numRows = computeResult.outputs()[o].stored().num_rows();
                     data.numCols = computeResult.outputs()[o].stored().num_cols();
                     data.isPlacedAtWorker = true;
-                    dynamic_cast<AllocationDescriptorGRPC&>(*(dp->allocation)).updateDistributedData(data);                                                
+                    dynamic_cast<AllocationDescriptorGRPC &>(*(dp->allocation)).updateDistributedData(data);
                 }
             });
-            threads_vector.push_back(move(t));           
+            threads_vector.push_back(move(t));
         }
         for (auto &thread : threads_vector)
             thread.join();
     }
 };
 
-#endif //SRC_RUNTIME_DISTRIBUTED_COORDINATOR_KERNELS_DISTRIBUTEDCOMPUTE_H
+#endif // SRC_RUNTIME_DISTRIBUTED_COORDINATOR_KERNELS_DISTRIBUTEDCOMPUTE_H
diff --git a/src/runtime/distributed/coordinator/kernels/DistributedRead.h b/src/runtime/distributed/coordinator/kernels/DistributedRead.h
index 981dd8f8b..0ab6f72a5 100644
--- a/src/runtime/distributed/coordinator/kernels/DistributedRead.h
+++ b/src/runtime/distributed/coordinator/kernels/DistributedRead.h
@@ -16,21 +16,21 @@
 
 #pragma once
 
+#include <parser/metadata/MetaDataParser.h>
 #include <runtime/local/context/DistributedContext.h>
 #include <runtime/local/datastructures/DataObjectFactory.h>
-#include <runtime/local/io/ReadCsv.h>
-#include <runtime/local/io/File.h>
-#include <parser/metadata/MetaDataParser.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
+#include <runtime/local/io/File.h>
+#include <runtime/local/io/ReadCsv.h>
 
 #include <runtime/distributed/coordinator/scheduling/LoadPartitioningDistributed.h>
-#include <runtime/local/datastructures/AllocationDescriptorGRPC.h>
 #include <runtime/distributed/proto/DistributedGRPCCaller.h>
 #include <runtime/distributed/worker/WorkerImpl.h>
+#include <runtime/local/datastructures/AllocationDescriptorGRPC.h>
 
 #ifdef USE_MPI
-    #include <runtime/distributed/worker/MPIHelper.h>
-#endif 
+#include <runtime/distributed/worker/MPIHelper.h>
+#endif
 
 #include <cstddef>
 
@@ -38,8 +38,7 @@
 // Struct for partial template specialization
 // ****************************************************************************
 
-template<ALLOCATION_TYPE AT, class DTRes>
-struct DistributedRead {
+template <ALLOCATION_TYPE AT, class DTRes> struct DistributedRead {
     static void apply(DTRes *&res, const char *filename, DCTX(dctx)) = delete;
 };
 
@@ -47,27 +46,19 @@ struct DistributedRead {
 // Convenience function
 // ****************************************************************************
 
-template<class DTRes>
-void distributedRead(DTRes *&res, const char *filename, DCTX(dctx))
-{
+template <class DTRes> void distributedRead(DTRes *&res, const char *filename, DCTX(dctx)) {
     const auto allocation_type = dctx->getUserConfig().distributedBackEndSetup;
-    if (allocation_type == ALLOCATION_TYPE::DIST_MPI)
-    {
+    if (allocation_type == ALLOCATION_TYPE::DIST_MPI) {
 #ifdef USE_MPI
         DistributedRead<ALLOCATION_TYPE::DIST_MPI, DTRes>::apply(res, filename, dctx);
 #endif
-    }
-    else if (allocation_type == ALLOCATION_TYPE::DIST_GRPC_ASYNC)
-    {
+    } else if (allocation_type == ALLOCATION_TYPE::DIST_GRPC_ASYNC) {
         DistributedRead<ALLOCATION_TYPE::DIST_GRPC_ASYNC, DTRes>::apply(res, filename, dctx);
-    }
-    else if (allocation_type == ALLOCATION_TYPE::DIST_GRPC_SYNC)
-    {
+    } else if (allocation_type == ALLOCATION_TYPE::DIST_GRPC_SYNC) {
         DistributedRead<ALLOCATION_TYPE::DIST_GRPC_SYNC, DTRes>::apply(res, filename, dctx);
     }
 }
 
-
 // ****************************************************************************
 // (Partial) template specializations for different distributed backends
 // ****************************************************************************
@@ -76,12 +67,8 @@ void distributedRead(DTRes *&res, const char *filename, DCTX(dctx))
 // ----------------------------------------------------------------------------
 // MPI
 // ----------------------------------------------------------------------------
-template<class DTRes>
-struct DistributedRead<ALLOCATION_TYPE::DIST_MPI, DTRes>
-{
-    static void apply(DTRes *&res, const char *filename, DCTX(dctx)) {
-        throw std::runtime_error("not implemented");
-    }
+template <class DTRes> struct DistributedRead<ALLOCATION_TYPE::DIST_MPI, DTRes> {
+    static void apply(DTRes *&res, const char *filename, DCTX(dctx)) { throw std::runtime_error("not implemented"); }
 };
 #endif
 
@@ -89,49 +76,43 @@ struct DistributedRead<ALLOCATION_TYPE::DIST_MPI, DTRes>
 // Asynchronous GRPC
 // ----------------------------------------------------------------------------
 
-template<class DTRes>
-struct DistributedRead<ALLOCATION_TYPE::DIST_GRPC_ASYNC, DTRes>
-{
-    static void apply(DTRes *&res, const char *filename, DCTX(dctx)) {
-       throw std::runtime_error("not implemented");
-    }
+template <class DTRes> struct DistributedRead<ALLOCATION_TYPE::DIST_GRPC_ASYNC, DTRes> {
+    static void apply(DTRes *&res, const char *filename, DCTX(dctx)) { throw std::runtime_error("not implemented"); }
 };
 
 // ----------------------------------------------------------------------------
 // Synchronous GRPC
 // ----------------------------------------------------------------------------
 
-template<class DTRes>
-struct DistributedRead<ALLOCATION_TYPE::DIST_GRPC_SYNC, DTRes>
-{
+template <class DTRes> struct DistributedRead<ALLOCATION_TYPE::DIST_GRPC_SYNC, DTRes> {
     static void apply(DTRes *&res, const char *filename, DCTX(dctx)) {
 #if USE_HDFS
         auto ctx = DistributedContext::get(dctx);
-        auto workers = ctx->getWorkers();        
-        
-        // Generate metadata for the object based on MetaDataFile and 
+        auto workers = ctx->getWorkers();
+
+        // Generate metadata for the object based on MetaDataFile and
         // when the worker needs the data it will read it automatically
-        
-        std::vector<std::thread> threads_vector;        
-        LoadPartitioningDistributed<DTRes, AllocationDescriptorGRPC> partioner(DistributionSchema::DISTRIBUTE, res, dctx);
-        while (partioner.HasNextChunk()){             
+
+        std::vector<std::thread> threads_vector;
+        LoadPartitioningDistributed<DTRes, AllocationDescriptorGRPC> partioner(DistributionSchema::DISTRIBUTE, res,
+                                                                               dctx);
+        while (partioner.HasNextChunk()) {
             auto hdfsFn = std::string(filename);
             auto dp = partioner.GetNextChunk();
-            
-            auto workerAddr = dynamic_cast<AllocationDescriptorGRPC*>(dp->allocation.get())->getLocation();
-            std::thread t([=, &res]()
-            {
+
+            auto workerAddr = dynamic_cast<AllocationDescriptorGRPC *>(dp->allocation.get())->getLocation();
+            std::thread t([=, &res]() {
                 auto stub = ctx->stubs[workerAddr].get();
 
                 distributed::HDFSFile fileData;
                 fileData.set_filename(hdfsFn);
                 fileData.set_start_row(dp->range->r_start);
                 fileData.set_num_rows(dp->range->r_len);
-                fileData.set_num_cols(dp->range->c_len);       
+                fileData.set_num_cols(dp->range->c_len);
 
                 grpc::ClientContext grpc_ctx;
                 distributed::StoredData response;
-                
+
                 auto status = stub->ReadHDFS(&grpc_ctx, fileData, &response);
                 if (!status.ok())
                     throw std::runtime_error(status.error_message());
@@ -141,11 +122,11 @@ struct DistributedRead<ALLOCATION_TYPE::DIST_GRPC_SYNC, DTRes>
                 newData.numRows = response.num_rows();
                 newData.numCols = response.num_cols();
                 newData.isPlacedAtWorker = true;
-                dynamic_cast<AllocationDescriptorGRPC&>(*(dp->allocation)).updateDistributedData(newData);
+                dynamic_cast<AllocationDescriptorGRPC &>(*(dp->allocation)).updateDistributedData(newData);
             });
-            threads_vector.push_back(move(t));            
+            threads_vector.push_back(move(t));
         }
-        
+
         for (auto &thread : threads_vector)
             thread.join();
 #endif
diff --git a/src/runtime/distributed/coordinator/kernels/DistributedWrapper.h b/src/runtime/distributed/coordinator/kernels/DistributedWrapper.h
index 45ce741e5..9c8e21495 100644
--- a/src/runtime/distributed/coordinator/kernels/DistributedWrapper.h
+++ b/src/runtime/distributed/coordinator/kernels/DistributedWrapper.h
@@ -26,56 +26,49 @@
 
 #include <runtime/local/datastructures/AllocationDescriptorGRPC.h>
 #ifdef USE_MPI
-    #include <runtime/local/datastructures/AllocationDescriptorMPI.h>
+#include <runtime/local/datastructures/AllocationDescriptorMPI.h>
 #endif
 
-#include <mlir/InitAllDialects.h>
-#include <mlir/IR/AsmState.h>
-#include <mlir/Parser/Parser.h>
 #include <llvm/Support/SourceMgr.h>
+#include <mlir/IR/AsmState.h>
 #include <mlir/IR/BuiltinTypes.h>
-#include <vector>
+#include <mlir/InitAllDialects.h>
+#include <mlir/Parser/Parser.h>
 #include <stdexcept>
+#include <vector>
 
-using mlir::daphne::VectorSplit;
 using mlir::daphne::VectorCombine;
+using mlir::daphne::VectorSplit;
 
-template <class DT>
-class DistributedWrapper {
-private:
+template <class DT> class DistributedWrapper {
+  private:
     DCTX(_dctx);
 
-protected:
+  protected:
     bool isBroadcast(mlir::daphne::VectorSplit splitMethod, const Structure *input) {
         return splitMethod == VectorSplit::NONE || (splitMethod == VectorSplit::ROWS && input->getNumRows() == 1);
     }
-public:
-    DistributedWrapper(DCTX(dctx)) : _dctx(dctx) {
-        //TODO start workers from here instead of manually (e.g. resource manager) ? 
 
+  public:
+    DistributedWrapper(DCTX(dctx)) : _dctx(dctx) {
+        // TODO start workers from here instead of manually (e.g. resource
+        // manager) ?
     }
-    ~DistributedWrapper() = default; //TODO Terminate workers (e.g. with gRPC, resource manager, etc.)
-
-    void execute(const char *mlirCode,
-                 DT ***res,
-                 const Structure **inputs,
-                 size_t numInputs,
-                 size_t numOutputs,
-                 int64_t *outRows,
-                 int64_t *outCols,
-                 VectorSplit *splits,
-                 VectorCombine *combines)                 
-    {        
+    ~DistributedWrapper() = default; // TODO Terminate workers (e.g. with gRPC,
+                                     // resource manager, etc.)
+
+    void execute(const char *mlirCode, DT ***res, const Structure **inputs, size_t numInputs, size_t numOutputs,
+                 int64_t *outRows, int64_t *outCols, VectorSplit *splits, VectorCombine *combines) {
         auto ctx = DistributedContext::get(_dctx);
         auto workers = ctx->getWorkers();
-        
-        // Backend Implementation 
+
+        // Backend Implementation
         // gRPC hard-coded selection
-        const auto allocation_type=_dctx->getUserConfig().distributedBackEndSetup;
-        //std::cout<<"Distributed wrapper " <<std::endl;
-        // output allocation for row-wise combine
-        for(size_t i = 0; i < numOutputs; ++i) {
-            if(*(res[i]) == nullptr && outRows[i] != -1 && outCols[i] != -1) {
+        const auto allocation_type = _dctx->getUserConfig().distributedBackEndSetup;
+        // std::cout<<"Distributed wrapper " <<std::endl;
+        //  output allocation for row-wise combine
+        for (size_t i = 0; i < numOutputs; ++i) {
+            if (*(res[i]) == nullptr && outRows[i] != -1 && outCols[i] != -1) {
                 auto zeroOut = combines[i] == mlir::daphne::VectorCombine::ADD;
                 // TODO we know result is only DenseMatrix<double> for now,
                 // but in the future this will change to support other DataTypes
@@ -83,139 +76,128 @@ class DistributedWrapper {
             }
         }
 
-        // Currently an input might appear twice in the inputs array of a pipeline.
-        // E.g. an input is needed both "Distributed/Scattered" and "Broadcasted".
-        // This might cause conflicts regarding the meta data of an object since 
-        // we need to represent multiple different "DataPlacements" (ways of how the data is distributed).
-        // A solution would be to support multiple meta data for a single Structure, each one representing
-        // a different way data is placed.
-        // @pdamme suggested that if an input is appearing multiple times in the input pipeline, 
-        // we can probably solve this by applying a more "aggressive" pipelining and removing duplicate inputs.
-        // For now we support only unique inputs.
+        // Currently an input might appear twice in the inputs array of a
+        // pipeline. E.g. an input is needed both "Distributed/Scattered" and
+        // "Broadcasted". This might cause conflicts regarding the meta data of
+        // an object since we need to represent multiple different
+        // "DataPlacements" (ways of how the data is distributed). A solution
+        // would be to support multiple meta data for a single Structure, each
+        // one representing a different way data is placed.
+        // @pdamme suggested that if an input is appearing multiple times in the
+        // input pipeline, we can probably solve this by applying a more
+        // "aggressive" pipelining and removing duplicate inputs. For now we
+        // support only unique inputs.
         std::vector<const Structure *> vec;
         for (size_t i = 0; i < numInputs; i++)
             vec.push_back(inputs[i]);
         sort(vec.begin(), vec.end());
         const bool hasDuplicates = std::adjacent_find(vec.begin(), vec.end()) != vec.end();
-        if(hasDuplicates)
-            throw std::runtime_error("Distributed runtime only supports unique inputs for now (no duplicate inputs in a pipeline)");
-        
+        if (hasDuplicates)
+            throw std::runtime_error("Distributed runtime only supports unique inputs for now (no "
+                                     "duplicate inputs in a pipeline)");
+
         // Parse mlir code fragment to determin pipeline inputs/outputs
         auto inputTypes = getPipelineInputTypes(mlirCode);
         std::vector<bool> scalars;
-        for(auto t : inputTypes)
-        {
-            scalars.push_back(t!=INPUT_TYPE::Matrix);
+        for (auto t : inputTypes) {
+            scalars.push_back(t != INPUT_TYPE::Matrix);
         }
-        // Distribute and broadcast inputs        
-        // Each primitive sends information to workers and changes the Structures' metadata information 
+        // Distribute and broadcast inputs
+        // Each primitive sends information to workers and changes the
+        // Structures' metadata information
         for (auto i = 0u; i < numInputs; ++i) {
             // if already placed on workers, skip
-            // TODO maybe this is not enough. We might also need to check if data resides in the specific way we need to.
-            // (i.e. rows/cols splitted accordingly). If it does then we can skip.
-            if (isBroadcast(splits[i], inputs[i])){
+            // TODO maybe this is not enough. We might also need to check if
+            // data resides in the specific way we need to. (i.e. rows/cols
+            // splitted accordingly). If it does then we can skip.
+            if (isBroadcast(splits[i], inputs[i])) {
                 auto type = inputTypes.at(i);
-                if (type==INPUT_TYPE::Matrix) {
-                    if(allocation_type==ALLOCATION_TYPE::DIST_MPI){
-#ifdef USE_MPI           
+                if (type == INPUT_TYPE::Matrix) {
+                    if (allocation_type == ALLOCATION_TYPE::DIST_MPI) {
+#ifdef USE_MPI
                         broadcast<ALLOCATION_TYPE::DIST_MPI>(inputs[i], false, _dctx);
 #endif
-                    }
-                    else if (allocation_type == ALLOCATION_TYPE::DIST_GRPC_ASYNC) 
-                    { 
+                    } else if (allocation_type == ALLOCATION_TYPE::DIST_GRPC_ASYNC) {
                         broadcast<ALLOCATION_TYPE::DIST_GRPC_ASYNC>(inputs[i], false, _dctx);
-                    }
-                    else if (allocation_type == ALLOCATION_TYPE::DIST_GRPC_SYNC) 
-                    { 
+                    } else if (allocation_type == ALLOCATION_TYPE::DIST_GRPC_SYNC) {
                         broadcast<ALLOCATION_TYPE::DIST_GRPC_SYNC>(inputs[i], false, _dctx);
                     }
-                }
-                else {
-                        if(allocation_type==ALLOCATION_TYPE::DIST_MPI){
-#ifdef USE_MPI 
-                            broadcast<ALLOCATION_TYPE::DIST_MPI>(inputs[i], true, _dctx);
+                } else {
+                    if (allocation_type == ALLOCATION_TYPE::DIST_MPI) {
+#ifdef USE_MPI
+                        broadcast<ALLOCATION_TYPE::DIST_MPI>(inputs[i], true, _dctx);
 #endif
-                        }
-                        else if (allocation_type == ALLOCATION_TYPE::DIST_GRPC_ASYNC) 
-                        { 
-                            broadcast<ALLOCATION_TYPE::DIST_GRPC_ASYNC>(inputs[i], true, _dctx);
-                        }
-                        else if (allocation_type == ALLOCATION_TYPE::DIST_GRPC_SYNC) 
-                        { 
-                            broadcast<ALLOCATION_TYPE::DIST_GRPC_SYNC>(inputs[i], true, _dctx);
-                        }
+                    } else if (allocation_type == ALLOCATION_TYPE::DIST_GRPC_ASYNC) {
+                        broadcast<ALLOCATION_TYPE::DIST_GRPC_ASYNC>(inputs[i], true, _dctx);
+                    } else if (allocation_type == ALLOCATION_TYPE::DIST_GRPC_SYNC) {
+                        broadcast<ALLOCATION_TYPE::DIST_GRPC_SYNC>(inputs[i], true, _dctx);
+                    }
                 }
-            }
-            else {
+            } else {
                 if (splits[i] != VectorSplit::ROWS)
-                    throw std::runtime_error("DistributedWrapper: only row split is currently supported");
-                // std::cout << i << " distr: " << inputs[i]->getNumRows() << " x " << inputs[i]->getNumCols() << std::endl;
-                if(allocation_type==ALLOCATION_TYPE::DIST_MPI){
-#ifdef USE_MPI 
+                    throw std::runtime_error("DistributedWrapper: only row "
+                                             "split is currently supported");
+                // std::cout << i << " distr: " << inputs[i]->getNumRows() << "
+                // x " << inputs[i]->getNumCols() << std::endl;
+                if (allocation_type == ALLOCATION_TYPE::DIST_MPI) {
+#ifdef USE_MPI
                     distribute<ALLOCATION_TYPE::DIST_MPI>(inputs[i], _dctx);
 #endif
-                }
-                else if (allocation_type == ALLOCATION_TYPE::DIST_GRPC_ASYNC) 
-                {
-                    distribute<ALLOCATION_TYPE::DIST_GRPC_ASYNC>(inputs[i], _dctx);  
-                }
-                else if (allocation_type == ALLOCATION_TYPE::DIST_GRPC_SYNC) 
-                { 
-                    distribute<ALLOCATION_TYPE::DIST_GRPC_SYNC>(inputs[i], _dctx);  
+                } else if (allocation_type == ALLOCATION_TYPE::DIST_GRPC_ASYNC) {
+                    distribute<ALLOCATION_TYPE::DIST_GRPC_ASYNC>(inputs[i], _dctx);
+                } else if (allocation_type == ALLOCATION_TYPE::DIST_GRPC_SYNC) {
+                    distribute<ALLOCATION_TYPE::DIST_GRPC_SYNC>(inputs[i], _dctx);
                 }
             }
         }
 
-        if(allocation_type==ALLOCATION_TYPE::DIST_MPI){
-#ifdef USE_MPI   
-            distributedCompute<ALLOCATION_TYPE::DIST_MPI>(res, numOutputs, inputs, numInputs, mlirCode, combines, _dctx);
-#endif        
-        }
-        else if (allocation_type == ALLOCATION_TYPE::DIST_GRPC_ASYNC) { 
-            distributedCompute<ALLOCATION_TYPE::DIST_GRPC_ASYNC>(res, numOutputs, inputs, numInputs, mlirCode, combines, _dctx);   
-        }
-        else if (allocation_type == ALLOCATION_TYPE::DIST_GRPC_SYNC) { 
-            distributedCompute<ALLOCATION_TYPE::DIST_GRPC_SYNC>(res, numOutputs, inputs, numInputs, mlirCode, combines, _dctx);   
+        if (allocation_type == ALLOCATION_TYPE::DIST_MPI) {
+#ifdef USE_MPI
+            distributedCompute<ALLOCATION_TYPE::DIST_MPI>(res, numOutputs, inputs, numInputs, mlirCode, combines,
+                                                          _dctx);
+#endif
+        } else if (allocation_type == ALLOCATION_TYPE::DIST_GRPC_ASYNC) {
+            distributedCompute<ALLOCATION_TYPE::DIST_GRPC_ASYNC>(res, numOutputs, inputs, numInputs, mlirCode, combines,
+                                                                 _dctx);
+        } else if (allocation_type == ALLOCATION_TYPE::DIST_GRPC_SYNC) {
+            distributedCompute<ALLOCATION_TYPE::DIST_GRPC_SYNC>(res, numOutputs, inputs, numInputs, mlirCode, combines,
+                                                                _dctx);
         }
-        //handle my part as coordinator we currently exclude the coordinator
+        // handle my part as coordinator we currently exclude the coordinator
         /*if(alloc_type==ALLOCATION_TYPE::DIST_MPI)
         {
             bool isScalar=true;
-            MPICoordinator::handleCoordinationPart<DT>(res, numOutputs, inputs, numInputs, mlirCode, scalars , combines, _dctx);
+            MPICoordinator::handleCoordinationPart<DT>(res, numOutputs, inputs,
+        numInputs, mlirCode, scalars , combines, _dctx);
         }*/
 
         // Collect
-        for (size_t o = 0; o < numOutputs; o++){
-            if(allocation_type==ALLOCATION_TYPE::DIST_MPI){
-#ifdef USE_MPI 
-                distributedCollect<ALLOCATION_TYPE::DIST_MPI>(*res[o], combines[o], _dctx);      
+        for (size_t o = 0; o < numOutputs; o++) {
+            if (allocation_type == ALLOCATION_TYPE::DIST_MPI) {
+#ifdef USE_MPI
+                distributedCollect<ALLOCATION_TYPE::DIST_MPI>(*res[o], combines[o], _dctx);
 #endif
-            }
-            else if (allocation_type == ALLOCATION_TYPE::DIST_GRPC_ASYNC) 
-            { 
+            } else if (allocation_type == ALLOCATION_TYPE::DIST_GRPC_ASYNC) {
                 distributedCollect<ALLOCATION_TYPE::DIST_GRPC_ASYNC>(*res[o], combines[o], _dctx);
-            }
-            else if (allocation_type == ALLOCATION_TYPE::DIST_GRPC_SYNC) 
-            { 
+            } else if (allocation_type == ALLOCATION_TYPE::DIST_GRPC_SYNC) {
                 distributedCollect<ALLOCATION_TYPE::DIST_GRPC_SYNC>(*res[o], combines[o], _dctx);
             }
-        }      
+        }
     }
 
-private:
+  private:
     enum INPUT_TYPE {
         Matrix,
         Double,
         // TOOD add more
     };
-    std::vector<INPUT_TYPE> getPipelineInputTypes(const char *mlirCode)
-    {
-        // is it safe to pass null for mlir::DaphneContext? 
+    std::vector<INPUT_TYPE> getPipelineInputTypes(const char *mlirCode) {
+        // is it safe to pass null for mlir::DaphneContext?
         // Fixme: is it ok to allow unregistered dialects?
         mlir::MLIRContext ctx;
         ctx.getOrLoadDialect<mlir::func::FuncDialect>();
         ctx.allowUnregisteredDialects();
-        //std::cout<<mlirCode<<std::endl;
+        // std::cout<<mlirCode<<std::endl;
         mlir::OwningOpRef<mlir::ModuleOp> module(mlir::parseSourceString<mlir::ModuleOp>(mlirCode, &ctx));
         if (!module) {
             auto message = "DistributedWrapper: Failed to parse source string.\n";
@@ -224,11 +206,12 @@ class DistributedWrapper {
         auto *distOp = module->lookupSymbol("dist");
         mlir::func::FuncOp distFunc;
         if (!(distFunc = llvm::dyn_cast_or_null<mlir::func::FuncOp>(distOp))) {
-            auto message = "DistributedWrapper: MLIR fragment has to contain `dist` FuncOp\n";
+            auto message = "DistributedWrapper: MLIR fragment has to contain "
+                           "`dist` FuncOp\n";
             throw std::runtime_error(message);
         }
         auto distFuncTy = distFunc.getFunctionType();
-        
+
         // TODO passing a vector<mlir::Type> seems to causes problems...
         // Use enum as work around for now but consider returning mlir::Type
         std::vector<INPUT_TYPE> inputTypes;
@@ -244,4 +227,4 @@ class DistributedWrapper {
     }
 };
 
-#endif //SRC_RUNTIME_DISTRIBUTED_COORDINATOR_KERNELS_DISTRIBUTEDWRAPPER_H
+#endif // SRC_RUNTIME_DISTRIBUTED_COORDINATOR_KERNELS_DISTRIBUTEDWRAPPER_H
diff --git a/src/runtime/distributed/coordinator/kernels/DistributedWrite.h b/src/runtime/distributed/coordinator/kernels/DistributedWrite.h
index 910ea5b2e..3e4488520 100644
--- a/src/runtime/distributed/coordinator/kernels/DistributedWrite.h
+++ b/src/runtime/distributed/coordinator/kernels/DistributedWrite.h
@@ -21,24 +21,22 @@
 #include <runtime/local/datastructures/DataObjectFactory.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
 
-#include <runtime/local/datastructures/AllocationDescriptorGRPC.h>
 #include <runtime/distributed/proto/DistributedGRPCCaller.h>
 #include <runtime/distributed/worker/WorkerImpl.h>
+#include <runtime/local/datastructures/AllocationDescriptorGRPC.h>
 
 #ifdef USE_MPI
-    #include <runtime/distributed/worker/MPIHelper.h>
-#endif 
+#include <runtime/distributed/worker/MPIHelper.h>
+#endif
 
 #include <cstddef>
 #include <filesystem>
 
-
 // ****************************************************************************
 // Struct for partial template specialization
 // ****************************************************************************
 
-template<ALLOCATION_TYPE AT, class DTArg>
-struct DistributedWrite {
+template <ALLOCATION_TYPE AT, class DTArg> struct DistributedWrite {
     static void apply(const DTArg *mat, const char *filename, DCTX(dctx)) = delete;
 };
 
@@ -46,27 +44,19 @@ struct DistributedWrite {
 // Convenience function
 // ****************************************************************************
 
-template<class DTArg>
-void distributedWrite(const DTArg *mat, const char *filename, DCTX(dctx))
-{
+template <class DTArg> void distributedWrite(const DTArg *mat, const char *filename, DCTX(dctx)) {
     const auto allocation_type = dctx->getUserConfig().distributedBackEndSetup;
-    if (allocation_type == ALLOCATION_TYPE::DIST_MPI)
-    {
+    if (allocation_type == ALLOCATION_TYPE::DIST_MPI) {
 #ifdef USE_MPI
         DistributedWrite<ALLOCATION_TYPE::DIST_MPI, const DTArg>::apply(mat, filename, dctx);
 #endif
-    }
-    else if (allocation_type == ALLOCATION_TYPE::DIST_GRPC_ASYNC)
-    {
+    } else if (allocation_type == ALLOCATION_TYPE::DIST_GRPC_ASYNC) {
         DistributedWrite<ALLOCATION_TYPE::DIST_GRPC_ASYNC, const DTArg>::apply(mat, filename, dctx);
-    }
-    else if (allocation_type == ALLOCATION_TYPE::DIST_GRPC_SYNC)
-    {
+    } else if (allocation_type == ALLOCATION_TYPE::DIST_GRPC_SYNC) {
         DistributedWrite<ALLOCATION_TYPE::DIST_GRPC_SYNC, const DTArg>::apply(mat, filename, dctx);
     }
 }
 
-
 // ****************************************************************************
 // (Partial) template specializations for different distributed backends
 // ****************************************************************************
@@ -75,9 +65,7 @@ void distributedWrite(const DTArg *mat, const char *filename, DCTX(dctx))
 // ----------------------------------------------------------------------------
 // MPI
 // ----------------------------------------------------------------------------
-template<class DTArg>
-struct DistributedWrite<ALLOCATION_TYPE::DIST_MPI, DTArg>
-{
+template <class DTArg> struct DistributedWrite<ALLOCATION_TYPE::DIST_MPI, DTArg> {
     static void apply(const DTArg *mat, const char *filename, DCTX(dctx)) {
         throw std::runtime_error("not implemented");
     }
@@ -88,10 +76,8 @@ struct DistributedWrite<ALLOCATION_TYPE::DIST_MPI, DTArg>
 // Asynchronous GRPC
 // ----------------------------------------------------------------------------
 
-template<class DTArg>
-struct DistributedWrite<ALLOCATION_TYPE::DIST_GRPC_ASYNC, DTArg>
-{
-    static void apply(const DTArg *mat, const char *filename, DCTX(dctx)) {        
+template <class DTArg> struct DistributedWrite<ALLOCATION_TYPE::DIST_GRPC_ASYNC, DTArg> {
+    static void apply(const DTArg *mat, const char *filename, DCTX(dctx)) {
         throw std::runtime_error("not implemented");
     }
 };
@@ -101,14 +87,12 @@ struct DistributedWrite<ALLOCATION_TYPE::DIST_GRPC_ASYNC, DTArg>
 // ----------------------------------------------------------------------------
 #ifdef USE_HDFS
 
-template<class DTArg>
-struct DistributedWrite<ALLOCATION_TYPE::DIST_GRPC_SYNC, DTArg>
-{
+template <class DTArg> struct DistributedWrite<ALLOCATION_TYPE::DIST_GRPC_SYNC, DTArg> {
     static void apply(const DTArg *mat, const char *filename, DCTX(dctx)) {
         auto ctx = DistributedContext::get(dctx);
         auto workers = ctx->getWorkers();
-        
-        if(mat == nullptr) {
+
+        if (mat == nullptr) {
             throw std::runtime_error("matrix argument is null");
         }
 
@@ -129,17 +113,16 @@ struct DistributedWrite<ALLOCATION_TYPE::DIST_GRPC_SYNC, DTArg>
             throw std::runtime_error("Directory failed");
         }
         size_t chunkId = 1;
-        std::vector<std::thread> threads_vector;        
-        for (auto workerAddr: workers){
-            auto hdfsfilename = baseFileName + "_segment_" + std::to_string(chunkId++); 
+        std::vector<std::thread> threads_vector;
+        for (auto workerAddr : workers) {
+            auto hdfsfilename = baseFileName + "_segment_" + std::to_string(chunkId++);
             DataPlacement *dp;
             if ((dp = mat->getMetaDataObject()->getDataPlacementByLocation(workerAddr))) {
-                auto data = dynamic_cast<AllocationDescriptorGRPC&>(*(dp->allocation)).getDistributedData();                
+                auto data = dynamic_cast<AllocationDescriptorGRPC &>(*(dp->allocation)).getDistributedData();
                 if (data.isPlacedAtWorker) {
-                    std::thread t([=, &mat]()
-                    {
+                    std::thread t([=, &mat]() {
                         auto stub = ctx->stubs[workerAddr].get();
-                        
+
                         distributed::HDFSWriteInfo fileData;
                         fileData.mutable_matrix()->set_identifier(data.identifier);
                         fileData.mutable_matrix()->set_num_rows(data.numRows);
@@ -155,19 +138,21 @@ struct DistributedWrite<ALLOCATION_TYPE::DIST_GRPC_SYNC, DTArg>
                         if (!status.ok())
                             throw std::runtime_error(status.error_message());
                     });
-                    threads_vector.push_back(move(t));     
+                    threads_vector.push_back(move(t));
                 } else {
-                    auto slicedMat = mat->sliceRow(dp->range.get()->r_start, dp->range.get()->r_start + dp->range.get()->r_len);
+                    auto slicedMat =
+                        mat->sliceRow(dp->range.get()->r_start, dp->range.get()->r_start + dp->range.get()->r_len);
                     if (extension == ".csv") {
                         writeHDFSCsv(slicedMat, hdfsfilename.c_str(), dctx);
                     } else if (extension == ".dbdf") {
                         writeDaphneHDFS(slicedMat, hdfsfilename.c_str(), dctx);
                     }
-                }                
+                }
             } else {
                 continue;
             }
-            // TODO we should also store ranges that did not have a dataplacement associated with them
+            // TODO we should also store ranges that did not have a
+            // dataplacement associated with them
         }
         for (auto &thread : threads_vector)
             thread.join();
diff --git a/src/runtime/distributed/coordinator/scheduling/LoadPartitioningDistributed.h b/src/runtime/distributed/coordinator/scheduling/LoadPartitioningDistributed.h
index 03134e5cc..a05a4bc30 100644
--- a/src/runtime/distributed/coordinator/scheduling/LoadPartitioningDistributed.h
+++ b/src/runtime/distributed/coordinator/scheduling/LoadPartitioningDistributed.h
@@ -21,23 +21,18 @@
 #include <runtime/local/datastructures/AllocationDescriptorMPI.h>
 #include <runtime/local/datastructures/Range.h>
 
-#include <vector>
+#include <cstddef>
 #include <stdexcept>
 #include <string>
-#include <cstddef>
 #include <type_traits>
+#include <vector>
 
 using mlir::daphne::VectorCombine;
 
+enum class DistributionSchema { DISTRIBUTE = 1, BROADCAST = 2 };
 
-enum class DistributionSchema{
-    DISTRIBUTE = 1,
-    BROADCAST = 2
-};
-
-template<class DT, class ALLOCATOR>
-class LoadPartitioningDistributed {
-private:
+template <class DT, class ALLOCATOR> class LoadPartitioningDistributed {
+  private:
     DistributionSchema distrschema;
     DT *mat;
     std::vector<std::string> workerList;
@@ -45,26 +40,22 @@ class LoadPartitioningDistributed {
     size_t totalTasks;
     DaphneContext *dctx;
 
-public:
-
-    LoadPartitioningDistributed(DistributionSchema schema, DT *&mat, DCTX(dctx)) :
-        distrschema(schema),
-        mat(mat),
-        dctx(dctx)
-    {
+  public:
+    LoadPartitioningDistributed(DistributionSchema schema, DT *&mat, DCTX(dctx))
+        : distrschema(schema), mat(mat), dctx(dctx) {
         auto ctx = DistributedContext::get(dctx);
         workerList = ctx->getWorkers();
         totalTasks = workerList.size();
     };
 
-    bool HasNextChunk(){
-        return taskIndex < totalTasks;
-    };
+    bool HasNextChunk() { return taskIndex < totalTasks; };
 
     // Each allocation descriptor might use a different constructor.
     // Here we provide the different implementations.
-    // Another solution would be to make sure that every constructor is similar so this would not be needed.
-    static ALLOCATOR CreateAllocatorDescriptor(DaphneContext* ctx, const std::string &addr, const DistributedData &data) {
+    // Another solution would be to make sure that every constructor is similar
+    // so this would not be needed.
+    static ALLOCATOR CreateAllocatorDescriptor(DaphneContext *ctx, const std::string &addr,
+                                               const DistributedData &data) {
         if constexpr (std::is_same_v<ALLOCATOR, AllocationDescriptorMPI>)
             return AllocationDescriptorMPI(std::stoi(addr), ctx, data);
         else if constexpr (std::is_same_v<ALLOCATOR, AllocationDescriptorGRPC>)
@@ -76,45 +67,38 @@ class LoadPartitioningDistributed {
     // Set ranges
     Range CreateRange() {
         switch (distrschema) {
-            case DistributionSchema::DISTRIBUTE: {
-                // Todo support for different distribution schemas
-                auto k = mat->getNumRows() / workerList.size();
-                auto m = mat->getNumRows() % workerList.size();
-                return Range(
-                    (taskIndex * k) + std::min(taskIndex, m),
-                    0,
-                    ((taskIndex + 1) * k + std::min(taskIndex + 1, m)) - ((taskIndex * k) + std::min(taskIndex, m)),
-                    mat->getNumCols()
-                );
-                break;
-            }
-            case DistributionSchema::BROADCAST:
-                return Range(
-                    0,
-                    0,
-                    mat->getNumRows(),
-                    mat->getNumCols()
-                );
-                break;
-            default:
-                throw std::runtime_error("Unknown distribution scheme");
+        case DistributionSchema::DISTRIBUTE: {
+            // Todo support for different distribution schemas
+            auto k = mat->getNumRows() / workerList.size();
+            auto m = mat->getNumRows() % workerList.size();
+            return Range((taskIndex * k) + std::min(taskIndex, m), 0,
+                         ((taskIndex + 1) * k + std::min(taskIndex + 1, m)) -
+                             ((taskIndex * k) + std::min(taskIndex, m)),
+                         mat->getNumCols());
+            break;
+        }
+        case DistributionSchema::BROADCAST:
+            return Range(0, 0, mat->getNumRows(), mat->getNumCols());
+            break;
+        default:
+            throw std::runtime_error("Unknown distribution scheme");
         }
     }
 
     // Update current distributed index object based on distribution schema
     DistributedIndex GetDistributedIndex() {
         switch (distrschema) {
-            case DistributionSchema::DISTRIBUTE:
-                // Todo support for different distribution schemas
-                return DistributedIndex(taskIndex, 0);
-            case DistributionSchema::BROADCAST:
-                return DistributedIndex(0, 0);
-            default:
-                throw std::runtime_error("Unknown distribution scheme");
+        case DistributionSchema::DISTRIBUTE:
+            // Todo support for different distribution schemas
+            return DistributedIndex(taskIndex, 0);
+        case DistributionSchema::BROADCAST:
+            return DistributedIndex(0, 0);
+        default:
+            throw std::runtime_error("Unknown distribution scheme");
         }
     }
 
-    DataPlacement * GetNextChunk(){
+    DataPlacement *GetNextChunk() {
 
         auto workerAddr = workerList.at(taskIndex);
 
@@ -122,9 +106,10 @@ class LoadPartitioningDistributed {
 
         DataPlacement *dp;
         if ((dp = mat->getMetaDataObject()->getDataPlacementByLocation(workerAddr))) {
-            auto data = dynamic_cast<ALLOCATOR&>(*(dp->allocation)).getDistributedData();
+            auto data = dynamic_cast<ALLOCATOR &>(*(dp->allocation)).getDistributedData();
 
-            // Check if existing placement matches the same ranges we currently need
+            // Check if existing placement matches the same ranges we currently
+            // need
             if (data.isPlacedAtWorker) {
                 auto existingRange = dp->range.get();
                 if (*existingRange == range)
@@ -136,15 +121,15 @@ class LoadPartitioningDistributed {
             } else
                 mat->getMetaDataObject()->updateRangeDataPlacementByID(dp->dp_id, &range);
             // TODO Currently we do not support distributing/splitting
-            // by columns. When we do, this should be changed (e.g. Index(0, taskIndex))
-            // This can be decided based on DistributionSchema
+            // by columns. When we do, this should be changed (e.g. Index(0,
+            // taskIndex)) This can be decided based on DistributionSchema
             data.ix = GetDistributedIndex();
-            dynamic_cast<ALLOCATOR&>(*(dp->allocation)).updateDistributedData(data);
-        }
-        else { // Else, create new object metadata entry
+            dynamic_cast<ALLOCATOR &>(*(dp->allocation)).updateDistributedData(data);
+        } else { // Else, create new object metadata entry
             DistributedData data;
             // TODO Currently we do not support distributing/splitting
-            // by columns. When we do, this should be changed (e.g. Index(0, taskIndex))
+            // by columns. When we do, this should be changed (e.g. Index(0,
+            // taskIndex))
             data.ix = GetDistributedIndex();
             auto allocationDescriptor = CreateAllocatorDescriptor(dctx, workerAddr, data);
             dp = mat->getMetaDataObject()->addDataPlacement(&allocationDescriptor, &range);
@@ -159,30 +144,25 @@ class LoadPartitioningDistributed {
         // Initialize Distributed index array, needed for results
         std::vector<DistributedIndex> ix(numOutputs, DistributedIndex(0, 0));
         for (auto workerAddr : workers) {
-            for (size_t i = 0; i < numOutputs; i++)
-            {
+            for (size_t i = 0; i < numOutputs; i++) {
                 // Get Result ranges
-                // TODO Seperate this into a different function and implement different strategies
+                // TODO Seperate this into a different function and implement
+                // different strategies
                 auto combineType = vectorCombine[i];
                 auto workersSize = workers.size();
                 size_t k = 0, m = 0;
-                if (combineType == VectorCombine::ROWS)
-                {
+                if (combineType == VectorCombine::ROWS) {
                     k = (*outputs[i])->getNumRows() / workersSize;
                     m = (*outputs[i])->getNumRows() % workersSize;
-                }
-                else if (combineType == VectorCombine::COLS)
-                {
+                } else if (combineType == VectorCombine::COLS) {
                     k = (*outputs[i])->getNumCols() / workersSize;
                     m = (*outputs[i])->getNumCols() % workersSize;
-                }
-                else if (combineType == VectorCombine::ADD)
-                {
+                } else if (combineType == VectorCombine::ADD) {
                     k = (*outputs[i])->getNumCols() / workersSize;
                     m = (*outputs[i])->getNumCols() % workersSize;
-                }
-                else
-                    throw std::runtime_error("LoadPartitioningDistributed: Only Rows/Cols combineType supported atm");
+                } else
+                    throw std::runtime_error("LoadPartitioningDistributed: Only Rows/Cols "
+                                             "combineType supported atm");
 
                 DistributedData data;
                 data.ix = ix[i];
@@ -192,8 +172,7 @@ class LoadPartitioningDistributed {
                 // Update distributed index for next iteration
                 // and set ranges for objmetadata
                 Range range;
-                if (vectorCombine[i] == VectorCombine::ROWS)
-                {
+                if (vectorCombine[i] == VectorCombine::ROWS) {
                     ix[i] = DistributedIndex(ix[i].getRow() + 1, ix[i].getCol());
 
                     range.r_start = data.ix.getRow() * k + std::min(data.ix.getRow(), m);
@@ -201,8 +180,7 @@ class LoadPartitioningDistributed {
                     range.c_start = 0;
                     range.c_len = (*outputs[i])->getNumCols();
                 }
-                if (vectorCombine[i] == VectorCombine::COLS || vectorCombine[i] == VectorCombine::ADD)
-                {
+                if (vectorCombine[i] == VectorCombine::COLS || vectorCombine[i] == VectorCombine::ADD) {
                     ix[i] = DistributedIndex(ix[i].getRow(), ix[i].getCol() + 1);
 
                     range.r_start = 0;
@@ -211,14 +189,12 @@ class LoadPartitioningDistributed {
                     range.c_len = ((data.ix.getCol() + 1) * k + std::min((data.ix.getCol() + 1), m)) - range.c_start;
                 }
 
-                // If dp already exists for this worker, update the range and data
-                if (auto dp = (*outputs[i])->getMetaDataObject()->getDataPlacementByLocation(workerAddr))
-                {
+                // If dp already exists for this worker, update the range and
+                // data
+                if (auto dp = (*outputs[i])->getMetaDataObject()->getDataPlacementByLocation(workerAddr)) {
                     (*outputs[i])->getMetaDataObject()->updateRangeDataPlacementByID(dp->dp_id, &range);
-                    dynamic_cast<ALLOCATOR&>(*(dp->allocation)).updateDistributedData(data);
-                }
-                else
-                { // else create new dp entry
+                    dynamic_cast<ALLOCATOR &>(*(dp->allocation)).updateDistributedData(data);
+                } else { // else create new dp entry
                     auto allocationDescriptor = CreateAllocatorDescriptor(dctx, workerAddr, data);
                     ((*outputs[i]))->getMetaDataObject()->addDataPlacement(&allocationDescriptor, &range);
                 }
diff --git a/src/runtime/distributed/proto/CallData.cpp b/src/runtime/distributed/proto/CallData.cpp
index f576c2ee3..17f18569c 100644
--- a/src/runtime/distributed/proto/CallData.cpp
+++ b/src/runtime/distributed/proto/CallData.cpp
@@ -17,47 +17,37 @@
 #include "CallData.h"
 
 void StoreCallData::Proceed(bool ok) {
-    if (status_ == CREATE)
-    {
+    if (status_ == CREATE) {
         // Make this instance progress to the PROCESS state.
         status_ = PROCESS;
 
         service_->RequestStore(&ctx_, &stream_, cq_, cq_, this);
         worker->PrepareStoreGRPC();
-    }
-    else if (status_ == PROCESS)
-    {
+    } else if (status_ == PROCESS) {
         if (ok) {
             stream_.Read(&data, this);
             grpc::Status status = worker->StoreGRPC(&ctx_, &data, &storedData);
             if (!status.ok())
-                throw std::runtime_error("Error while receiving/storing partial data");            
-        }
-        else {
+                throw std::runtime_error("Error while receiving/storing partial data");
+        } else {
             new StoreCallData(worker, scq_, cq_);
             status_ = FINISH;
 
             stream_.Finish(storedData, grpc::Status::OK, this);
         }
-    }
-    else
-    {
+    } else {
         GPR_ASSERT(status_ == FINISH);
         delete this;
     }
 }
 
 void ComputeCallData::Proceed(bool ok) {
-    if (status_ == CREATE)
-    {
+    if (status_ == CREATE) {
         // Make this instance progress to the PROCESS state.
         status_ = PROCESS;
 
-        service_->RequestCompute(&ctx_, &task, &responder_, cq_, cq_,
-                                    this);
-    }
-    else if (status_ == PROCESS)
-    {
+        service_->RequestCompute(&ctx_, &task, &responder_, cq_, cq_, this);
+    } else if (status_ == PROCESS) {
         if (!ok)
             delete this;
         status_ = FINISH;
@@ -67,25 +57,19 @@ void ComputeCallData::Proceed(bool ok) {
         grpc::Status status = worker->ComputeGRPC(&ctx_, &task, &result);
 
         responder_.Finish(result, status, this);
-    }
-    else
-    {
+    } else {
         GPR_ASSERT(status_ == FINISH);
         delete this;
     }
 }
 
 void TransferCallData::Proceed(bool ok) {
-    if (status_ == CREATE)
-    {
+    if (status_ == CREATE) {
         // Make this instance progress to the PROCESS state.
         status_ = PROCESS;
 
-        service_->RequestTransfer(&ctx_, &storedData, &responder_, cq_, cq_,
-                                    this);
-    }
-    else if (status_ == PROCESS)
-    {
+        service_->RequestTransfer(&ctx_, &storedData, &responder_, cq_, cq_, this);
+    } else if (status_ == PROCESS) {
         if (!ok)
             delete this;
         status_ = FINISH;
@@ -95,15 +79,12 @@ void TransferCallData::Proceed(bool ok) {
         grpc::Status status = worker->TransferGRPC(&ctx_, &storedData, &data);
 
         responder_.Finish(data, status, this);
-    }
-    else
-    {
+    } else {
         GPR_ASSERT(status_ == FINISH);
         delete this;
     }
 }
 
-
 // void FreeMemCallData::Proceed() {
 //     if (status_ == CREATE)
 //     {
@@ -119,7 +100,8 @@ void TransferCallData::Proceed(bool ok) {
 
 //         new FreeMemCallData(worker, cq_);
 
-//         grpc::Status status = worker->FreeMem(&ctx_, &storedData, &emptyMessage);
+//         grpc::Status status = worker->FreeMem(&ctx_, &storedData,
+//         &emptyMessage);
 
 //         responder_.Finish(emptyMessage, status, this);
 //     }
diff --git a/src/runtime/distributed/proto/CallData.h b/src/runtime/distributed/proto/CallData.h
index 4724a7170..238e8863f 100644
--- a/src/runtime/distributed/proto/CallData.h
+++ b/src/runtime/distributed/proto/CallData.h
@@ -17,29 +17,27 @@
 
 #pragma once
 
-#include <runtime/distributed/worker/WorkerImplGRPCAsync.h>
-#include <runtime/distributed/proto/worker.pb.h>
 #include <runtime/distributed/proto/worker.grpc.pb.h>
+#include <runtime/distributed/proto/worker.pb.h>
+#include <runtime/distributed/worker/WorkerImplGRPCAsync.h>
 
-class CallData
-{
-public:
+class CallData {
+  public:
     virtual void Proceed(bool ok) = 0;
     virtual ~CallData() = default;
 };
-class StoreCallData final : public CallData
-{
-public:
+class StoreCallData final : public CallData {
+  public:
     StoreCallData(WorkerImplGRPCAsync *worker_, grpc::ServerCompletionQueue *scq, grpc::ServerCompletionQueue *cq)
-        : worker(worker_), service_(&worker_->service_), scq_(scq), cq_(cq), stream_(&ctx_), responder_(&ctx_), status_(CREATE)
-    {
+        : worker(worker_), service_(&worker_->service_), scq_(scq), cq_(cq), stream_(&ctx_), responder_(&ctx_),
+          status_(CREATE) {
         // Invoke the serving logic right away.
         Proceed(true);
     }
 
     void Proceed(bool ok) override;
 
-private:
+  private:
     WorkerImplGRPCAsync *worker;
     distributed::Worker::AsyncService *service_;
     // The producer-consumer queue where for asynchronous server notifications.
@@ -55,27 +53,20 @@ class StoreCallData final : public CallData
     grpc::ServerAsyncResponseWriter<distributed::StoredData> responder_;
 
     // Let's implement a tiny state machine with the following states.
-    enum CallStatus
-    {
-        CREATE,
-        PROCESS,
-        FINISH
-    };
+    enum CallStatus { CREATE, PROCESS, FINISH };
     CallStatus status_; // The current serving state.
 };
-class ComputeCallData final : public CallData
-{
-public:
+class ComputeCallData final : public CallData {
+  public:
     ComputeCallData(WorkerImplGRPCAsync *worker_, grpc::ServerCompletionQueue *cq)
-        : worker(worker_), service_(&worker_->service_), cq_(cq), responder_(&ctx_), status_(CREATE)
-    {
+        : worker(worker_), service_(&worker_->service_), cq_(cq), responder_(&ctx_), status_(CREATE) {
         // Invoke the serving logic right away.
         Proceed(true);
     }
 
     void Proceed(bool ok) override;
 
-private:
+  private:
     WorkerImplGRPCAsync *worker;
     distributed::Worker::AsyncService *service_;
     // The producer-consumer queue where for asynchronous server notifications.
@@ -89,26 +80,20 @@ class ComputeCallData final : public CallData
     grpc::ServerAsyncResponseWriter<distributed::ComputeResult> responder_;
 
     // Let's implement a tiny state machine with the following states.
-    enum CallStatus
-    {
-        CREATE,
-        PROCESS,
-        FINISH
-    };
+    enum CallStatus { CREATE, PROCESS, FINISH };
     CallStatus status_; // The current serving state.
 };
 
-class TransferCallData final : public CallData
-{
-public:
+class TransferCallData final : public CallData {
+  public:
     TransferCallData(WorkerImplGRPCAsync *worker_, grpc::ServerCompletionQueue *cq)
-        : worker(worker_), service_(&worker_->service_), cq_(cq), responder_(&ctx_), status_(CREATE)
-    {
+        : worker(worker_), service_(&worker_->service_), cq_(cq), responder_(&ctx_), status_(CREATE) {
         // Invoke the serving logic right away.
         Proceed(true);
     }
     void Proceed(bool ok) override;
-private:
+
+  private:
     WorkerImplGRPCAsync *worker;
     distributed::Worker::AsyncService *service_;
     // The producer-consumer queue where for asynchronous server notifications.
@@ -122,20 +107,17 @@ class TransferCallData final : public CallData
     grpc::ServerAsyncResponseWriter<distributed::Data> responder_;
 
     // Let's implement a tiny state machine with the following states.
-    enum CallStatus
-    {
-        CREATE,
-        PROCESS,
-        FINISH
-    };
+    enum CallStatus { CREATE, PROCESS, FINISH };
     CallStatus status_; // The current serving state.
 };
 
 // class FreeMemCallData final : public CallData
 // {
 //     public:
-//         FreeMemCallData(WorkerImplGRPCAsync *worker_, grpc::ServerCompletionQueue *cq)
-//             : worker(worker_), service_(&worker_->service_), cq_(cq), responder_(&ctx_), status_(CREATE)
+//         FreeMemCallData(WorkerImplGRPCAsync *worker_,
+//         grpc::ServerCompletionQueue *cq)
+//             : worker(worker_), service_(&worker_->service_), cq_(cq),
+//             responder_(&ctx_), status_(CREATE)
 //         {
 //             // Invoke the serving logic right away.
 //             Proceed();
@@ -144,10 +126,10 @@ class TransferCallData final : public CallData
 //     private:
 //         WorkerImplGRPCAsync *worker;
 //         distributed::Worker::AsyncService *service_;
-//         // The producer-consumer queue where for asynchronous server notifications.
-//         grpc::ServerCompletionQueue *cq_;
-//         grpc::ServerContext ctx_;
-        
+//         // The producer-consumer queue where for asynchronous server
+//         notifications. grpc::ServerCompletionQueue *cq_; grpc::ServerContext
+//         ctx_;
+
 //         distributed::StoredData storedData;
 //         distributed::Empty emptyMessage;
 //         grpc::ServerAsyncResponseWriter<distributed::Empty> responder_;
diff --git a/src/runtime/distributed/proto/DistributedGRPCCaller.h b/src/runtime/distributed/proto/DistributedGRPCCaller.h
index 8ff8475cb..31f5ebed3 100644
--- a/src/runtime/distributed/proto/DistributedGRPCCaller.h
+++ b/src/runtime/distributed/proto/DistributedGRPCCaller.h
@@ -17,8 +17,8 @@
 #ifndef SRC_RUNTIME_DISTRIBUTED_COORDINATOR_KERNELS_DISTRIBUTEDCALLER_H
 #define SRC_RUNTIME_DISTRIBUTED_COORDINATOR_KERNELS_DISTRIBUTEDCALLER_H
 #include <grpcpp/grpcpp.h>
-#include <runtime/distributed/proto/worker.pb.h>
 #include <runtime/distributed/proto/worker.grpc.pb.h>
+#include <runtime/distributed/proto/worker.pb.h>
 
 #include <memory>
 // ****************************************************************************
@@ -26,36 +26,33 @@
 // ****************************************************************************
 
 /**
-* @brief Creates an object allowing asynchronous
-*        communication between daphne and workers
-* 
-* @tparam  StoredInfo An object/value holding information related to a specific call, returned upon completition
-* @tparam  Argument The class used as argument for the call
-* @tparam  ReturnType The result class returned by the call
-*/
-template<class StoredInfo, class Argument, class ReturnType>
-class DistributedGRPCCaller {
-private:
-    
+ * @brief Creates an object allowing asynchronous
+ *        communication between daphne and workers
+ *
+ * @tparam  StoredInfo An object/value holding information related to a specific
+ * call, returned upon completition
+ * @tparam  Argument The class used as argument for the call
+ * @tparam  ReturnType The result class returned by the call
+ */
+template <class StoredInfo, class Argument, class ReturnType> class DistributedGRPCCaller {
+  private:
     /**
      * @brief   Map to keep channels.
      *          We declare channels as inline static in order to reuse them
-     *          across different Distributed kernels. 
+     *          across different Distributed kernels.
      *          TODO: Move channels map to DistributedContext.
      */
     inline static std::map<std::string, std::shared_ptr<grpc::Channel>> channels;
-    struct ResultData 
-    {
+    struct ResultData {
         // Contains struct StoredInfo, that was passed when call was made
         StoredInfo storedInfo;
         // Contains the actual result of the call
         ReturnType result;
     };
-    struct AsyncClientCall
-    {
+    struct AsyncClientCall {
         grpc::ClientContext context_;
         grpc::Status status;
-        
+
         StoredInfo storedInfo;
         ReturnType result;
     };
@@ -63,139 +60,123 @@ class DistributedGRPCCaller {
     grpc::CompletionQueue cq_;
     DistributedContext *ctx;
 
-    std::map<std::string, AsyncClientCall*> calls;
+    std::map<std::string, AsyncClientCall *> calls;
     std::map<std::string, std::unique_ptr<grpc::ClientAsyncWriter<Argument>>> writers;
 
     size_t EMPTY_TAG = 0;
-public:
-    DistributedGRPCCaller(DCTX(dctx)) {
-        ctx = DistributedContext::get(dctx);
-    };
-    ~DistributedGRPCCaller() {};
-    
+
+  public:
+    DistributedGRPCCaller(DCTX(dctx)) { ctx = DistributedContext::get(dctx); };
+    ~DistributedGRPCCaller(){};
+
     /**
-    * @brief Enqueues an asynchronous Store call to be executed.     
-    * 
-    * @param  workerAddr An address (or channel) to make the call
-    * @param  StoredInfo An StoredInfo type returned when call response is ready
-    */
-    void asyncStoreCall(
-        const std::string &addr,
-        const StoredInfo &storedInfo
-        )
-    {
+     * @brief Enqueues an asynchronous Store call to be executed.
+     *
+     * @param  workerAddr An address (or channel) to make the call
+     * @param  StoredInfo An StoredInfo type returned when call response is
+     * ready
+     */
+    void asyncStoreCall(const std::string &addr, const StoredInfo &storedInfo) {
         AsyncClientCall *call = new AsyncClientCall;
-        call->storedInfo = storedInfo; 
+        call->storedInfo = storedInfo;
         calls[addr] = call;
-        
+
         auto stub = ctx->stubs[addr].get();
-        writers[addr] = std::move(stub->AsyncStore(&call->context_, &call->result, &cq_, (void*)call));
+        writers[addr] = std::move(stub->AsyncStore(&call->context_, &call->result, &cq_, (void *)call));
         void *tag;
         bool ok;
         cq_.Next(&tag, &ok);
         // auto response_reader = stub->AsyncStore(&call->context_, arg, &cq_);
-        
+
         // response_reader->Finish(&call->result, &call->status, (void*)call);
         callCounter++;
     }
-    
-    void sendDataStream(std::string addr, const distributed::Data &data){
-        writers[addr]->Write(data, (void*)calls[addr]);
+
+    void sendDataStream(std::string addr, const distributed::Data &data) {
+        writers[addr]->Write(data, (void *)calls[addr]);
         void *tag;
         bool ok;
         cq_.Next(&tag, &ok);
     }
-    void writesDone(){
-        for (auto &writer : writers){
+    void writesDone() {
+        for (auto &writer : writers) {
             // Use different tag, we won't be using this one.
-            writer.second->WritesDone((void*)EMPTY_TAG);
-            writer.second->Finish(&calls[writer.first]->status, (void*)calls[writer.first]);
+            writer.second->WritesDone((void *)EMPTY_TAG);
+            writer.second->Finish(&calls[writer.first]->status, (void *)calls[writer.first]);
         }
     }
 
     /**
-    * @brief Enqueues an asynchronous Compute call to be executed.     
-    * 
-    * @param  workerAddr An address (or channel) to make the call
-    * @param  StoredInfo An StoredInfo type returned when call response is ready
-    * @param  arg Argument passed to the asynchronous call
-    */
-    void asyncComputeCall(
-        const std::string &workerAddr,
-        const StoredInfo &storedInfo,
-        const Argument &arg
-        )
-    {
+     * @brief Enqueues an asynchronous Compute call to be executed.
+     *
+     * @param  workerAddr An address (or channel) to make the call
+     * @param  StoredInfo An StoredInfo type returned when call response is
+     * ready
+     * @param  arg Argument passed to the asynchronous call
+     */
+    void asyncComputeCall(const std::string &workerAddr, const StoredInfo &storedInfo, const Argument &arg) {
         AsyncClientCall *call = new AsyncClientCall;
         call->storedInfo = storedInfo;
 
         auto stub = ctx->stubs[workerAddr].get();
         auto response_reader = stub->AsyncCompute(&call->context_, arg, &cq_);
-        
-        response_reader->Finish(&call->result, &call->status, (void*)call);
+
+        response_reader->Finish(&call->result, &call->status, (void *)call);
         callCounter++;
     }
     /**
-    * @brief Enqueues an asynchronous Transfer call to be executed.     
-    * 
-    * @param  workerAddr An address (or channel) to make the call
-    * @param  StoredInfo An StoredInfo type returned when call response is ready
-    * @param  arg Argument passed to the asynchronous call
-    */
-    void asyncTransferCall(
-        const std::string &workerAddr,
-        const StoredInfo &storedInfo,
-        const Argument &arg
-        )
-    {
+     * @brief Enqueues an asynchronous Transfer call to be executed.
+     *
+     * @param  workerAddr An address (or channel) to make the call
+     * @param  StoredInfo An StoredInfo type returned when call response is
+     * ready
+     * @param  arg Argument passed to the asynchronous call
+     */
+    void asyncTransferCall(const std::string &workerAddr, const StoredInfo &storedInfo, const Argument &arg) {
         AsyncClientCall *call = new AsyncClientCall;
         call->storedInfo = storedInfo;
 
         auto stub = ctx->stubs[workerAddr].get();
         auto response_reader = stub->AsyncTransfer(&call->context_, arg, &cq_);
-        
-        response_reader->Finish(&call->result, &call->status, (void*)call);
+
+        response_reader->Finish(&call->result, &call->status, (void *)call);
         callCounter++;
     }
     /**
-    * @brief Enqueues an asynchronous FreeMem call to be executed.     
-    * 
-    * @param  workerAddr An address (or channel) to make the call
-    * @param  StoredInfo An StoredInfo type returned when call response is ready
-    * @param  arg Argument passed to the asynchronous call
-    */
-    void asyncFreeMemCall(
-        const std::string &workerAddr,
-        const StoredInfo &storedInfo,
-        const Argument &arg
-        )
-    {
+     * @brief Enqueues an asynchronous FreeMem call to be executed.
+     *
+     * @param  workerAddr An address (or channel) to make the call
+     * @param  StoredInfo An StoredInfo type returned when call response is
+     * ready
+     * @param  arg Argument passed to the asynchronous call
+     */
+    void asyncFreeMemCall(const std::string &workerAddr, const StoredInfo &storedInfo, const Argument &arg) {
         AsyncClientCall *call = new AsyncClientCall;
         call->storedInfo = storedInfo;
 
         auto stub = ctx->stubs[workerAddr].get();
         auto response_reader = stub->AsyncFreeMem(&call->context_, arg, &cq_);
-        
-        response_reader->Finish(&call->result, &call->status, (void*)call);
+
+        response_reader->Finish(&call->result, &call->status, (void *)call);
         callCounter++;
     }
     /**
-    * @brief    Get the next available result from the queue of asynchronous calls
-    * @result   A struct with two fields. First field is "StoredInfo" struct passed when the call was enqueued
-    *           and second field is "ReturnType" result of the call
-    */
+     * @brief    Get the next available result from the queue of asynchronous
+     * calls
+     * @result   A struct with two fields. First field is "StoredInfo" struct
+     * passed when the call was enqueued and second field is "ReturnType" result
+     * of the call
+     */
     ResultData getNextResult() {
         void *got_tag;
         bool ok = false;
         do {
             cq_.Next(&got_tag, &ok);
-        } while(got_tag == (void*)EMPTY_TAG);
+        } while (got_tag == (void *)EMPTY_TAG);
         callCounter--;
-        AsyncClientCall *call = static_cast<AsyncClientCall*>(got_tag);    
-        if (!(ok && call->status.ok())){
-            throw std::runtime_error(
-                call->status.error_message()
-            );
+        AsyncClientCall *call = static_cast<AsyncClientCall *>(got_tag);
+        if (!(ok && call->status.ok())) {
+            throw std::runtime_error(call->status.error_message());
         }
         ResultData ret({call->storedInfo, call->result});
         delete call;
@@ -203,11 +184,9 @@ class DistributedGRPCCaller {
     };
 
     /**
-    * @brief Returns True if there are no more async calls to wait
-    */
-    bool isQueueEmpty() {
-        return (callCounter == 0);
-    };
+     * @brief Returns True if there are no more async calls to wait
+     */
+    bool isQueueEmpty() { return (callCounter == 0); };
 };
 
-#endif //SRC_RUNTIME_DISTRIBUTED_COORDINATOR_KERNELS_DISTRIBUTEDCALLER_H
\ No newline at end of file
+#endif // SRC_RUNTIME_DISTRIBUTED_COORDINATOR_KERNELS_DISTRIBUTEDCALLER_H
\ No newline at end of file
diff --git a/src/runtime/distributed/worker/MPICoordinator.h b/src/runtime/distributed/worker/MPICoordinator.h
index 320dd8d11..4563af582 100644
--- a/src/runtime/distributed/worker/MPICoordinator.h
+++ b/src/runtime/distributed/worker/MPICoordinator.h
@@ -1,120 +1,109 @@
 #ifndef SRC_RUNTIME_DISTRIBUTED_MPICOORDINATOR_H
 #define SRC_RUNTIME_DISTRIBUTED_MPICOORDINATOR_H
 
+#include <iostream>
 #include <mpi.h>
-#include <runtime/local/datastructures/DenseMatrix.h>
+#include <runtime/distributed/worker/MPIHelper.h>
 #include <runtime/distributed/worker/MPISerializer.h>
-#include <unistd.h>
-#include <iostream>
-#include <sstream>
+#include <runtime/distributed/worker/MPIWorker.h>
 #include <runtime/local/datastructures/AllocationDescriptorMPI.h>
+#include <runtime/local/datastructures/DenseMatrix.h>
 #include <runtime/local/datastructures/IAllocationDescriptor.h>
-#include <runtime/distributed/worker/MPIWorker.h>
-#include <runtime/distributed/worker/MPIHelper.h>
+#include <sstream>
+#include <unistd.h>
 
 #include <ir/daphneir/Daphne.h>
-#include <mlir/InitAllDialects.h>
 #include <mlir/IR/AsmState.h>
-##include <mlir/Parser.h>
+#include <mlir/InitAllDialects.h>
+##include<mlir / Parser.h>
 #include <llvm/Support/SourceMgr.h>
 #include <mlir/IR/BuiltinTypes.h>
 
-using mlir::daphne::VectorSplit;
+    using mlir::daphne::VectorSplit;
 using mlir::daphne::VectorCombine;
 
 #include <vector>
 
+class MPICoordinator {
 
+  public:
+    template <class DT>
+    static void handleCoordinationPart(DT ***res, size_t numOutputs, const Structure **inputs, size_t numInputs,
+                                       const char *mlirCode, std::vector<bool> scalars, VectorCombine *combines,
+                                       DaphneContext *dctx) {
+        std::vector<WorkerImpl::StoredInfo> outputsStoredInfo;
+        std::vector<WorkerImpl::StoredInfo> inputsStoredInfo;
+        size_t partitionSize;
+        int worldSize = MPIHelper::getCommSize();
 
-class MPICoordinator{
-
-        public:
-        template<class DT>
-        static void handleCoordinationPart(DT ***res, size_t numOutputs, const Structure **inputs, size_t numInputs, const char *mlirCode, std::vector<bool> scalars, VectorCombine *combines, DaphneContext *dctx)
-        {
-            std::vector<WorkerImpl::StoredInfo> outputsStoredInfo;
-            std::vector<WorkerImpl::StoredInfo> inputsStoredInfo;
-            size_t partitionSize;
-            int worldSize= MPIHelper::getCommSize();
-            
-            //prepare input
-            for (size_t i = 0; i < numInputs; i++)
-            {
-               /* Range range;
-                auto combineType = combines[i];
-                if (combineType== VectorCombine::ROWS) {
-                    partitionSize= inputs[i]->getNumRows()/worldSize;
-                    range.r_start = COORDINATOR * partitionSize;
-                    range.r_len =partitionSize;
-                    range.c_start=0;
-                    range.c_len=inputs[i]->getNumCols();
-                }
-                else if (combineType == VectorCombine::COLS) {
-                    partitionSize= inputs[i]->getNumCols()/worldSize;
-                    range.c_start = COORDINATOR * partitionSize;
-                    range.c_len =partitionSize;
-                    range.r_start=0;
-                    range.r_len=inputs[i]->getNumRows();
-
-                }*/
-                WorkerImpl::StoredInfo info;
-                if(!scalars.at(i))
-                {
-                    Structure* mat = (Structure *)(&inputs[i]);
-                    //info = solver.doStore(mat);
-                }
-                else
-                {
-                    auto ptr = (double*)(&inputs[i]);
-                    double val = *ptr;
-                    //info= solver.doStore(&val);
-                }
-                inputsStoredInfo.push_back(info);
+        // prepare input
+        for (size_t i = 0; i < numInputs; i++) {
+            /* Range range;
+             auto combineType = combines[i];
+             if (combineType== VectorCombine::ROWS) {
+                 partitionSize= inputs[i]->getNumRows()/worldSize;
+                 range.r_start = COORDINATOR * partitionSize;
+                 range.r_len =partitionSize;
+                 range.c_start=0;
+                 range.c_len=inputs[i]->getNumCols();
+             }
+             else if (combineType == VectorCombine::COLS) {
+                 partitionSize= inputs[i]->getNumCols()/worldSize;
+                 range.c_start = COORDINATOR * partitionSize;
+                 range.c_len =partitionSize;
+                 range.r_start=0;
+                 range.r_len=inputs[i]->getNumRows();
 
+             }*/
+            WorkerImpl::StoredInfo info;
+            if (!scalars.at(i)) {
+                Structure *mat = (Structure *)(&inputs[i]);
+                // info = solver.doStore(mat);
+            } else {
+                auto ptr = (double *)(&inputs[i]);
+                double val = *ptr;
+                // info= solver.doStore(&val);
             }
-            // prepare output    
-            for (size_t i = 0; i < numOutputs; i++)
-            {
-                auto combineType = combines[i];   
-                DistributedData data;
-                data.vectorCombine = combineType;
-                data.isPlacedAtWorker = true;
-                Range range;                                
-                if (combineType== VectorCombine::ROWS) {
-                    partitionSize = (*res[i])->getNumRows()/worldSize;
-                    data.ix  = DistributedIndex(COORDINATOR, 0);                
-                    range.r_start = data.ix.getRow() * partitionSize;
-                    range.r_len = partitionSize;
-                    range.c_start = 0;
-                    range.c_len = (*res[i])->getNumCols();
-                }
-                else if (combineType == VectorCombine::COLS) {
-                    partitionSize = (*res[i])->getNumCols()/worldSize;
-                    data.ix  = DistributedIndex(0, COORDINATOR);  
-                    range.r_start = 0; 
-                    range.r_len = (*res[i])->getNumRows(); 
-                    range.c_start = data.ix.getCol() * partitionSize;
-                    range.c_len = partitionSize;
-                }
-                //std::cout<<"rank "<< COORDINATOR <<" Range rows from "<< range.r_start <<" to " <<( range.r_len + range.r_start)<< " cols from " <<range.c_start <<" to " <<( range.c_len + range.c_start)<<std::endl;
-                std::string addr= std::to_string(COORDINATOR);
-                // If dp already exists for this worker, update the range and data
-                if (auto dp = (*res[i])->getMetaDataObject()->getDataPlacementByLocation(addr)) { 
-                    (*res[i])->getMetaDataObject()->updateRangeDataPlacementByID(dp->dp_id, &range);
-                    dynamic_cast<AllocationDescriptorMPI&>(*(dp->allocation)).updateDistributedData(data);                    
-                }
-                else { // else create new dp entry   
-                    AllocationDescriptorMPI allocationDescriptor(
-                                            dctx,
-                                            COORDINATOR,
-                                            data);                                    
-                    ((*res[i]))->getMetaDataObject()->addDataPlacement(&allocationDescriptor, &range);                    
-                } 
+            inputsStoredInfo.push_back(info);
+        }
+        // prepare output
+        for (size_t i = 0; i < numOutputs; i++) {
+            auto combineType = combines[i];
+            DistributedData data;
+            data.vectorCombine = combineType;
+            data.isPlacedAtWorker = true;
+            Range range;
+            if (combineType == VectorCombine::ROWS) {
+                partitionSize = (*res[i])->getNumRows() / worldSize;
+                data.ix = DistributedIndex(COORDINATOR, 0);
+                range.r_start = data.ix.getRow() * partitionSize;
+                range.r_len = partitionSize;
+                range.c_start = 0;
+                range.c_len = (*res[i])->getNumCols();
+            } else if (combineType == VectorCombine::COLS) {
+                partitionSize = (*res[i])->getNumCols() / worldSize;
+                data.ix = DistributedIndex(0, COORDINATOR);
+                range.r_start = 0;
+                range.r_len = (*res[i])->getNumRows();
+                range.c_start = data.ix.getCol() * partitionSize;
+                range.c_len = partitionSize;
+            }
+            // std::cout<<"rank "<< COORDINATOR <<" Range rows from "<<
+            // range.r_start <<" to " <<( range.r_len + range.r_start)<< " cols
+            // from " <<range.c_start <<" to " <<( range.c_len +
+            // range.c_start)<<std::endl;
+            std::string addr = std::to_string(COORDINATOR);
+            // If dp already exists for this worker, update the range and data
+            if (auto dp = (*res[i])->getMetaDataObject()->getDataPlacementByLocation(addr)) {
+                (*res[i])->getMetaDataObject()->updateRangeDataPlacementByID(dp->dp_id, &range);
+                dynamic_cast<AllocationDescriptorMPI &>(*(dp->allocation)).updateDistributedData(data);
+            } else { // else create new dp entry
+                AllocationDescriptorMPI allocationDescriptor(dctx, COORDINATOR, data);
+                ((*res[i]))->getMetaDataObject()->addDataPlacement(&allocationDescriptor, &range);
             }
-            //solver.Compute(&outputsStoredInfo, inputsStoredInfo, mlirCode);
-
         }
+        // solver.Compute(&outputsStoredInfo, inputsStoredInfo, mlirCode);
+    }
 };
 
-
 #endif
diff --git a/src/runtime/distributed/worker/MPIHelper.h b/src/runtime/distributed/worker/MPIHelper.h
index 826066aa2..80d701c6f 100644
--- a/src/runtime/distributed/worker/MPIHelper.h
+++ b/src/runtime/distributed/worker/MPIHelper.h
@@ -1,81 +1,95 @@
 #ifndef SRC_RUNTIME_DISTRIBUTED_MPIHELPER_H
 #define SRC_RUNTIME_DISTRIBUTED_MPIHELPER_H
 
+#include <iostream>
 #include <mpi.h>
-#include <runtime/local/datastructures/DenseMatrix.h>
 #include <runtime/distributed/worker/WorkerImpl.h>
-#include <unistd.h>
-#include <iostream>
-#include <sstream>
 #include <runtime/local/datastructures/AllocationDescriptorMPI.h>
+#include <runtime/local/datastructures/DenseMatrix.h>
 #include <runtime/local/datastructures/IAllocationDescriptor.h>
-#include <runtime/distributed/worker/WorkerImpl.h>
+#include <sstream>
+#include <unistd.h>
 
 #include <vector>
 
 #define COORDINATOR 0
 
-enum TypesOfMessages{
-    BROADCAST, STREAM_INIT, STREAM_COMPLETE, DATASIZE, DATA, DATAACK, MLIRSIZE, TRANSFER, TRANSFERSIZE, MLIR, INPUTKEYS, COMPUTERESULT, OUTPUT, OUTPUTKEY,  DETACH
-};
-enum WorkerStatus{
-    LISTENING=0, DETACHED, TERMINATED
+enum TypesOfMessages {
+    BROADCAST,
+    STREAM_INIT,
+    STREAM_COMPLETE,
+    DATASIZE,
+    DATA,
+    DATAACK,
+    MLIRSIZE,
+    TRANSFER,
+    TRANSFERSIZE,
+    MLIR,
+    INPUTKEYS,
+    COMPUTERESULT,
+    OUTPUT,
+    OUTPUTKEY,
+    DETACH
 };
+enum WorkerStatus { LISTENING = 0, DETACHED, TERMINATED };
 
-class MPIHelper
-{
-public:
+class MPIHelper {
+  public:
     using StoredInfo = WorkerImpl::StoredInfo;
     struct Task {
-    private:
+      private:
         struct Header {
             size_t mlir_code_len;
             size_t num_inputs;
         } __attribute__((__packed__));
-    public:
+
+      public:
         std::string mlir_code;
         std::vector<WorkerImpl::StoredInfo> inputs;
 
-        size_t sizeInBytes() { 
+        size_t sizeInBytes() {
             size_t len = 0;
             len += sizeof(Header);
             len += mlir_code.size();
-            for (auto &inp : inputs){
+            for (auto &inp : inputs) {
                 len += sizeof(size_t); // strlen
                 len += inp.identifier.size();
                 len += sizeof(size_t) * 2; // numRows + numCols
             }
             return len;
         }
-        void serialize(std::vector<char> &buffer){
+        void serialize(std::vector<char> &buffer) {
             Header h;
             h.mlir_code_len = mlir_code.size();
             h.num_inputs = inputs.size();
-            
+
             buffer.resize(this->sizeInBytes());
 
             auto bufIdx = buffer.begin();
-            std::copy(reinterpret_cast<char*>(&h), reinterpret_cast<char*>(&h) + sizeof(h), bufIdx);
+            std::copy(reinterpret_cast<char *>(&h), reinterpret_cast<char *>(&h) + sizeof(h), bufIdx);
             bufIdx += sizeof(h);
 
             std::copy(mlir_code.begin(), mlir_code.end(), bufIdx);
             bufIdx += mlir_code.size();
-                        
-            for (auto &inp : inputs){
+
+            for (auto &inp : inputs) {
                 size_t strLen = inp.identifier.size();
-                std::copy(reinterpret_cast<char*>(&strLen), reinterpret_cast<char*>(&strLen) + sizeof(strLen), bufIdx);
+                std::copy(reinterpret_cast<char *>(&strLen), reinterpret_cast<char *>(&strLen) + sizeof(strLen),
+                          bufIdx);
                 bufIdx += sizeof(strLen);
                 std::copy(inp.identifier.data(), inp.identifier.data() + strLen, bufIdx);
                 bufIdx += strLen;
-                std::copy(reinterpret_cast<char*>(&inp.numRows), reinterpret_cast<char*>(&inp.numRows) + sizeof(inp.numRows), bufIdx);
+                std::copy(reinterpret_cast<char *>(&inp.numRows),
+                          reinterpret_cast<char *>(&inp.numRows) + sizeof(inp.numRows), bufIdx);
                 bufIdx += sizeof(inp.numRows);
-                std::copy(reinterpret_cast<char*>(&inp.numCols), reinterpret_cast<char*>(&inp.numCols) + sizeof(inp.numCols), bufIdx);
+                std::copy(reinterpret_cast<char *>(&inp.numCols),
+                          reinterpret_cast<char *>(&inp.numCols) + sizeof(inp.numCols), bufIdx);
                 bufIdx += sizeof(inp.numCols);
             }
         }
-        void deserialize(const std::vector<char> &buffer){
-            size_t mlir_code_len = (size_t)((const Header*)buffer.data())->mlir_code_len;
-            size_t num_inputs = (size_t)((const Header*)buffer.data())->num_inputs;
+        void deserialize(const std::vector<char> &buffer) {
+            size_t mlir_code_len = (size_t)((const Header *)buffer.data())->mlir_code_len;
+            size_t num_inputs = (size_t)((const Header *)buffer.data())->num_inputs;
 
             auto bufIdx = buffer.begin();
             bufIdx += sizeof(Header);
@@ -83,41 +97,38 @@ class MPIHelper
             this->mlir_code.resize(mlir_code_len);
             std::copy(bufIdx, bufIdx + mlir_code_len, mlir_code.begin());
             bufIdx += mlir_code_len;
-            
+
             this->inputs.resize(num_inputs);
-            
-            for (auto &inp : inputs){
+
+            for (auto &inp : inputs) {
                 size_t strLen;
-                std::copy(bufIdx, bufIdx + sizeof(strLen), reinterpret_cast<char*>(&strLen));
+                std::copy(bufIdx, bufIdx + sizeof(strLen), reinterpret_cast<char *>(&strLen));
                 bufIdx += sizeof(strLen);
                 inp.identifier.resize(strLen);
                 std::copy(bufIdx, bufIdx + strLen, inp.identifier.data());
                 bufIdx += strLen;
-                std::copy(bufIdx, bufIdx + sizeof(inp.numRows), reinterpret_cast<char*>(&inp.numRows));
+                std::copy(bufIdx, bufIdx + sizeof(inp.numRows), reinterpret_cast<char *>(&inp.numRows));
                 bufIdx += sizeof(inp.numRows);
-                std::copy(bufIdx, bufIdx + sizeof(inp.numCols), reinterpret_cast<char*>(&inp.numCols));
+                std::copy(bufIdx, bufIdx + sizeof(inp.numCols), reinterpret_cast<char *>(&inp.numCols));
                 bufIdx += sizeof(inp.numCols);
             }
         }
     };
 
-    static int getCommSize()
-    {
+    static int getCommSize() {
         int worldSize;
         MPI_Comm_size(MPI_COMM_WORLD, &worldSize);
         return worldSize;
     }
-    
 
-    static WorkerImpl::StoredInfo constructStoredInfo(std::string input)
-    {
+    static WorkerImpl::StoredInfo constructStoredInfo(std::string input) {
         WorkerImpl::StoredInfo info;
         std::stringstream s_stream(input);
         std::vector<std::string> results;
-        while (s_stream.good())
-        {
+        while (s_stream.good()) {
             std::string substr;
-            getline(s_stream, substr, ','); // get first string delimited by comma
+            getline(s_stream, substr,
+                    ','); // get first string delimited by comma
             results.push_back(substr);
         }
         info.identifier = results.at(0);
@@ -125,43 +136,37 @@ class MPIHelper
         sscanf(results.at(2).c_str(), "%zu", &info.numCols);
         return info;
     }
-    static std::vector<WorkerImpl::StoredInfo> constructStoredInfoVector(std::vector<char> &buffer)
-    {
+    static std::vector<WorkerImpl::StoredInfo> constructStoredInfoVector(std::vector<char> &buffer) {
         std::vector<WorkerImpl::StoredInfo> vecInfo;
         std::string str(buffer.begin(), buffer.end());
         std::stringstream s_stream(str);
         std::string substr;
-        while (getline(s_stream, substr, ':'))
-        {
+        while (getline(s_stream, substr, ':')) {
             vecInfo.push_back(constructStoredInfo(substr));
         }
         return vecInfo;
     }
 
-    static std::vector<char> getComputeResults(int rank)
-    {
+    static std::vector<char> getComputeResults(int rank) {
         size_t resultsLen = 0;
         std::vector<char> buffer;
         getMessageFrom(rank, COMPUTERESULT, MPI_UNSIGNED_CHAR, buffer, &resultsLen);
         return buffer;
     }
 
-    static WorkerImpl::StoredInfo getDataAcknowledgement(int *rank)
-    {
+    static WorkerImpl::StoredInfo getDataAcknowledgement(int *rank) {
         std::vector<char> dataAcknowledgement;
         size_t len;
         getMessage(rank, DATAACK, MPI_CHAR, dataAcknowledgement, &len);
         std::string incomeAck = std::string(dataAcknowledgement.data());
-        StoredInfo info = constructStoredInfo(incomeAck);        
+        StoredInfo info = constructStoredInfo(incomeAck);
         return info;
     }
 
-    static void broadcastData(size_t messageLength, void *data)
-    {
+    static void broadcastData(size_t messageLength, void *data) {
         int worldSize = getCommSize();
         int message = messageLength;
-        for (int rank = 0; rank < worldSize; rank++)
-        {
+        for (int rank = 0; rank < worldSize; rank++) {
             if (rank == COORDINATOR)
                 continue;
             MPI_Send(&message, 1, MPI_INT, rank, BROADCAST, MPI_COMM_WORLD);
@@ -169,28 +174,18 @@ class MPIHelper
         MPI_Bcast(data, message, MPI_UNSIGNED_CHAR, COORDINATOR, MPI_COMM_WORLD);
     }
 
-    static void initiateStreaming(int rank, size_t chunksize)
-    {
+    static void initiateStreaming(int rank, size_t chunksize) {
         MPI_Send(&chunksize, 1, MPI_INT, rank, STREAM_INIT, MPI_COMM_WORLD);
     }
-    static void sendData(size_t messageLength, void *data, int rank)
-    {
-        sendWithTag(DATA, messageLength, data, rank);
-    }
+    static void sendData(size_t messageLength, void *data, int rank) { sendWithTag(DATA, messageLength, data, rank); }
 
-    static void sendTask(size_t messageLength, void *data, int rank)
-    {
-        sendWithTag(MLIR, messageLength, data, rank);
-    }
+    static void sendTask(size_t messageLength, void *data, int rank) { sendWithTag(MLIR, messageLength, data, rank); }
 
-    static void displayDataStructure(Structure *inputStruct, std::string dataToDisplay)
-    {
+    static void displayDataStructure(Structure *inputStruct, std::string dataToDisplay) {
         DenseMatrix<double> *res = dynamic_cast<DenseMatrix<double> *>(inputStruct);
         double *allValues = res->getValues();
-        for (size_t r = 0; r < res->getNumRows(); r++)
-        {
-            for (size_t c = 0; c < res->getNumCols(); c++)
-            {
+        for (size_t r = 0; r < res->getNumRows(); r++) {
+            for (size_t c = 0; c < res->getNumCols(); c++) {
                 dataToDisplay += std::to_string(allValues[c]) + " , ";
             }
             dataToDisplay += "\n";
@@ -199,8 +194,7 @@ class MPIHelper
         // std::cout<<dataToDisplay<<std::endl;
     }
 
-    static void requestData(const int& rank, const StoredInfo& info)
-    {
+    static void requestData(const int &rank, const StoredInfo &info) {
         int len = info.toString().length();
         len++;
         MPI_Send(&len, 1, MPI_INT, rank, TRANSFERSIZE, MPI_COMM_WORLD);
@@ -210,9 +204,7 @@ class MPIHelper
         MPI_Send(message, len, MPI_CHAR, rank, TRANSFER, MPI_COMM_WORLD);
     }
 
-
-    static void getMessage(int *rank, int tag, MPI_Datatype type, std::vector<char> &data, size_t *len)
-    {
+    static void getMessage(int *rank, int tag, MPI_Datatype type, std::vector<char> &data, size_t *len) {
         int size;
         MPI_Status status;
         MPI_Probe(MPI_ANY_SOURCE, tag, MPI_COMM_WORLD, &status);
@@ -224,8 +216,7 @@ class MPIHelper
         *len = size;
     }
 
-    static void getMessageFrom(int rank, int tag, MPI_Datatype type, std::vector<char> &data, size_t *len)
-    {
+    static void getMessageFrom(int rank, int tag, MPI_Datatype type, std::vector<char> &data, size_t *len) {
         int size;
         MPI_Status status;
         MPI_Probe(rank, tag, MPI_COMM_WORLD, &status);
@@ -236,15 +227,13 @@ class MPIHelper
         *len = size;
     }
 
-    static void sendWithTag(TypesOfMessages tag, size_t messageLength, void *data, int rank)
-    {
+    static void sendWithTag(TypesOfMessages tag, size_t messageLength, void *data, int rank) {
         if (rank == COORDINATOR)
             return;
         int message = messageLength;
         int sizeTag = -1, dataTag = -1;
         // std::cout<<"message size is "<< message << " tag "<< tag <<std::endl;
-        switch (tag)
-        {
+        switch (tag) {
         case DATA:
             sizeTag = DATASIZE;
             dataTag = DATA;
@@ -255,7 +244,8 @@ class MPIHelper
         default:
             break;
         }
-        // std::cout<<"message size is "<< message << " tag "<< sizeTag <<std::endl;
+        // std::cout<<"message size is "<< message << " tag "<< sizeTag
+        // <<std::endl;
         MPI_Send(&message, 1, MPI_INT, rank, sizeTag, MPI_COMM_WORLD);
         MPI_Send(data, message, MPI_UNSIGNED_CHAR, rank, dataTag, MPI_COMM_WORLD);
     }
diff --git a/src/runtime/distributed/worker/MPIWorker.h b/src/runtime/distributed/worker/MPIWorker.h
index e959fa22f..7bf070d99 100644
--- a/src/runtime/distributed/worker/MPIWorker.h
+++ b/src/runtime/distributed/worker/MPIWorker.h
@@ -20,187 +20,186 @@
 #include <mpi.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
 
-#include <runtime/distributed/worker/WorkerImpl.h>
 #include <runtime/distributed/worker/MPIHelper.h>
+#include <runtime/distributed/worker/WorkerImpl.h>
 #include <runtime/local/datastructures/AllocationDescriptorMPI.h>
 #include <runtime/local/datastructures/IAllocationDescriptor.h>
 #include <runtime/local/io/DaphneSerializer.h>
 
 class MPIWorker : WorkerImpl {
-    public:
-        MPIWorker(DaphneUserConfig& _cfg) : WorkerImpl(_cfg) {//TODO
-            MPI_Comm_rank(MPI_COMM_WORLD, &id);
-        }
-        
-        ~MPIWorker(){//TODO
-        }
-        
-        void joinComputingTeam(){
-            int inCommingMessage=0;
-            MPI_Status status;
-            while(myState!=TERMINATED){//
-                MPI_Iprobe(COORDINATOR, MPI_ANY_TAG, MPI_COMM_WORLD, &inCommingMessage, &status);
-                if(inCommingMessage && myState!=DETACHED)
-                {
-                    handleInCommingMessages(status);
-                }
-                else
-                    continueComputing(); // takes form a queue // hocks for scheuling
-            }
+  public:
+    MPIWorker(DaphneUserConfig &_cfg) : WorkerImpl(_cfg) { // TODO
+        MPI_Comm_rank(MPI_COMM_WORLD, &id);
+    }
+
+    ~MPIWorker() { // TODO
+    }
+
+    void joinComputingTeam() {
+        int inCommingMessage = 0;
+        MPI_Status status;
+        while (myState != TERMINATED) { //
+            MPI_Iprobe(COORDINATOR, MPI_ANY_TAG, MPI_COMM_WORLD, &inCommingMessage, &status);
+            if (inCommingMessage && myState != DETACHED) {
+                handleInCommingMessages(status);
+            } else
+                continueComputing(); // takes form a queue // hocks for
+                                     // scheuling
         }
-        
-    private:
-        int id;
-        int myState=LISTENING;
-        int temp=0;
-        // Store in chunks
-        std::unique_ptr<DaphneDeserializerChunks<Structure>> deserializer;
-        std::unique_ptr<DaphneDeserializerChunks<Structure>::Iterator> deserializerIter;
-        Structure *deserializedMatrix; 
-
-        std::tuple<bool, StoredInfo> storeInputs(const std::vector<char> &buffer, size_t messageLength)
-        {
-            StoredInfo info;
-            if (*deserializerIter == deserializer->begin() && DF_Dtype(buffer.data()) == DF_data_t::Value_t) {
-                double val = DaphneSerializer<double>::deserialize(buffer.data());
-                info = this->Store(&val);
+    }
+
+  private:
+    int id;
+    int myState = LISTENING;
+    int temp = 0;
+    // Store in chunks
+    std::unique_ptr<DaphneDeserializerChunks<Structure>> deserializer;
+    std::unique_ptr<DaphneDeserializerChunks<Structure>::Iterator> deserializerIter;
+    Structure *deserializedMatrix;
+
+    std::tuple<bool, StoredInfo> storeInputs(const std::vector<char> &buffer, size_t messageLength) {
+        StoredInfo info;
+        if (*deserializerIter == deserializer->begin() && DF_Dtype(buffer.data()) == DF_data_t::Value_t) {
+            double val = DaphneSerializer<double>::deserialize(buffer.data());
+            info = this->Store(&val);
+            return std::make_tuple(true, info);
+        } else {
+            // partially deserialize next
+            (*deserializerIter)->first = messageLength;
+            if ((*deserializerIter)->second->size() < messageLength)
+                (*deserializerIter)->second->resize(messageLength);
+            (*deserializerIter)
+                ->second->assign(static_cast<const char *>(buffer.data()),
+                                 static_cast<const char *>(buffer.data()) + messageLength);
+
+            // advance iterator, this also partially deserializes
+            ++(*deserializerIter);
+            // response if we completed
+            if (*deserializerIter == deserializer->end()) {
+                info = this->Store(deserializedMatrix);
                 return std::make_tuple(true, info);
-            } else {
-                // partially deserialize next
-                (*deserializerIter)->first = messageLength;
-                if ((*deserializerIter)->second->size() < messageLength)
-                    (*deserializerIter)->second->resize(messageLength);
-                (*deserializerIter)->second->assign(static_cast<const char*>(buffer.data()), static_cast<const char*>(buffer.data()) + messageLength);
-                
-                // advance iterator, this also partially deserializes
-                ++(*deserializerIter);
-                // response if we completed
-                if (*deserializerIter == deserializer->end()){
-                    info = this->Store(deserializedMatrix);
-                    return std::make_tuple(true, info);
-                }
-            }            
-            return std::make_tuple(false, info); 
-        }
-        
-        void sendComputeResult(std::vector<StoredInfo> outputs)
-        {
-            std::string computeResult = ""; 
-            StoredInfo tempInfo=outputs.at(0);
-            computeResult += tempInfo.toString();
-            for (size_t i = 1; i < outputs.size(); i++)
-            {   
-                StoredInfo tempInfo=outputs.at(i);
-                computeResult += ":" + tempInfo.toString();
             }
-            MPI_Send(computeResult.c_str(), computeResult.size(), MPI_CHAR, COORDINATOR, COMPUTERESULT, MPI_COMM_WORLD);
         }
-        void sendMatrix(StoredInfo info)
-        {
-            auto mat = this->Transfer(info);
-            std::vector<char> dataToSend;
-            size_t messageLength = DaphneSerializer<Structure>::serialize(mat, dataToSend);
-            
-            int len = messageLength;
-            MPI_Send(dataToSend.data(), len, MPI_UNSIGNED_CHAR, COORDINATOR, OUTPUT, MPI_COMM_WORLD);            
-        }
-        
-        void prepareBufferForMessage(std::vector<char> &buffer, int * messageLength, MPI_Datatype type, int source, int tag)
-        {
-            MPI_Status messageStatus;
-            MPI_Recv(messageLength, 1, type, source, tag, MPI_COMM_WORLD, &messageStatus);
-            // std::cout<< id<<" in distribute size " <<*messageLength << " tag " << tag <<std::endl;            
-            if (buffer.size() < size_t(*messageLength))
-                buffer.resize(size_t(*messageLength));
-        } 
-        
-        void sendDataACK(StoredInfo info)
-        {
-            //std::cout<< "ack from " << id << " will have " << info.identifier << " , " << std::to_string(info.numRows) << " , " <<std::to_string(info.numCols)<<std::endl;
-            std::string toSend= info.toString();
-            MPI_Send(toSend.c_str(), sizeof(toSend), MPI_CHAR, COORDINATOR, DATAACK, MPI_COMM_WORLD);
-            //std::cout<<"rank " << id <<" acknowledging" <<std::endl;
-        }
-        
-        void detachFromComputingTeam(){
-            myState = DETACHED;
-            //std::cout<<"I am " << id <<". I got detach message... " << std::endl;
+        return std::make_tuple(false, info);
+    }
+
+    void sendComputeResult(std::vector<StoredInfo> outputs) {
+        std::string computeResult = "";
+        StoredInfo tempInfo = outputs.at(0);
+        computeResult += tempInfo.toString();
+        for (size_t i = 1; i < outputs.size(); i++) {
+            StoredInfo tempInfo = outputs.at(i);
+            computeResult += ":" + tempInfo.toString();
         }
-        
-        void terminate(){
+        MPI_Send(computeResult.c_str(), computeResult.size(), MPI_CHAR, COORDINATOR, COMPUTERESULT, MPI_COMM_WORLD);
+    }
+    void sendMatrix(StoredInfo info) {
+        auto mat = this->Transfer(info);
+        std::vector<char> dataToSend;
+        size_t messageLength = DaphneSerializer<Structure>::serialize(mat, dataToSend);
+
+        int len = messageLength;
+        MPI_Send(dataToSend.data(), len, MPI_UNSIGNED_CHAR, COORDINATOR, OUTPUT, MPI_COMM_WORLD);
+    }
+
+    void prepareBufferForMessage(std::vector<char> &buffer, int *messageLength, MPI_Datatype type, int source,
+                                 int tag) {
+        MPI_Status messageStatus;
+        MPI_Recv(messageLength, 1, type, source, tag, MPI_COMM_WORLD, &messageStatus);
+        // std::cout<< id<<" in distribute size " <<*messageLength << " tag " <<
+        // tag <<std::endl;
+        if (buffer.size() < size_t(*messageLength))
+            buffer.resize(size_t(*messageLength));
+    }
+
+    void sendDataACK(StoredInfo info) {
+        // std::cout<< "ack from " << id << " will have " << info.identifier <<
+        // " , " << std::to_string(info.numRows) << " , "
+        // <<std::to_string(info.numCols)<<std::endl;
+        std::string toSend = info.toString();
+        MPI_Send(toSend.c_str(), sizeof(toSend), MPI_CHAR, COORDINATOR, DATAACK, MPI_COMM_WORLD);
+        // std::cout<<"rank " << id <<" acknowledging" <<std::endl;
+    }
+
+    void detachFromComputingTeam() {
+        myState = DETACHED;
+        // std::cout<<"I am " << id <<". I got detach message... " << std::endl;
+    }
+
+    void terminate() {
+        myState = TERMINATED;
+        // std::cout<<"I am worker " << id << ". I'll rest in peace" <<
+        // std::endl;
+    }
+
+    void continueComputing() {
+        if (myState == DETACHED)
             myState = TERMINATED;
-           // std::cout<<"I am worker " << id << ". I'll rest in peace" << std::endl;
-        }
-        
-        void continueComputing(){
-            if(myState==DETACHED)
-                myState=TERMINATED;
-        }
+    }
 
-        void handleInCommingMessages(MPI_Status status){
-            int source = status.MPI_SOURCE;
-            int tag = status.MPI_TAG;
-            MPI_Status messageStatus;
-            std::vector<char> buffer;
-            int messageLength;
-            MPIHelper::Task MsgTask;
-            std::string printData="";
-            std::vector<StoredInfo> outputs;
-            std::string identifier;
-            WorkerImpl::Status exStatus(true);
-            switch(tag){
-                case STREAM_INIT:
-                    int chunkSize;
-                    MPI_Recv(&chunkSize, 1, MPI_INT, COORDINATOR, STREAM_INIT, MPI_COMM_WORLD, &messageStatus);
-                    deserializer.reset(new DaphneDeserializerChunks<Structure>(&deserializedMatrix, chunkSize));
-                    deserializerIter.reset(new DaphneDeserializerChunks<Structure>::Iterator(deserializer->begin()));    
-                    (*deserializerIter)->second->resize(chunkSize);
-                break;
-                case DATASIZE: {
-                    prepareBufferForMessage(buffer, &messageLength, MPI_INT, source, DATASIZE);
-                    MPI_Recv(buffer.data(), messageLength, MPI_UNSIGNED_CHAR, COORDINATOR, DATA, MPI_COMM_WORLD, &messageStatus);                    
-                    auto ret = storeInputs(buffer, (size_t)messageLength);
-                    if (std::get<0>(ret))
-                        sendDataACK(std::get<1>(ret));
-                }
-                break;
-                case BROADCAST: {
-                    prepareBufferForMessage(buffer, &messageLength, MPI_INT, source, BROADCAST);
-                    MPI_Bcast(buffer.data(), messageLength, MPI_UNSIGNED_CHAR, COORDINATOR, MPI_COMM_WORLD);
-                    auto ret = storeInputs(buffer, (size_t)messageLength);
-                    if (std::get<0>(ret))
-                        sendDataACK(std::get<1>(ret));
-                }                
-                break;   
-
-                case MLIRSIZE:
-                    prepareBufferForMessage(buffer, &messageLength, MPI_INT, source, MLIRSIZE);
-                    MPI_Recv(buffer.data(), messageLength, MPI_UNSIGNED_CHAR, COORDINATOR, MLIR,MPI_COMM_WORLD, &messageStatus);
-                    MsgTask.deserialize(buffer);
-                    
-                    exStatus = this->Compute(&outputs, MsgTask.inputs, MsgTask.mlir_code);
-                    sendComputeResult(outputs);                    
-                break;
-
-                case TRANSFERSIZE: {
-                    prepareBufferForMessage(buffer, &messageLength, MPI_INT, source, TRANSFERSIZE);
-                    MPI_Recv(buffer.data(), messageLength, MPI_CHAR, COORDINATOR, TRANSFER, MPI_COMM_WORLD, &messageStatus);
-                    auto info = MPIHelper::constructStoredInfo(std::string(buffer.data()));
-                    sendMatrix(info);
-                }
-                break;
-
-                case DETACH:
-                    unsigned char terminateMessage;
-                    MPI_Recv(&terminateMessage, 1, MPI_UNSIGNED_CHAR, source, DETACH, MPI_COMM_WORLD, &messageStatus);
-                    detachFromComputingTeam();
-                break;
-
-                default:
-                    //TODO
-                break;
-            }
+    void handleInCommingMessages(MPI_Status status) {
+        int source = status.MPI_SOURCE;
+        int tag = status.MPI_TAG;
+        MPI_Status messageStatus;
+        std::vector<char> buffer;
+        int messageLength;
+        MPIHelper::Task MsgTask;
+        std::string printData = "";
+        std::vector<StoredInfo> outputs;
+        std::string identifier;
+        WorkerImpl::Status exStatus(true);
+        switch (tag) {
+        case STREAM_INIT:
+            int chunkSize;
+            MPI_Recv(&chunkSize, 1, MPI_INT, COORDINATOR, STREAM_INIT, MPI_COMM_WORLD, &messageStatus);
+            deserializer.reset(new DaphneDeserializerChunks<Structure>(&deserializedMatrix, chunkSize));
+            deserializerIter.reset(new DaphneDeserializerChunks<Structure>::Iterator(deserializer->begin()));
+            (*deserializerIter)->second->resize(chunkSize);
+            break;
+        case DATASIZE: {
+            prepareBufferForMessage(buffer, &messageLength, MPI_INT, source, DATASIZE);
+            MPI_Recv(buffer.data(), messageLength, MPI_UNSIGNED_CHAR, COORDINATOR, DATA, MPI_COMM_WORLD,
+                     &messageStatus);
+            auto ret = storeInputs(buffer, (size_t)messageLength);
+            if (std::get<0>(ret))
+                sendDataACK(std::get<1>(ret));
+        } break;
+        case BROADCAST: {
+            prepareBufferForMessage(buffer, &messageLength, MPI_INT, source, BROADCAST);
+            MPI_Bcast(buffer.data(), messageLength, MPI_UNSIGNED_CHAR, COORDINATOR, MPI_COMM_WORLD);
+            auto ret = storeInputs(buffer, (size_t)messageLength);
+            if (std::get<0>(ret))
+                sendDataACK(std::get<1>(ret));
+        } break;
+
+        case MLIRSIZE:
+            prepareBufferForMessage(buffer, &messageLength, MPI_INT, source, MLIRSIZE);
+            MPI_Recv(buffer.data(), messageLength, MPI_UNSIGNED_CHAR, COORDINATOR, MLIR, MPI_COMM_WORLD,
+                     &messageStatus);
+            MsgTask.deserialize(buffer);
+
+            exStatus = this->Compute(&outputs, MsgTask.inputs, MsgTask.mlir_code);
+            sendComputeResult(outputs);
+            break;
+
+        case TRANSFERSIZE: {
+            prepareBufferForMessage(buffer, &messageLength, MPI_INT, source, TRANSFERSIZE);
+            MPI_Recv(buffer.data(), messageLength, MPI_CHAR, COORDINATOR, TRANSFER, MPI_COMM_WORLD, &messageStatus);
+            auto info = MPIHelper::constructStoredInfo(std::string(buffer.data()));
+            sendMatrix(info);
+        } break;
+
+        case DETACH:
+            unsigned char terminateMessage;
+            MPI_Recv(&terminateMessage, 1, MPI_UNSIGNED_CHAR, source, DETACH, MPI_COMM_WORLD, &messageStatus);
+            detachFromComputingTeam();
+            break;
+
+        default:
+            // TODO
+            break;
         }
+    }
 };
 
 #endif
diff --git a/src/runtime/distributed/worker/WorkerImpl.cpp b/src/runtime/distributed/worker/WorkerImpl.cpp
index a2d88a9f6..6ea33be26 100644
--- a/src/runtime/distributed/worker/WorkerImpl.cpp
+++ b/src/runtime/distributed/worker/WorkerImpl.cpp
@@ -14,60 +14,55 @@
  * limitations under the License.
  */
 
-#include <mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h>
-#include <mlir/IR/MLIRContext.h>
 #include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"
+#include <llvm/Support/SourceMgr.h>
 #include <mlir/ExecutionEngine/ExecutionEngine.h>
 #include <mlir/IR/AsmState.h>
+#include <mlir/IR/MLIRContext.h>
 #include <mlir/Parser/Parser.h>
-#include <llvm/Support/SourceMgr.h>
+#include <mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h>
 
 #include <ir/daphneir/Daphne.h>
 #include <parser/catalog/KernelCatalogParser.h>
 
 #include "WorkerImpl.h"
 
+#include <compiler/execution/DaphneIrExecutor.h>
 #include <runtime/local/datastructures/CSRMatrix.h>
-#include <runtime/local/datastructures/DenseMatrix.h>
 #include <runtime/local/datastructures/DataObjectFactory.h>
-#include <runtime/local/kernels/Read.h>
-#include <runtime/local/io/ReadCsv.h>
+#include <runtime/local/datastructures/DenseMatrix.h>
 #include <runtime/local/io/File.h>
-#include <compiler/execution/DaphneIrExecutor.h>
+#include <runtime/local/io/ReadCsv.h>
+#include <runtime/local/kernels/Read.h>
 
 #include <stdexcept>
 
 const std::string WorkerImpl::DISTRIBUTED_FUNCTION_NAME = "dist";
 
-WorkerImpl::WorkerImpl(DaphneUserConfig& _cfg) : cfg(_cfg), tmp_file_counter_(0), localData_() {}
+WorkerImpl::WorkerImpl(DaphneUserConfig &_cfg) : cfg(_cfg), tmp_file_counter_(0), localData_() {}
 
-template<>
-WorkerImpl::StoredInfo WorkerImpl::Store<Structure>(Structure *mat)
-{    
+template <> WorkerImpl::StoredInfo WorkerImpl::Store<Structure>(Structure *mat) {
     auto identifier = "tmp_" + std::to_string(tmp_file_counter_++);
     localData_[identifier] = mat;
     return StoredInfo({identifier, mat->getNumRows(), mat->getNumCols()});
 }
-template<>
-WorkerImpl::StoredInfo WorkerImpl::Store<double>(double *val)
-{    
+template <> WorkerImpl::StoredInfo WorkerImpl::Store<double>(double *val) {
     auto identifier = "tmp_" + std::to_string(tmp_file_counter_++);
     // The vectorized engine expects as input, a pointer value
     // to the memory holding a value. Therefore we need to allocate memory
     // and save the value of the pointer to that address.
-    
-    // TODO: We need to implement a free operation (this applies to objects/matrices too). 
-    // It's probably best that coordinator decides when
-    // memory should be freed, (either when requesting data or by specifing a "FreeMemory" RPC call).
-    double * valPtr = new double(*val);
+
+    // TODO: We need to implement a free operation (this applies to
+    // objects/matrices too). It's probably best that coordinator decides when
+    // memory should be freed, (either when requesting data or by specifing a
+    // "FreeMemory" RPC call).
+    double *valPtr = new double(*val);
     localData_[identifier] = valPtr;
     return StoredInfo({identifier, 0, 0});
 }
 
-
 WorkerImpl::Status WorkerImpl::Compute(std::vector<WorkerImpl::StoredInfo> *outputs,
-        const std::vector<WorkerImpl::StoredInfo> &inputs, const std::string &mlirCode)
-{
+                                       const std::vector<WorkerImpl::StoredInfo> &inputs, const std::string &mlirCode) {
     cfg.use_vectorized_exec = true;
     cfg.use_distributed = false;
 
@@ -78,10 +73,10 @@ WorkerImpl::Status WorkerImpl::Compute(std::vector<WorkerImpl::StoredInfo> *outp
     // the FreeOps at the coordinator already.
     DaphneIrExecutor executor(false, cfg);
 
-    KernelCatalog & kc = executor.getUserConfig().kernelCatalog;
+    KernelCatalog &kc = executor.getUserConfig().kernelCatalog;
     KernelCatalogParser kcp(executor.getContext());
     kcp.parseKernelCatalog(cfg.libdir + "/catalog.json", kc);
-    if(executor.getUserConfig().use_cuda)
+    if (executor.getUserConfig().use_cuda)
         kcp.parseKernelCatalog(cfg.libdir + "/CUDAcatalog.json", kc);
 
     mlir::OwningOpRef<mlir::ModuleOp> module(mlir::parseSourceString<mlir::ModuleOp>(mlirCode, executor.getContext()));
@@ -102,11 +97,8 @@ WorkerImpl::Status WorkerImpl::Compute(std::vector<WorkerImpl::StoredInfo> *outp
 
     std::vector<void *> inputsObj;
     std::vector<void *> outputsObj;
-    auto packedInputsOutputs = createPackedCInterfaceInputsOutputs(distFuncTy,
-        inputs,
-        outputsObj,
-        inputsObj);
-    
+    auto packedInputsOutputs = createPackedCInterfaceInputsOutputs(distFuncTy, inputs, outputsObj, inputsObj);
+
     // Increase the reference counters of all inputs to the `dist` function.
     // (But only consider data objects, not scalars.)
     // This is necessary to avoid them from being destroyed within the
@@ -114,11 +106,11 @@ WorkerImpl::Status WorkerImpl::Compute(std::vector<WorkerImpl::StoredInfo> *outp
     // local function calls, where we also increase the inputs' reference
     // counters before the call, for the same reason. See ManageObjsRefsPass
     // for details.
-    for(size_t i = 0; i < inputsObj.size(); i++)
-        // TODO Use CompilerUtils::isObjType() once this branch has been rebased.
-        // if(CompilerUtils::isObjType(distFuncTy.getInput(i)))
-        if(llvm::isa<mlir::daphne::MatrixType, mlir::daphne::FrameType>(distFuncTy.getInput(i)))
-            reinterpret_cast<Structure*>(inputsObj[i])->increaseRefCounter();
+    for (size_t i = 0; i < inputsObj.size(); i++)
+        // TODO Use CompilerUtils::isObjType() once this branch has been
+        // rebased. if(CompilerUtils::isObjType(distFuncTy.getInput(i)))
+        if (llvm::isa<mlir::daphne::MatrixType, mlir::daphne::FrameType>(distFuncTy.getInput(i)))
+            reinterpret_cast<Structure *>(inputsObj[i])->increaseRefCounter();
 
     // Execution
     // TODO Before we run the passes, we should insert information on shape
@@ -141,7 +133,7 @@ WorkerImpl::Status WorkerImpl::Compute(std::vector<WorkerImpl::StoredInfo> *outp
         return WorkerImpl::Status(false, std::string("Failed to create JIT-Execution engine"));
     }
     auto error = engine->invokePacked(DISTRIBUTED_FUNCTION_NAME,
-        llvm::MutableArrayRef<void *>{&packedInputsOutputs[0], (size_t)0});
+                                      llvm::MutableArrayRef<void *>{&packedInputsOutputs[0], (size_t)0});
 
     if (error) {
         std::stringstream ss("JIT-Engine invocation failed.");
@@ -150,12 +142,12 @@ WorkerImpl::Status WorkerImpl::Compute(std::vector<WorkerImpl::StoredInfo> *outp
     }
 
     for (auto zipped : llvm::zip(outputsObj, distFuncTy.getResults())) {
-        auto output = std::get<0>(zipped);        
+        auto output = std::get<0>(zipped);
 
         auto identification = "tmp_" + std::to_string(tmp_file_counter_++);
         localData_[identification] = output;
 
-        auto mat = static_cast<Structure*>(output);
+        auto mat = static_cast<Structure *>(output);
 
         outputs->push_back(StoredInfo({identification, mat->getNumRows(), mat->getNumCols()}));
     }
@@ -171,26 +163,24 @@ WorkerImpl::Status WorkerImpl::Compute(std::vector<WorkerImpl::StoredInfo> *outp
 //     }
 //     else {
 //         // TODO: further types data cases
-//         throw std::runtime_error("WorlerImpl: TODO: further types data cases");
+//         throw std::runtime_error("WorlerImpl: TODO: further types data
+//         cases");
 //     }
 //     return dataCase;
 // }
 
-
-Structure * WorkerImpl::Transfer(StoredInfo info)
-{
+Structure *WorkerImpl::Transfer(StoredInfo info) {
     Structure *mat = readOrGetMatrix(info.identifier, info.numRows, info.numCols);
     return mat;
 }
 
-
 std::vector<void *> WorkerImpl::createPackedCInterfaceInputsOutputs(mlir::FunctionType functionType,
                                                                     std::vector<WorkerImpl::StoredInfo> workInputs,
                                                                     std::vector<void *> &outputs,
-                                                                    std::vector<void *> &inputs)
-{
+                                                                    std::vector<void *> &inputs) {
     if (static_cast<size_t>(functionType.getNumInputs()) != workInputs.size())
-        throw std::runtime_error("WorkerImpl: Number of inputs received have to match number of MLIR fragment inputs");
+        throw std::runtime_error("WorkerImpl: Number of inputs received have "
+                                 "to match number of MLIR fragment inputs");
     std::vector<void *> inputsAndOutputs;
 
     // No realloc is allowed to happen, otherwise the pointers are invalid
@@ -205,69 +195,65 @@ std::vector<void *> WorkerImpl::createPackedCInterfaceInputsOutputs(mlir::Functi
         inputsAndOutputs.push_back(&inputs.back());
     }
 
-//    for (const auto &type : functionType.getResults()) {
-    for(auto i = 0ul; i < functionType.getResults().size(); ++i) {
+    //    for (const auto &type : functionType.getResults()) {
+    for (auto i = 0ul; i < functionType.getResults().size(); ++i) {
         outputs.push_back(nullptr);
         inputsAndOutputs.push_back(&outputs.back());
     }
     return inputsAndOutputs;
 }
 
-void *WorkerImpl::loadWorkInputData(mlir::Type mlirType, StoredInfo &workInput)
-{    
+void *WorkerImpl::loadWorkInputData(mlir::Type mlirType, StoredInfo &workInput) {
     // TODO: all types
     bool isSparse = false;
     bool isFloat = false;
     bool isScalar = false;
-    if (llvm::isa<mlir::daphne::MatrixType>(mlirType)){
-        auto matTy = mlirType.dyn_cast<mlir::daphne::MatrixType>();        
-        isSparse = matTy.getRepresentation() == mlir::daphne::MatrixRepresentation::Sparse;       
+    if (llvm::isa<mlir::daphne::MatrixType>(mlirType)) {
+        auto matTy = mlirType.dyn_cast<mlir::daphne::MatrixType>();
+        isSparse = matTy.getRepresentation() == mlir::daphne::MatrixRepresentation::Sparse;
         isFloat = llvm::isa<mlir::Float64Type>(matTy.getElementType());
-    }
-    else
+    } else
         isScalar = true;
     return readOrGetMatrix(workInput.identifier, workInput.numRows, workInput.numCols, isSparse, isFloat, isScalar);
 }
 
-Structure *WorkerImpl::readOrGetMatrix(const std::string &identifier, size_t numRows, size_t numCols, bool isSparse /*= false */, bool isFloat /* = false*/, bool isScalar /* = false */)
-{
+Structure *WorkerImpl::readOrGetMatrix(const std::string &identifier, size_t numRows, size_t numCols,
+                                       bool isSparse /*= false */, bool isFloat /* = false*/,
+                                       bool isScalar /* = false */) {
     auto data_it = localData_.find(identifier);
     if (data_it != localData_.end()) {
         // Data already cached
-        if (isScalar){
-            auto valAddress = (double*)(data_it->second);
-            auto structurePtrPtr = (Structure**)valAddress;
+        if (isScalar) {
+            auto valAddress = (double *)(data_it->second);
+            auto structurePtrPtr = (Structure **)valAddress;
             return (*structurePtrPtr);
-        }
-        else
+        } else
             return static_cast<Structure *>(data_it->second);
-    }
-    else {
+    } else {
         // Data not yet loaded -> load from file
-        Structure * m = nullptr;
-        // TODO do we need to check for sparsity here? Why Dense and CSR use different read method?
-        if(isSparse) {        
-            if (isFloat){
+        Structure *m = nullptr;
+        // TODO do we need to check for sparsity here? Why Dense and CSR use
+        // different read method?
+        if (isSparse) {
+            if (isFloat) {
                 CSRMatrix<double> *m2 = nullptr;
                 read<CSRMatrix<double>>(m2, identifier.c_str(), nullptr);
                 m = m2;
-            }
-            else{
+            } else {
                 CSRMatrix<int64_t> *m2 = nullptr;
                 read<CSRMatrix<int64_t>>(m2, identifier.c_str(), nullptr);
                 m = m2;
             }
-        }
-        else {
+        } else {
             struct File *file = openFile(identifier.c_str());
             char delim = ',';
             // TODO use read
             if (isFloat) {
-                DenseMatrix<double> *m2 = nullptr;                
+                DenseMatrix<double> *m2 = nullptr;
                 readCsvFile<DenseMatrix<double>>(m2, file, numRows, numCols, delim);
                 m = m2;
             } else {
-                DenseMatrix<int64_t> *m2 = nullptr;                
+                DenseMatrix<int64_t> *m2 = nullptr;
                 readCsvFile<DenseMatrix<int64_t>>(m2, file, numRows, numCols, delim);
                 m = m2;
             }
diff --git a/src/runtime/distributed/worker/WorkerImpl.h b/src/runtime/distributed/worker/WorkerImpl.h
index 1183051a5..407b3db33 100644
--- a/src/runtime/distributed/worker/WorkerImpl.h
+++ b/src/runtime/distributed/worker/WorkerImpl.h
@@ -24,29 +24,29 @@
 #include <api/cli/DaphneUserConfig.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
 
-class WorkerImpl  
-{
-public:
+class WorkerImpl {
+  public:
     class Status {
-        private:
-            bool ok_;
-            std::string error_message_;
-        public:
-            Status (bool ok) : ok_(ok), error_message_("") { };
-            Status(bool ok, std::string msg) : ok_(ok), error_message_(msg) { };
-            bool ok() const { return ok_; };
-            std::string error_message() const { return error_message_; }; 
+      private:
+        bool ok_;
+        std::string error_message_;
+
+      public:
+        Status(bool ok) : ok_(ok), error_message_(""){};
+        Status(bool ok, std::string msg) : ok_(ok), error_message_(msg){};
+        bool ok() const { return ok_; };
+        std::string error_message() const { return error_message_; };
     };
-    
+
     const static std::string DISTRIBUTED_FUNCTION_NAME;
 
-    DaphneUserConfig& cfg;
+    DaphneUserConfig &cfg;
 
-    WorkerImpl(DaphneUserConfig& _cfg);
+    WorkerImpl(DaphneUserConfig &_cfg);
     ~WorkerImpl() = default;
-    
-    virtual void Wait() { };
-   
+
+    virtual void Wait() {};
+
     struct StoredInfo {
         std::string identifier;
         size_t numRows, numCols;
@@ -57,50 +57,55 @@ class WorkerImpl
 
     /**
      * @brief Stores a matrix at worker's memory
-     * 
+     *
      * @param mat Structure * obj to store
-     * @return StoredInfo Information regarding stored object (identifier, numRows, numCols)
+     * @return StoredInfo Information regarding stored object (identifier,
+     * numRows, numCols)
      */
-    template<class DT>
-    StoredInfo Store(DT *mat) ;
-    
+    template <class DT> StoredInfo Store(DT *mat);
+
     /**
      * @brief Computes a pipeline
-     * 
-     * @param outputs vector to populate with results of the pipeline (identifier, numRows/cols, etc.)
+     *
+     * @param outputs vector to populate with results of the pipeline
+     * (identifier, numRows/cols, etc.)
      * @param inputs vector with inputs of pipeline (identifiers to use, etc.)
      * @param mlirCode mlir code fragment
-     * @return WorkerImpl::Status contains if everything went fine, with an optional error message
+     * @return WorkerImpl::Status contains if everything went fine, with an
+     * optional error message
      */
     WorkerImpl::Status Compute(std::vector<WorkerImpl::StoredInfo> *outputs,
-            const std::vector<WorkerImpl::StoredInfo> &inputs, const std::string &mlirCode);
+                               const std::vector<WorkerImpl::StoredInfo> &inputs, const std::string &mlirCode);
 
     /**
      * @brief Returns a matrix stored in worker's memory
-     * 
-     * @param storedInfo Information regarding stored object (identifier, numRows, numCols)
+     *
+     * @param storedInfo Information regarding stored object (identifier,
+     * numRows, numCols)
      * @return Structure* Returns object
      */
-    Structure * Transfer(StoredInfo storedInfo);
+    Structure *Transfer(StoredInfo storedInfo);
 
-private:
+  private:
     uint64_t tmp_file_counter_ = 0;
     std::unordered_map<std::string, void *> localData_;
     /**
-     * Creates a vector holding pointers to the inputs as well as the outputs. This vector can directly be passed
-     * to the `ExecutionEngine::invokePacked` method.
+     * Creates a vector holding pointers to the inputs as well as the outputs.
+     * This vector can directly be passed to the `ExecutionEngine::invokePacked`
+     * method.
      * @param functionType Type of the function that will be invoked
      * @param workInputs Inputs send by client
-     * @param outputs Reference to the vector that will hold the outputs of the invoked function
+     * @param outputs Reference to the vector that will hold the outputs of the
+     * invoked function
      * @return packed pointers to inputs and outputs
      */
     std::vector<void *> createPackedCInterfaceInputsOutputs(mlir::FunctionType functionType,
                                                             std::vector<WorkerImpl::StoredInfo> workInputs,
-                                                            std::vector<void *> &outputs,
-                                                            std::vector<void *> &inputs);
-    
-    Structure *readOrGetMatrix(const std::string &identifier, size_t numRows, size_t numCols, bool isSparse = false, bool isFloat = false, bool isScalar = false);
-    void *loadWorkInputData(mlir::Type mlirType, StoredInfo& workInput);    
+                                                            std::vector<void *> &outputs, std::vector<void *> &inputs);
+
+    Structure *readOrGetMatrix(const std::string &identifier, size_t numRows, size_t numCols, bool isSparse = false,
+                               bool isFloat = false, bool isScalar = false);
+    void *loadWorkInputData(mlir::Type mlirType, StoredInfo &workInput);
 };
 
-#endif //SRC_RUNTIME_DISTRIBUTED_WORKER_WORKERIMPL_H
+#endif // SRC_RUNTIME_DISTRIBUTED_WORKER_WORKERIMPL_H
diff --git a/src/runtime/distributed/worker/WorkerImplGRPCAsync.cpp b/src/runtime/distributed/worker/WorkerImplGRPCAsync.cpp
index d2e4ef53d..08fdb4aeb 100644
--- a/src/runtime/distributed/worker/WorkerImplGRPCAsync.cpp
+++ b/src/runtime/distributed/worker/WorkerImplGRPCAsync.cpp
@@ -14,18 +14,17 @@
  *  limitations under the License.
  */
 
-
 #include "WorkerImplGRPCAsync.h"
 
 #include <runtime/distributed/proto/CallData.h>
-#include <runtime/local/io/DaphneSerializer.h>
 #include <runtime/local/datastructures/DataObjectFactory.h>
+#include <runtime/local/io/DaphneSerializer.h>
 
 #include <grpcpp/grpcpp.h>
 #include <grpcpp/server_builder.h>
 
-WorkerImplGRPCAsync::WorkerImplGRPCAsync(const std::string& addr, DaphneUserConfig& _cfg) : WorkerImpl(_cfg), stream_(&ctx_), responder_(&ctx_)
-{
+WorkerImplGRPCAsync::WorkerImplGRPCAsync(const std::string &addr, DaphneUserConfig &_cfg)
+    : WorkerImpl(_cfg), stream_(&ctx_), responder_(&ctx_) {
     builder.AddListeningPort(addr, grpc::InsecureServerCredentials());
     cq_ = builder.AddCompletionQueue();
     builder.RegisterService(&service_);
@@ -40,33 +39,31 @@ void WorkerImplGRPCAsync::Wait() {
     new ComputeCallData(this, cq_.get());
     new TransferCallData(this, cq_.get());
     // new FreeMemCallData(this, cq_.get());
-    void* tag;  // uniquely identifies a request.
+    void *tag; // uniquely identifies a request.
     bool ok;
     // Block waiting to read the next event from the completion queue. The
     // event is uniquely identified by its tag, which in this case is the
     // memory address of a CallData instance.
     // The return value of Next should always be checked. This return value
     // tells us whether there is any kind of event or cq_ is shutting down.
-    while (cq_->Next(&tag, &ok)) {        
-        static_cast<CallData*>(tag)->Proceed(ok);
+    while (cq_->Next(&tag, &ok)) {
+        static_cast<CallData *>(tag)->Proceed(ok);
     }
 }
 
-grpc::Status WorkerImplGRPCAsync::StoreGRPC(::grpc::ServerContext *context,
-                         const ::distributed::Data *request,
-                         ::distributed::StoredData *response) 
-{
+grpc::Status WorkerImplGRPCAsync::StoreGRPC(::grpc::ServerContext *context, const ::distributed::Data *request,
+                                            ::distributed::StoredData *response) {
     StoredInfo storedInfo;
     size_t bufferLength = request->bytes().size();
     // If deserializer's chunk size is 0, it means we just begin receiving data
     if (bufferLength == 0 && this->isFirstChunk == true)
         return grpc::Status::OK;
     // The first actual message received, is the chunk size.
-    if (isFirstChunk){
+    if (isFirstChunk) {
         this->isFirstChunk = false;
         size_t chunkSize;
         auto buffer = request->bytes().data();
-        std::copy(buffer, buffer + sizeof(size_t), reinterpret_cast<char*>(&chunkSize));
+        std::copy(buffer, buffer + sizeof(size_t), reinterpret_cast<char *>(&chunkSize));
         deserializer->chunkSize = chunkSize;
         return grpc::Status::OK;
     }
@@ -78,14 +75,16 @@ grpc::Status WorkerImplGRPCAsync::StoreGRPC(::grpc::ServerContext *context,
         response->set_identifier(storedInfo.identifier);
         response->set_num_rows(storedInfo.numRows);
         response->set_num_cols(storedInfo.numCols);
-        return ::grpc::Status::OK;    
+        return ::grpc::Status::OK;
     } else {
         // partially deserialize next
         (*deserializerIter)->first = bufferLength;
         if ((*deserializerIter)->second->size() < bufferLength)
             (*deserializerIter)->second->resize(bufferLength);
-        (*deserializerIter)->second->assign(static_cast<const char*>(request->bytes().data()), static_cast<const char*>(request->bytes().data()) + bufferLength);
-        
+        (*deserializerIter)
+            ->second->assign(static_cast<const char *>(request->bytes().data()),
+                             static_cast<const char *>(request->bytes().data()) + bufferLength);
+
         // advance iterator, this also partially deserializes
         ++(*deserializerIter);
 
@@ -95,7 +94,7 @@ grpc::Status WorkerImplGRPCAsync::StoreGRPC(::grpc::ServerContext *context,
             response->set_identifier(storedInfo.identifier);
             response->set_num_rows(storedInfo.numRows);
             response->set_num_cols(storedInfo.numCols);
-            return ::grpc::Status::OK;    
+            return ::grpc::Status::OK;
         }
     }
     return ::grpc::Status::OK;
@@ -103,27 +102,27 @@ grpc::Status WorkerImplGRPCAsync::StoreGRPC(::grpc::ServerContext *context,
 
 void WorkerImplGRPCAsync::PrepareStoreGRPC() {
     this->isFirstChunk = true;
-    size_t chunkSize = DaphneSerializer<Structure>::HEADER_BUFFER_SIZE + 1; // This will be updated when the first request happens and master node sends the chunk size
+    size_t chunkSize =
+        DaphneSerializer<Structure>::HEADER_BUFFER_SIZE + 1; // This will be updated when the first request happens
+                                                             // and master node sends the chunk size
     deserializer.reset(new DaphneDeserializerChunks<Structure>(&mat, chunkSize));
-    deserializerIter.reset(new DaphneDeserializerChunks<Structure>::Iterator(deserializer->begin()));    
+    deserializerIter.reset(new DaphneDeserializerChunks<Structure>::Iterator(deserializer->begin()));
     (*deserializerIter)->second->resize(chunkSize);
 }
 
-grpc::Status WorkerImplGRPCAsync::ComputeGRPC(::grpc::ServerContext *context,
-                         const ::distributed::Task *request,
-                         ::distributed::ComputeResult *response)
-{
+grpc::Status WorkerImplGRPCAsync::ComputeGRPC(::grpc::ServerContext *context, const ::distributed::Task *request,
+                                              ::distributed::ComputeResult *response) {
     std::vector<StoredInfo> inputs;
     inputs.reserve(request->inputs().size());
 
     std::vector<StoredInfo> outputs = std::vector<StoredInfo>();
-    for (auto input : request->inputs()){
+    for (auto input : request->inputs()) {
         auto stored = input.stored();
         inputs.push_back(StoredInfo({stored.identifier(), stored.num_rows(), stored.num_cols()}));
     }
     auto respMsg = Compute(&outputs, inputs, request->mlir_code());
-    for (auto output : outputs){        
-        distributed::WorkData workData;        
+    for (auto output : outputs) {
+        distributed::WorkData workData;
         workData.mutable_stored()->set_identifier(output.identifier);
         workData.mutable_stored()->set_num_rows(output.numRows);
         workData.mutable_stored()->set_num_cols(output.numCols);
@@ -132,13 +131,11 @@ grpc::Status WorkerImplGRPCAsync::ComputeGRPC(::grpc::ServerContext *context,
     if (respMsg.ok())
         return ::grpc::Status::OK;
     else
-        return ::grpc::Status(grpc::StatusCode::ABORTED, respMsg.error_message());        
+        return ::grpc::Status(grpc::StatusCode::ABORTED, respMsg.error_message());
 }
 
-grpc::Status WorkerImplGRPCAsync::TransferGRPC(::grpc::ServerContext *context,
-                          const ::distributed::StoredData *request,
-                         ::distributed::Data *response)
-{
+grpc::Status WorkerImplGRPCAsync::TransferGRPC(::grpc::ServerContext *context, const ::distributed::StoredData *request,
+                                               ::distributed::Data *response) {
     StoredInfo info({request->identifier(), request->num_rows(), request->num_cols()});
     std::vector<char> buffer;
     size_t bufferLength;
diff --git a/src/runtime/distributed/worker/WorkerImplGRPCAsync.h b/src/runtime/distributed/worker/WorkerImplGRPCAsync.h
index 2e38710d7..d005fb819 100644
--- a/src/runtime/distributed/worker/WorkerImplGRPCAsync.h
+++ b/src/runtime/distributed/worker/WorkerImplGRPCAsync.h
@@ -19,16 +19,15 @@
 
 #include "WorkerImpl.h"
 
+#include "runtime/distributed/proto/worker.grpc.pb.h"
+#include "runtime/distributed/proto/worker.pb.h"
 #include <grpcpp/grpcpp.h>
 #include <grpcpp/server_builder.h>
-#include "runtime/distributed/proto/worker.pb.h"
-#include "runtime/distributed/proto/worker.grpc.pb.h"
 
 #include <runtime/local/io/DaphneSerializer.h>
 
-class WorkerImplGRPCAsync : public WorkerImpl 
-{
-private:
+class WorkerImplGRPCAsync : public WorkerImpl {
+  private:
     grpc::ServerContext ctx_;
     std::unique_ptr<grpc::ServerCompletionQueue> cq_;
     std::unique_ptr<grpc::ServerCompletionQueue> scq_;
@@ -36,29 +35,27 @@ class WorkerImplGRPCAsync : public WorkerImpl
     std::unique_ptr<grpc::Server> server;
     grpc::ServerAsyncReader<distributed::StoredData, distributed::Data> stream_;
     grpc::ServerAsyncResponseWriter<distributed::StoredData> responder_;
-    
+
     // Store in chunks
     std::unique_ptr<DaphneDeserializerChunks<Structure>> deserializer;
     std::unique_ptr<DaphneDeserializerChunks<Structure>::Iterator> deserializerIter;
     Structure *mat;
     bool isFirstChunk = false;
-public:
-    explicit WorkerImplGRPCAsync(const std::string& addr, DaphneUserConfig& _cfg);
+
+  public:
+    explicit WorkerImplGRPCAsync(const std::string &addr, DaphneUserConfig &_cfg);
     void Wait() override;
 
-    grpc::Status StoreGRPC(::grpc::ServerContext *context,
-                         const ::distributed::Data *request,
-                         ::distributed::StoredData *response) ;
-    grpc::Status ComputeGRPC(::grpc::ServerContext *context,
-                         const ::distributed::Task *request,
-                         ::distributed::ComputeResult *response);
-    grpc::Status TransferGRPC(::grpc::ServerContext *context,
-                          const ::distributed::StoredData *request,
-                         ::distributed::Data *response) ;
+    grpc::Status StoreGRPC(::grpc::ServerContext *context, const ::distributed::Data *request,
+                           ::distributed::StoredData *response);
+    grpc::Status ComputeGRPC(::grpc::ServerContext *context, const ::distributed::Task *request,
+                             ::distributed::ComputeResult *response);
+    grpc::Status TransferGRPC(::grpc::ServerContext *context, const ::distributed::StoredData *request,
+                              ::distributed::Data *response);
 
     distributed::Worker::AsyncService service_;
 
     void PrepareStoreGRPC();
 };
 
-#endif //SRC_RUNTIME_DISTRIBUTED_WORKER_WORKERIMPLGRPCASYNC_H
+#endif // SRC_RUNTIME_DISTRIBUTED_WORKER_WORKERIMPLGRPCASYNC_H
diff --git a/src/runtime/distributed/worker/WorkerImplGRPCSync.cpp b/src/runtime/distributed/worker/WorkerImplGRPCSync.cpp
index d79f8b001..9f7207028 100644
--- a/src/runtime/distributed/worker/WorkerImplGRPCSync.cpp
+++ b/src/runtime/distributed/worker/WorkerImplGRPCSync.cpp
@@ -14,28 +14,27 @@
  *  limitations under the License.
  */
 
-
 #include "WorkerImplGRPCSync.h"
 
-#include <runtime/local/io/DaphneSerializer.h>
 #include <runtime/local/datastructures/DataObjectFactory.h>
+#include <runtime/local/io/DaphneSerializer.h>
 
 #include <grpcpp/grpcpp.h>
 #include <grpcpp/server_builder.h>
 
 #if USE_HDFS
-    #include <runtime/local/io/HDFS/ReadHDFSCsv.h>
-    #include <runtime/local/io/HDFS/ReadDaphneHDFS.h>
-    #include <runtime/local/io/HDFS/WriteHDFSCsv.h>
-    #include <runtime/local/io/HDFS/WriteDaphneHDFS.h>
-    #include <runtime/local/kernels/CreateHDFSContext.h>
+#include <runtime/local/io/HDFS/ReadDaphneHDFS.h>
+#include <runtime/local/io/HDFS/ReadHDFSCsv.h>
+#include <runtime/local/io/HDFS/WriteDaphneHDFS.h>
+#include <runtime/local/io/HDFS/WriteHDFSCsv.h>
+#include <runtime/local/kernels/CreateHDFSContext.h>
 #endif
 #include <util/KernelDispatchMapping.h>
 #include <util/Statistics.h>
 #include <util/StringRefCount.h>
 
-
-WorkerImplGRPCSync::WorkerImplGRPCSync(const std::string& addr, DaphneUserConfig& _cfg) : WorkerImpl(_cfg)
+WorkerImplGRPCSync::WorkerImplGRPCSync(const std::string &addr, DaphneUserConfig &_cfg)
+    : WorkerImpl(_cfg)
 
 {
     builder.AddListeningPort(addr, grpc::InsecureServerCredentials());
@@ -45,14 +44,11 @@ WorkerImplGRPCSync::WorkerImplGRPCSync(const std::string& addr, DaphneUserConfig
     server = builder.BuildAndStart();
 }
 
-void WorkerImplGRPCSync::Wait() {
-    server->Wait();
-}
+void WorkerImplGRPCSync::Wait() { server->Wait(); }
 
 grpc::Status WorkerImplGRPCSync::Store(::grpc::ServerContext *context,
-                         ::grpc::ServerReader< ::distributed::Data>* reader,
-                         ::distributed::StoredData *response)
-{
+                                       ::grpc::ServerReader<::distributed::Data> *reader,
+                                       ::distributed::StoredData *response) {
     StoredInfo storedInfo;
     distributed::Data data;
     reader->Read(&data);
@@ -75,17 +71,18 @@ grpc::Status WorkerImplGRPCSync::Store(::grpc::ServerContext *context,
 
         if ((*deserializerIter)->second->size() < len)
             (*deserializerIter)->second->resize(len);
-        (*deserializerIter)->second->assign(static_cast<const char*>(buffer), static_cast<const char*>(buffer) + len);
+        (*deserializerIter)->second->assign(static_cast<const char *>(buffer), static_cast<const char *>(buffer) + len);
 
         // advance iterator, this also partially deserializes
         ++(*deserializerIter);
-        while (reader->Read(&data)){
+        while (reader->Read(&data)) {
             buffer = data.bytes().data();
             len = data.bytes().size();
             (*deserializerIter)->first = len;
             if ((*deserializerIter)->second->size() < len)
                 (*deserializerIter)->second->resize(len);
-            (*deserializerIter)->second->assign(static_cast<const char*>(buffer), static_cast<const char*>(buffer) + len);
+            (*deserializerIter)
+                ->second->assign(static_cast<const char *>(buffer), static_cast<const char *>(buffer) + len);
 
             // advance iterator, this also partially deserializes
             ++(*deserializerIter);
@@ -98,20 +95,18 @@ grpc::Status WorkerImplGRPCSync::Store(::grpc::ServerContext *context,
     return ::grpc::Status::OK;
 }
 
-grpc::Status WorkerImplGRPCSync::Compute(::grpc::ServerContext *context,
-                         const ::distributed::Task *request,
-                         ::distributed::ComputeResult *response)
-{
+grpc::Status WorkerImplGRPCSync::Compute(::grpc::ServerContext *context, const ::distributed::Task *request,
+                                         ::distributed::ComputeResult *response) {
     std::vector<StoredInfo> inputs;
     inputs.reserve(request->inputs().size());
 
     std::vector<StoredInfo> outputs = std::vector<StoredInfo>();
-    for (auto input : request->inputs()){
+    for (auto input : request->inputs()) {
         auto stored = input.stored();
         inputs.push_back(StoredInfo({stored.identifier(), stored.num_rows(), stored.num_cols()}));
     }
     auto respMsg = WorkerImpl::Compute(&outputs, inputs, request->mlir_code());
-    for (auto output : outputs){
+    for (auto output : outputs) {
         distributed::WorkData workData;
         workData.mutable_stored()->set_identifier(output.identifier);
         workData.mutable_stored()->set_num_rows(output.numRows);
@@ -124,10 +119,8 @@ grpc::Status WorkerImplGRPCSync::Compute(::grpc::ServerContext *context,
         return ::grpc::Status(grpc::StatusCode::ABORTED, respMsg.error_message());
 }
 
-grpc::Status WorkerImplGRPCSync::Transfer(::grpc::ServerContext *context,
-                          const ::distributed::StoredData *request,
-                         ::distributed::Data *response)
-{
+grpc::Status WorkerImplGRPCSync::Transfer(::grpc::ServerContext *context, const ::distributed::StoredData *request,
+                                          ::distributed::Data *response) {
     StoredInfo info({request->identifier(), request->num_rows(), request->num_cols()});
     std::vector<char> buffer;
     size_t bufferLength;
@@ -137,36 +130,32 @@ grpc::Status WorkerImplGRPCSync::Transfer(::grpc::ServerContext *context,
     return ::grpc::Status::OK;
 }
 
-
 #if USE_HDFS
-grpc::Status WorkerImplGRPCSync::ReadHDFS(::grpc::ServerContext *context,
-                          const ::distributed::HDFSFile *request,
-                         ::distributed::StoredData *response)
-{
-    DaphneContext ctx(cfg,KernelDispatchMapping::instance(), Statistics::instance(), StringRefCounter::instance());
+grpc::Status WorkerImplGRPCSync::ReadHDFS(::grpc::ServerContext *context, const ::distributed::HDFSFile *request,
+                                          ::distributed::StoredData *response) {
+    DaphneContext ctx(cfg, KernelDispatchMapping::instance(), Statistics::instance(), StringRefCounter::instance());
     createHDFSContext(&ctx);
-    DenseMatrix<double> *res = DataObjectFactory::create<DenseMatrix<double>>(request->num_rows(), request->num_cols(), false);
+    DenseMatrix<double> *res =
+        DataObjectFactory::create<DenseMatrix<double>>(request->num_rows(), request->num_cols(), false);
     if (request->filename().find("csv") != std::string::npos)
-        readHDFSCsv(res, request->filename().c_str(), request->num_rows(), request->num_cols(), ',', &ctx, request->start_row());
+        readHDFSCsv(res, request->filename().c_str(), request->num_rows(), request->num_cols(), ',', &ctx,
+                    request->start_row());
     else if (request->filename().find("dbdf") != std::string::npos)
         readDaphneHDFS(res, request->filename().c_str(), &ctx, request->start_row());
-    auto storedInfo = WorkerImpl::Store(dynamic_cast<Structure*>(res));
+    auto storedInfo = WorkerImpl::Store(dynamic_cast<Structure *>(res));
 
     response->set_identifier(storedInfo.identifier);
     response->set_num_rows(storedInfo.numRows);
     response->set_num_cols(storedInfo.numCols);
     return ::grpc::Status::OK;
-
 }
 
-grpc::Status WorkerImplGRPCSync::WriteHDFS(::grpc::ServerContext *context,
-                          const ::distributed::HDFSWriteInfo *request,
-                         ::distributed::Empty *response)
-{
-    DaphneContext ctx(cfg,KernelDispatchMapping::instance(), Statistics::instance(), StringRefCounter::instance());
+grpc::Status WorkerImplGRPCSync::WriteHDFS(::grpc::ServerContext *context, const ::distributed::HDFSWriteInfo *request,
+                                           ::distributed::Empty *response) {
+    DaphneContext ctx(cfg, KernelDispatchMapping::instance(), Statistics::instance(), StringRefCounter::instance());
     createHDFSContext(&ctx);
     StoredInfo si({request->matrix().identifier(), request->matrix().num_rows(), request->matrix().num_cols()});
-    auto mat = dynamic_cast<DenseMatrix<double>*>(WorkerImpl::Transfer(si));
+    auto mat = dynamic_cast<DenseMatrix<double> *>(WorkerImpl::Transfer(si));
     if (request->dirname().find("csv") != std::string::npos)
         writeHDFSCsv(mat, request->dirname().c_str(), &ctx);
     else if (request->dirname().find("dbdf") != std::string::npos)
diff --git a/src/runtime/distributed/worker/WorkerImplGRPCSync.h b/src/runtime/distributed/worker/WorkerImplGRPCSync.h
index 8f9dd1607..54abf8c87 100644
--- a/src/runtime/distributed/worker/WorkerImplGRPCSync.h
+++ b/src/runtime/distributed/worker/WorkerImplGRPCSync.h
@@ -19,16 +19,15 @@
 
 #include "WorkerImpl.h"
 
+#include "runtime/distributed/proto/worker.grpc.pb.h"
+#include "runtime/distributed/proto/worker.pb.h"
 #include <grpcpp/grpcpp.h>
 #include <grpcpp/server_builder.h>
-#include "runtime/distributed/proto/worker.pb.h"
-#include "runtime/distributed/proto/worker.grpc.pb.h"
 
 #include <runtime/local/io/DaphneSerializer.h>
 
-class WorkerImplGRPCSync : public WorkerImpl, public distributed::Worker::Service
-{
-private:
+class WorkerImplGRPCSync : public WorkerImpl, public distributed::Worker::Service {
+  private:
     grpc::ServerBuilder builder;
     std::unique_ptr<grpc::Server> server;
     // Store in chunks
@@ -36,29 +35,23 @@ class WorkerImplGRPCSync : public WorkerImpl, public distributed::Worker::Servic
     std::unique_ptr<DaphneDeserializerChunks<Structure>::Iterator> deserializerIter;
     Structure *mat;
 
-public:
-    explicit WorkerImplGRPCSync(const std::string& addr, DaphneUserConfig& _cfg);
+  public:
+    explicit WorkerImplGRPCSync(const std::string &addr, DaphneUserConfig &_cfg);
     void Wait() override;
 #if USE_HDFS
-    grpc::Status WriteHDFS(::grpc::ServerContext *context,
-                         const ::distributed::HDFSWriteInfo *request,
-                         ::distributed::Empty *response) override;
-    grpc::Status ReadHDFS(::grpc::ServerContext *context,
-                         const ::distributed::HDFSFile *request,
-                         ::distributed::StoredData *response) override;
+    grpc::Status WriteHDFS(::grpc::ServerContext *context, const ::distributed::HDFSWriteInfo *request,
+                           ::distributed::Empty *response) override;
+    grpc::Status ReadHDFS(::grpc::ServerContext *context, const ::distributed::HDFSFile *request,
+                          ::distributed::StoredData *response) override;
 #endif
-    grpc::Status Store(::grpc::ServerContext *context,
-                         ::grpc::ServerReader<::distributed::Data>* reader,
-                         ::distributed::StoredData *response) override;
-    grpc::Status Compute(::grpc::ServerContext *context,
-                         const ::distributed::Task *request,
+    grpc::Status Store(::grpc::ServerContext *context, ::grpc::ServerReader<::distributed::Data> *reader,
+                       ::distributed::StoredData *response) override;
+    grpc::Status Compute(::grpc::ServerContext *context, const ::distributed::Task *request,
                          ::distributed::ComputeResult *response) override;
-    grpc::Status Transfer(::grpc::ServerContext *context,
-                          const ::distributed::StoredData *request,
-                         ::distributed::Data *response) override;
+    grpc::Status Transfer(::grpc::ServerContext *context, const ::distributed::StoredData *request,
+                          ::distributed::Data *response) override;
 
-    template<class DT>
-    DT* CreateMatrix(const ::distributed::Data *mat);
+    template <class DT> DT *CreateMatrix(const ::distributed::Data *mat);
 };
 
-#endif //SRC_RUNTIME_DISTRIBUTED_WORKER_WORKERIMPLGRPCSYNC_H
+#endif // SRC_RUNTIME_DISTRIBUTED_WORKER_WORKERIMPLGRPCSYNC_H
diff --git a/src/runtime/distributed/worker/main.cpp b/src/runtime/distributed/worker/main.cpp
index d8a87cdb0..ec9eeb6f9 100644
--- a/src/runtime/distributed/worker/main.cpp
+++ b/src/runtime/distributed/worker/main.cpp
@@ -14,7 +14,6 @@
  *  limitations under the License.
  */
 
-
 #include <iostream>
 
 #include "WorkerImpl.h"
@@ -22,9 +21,7 @@
 #include "WorkerImplGRPCSync.h"
 #include <parser/config/ConfigParser.h>
 
-
-int main(int argc, char *argv[])
-{
+int main(int argc, char *argv[]) {
     DaphneUserConfig user_config{};
     std::string configFile = "WorkerConfig.json";
 
@@ -36,8 +33,7 @@ int main(int argc, char *argv[])
         if (ConfigParser::fileExists(configFile)) {
             ConfigParser::readUserConfig(configFile, user_config);
         }
-    }
-    catch(std::exception & e) {
+    } catch (std::exception &e) {
         spdlog::error("Parser error while reading worker config:\n{}", e.what());
         spdlog::error("You can create a WorkerConfig.json to configure the worker.\n");
     }
@@ -53,7 +49,7 @@ int main(int argc, char *argv[])
 
     // TODO choose specific implementation based on arguments or config file
     WorkerImpl *service = new WorkerImplGRPCSync(addr, user_config);
-    
+
     std::cout << "Started Distributed Worker on `" << addr << "`\n";
     service->Wait();
 
diff --git a/src/runtime/local/context/CUDAContext.cpp b/src/runtime/local/context/CUDAContext.cpp
index e0bbf513c..dddd530bb 100644
--- a/src/runtime/local/context/CUDAContext.cpp
+++ b/src/runtime/local/context/CUDAContext.cpp
@@ -34,22 +34,24 @@ void CUDAContext::destroy() {
     CHECK_CUDNN(cudnnDestroyFilterDescriptor(filter_desc));
 
     CHECK_CUDART(cudaFree(cudnn_workspace));
-//    CHECK_CUDART(cudaFree(cublas_workspace));
-//    CHECK_CUBLAS(cublasLtDestroy(ltHandle));
+    //    CHECK_CUDART(cudaFree(cublas_workspace));
+    //    CHECK_CUBLAS(cublasLtDestroy(ltHandle));
 }
 
 void CUDAContext::init() {
     CHECK_CUDART(cudaSetDevice(device_id));
     CHECK_CUDART(cudaGetDeviceProperties(&device_properties, device_id));
 
-    size_t available; size_t total;
+    size_t available;
+    size_t total;
     cudaMemGetInfo(&available, &total);
     // ToDo: make this a user config item
     float mem_usage = 0.9f;
     mem_budget = total * mem_usage;
 
-    logger->info("Using CUDA device {}: {}\n\tAvailable mem: {} Total mem: {} using {}% -> {}", device_id,
-            device_properties.name, available, total, mem_usage * 100, mem_budget);
+    logger->info("Using CUDA device {}: {}\n\tAvailable mem: {} Total mem: {} "
+                 "using {}% -> {}",
+                 device_id, device_properties.name, available, total, mem_usage * 100, mem_budget);
 
     CHECK_CUBLAS(cublasCreate(&cublas_handle));
     CHECK_CUSPARSE(cusparseCreate(&cusparse_handle));
@@ -68,37 +70,24 @@ void CUDAContext::init() {
 
     getCUDNNWorkspace(64 * 1024 * 1024);
 
-//    CHECK_CUBLAS(cublasLtCreate(&cublaslt_Handle));
-//    CHECK_CUDART(cudaMalloc(&cublas_workspace, cublas_workspace_size));
+    //    CHECK_CUBLAS(cublasLtCreate(&cublaslt_Handle));
+    //    CHECK_CUDART(cudaMalloc(&cublas_workspace, cublas_workspace_size));
 }
 
-template<>
-cudnnDataType_t CUDAContext::getCUDNNDataType<float>() const {
-    return CUDNN_DATA_FLOAT;
-}
+template <> cudnnDataType_t CUDAContext::getCUDNNDataType<float>() const { return CUDNN_DATA_FLOAT; }
 
-template<>
-cudnnDataType_t CUDAContext::getCUDNNDataType<double>() const {
-    return CUDNN_DATA_DOUBLE;
-}
+template <> cudnnDataType_t CUDAContext::getCUDNNDataType<double>() const { return CUDNN_DATA_DOUBLE; }
 
-template<>
-cudaDataType CUDAContext::getCUSparseDataType<float>() const {
-    return CUDA_R_32F;
-}
+template <> cudaDataType CUDAContext::getCUSparseDataType<float>() const { return CUDA_R_32F; }
 
-template<>
-cudaDataType CUDAContext::getCUSparseDataType<double>() const {
-    return CUDA_R_64F;
-}
+template <> cudaDataType CUDAContext::getCUSparseDataType<double>() const { return CUDA_R_64F; }
 
-void* CUDAContext::getCUDNNWorkspace(size_t size) {
+void *CUDAContext::getCUDNNWorkspace(size_t size) {
     if (size > cudnn_workspace_size) {
         logger->debug("Allocating cuDNN workspace of size {} bytes", size);
         CHECK_CUDART(cudaMalloc(&cudnn_workspace, size));
         cudnn_workspace_size = size;
-    }
-    else {
+    } else {
         logger->debug("Not allocating cuDNN conv workspace of size {} bytes", size);
     }
 
@@ -113,28 +102,28 @@ std::unique_ptr<IContext> CUDAContext::createCudaContext(int device_id) {
     int device_count = -1;
     CHECK_CUDART(cudaGetDeviceCount(&device_count));
 
-    if(device_count < 1) {
+    if (device_count < 1) {
         ctx->logger->warn("Not creating requested CUDA context. No cuda devices available.");
         return nullptr;
     }
 
-    if(device_id >= device_count) {
+    if (device_id >= device_count) {
         ctx->logger->warn("Requested device ID {} >= device count {}", device_id, device_count);
         return nullptr;
     }
 
     ctx->init();
-//    return reinterpret_cast<std::unique_ptr<IContext> &&>(ctx);
+    //    return reinterpret_cast<std::unique_ptr<IContext> &&>(ctx);
     return ctx;
 }
 
-std::shared_ptr<std::byte> CUDAContext::malloc(size_t size, bool zero, size_t& id) {
+std::shared_ptr<std::byte> CUDAContext::malloc(size_t size, bool zero, size_t &id) {
     id = alloc_count++;
-    std::byte* dev_ptr;
+    std::byte *dev_ptr;
     CHECK_CUDART(cudaMalloc(reinterpret_cast<void **>(&dev_ptr), size));
     allocations.emplace(id, std::shared_ptr<std::byte>(dev_ptr, CudaDeleter<std::byte>()));
 
-    if(zero)
+    if (zero)
         CHECK_CUDART(cudaMemset(dev_ptr, 0, size));
     return allocations.at(id);
 }
@@ -145,6 +134,4 @@ void CUDAContext::free(size_t id) {
     allocations.erase(id);
 }
 
-int CUDAContext::getMaxNumThreads() {
-    return device_properties.maxThreadsPerBlock;
-}
+int CUDAContext::getMaxNumThreads() { return device_properties.maxThreadsPerBlock; }
diff --git a/src/runtime/local/context/CUDAContext.h b/src/runtime/local/context/CUDAContext.h
index 08c3e8e7e..15c03d9a2 100644
--- a/src/runtime/local/context/CUDAContext.h
+++ b/src/runtime/local/context/CUDAContext.h
@@ -19,9 +19,12 @@
 #include "runtime/local/context/DaphneContext.h"
 #include "runtime/local/kernels/CUDA/HostUtils.h"
 
+#include <fmt/core.h>
+#include <fmt/ranges.h>
+
 #include <iostream>
-#include <memory>
 #include <map>
+#include <memory>
 
 class CUDAContext final : public IContext {
     int device_id = -1;
@@ -41,21 +44,19 @@ class CUDAContext final : public IContext {
 
     // preallocate 64MB
     size_t cudnn_workspace_size{};
-    void* cudnn_workspace{};
-    
+    void *cudnn_workspace{};
+
     std::map<size_t, std::shared_ptr<std::byte>> allocations;
     static size_t alloc_count;
 
-    explicit CUDAContext(int id) : device_id(id) {
-        logger = spdlog::get("runtime::cuda");
-    }
-    
+    explicit CUDAContext(int id) : device_id(id) { logger = spdlog::get("runtime::cuda"); }
+
     void init();
-    
-public:
+
+  public:
     CUDAContext() = delete;
-    CUDAContext(const CUDAContext&) = delete;
-    CUDAContext& operator=(const CUDAContext&) = delete;
+    CUDAContext(const CUDAContext &) = delete;
+    CUDAContext &operator=(const CUDAContext &) = delete;
     ~CUDAContext() = default;
 
     void destroy() override;
@@ -64,33 +65,33 @@ class CUDAContext final : public IContext {
     [[nodiscard]] cublasHandle_t getCublasHandle() const { return cublas_handle; }
     [[nodiscard]] cusparseHandle_t getCusparseHandle() const { return cusparse_handle; }
 
-    [[nodiscard]] const cudaDeviceProp* getDeviceProperties() const { return &device_properties; }
-    [[nodiscard]] cudnnHandle_t  getCUDNNHandle() const { return cudnn_handle; }
+    [[nodiscard]] const cudaDeviceProp *getDeviceProperties() const { return &device_properties; }
+    [[nodiscard]] cudnnHandle_t getCUDNNHandle() const { return cudnn_handle; }
     [[nodiscard]] cusolverDnHandle_t getCUSOLVERHandle() const { return cusolver_handle; }
     cudaStream_t getCuSolverStream() { return cusolver_stream; }
 
-    template<class T>
-    [[nodiscard]] cudnnDataType_t getCUDNNDataType() const;
+    template <class T> [[nodiscard]] cudnnDataType_t getCUDNNDataType() const;
 
-    template<class T>
-    [[nodiscard]] cudaDataType getCUSparseDataType() const;
+    template <class T> [[nodiscard]] cudaDataType getCUSparseDataType() const;
 
-    void* getCUDNNWorkspace(size_t size);
+    void *getCUDNNWorkspace(size_t size);
 
     [[nodiscard]] size_t getMemBudget() const { return mem_budget; }
     int getMaxNumThreads();
-    static CUDAContext* get(DaphneContext* ctx, size_t id) { return dynamic_cast<CUDAContext*>(ctx->getCUDAContext(id)); }
+    static CUDAContext *get(DaphneContext *ctx, size_t id) {
+        return dynamic_cast<CUDAContext *>(ctx->getCUDAContext(id));
+    }
 
-    std::shared_ptr<std::byte> malloc(size_t size, bool zero, size_t& id);
+    std::shared_ptr<std::byte> malloc(size_t size, bool zero, size_t &id);
 
     void free(size_t id);
 
-    template<typename T>
-    static void debugPrintCUDABuffer(const CUDAContext& ctx, std::string_view title, const T* data, size_t num_items) {
+    template <typename T>
+    static void debugPrintCUDABuffer(const CUDAContext &ctx, std::string_view title, const T *data, size_t num_items) {
         std::vector<T> tmp(num_items);
         CHECK_CUDART(cudaMemcpy(tmp.data(), data, num_items * sizeof(T), cudaMemcpyDeviceToHost));
         auto out = fmt::memory_buffer();
-        fmt::format_to(std::back_inserter(out),"{} \n", title);
+        fmt::format_to(std::back_inserter(out), "{} \n", title);
         fmt::format_to(std::back_inserter(out), fmt::join(tmp, ", "));
         ctx.logger->debug(out);
     }
@@ -100,7 +101,7 @@ class CUDAContext final : public IContext {
     cudnnTensorDescriptor_t src_tensor_desc{}, dst_tensor_desc{}, bn_tensor_desc{};
     cudnnTensorFormat_t tensor_format = CUDNN_TENSOR_NCHW;
     cudnnFilterDescriptor_t filter_desc{};
-    cudnnActivationDescriptor_t  activation_desc{};
+    cudnnActivationDescriptor_t activation_desc{};
     cudnnConvolutionDescriptor_t conv_desc{};
     cudnnBatchNormMode_t bn_mode = CUDNN_BATCHNORM_SPATIAL;
 
diff --git a/src/runtime/local/context/DaphneContext.h b/src/runtime/local/context/DaphneContext.h
index a30e9ba4b..c1c3068c1 100644
--- a/src/runtime/local/context/DaphneContext.h
+++ b/src/runtime/local/context/DaphneContext.h
@@ -21,27 +21,24 @@
 #include <util/Statistics.h>
 #include <util/StringRefCount.h>
 
-#include <vector>
 #include <iostream>
 #include <memory>
+#include <vector>
 
 #include "IContext.h"
 
 #ifdef USE_FPGAOPENCL
-    #include "FPGAContext.h"
+#include "FPGAContext.h"
 #endif
 
-
-
-
 // This macro is intended to be used in kernel function signatures, such that
 // we can change the ubiquitous DaphneContext parameter in a single place, if
 // required.
-#define DCTX(varname) DaphneContext * varname
+#define DCTX(varname) DaphneContext *varname
 
 /**
  * @brief This class carries all kinds of run-time context information.
- * 
+ *
  * An instance of this class is passed to every kernel at run-time. It allows
  * the kernel to retrieve information about the run-time environment.
  */
@@ -55,7 +52,6 @@ struct DaphneContext {
     // of that type here, in order to separate concerns and allow a  high-level
     // overview of the context information.
 
-
     std::vector<std::unique_ptr<IContext>> cuda_contexts;
     std::vector<std::unique_ptr<IContext>> fpga_contexts;
 
@@ -69,26 +65,24 @@ struct DaphneContext {
      * Modifying the configuration is intensionally allowed, since it enables
      * changing the configuration at run-time via DaphneDSL.
      */
-    DaphneUserConfig& config;
-    KernelDispatchMapping& dispatchMapping;
-    Statistics& stats;
-    StringRefCounter& stringRefCount;
+    DaphneUserConfig &config;
+    KernelDispatchMapping &dispatchMapping;
+    Statistics &stats;
+    StringRefCounter &stringRefCount;
 
     std::shared_ptr<spdlog::logger> logger;
 
-    explicit DaphneContext(DaphneUserConfig &config,
-                           KernelDispatchMapping &dispatchMapping,
-                           Statistics &stats,
-                           StringRefCounter& stringRefCnt)
+    explicit DaphneContext(DaphneUserConfig &config, KernelDispatchMapping &dispatchMapping, Statistics &stats,
+                           StringRefCounter &stringRefCnt)
         : config(config), dispatchMapping(dispatchMapping), stats(stats), stringRefCount(stringRefCnt) {
         logger = spdlog::get("runtime");
     }
 
     ~DaphneContext() {
-        for (auto& ctx : cuda_contexts) {
+        for (auto &ctx : cuda_contexts) {
             ctx->destroy();
         }
-        for (auto& ctx : fpga_contexts) {
+        for (auto &ctx : fpga_contexts) {
             ctx->destroy();
         }
         cuda_contexts.clear();
@@ -96,40 +90,30 @@ struct DaphneContext {
     }
 
 #ifdef USE_CUDA
-    // ToDo: in a multi device setting this should use a find call instead of a direct [] access
-    [[nodiscard]] IContext* getCUDAContext(size_t dev_id) const {
-        return cuda_contexts[dev_id].get();
-    }
+    // ToDo: in a multi device setting this should use a find call instead of a
+    // direct [] access
+    [[nodiscard]] IContext *getCUDAContext(size_t dev_id) const { return cuda_contexts[dev_id].get(); }
 #endif
 #ifdef USE_FPGAOPENCL
-    // ToDo: in a multi device setting this should use a find call instead of a direct [] access
-    [[nodiscard]] FPGAContext* getFPGAContext(int dev_id) const {
- //	std::cout<<"inside getFPGAContext"<<std::endl;
-       return dynamic_cast<FPGAContext*>(fpga_contexts[dev_id].get());
+    // ToDo: in a multi device setting this should use a find call instead of a
+    // direct [] access
+    [[nodiscard]] FPGAContext *getFPGAContext(int dev_id) const {
+        //	std::cout<<"inside getFPGAContext"<<std::endl;
+        return dynamic_cast<FPGAContext *>(fpga_contexts[dev_id].get());
     }
 #endif
 
-    void startKernelTimer(int kId) {
-        stats.startKernelTimer(kId);
-    }
+    void startKernelTimer(int kId) { stats.startKernelTimer(kId); }
 
-    void stopKernelTimer(int kId) {
-        stats.stopKernelTimer(kId);
-    }
+    void stopKernelTimer(int kId) { stats.stopKernelTimer(kId); }
 
     [[nodiscard]] bool useCUDA() const { return !cuda_contexts.empty(); }
     [[nodiscard]] bool useFPGA() const { return !fpga_contexts.empty(); }
 
-    [[nodiscard]] IContext *getDistributedContext() const {
-        return distributed_context.get();
-    }
+    [[nodiscard]] IContext *getDistributedContext() const { return distributed_context.get(); }
 #ifdef USE_HDFS
-    [[nodiscard]] IContext* getHDFSContext() const {
-        return hdfs_context.get();
-    }
+    [[nodiscard]] IContext *getHDFSContext() const { return hdfs_context.get(); }
 #endif
 
     [[nodiscard]] DaphneUserConfig &getUserConfig() const { return config; }
-
-
 };
diff --git a/src/runtime/local/context/DistributedContext.h b/src/runtime/local/context/DistributedContext.h
index 61b099b62..a773e9976 100644
--- a/src/runtime/local/context/DistributedContext.h
+++ b/src/runtime/local/context/DistributedContext.h
@@ -18,30 +18,33 @@
 
 #include <runtime/local/context/DaphneContext.h>
 #ifdef USE_MPI
-    #include <runtime/distributed/worker/MPIHelper.h>
-#endif 
-#include <vector>
+#include <runtime/distributed/worker/MPIHelper.h>
+#endif
 #include <cstdlib>
-#include <string>
-#include <stdexcept>
-#include <memory>
 #include <grpcpp/grpcpp.h>
-#include <runtime/distributed/proto/worker.pb.h>
+#include <memory>
 #include <runtime/distributed/proto/worker.grpc.pb.h>
+#include <runtime/distributed/proto/worker.pb.h>
+#include <stdexcept>
+#include <string>
+#include <vector>
 // TODO: Separate implementation in a .cpp file?
 class DistributedContext final : public IContext {
-private:
+  private:
     std::vector<std::string> workers;
-public:
+
+  public:
     std::map<std::string, std::unique_ptr<distributed::Worker::Stub>> stubs;
     DistributedContext(const DaphneUserConfig &cfg) {
 
-        if (cfg.distributedBackEndSetup == ALLOCATION_TYPE::DIST_GRPC_ASYNC || cfg.distributedBackEndSetup == ALLOCATION_TYPE::DIST_GRPC_SYNC) {
-            // TODO: Get the list of distributed workers from daphne user config/cli arguments and
-            // keep environmental variables optional.
+        if (cfg.distributedBackEndSetup == ALLOCATION_TYPE::DIST_GRPC_ASYNC ||
+            cfg.distributedBackEndSetup == ALLOCATION_TYPE::DIST_GRPC_SYNC) {
+            // TODO: Get the list of distributed workers from daphne user
+            // config/cli arguments and keep environmental variables optional.
             auto envVar = std::getenv("DISTRIBUTED_WORKERS");
             if (envVar == nullptr) {
-                throw std::runtime_error("--distributed execution is set with gRPC but EV DISTRIBUTED_WORKERS is empty");
+                throw std::runtime_error("--distributed execution is set with gRPC but EV "
+                                         "DISTRIBUTED_WORKERS is empty");
             }
 
             std::string workersStr(envVar);
@@ -65,9 +68,9 @@ class DistributedContext final : public IContext {
 #ifdef USE_MPI
             // Exclude Coordinator
             size_t worldSize = MPIHelper::getCommSize();
-            // We use strings for the addresses for consistency with other frameworks (e.g. gRPC)
-            // Exclude coordinator
-            for (size_t i = 1; i < worldSize; i++) 
+            // We use strings for the addresses for consistency with other
+            // frameworks (e.g. gRPC) Exclude coordinator
+            for (size_t i = 1; i < worldSize; i++)
                 workers.push_back(std::to_string(i));
 #endif
         }
@@ -83,9 +86,9 @@ class DistributedContext final : public IContext {
         // Clean up
     };
 
-    static DistributedContext* get(DaphneContext *ctx) { return dynamic_cast<DistributedContext*>(ctx->getDistributedContext()); };
-
-    std::vector<std::string> getWorkers(){
-        return workers;
+    static DistributedContext *get(DaphneContext *ctx) {
+        return dynamic_cast<DistributedContext *>(ctx->getDistributedContext());
     };
+
+    std::vector<std::string> getWorkers() { return workers; };
 };
\ No newline at end of file
diff --git a/src/runtime/local/context/FPGAContext.cpp b/src/runtime/local/context/FPGAContext.cpp
index d192527ee..2a981c784 100644
--- a/src/runtime/local/context/FPGAContext.cpp
+++ b/src/runtime/local/context/FPGAContext.cpp
@@ -1,6 +1,6 @@
 /*
  * Copyright 2021 The DAPHNE Consortium
- 
+
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
@@ -14,39 +14,38 @@
  * limitations under the License.
  */
 
+#include "runtime/local/context/FPGAContext.h"
 #include "AOCLUtils/aocl_utils.h"
 #include "CL/opencl.h"
-#include "runtime/local/context/FPGAContext.h"
-//#include <cstdio>
-//#include <cstdlib>
-//#include <cstring>
-//#include <fstream>
-//#include <iomanip>
-//#include <iostream>
-//#include <math.h>
-//#include <sstream>
-//#include <stdint.h>
-//#include <stdio.h>
+// #include <cstdio>
+// #include <cstdlib>
+// #include <cstring>
+// #include <fstream>
+// #include <iomanip>
+// #include <iostream>
+// #include <math.h>
+// #include <sstream>
+// #include <stdint.h>
+// #include <stdio.h>
 
 using namespace std;
 using namespace aocl_utils;
 
-#define DPRINTF(...)     \
-    printf(__VA_ARGS__); \
+#define DPRINTF(...)                                                                                                   \
+    printf(__VA_ARGS__);                                                                                               \
     fflush(stdout);
 
-#define CHECK(status)                                       \
-    if (status != CL_SUCCESS) {                             \
-        throw std::runtime_error(fmt::format("error {} in line {}", status, __LINE__)); \
+#define CHECK(status)                                                                                                  \
+    if (status != CL_SUCCESS) {                                                                                        \
+        throw std::runtime_error(fmt::format("error {} in line {}", status, __LINE__));                                \
     }
 
-void FPGAContext::destroy() {
-    spdlog::debug("Destroying FPGA context...");
-}
+void FPGAContext::destroy() { spdlog::debug("Destroying FPGA context..."); }
 
 void FPGAContext::init() {
     spdlog::debug("creating FPGA context...");
-    spdlog::debug("\n===== Host-CPU setting up the OpenCL platform and device ======\n\n");
+    spdlog::debug("\n===== Host-CPU setting up the OpenCL platform and device "
+                  "======\n\n");
     unsigned int buf_uint;
     cl_int status;
     char buffer[4096];
@@ -68,18 +67,10 @@ void FPGAContext::init() {
     DPRINTF("Initializing IDs\n");
 #endif
     for (int i = 0; i < (int)numPlatforms; i++) {
-        status = clGetDeviceIDs(platforms[i],
-                                CL_DEVICE_TYPE_ALL,
-                                maxDevices,
-                                devices,
-                                &numDevices);
+        status = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, maxDevices, devices, &numDevices);
 
         if (status == CL_SUCCESS) {
-            clGetPlatformInfo(platforms[i],
-                              CL_PLATFORM_NAME,
-                              4096,
-                              buffer,
-                              NULL);
+            clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 4096, buffer, NULL);
 #if defined(ALTERA_CL)
             if (strstr(buffer, "Altera") != NULL) {
                 device_found = 1;
@@ -97,7 +88,7 @@ void FPGAContext::init() {
 #ifndef NDEBUG
             DPRINTF("Platform found : %s\n", buffer);
 #endif
-	    device_found = 1;
+            device_found = 1;
         }
     }
     if (!device_found) {
@@ -107,36 +98,16 @@ void FPGAContext::init() {
 #ifndef NDEBUG
     DPRINTF("Total number of devices: %d", numDevices);
     for (unsigned int i = 0; i < numDevices; i++) {
-        clGetDeviceInfo(devices[i],
-                        CL_DEVICE_NAME,
-                        4096,
-                        buffer,
-                        NULL);
+        clGetDeviceInfo(devices[i], CL_DEVICE_NAME, 4096, buffer, NULL);
         DPRINTF("\nDevice Name: %s\n", buffer);
-        clGetDeviceInfo(devices[i],
-                        CL_DEVICE_VENDOR,
-                        4096,
-                        buffer,
-                        NULL);
+        clGetDeviceInfo(devices[i], CL_DEVICE_VENDOR, 4096, buffer, NULL);
         DPRINTF("Device Vendor: %s\n", buffer);
-        clGetDeviceInfo(devices[i],
-                        CL_DEVICE_MAX_COMPUTE_UNITS,
-                        sizeof(buf_uint),
-                        &buf_uint,
-                        NULL);
+        clGetDeviceInfo(devices[i], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(buf_uint), &buf_uint, NULL);
         DPRINTF("Device Computing Units: %u\n", buf_uint);
-        clGetDeviceInfo(devices[i],
-                        CL_DEVICE_GLOBAL_MEM_SIZE,
-                        sizeof(unsigned long),
-                        &buffer,
-                        NULL);
-        DPRINTF("Global Memory Size: %li\n", *((unsigned long*)buffer));
-        clGetDeviceInfo(devices[i],
-                        CL_DEVICE_MAX_MEM_ALLOC_SIZE,
-                        sizeof(unsigned long),
-                        &buffer,
-                        NULL);
-        DPRINTF("Global Memory Allocation Size: %li\n\n", *((unsigned long*)buffer));
+        clGetDeviceInfo(devices[i], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(unsigned long), &buffer, NULL);
+        DPRINTF("Global Memory Size: %li\n", *((unsigned long *)buffer));
+        clGetDeviceInfo(devices[i], CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(unsigned long), &buffer, NULL);
+        DPRINTF("Global Memory Allocation Size: %li\n\n", *((unsigned long *)buffer));
     }
 #endif
     //----------------------------------------------
@@ -144,32 +115,24 @@ void FPGAContext::init() {
 #ifndef NDEBUG
     DPRINTF("\n===== Host-CPU setting up the OpenCL command queues ======\n\n");
 #endif
-    context = clCreateContext(
-        NULL,
-        1,
-        devices,
-        NULL,
-        NULL,
-        &status);
+    context = clCreateContext(NULL, 1, devices, NULL, NULL, &status);
     CHECK(status);
 }
 
 std::unique_ptr<IContext> FPGAContext::createFpgaContext(int device_id) {
 
-/*    	if(FPGAContext::numDevices < 1) {
-        std::cerr << "Not creating requested FPGA context. No FPGA devices available." << std::endl;
-        return nullptr;
-    }
+    /*    	if(FPGAContext::numDevices < 1) {
+            std::cerr << "Not creating requested FPGA context. No FPGA devices
+       available." << std::endl; return nullptr;
+        }
 
-    if(device_id >= (int)numDevices) {
-        std::cerr << "Requested device ID " << device_id << " >= device count "<<std::endl;// << device_count << std::endl;
-        return nullptr;
-    }
-*/
+        if(device_id >= (int)numDevices) {
+            std::cerr << "Requested device ID " << device_id << " >= device
+       count "<<std::endl;// << device_count << std::endl; return nullptr;
+        }
+    */
     auto ctx = std::unique_ptr<FPGAContext>(new FPGAContext(device_id));
     ctx->init();
 
     return ctx;
 }
-
-
diff --git a/src/runtime/local/context/FPGAContext.h b/src/runtime/local/context/FPGAContext.h
index 0adc45dab..fe6bab59a 100644
--- a/src/runtime/local/context/FPGAContext.h
+++ b/src/runtime/local/context/FPGAContext.h
@@ -16,50 +16,47 @@
 
 #pragma once
 
-#include <spdlog/spdlog.h>
 #include "IContext.h"
 #include <AOCLUtils/aocl_utils.h>
 #include <CL/opencl.h>
 #include <iostream>
 #include <memory>
+#include <spdlog/spdlog.h>
 
 class FPGAContext : public IContext {
     int device_id = -1;
     size_t mem_budget = 0;
 
-
     explicit FPGAContext(int id) : device_id(id) {
-	    //std::cout<<"fpga context constructor"<<std::endl; 
-	    }
-public:
+        // std::cout<<"fpga context constructor"<<std::endl;
+    }
+
+  public:
     cl_uint numPlatforms = 0;
     cl_platform_id *platforms = NULL;
     cl_uint maxDevices = 4;
-    cl_device_id devices[4];//maxDevices];
+    cl_device_id devices[4]; // maxDevices];
     cl_uint numDevices = 0;
     cl_context context = NULL;
 
-
-
-
-
     FPGAContext() = delete;
-    FPGAContext(const FPGAContext&) = delete;
-    FPGAContext& operator=(const FPGAContext&) = delete;
+    FPGAContext(const FPGAContext &) = delete;
+    FPGAContext &operator=(const FPGAContext &) = delete;
     ~FPGAContext() = default;
 
     void destroy() override;
     static std::unique_ptr<IContext> createFpgaContext(int id);
 
-//    [[nodiscard]] cublasHandle_t getCublasHandle() const { return cublas_handle; }
-//    [[nodiscard]] cusparseHandle_t getCusparseHandle() const { return cusparse_handle; }
-
-  //  [[nodiscard]] const cudaDeviceProp* getDeviceProperties() const { return &device_properties; }
-
+    //    [[nodiscard]] cublasHandle_t getCublasHandle() const { return
+    //    cublas_handle; }
+    //    [[nodiscard]] cusparseHandle_t getCusparseHandle() const { return
+    //    cusparse_handle; }
 
-//    size_t getMemBudget() { return mem_budget; }
+    //  [[nodiscard]] const cudaDeviceProp* getDeviceProperties() const { return
+    //  &device_properties; }
 
+    //    size_t getMemBudget() { return mem_budget; }
 
-private:
+  private:
     void init();
 };
diff --git a/src/runtime/local/context/HDFSContext.h b/src/runtime/local/context/HDFSContext.h
index 9745442d3..dadea0353 100644
--- a/src/runtime/local/context/HDFSContext.h
+++ b/src/runtime/local/context/HDFSContext.h
@@ -20,25 +20,26 @@
 #include <runtime/local/io/HDFS/HDFSUtils.h>
 
 #if USE_HDFS
-    #include <hdfs/hdfs.h>
+#include <hdfs/hdfs.h>
 #endif
 
 #include <memory>
 #include <vector>
 
-
 // TODO: Separate implementation in a .cpp file?
 class HDFSContext final : public IContext {
-private:
+  private:
     std::vector<std::string> workers{};
-public:
+
+  public:
 #if USE_HDFS
     std::unique_ptr<hdfsFS> fs;
 #endif
     HDFSContext(const DaphneUserConfig &cfg) {
 #if USE_HDFS
         auto IpPort = HDFSUtils::parseIPAddress(cfg.hdfs_Address);
-        fs = std::make_unique<hdfsFS>(hdfsConnectAsUser(std::get<0>(IpPort).c_str(), std::get<1>(IpPort), cfg.hdfs_username.c_str()));
+        fs = std::make_unique<hdfsFS>(
+            hdfsConnectAsUser(std::get<0>(IpPort).c_str(), std::get<1>(IpPort), cfg.hdfs_username.c_str()));
 #endif
     }
     ~HDFSContext() {
@@ -56,7 +57,7 @@ class HDFSContext final : public IContext {
         // Clean up
     };
 #if USE_HDFS
-    static HDFSContext* get(DaphneContext *ctx) { return dynamic_cast<HDFSContext*>(ctx->getHDFSContext()); };
-    hdfsFS* getConnection() { return fs.get(); };
+    static HDFSContext *get(DaphneContext *ctx) { return dynamic_cast<HDFSContext *>(ctx->getHDFSContext()); };
+    hdfsFS *getConnection() { return fs.get(); };
 #endif
 };
\ No newline at end of file
diff --git a/src/runtime/local/context/IContext.h b/src/runtime/local/context/IContext.h
index 92e2de389..16829f392 100644
--- a/src/runtime/local/context/IContext.h
+++ b/src/runtime/local/context/IContext.h
@@ -1,6 +1,6 @@
 #pragma once
 
 class IContext {
-public:
+  public:
     virtual void destroy() = 0;
 };
diff --git a/src/runtime/local/datagen/GenGivenVals.h b/src/runtime/local/datagen/GenGivenVals.h
index 689a391a3..0d62d1889 100644
--- a/src/runtime/local/datagen/GenGivenVals.h
+++ b/src/runtime/local/datagen/GenGivenVals.h
@@ -32,9 +32,9 @@
 // Struct for partial template specialization
 // ****************************************************************************
 
-template<class DT>
-struct GenGivenVals {
-    static DT * generate(size_t numRows, const std::vector<typename DT::VT> & elements, size_t minNumNonZeros = 0) = delete;
+template <class DT> struct GenGivenVals {
+    static DT *generate(size_t numRows, const std::vector<typename DT::VT> &elements,
+                        size_t minNumNonZeros = 0) = delete;
 };
 
 // ****************************************************************************
@@ -44,17 +44,17 @@ struct GenGivenVals {
 /**
  * @brief A very simple data generator which populates a matrix with the
  * elements of the given `std::vector`.
- * 
+ *
  * Meant only for small matrices, mainly as a utility for testing and
  * debugging. Note that it can easily be used with an initializer list as
  * follows:
- * 
+ *
  * ```c++
  * // Generates the matrix  3 1 4
  * //                       1 5 9
  * auto m = genGivenVals<DenseMatrix<double>>(2, {3, 1, 4, 1, 5, 9});
  * ```
- * 
+ *
  * @param numRows The number of rows.
  * @param elements The data elements to populate the matrix with. Their number
  * must be divisible by `numRows`.
@@ -63,8 +63,8 @@ struct GenGivenVals {
  * @return A matrix of the specified data type `DT` containing the provided
  * data elements.
  */
-template<class DT>
-DT * genGivenVals(size_t numRows, const std::vector<typename DT::VT> & elements, size_t minNumNonZeros = 0) {
+template <class DT>
+DT *genGivenVals(size_t numRows, const std::vector<typename DT::VT> &elements, size_t minNumNonZeros = 0) {
     return GenGivenVals<DT>::generate(numRows, elements, minNumNonZeros);
 }
 
@@ -81,40 +81,39 @@ DT * genGivenVals(size_t numRows, const std::vector<typename DT::VT> & elements,
 // DenseMatrix
 // ----------------------------------------------------------------------------
 
-template<typename VT>
-struct GenGivenVals<DenseMatrix<VT>> {
-    static DenseMatrix<VT> * generate(size_t numRows, const std::vector<VT> & elements, size_t minNumNonZeros = 0) {
-        if(numRows == 0)
+template <typename VT> struct GenGivenVals<DenseMatrix<VT>> {
+    static DenseMatrix<VT> *generate(size_t numRows, const std::vector<VT> &elements, size_t minNumNonZeros = 0) {
+        if (numRows == 0)
             // We could return a 0x0 matrix, but this is often not what we want.
-            // In many (test) cases, we want a 0xm matrix, but the number of columns
-            // cannot be inferred if there are no elements. In such cases, callers
-            // should rather construct a 0xm matrix via DataObjectFactory::create().
+            // In many (test) cases, we want a 0xm matrix, but the number of
+            // columns cannot be inferred if there are no elements. In such
+            // cases, callers should rather construct a 0xm matrix via
+            // DataObjectFactory::create().
             throw std::runtime_error("genGivenVals(): numRows must not be zero");
 
         const size_t numCells = elements.size();
-        if(numCells % numRows)
-            throw std::runtime_error(
-                    "genGivenVals(): the number of given data elements must be "
-                    "divisible by given number of rows"
-            );
+        if (numCells % numRows)
+            throw std::runtime_error("genGivenVals(): the number of given data elements must be "
+                                     "divisible by given number of rows");
         const size_t numCols = numCells / numRows;
         auto res = DataObjectFactory::create<DenseMatrix<VT>>(numRows, numCols, false);
-        memcpy(res->getValues(), elements.data(), numCells * sizeof(VT));
+        std::copy(elements.begin(), elements.end(), res->getValues());
         return res;
     }
 };
 
-template<>
-struct GenGivenVals<DenseMatrix<const char*>> {
-    static DenseMatrix<const char*> * generate(size_t numRows, const std::vector<const char*> & elements, size_t minNumNonZeros = 0) {
+template <> struct GenGivenVals<DenseMatrix<const char *>> {
+    static DenseMatrix<const char *> *generate(size_t numRows, const std::vector<const char *> &elements,
+                                               size_t minNumNonZeros = 0) {
         const size_t numCells = elements.size();
         if (numCells % numRows != 0)
-            throw std::runtime_error("genGivenVals: number of given data elements must be divisible by given number of rows");
+            throw std::runtime_error("genGivenVals: number of given data elements must be divisible "
+                                     "by given number of rows");
         const size_t numCols = numCells / numRows;
-        auto res = DataObjectFactory::create<DenseMatrix<const char*>>(numRows, numCols, false);
+        auto res = DataObjectFactory::create<DenseMatrix<const char *>>(numRows, numCols, false);
         res->prepareAppend();
-        for(size_t r = 0; r < numRows; r++)
-            for(size_t c = 0; c < numCols; c++)
+        for (size_t r = 0; r < numRows; r++)
+            for (size_t c = 0; c < numCols; c++)
                 res->append(r, c, elements[r * res->getRowSkip() + c]);
         res->finishAppend();
         return res;
@@ -125,33 +124,34 @@ struct GenGivenVals<DenseMatrix<const char*>> {
 // CSRMatrix
 // ----------------------------------------------------------------------------
 
-template<typename VT>
-struct GenGivenVals<CSRMatrix<VT>> {
-    static CSRMatrix<VT> * generate(size_t numRows, const std::vector<VT> & elements, size_t minNumNonZeros = 0) {
+template <typename VT> struct GenGivenVals<CSRMatrix<VT>> {
+    static CSRMatrix<VT> *generate(size_t numRows, const std::vector<VT> &elements, size_t minNumNonZeros = 0) {
         const size_t numCells = elements.size();
         if (numCells % numRows != 0)
-            throw std::runtime_error("genGivenVals: number of given data elements must be divisible by given number of rows");
+            throw std::runtime_error("genGivenVals: number of given data elements must be divisible "
+                                     "by given number of rows");
         const size_t numCols = numCells / numRows;
         size_t numNonZeros = 0;
-        for(VT v : elements)
-            if(v != VT(0))
+        for (VT v : elements)
+            if (v != VT(0))
                 numNonZeros++;
-        auto res = DataObjectFactory::create<CSRMatrix<VT>>(numRows, numCols, std::max(numNonZeros, minNumNonZeros), false);
-        VT * values = res->getValues();
-        size_t * colIdxs = res->getColIdxs();
-        size_t * rowOffsets = res->getRowOffsets();
+        auto res =
+            DataObjectFactory::create<CSRMatrix<VT>>(numRows, numCols, std::max(numNonZeros, minNumNonZeros), false);
+        VT *values = res->getValues();
+        size_t *colIdxs = res->getColIdxs();
+        size_t *rowOffsets = res->getRowOffsets();
         size_t pos = 0;
         size_t colIdx = 0;
         size_t rowIdx = 0;
         rowOffsets[0] = 0;
-        for(VT v : elements) {
-            if(v != VT(0)) {
+        for (VT v : elements) {
+            if (v != VT(0)) {
                 values[pos] = v;
                 colIdxs[pos] = colIdx;
                 pos++;
             }
             colIdx++;
-            if(colIdx == numCols) {
+            if (colIdx == numCols) {
                 colIdx = 0;
                 rowOffsets[rowIdx++ + 1] = pos;
             }
@@ -164,12 +164,12 @@ struct GenGivenVals<CSRMatrix<VT>> {
 // Matrix
 // ----------------------------------------------------------------------------
 
-template<typename VT>
-struct GenGivenVals<Matrix<VT>> {
-    static Matrix<VT> * generate(size_t numRows, const std::vector<VT> & elements, size_t minNumNonZeros = 0) {
-        // this is to simplify generating test matrices for the "Matrix" kernel specializations
+template <typename VT> struct GenGivenVals<Matrix<VT>> {
+    static Matrix<VT> *generate(size_t numRows, const std::vector<VT> &elements, size_t minNumNonZeros = 0) {
+        // this is to simplify generating test matrices for the "Matrix" kernel
+        // specializations
         return GenGivenVals<DenseMatrix<VT>>::generate(numRows, elements, minNumNonZeros);
     }
 };
 
-#endif //SRC_RUNTIME_LOCAL_DATAGEN_GENGIVENVALS_H
\ No newline at end of file
+#endif // SRC_RUNTIME_LOCAL_DATAGEN_GENGIVENVALS_H
\ No newline at end of file
diff --git a/src/runtime/local/datastructures/AllocationDescriptorCUDA.h b/src/runtime/local/datastructures/AllocationDescriptorCUDA.h
index 4a104e2bf..48fb9c099 100644
--- a/src/runtime/local/datastructures/AllocationDescriptorCUDA.h
+++ b/src/runtime/local/datastructures/AllocationDescriptorCUDA.h
@@ -16,25 +16,26 @@
 
 #pragma once
 
-#include <cstdint>
 #include "DataPlacement.h"
 #include "runtime/local/context/CUDAContext.h"
+#include <cstdint>
 
 class AllocationDescriptorCUDA : public IAllocationDescriptor {
     ALLOCATION_TYPE type = ALLOCATION_TYPE::GPU_CUDA;
     uint32_t device_id{};
-    DaphneContext* dctx{};
+    DaphneContext *dctx{};
     std::shared_ptr<std::byte> data{};
     size_t alloc_id{};
 
-public:
+  public:
     AllocationDescriptorCUDA() = delete;
 
-    AllocationDescriptorCUDA(DaphneContext* ctx, uint32_t device_id) : device_id(device_id), dctx(ctx) { }
+    AllocationDescriptorCUDA(DaphneContext *ctx, uint32_t device_id) : device_id(device_id), dctx(ctx) {}
 
     ~AllocationDescriptorCUDA() override {
-        // ToDo: for now we free if this is the last context-external ref to the buffer
-        if(data.use_count() == 2) {
+        // ToDo: for now we free if this is the last context-external ref to the
+        // buffer
+        if (data.use_count() == 2) {
             CUDAContext::get(dctx, device_id)->free(alloc_id);
         }
     }
@@ -55,16 +56,16 @@ class AllocationDescriptorCUDA : public IAllocationDescriptor {
         return std::make_unique<AllocationDescriptorCUDA>(*this);
     }
 
-    void transferTo(std::byte* src, size_t size) override {
+    void transferTo(std::byte *src, size_t size) override {
         CHECK_CUDART(cudaMemcpy(data.get(), src, size, cudaMemcpyHostToDevice));
     }
-    void transferFrom(std::byte* dst, size_t size) override {
+    void transferFrom(std::byte *dst, size_t size) override {
         CHECK_CUDART(cudaMemcpy(dst, data.get(), size, cudaMemcpyDeviceToHost));
     };
 
-    bool operator==(const IAllocationDescriptor* other) const override {
-        if(getType() == other->getType())
-            return(getLocation() == dynamic_cast<const AllocationDescriptorCUDA *>(other)->getLocation());
+    bool operator==(const IAllocationDescriptor *other) const override {
+        if (getType() == other->getType())
+            return (getLocation() == dynamic_cast<const AllocationDescriptorCUDA *>(other)->getLocation());
         return false;
     }
 };
diff --git a/src/runtime/local/datastructures/AllocationDescriptorGRPC.h b/src/runtime/local/datastructures/AllocationDescriptorGRPC.h
index dbcaa8a32..00e0a7658 100644
--- a/src/runtime/local/datastructures/AllocationDescriptorGRPC.h
+++ b/src/runtime/local/datastructures/AllocationDescriptorGRPC.h
@@ -25,63 +25,59 @@
 #include <runtime/local/datastructures/Structure.h>
 
 class AllocationDescriptorGRPC : public IAllocationDescriptor {
-private:
+  private:
     DaphneContext *ctx;
     ALLOCATION_TYPE type = ALLOCATION_TYPE::DIST_GRPC;
     const std::string workerAddress;
     DistributedData distributedData;
     std::shared_ptr<std::byte> data;
-public:
-    AllocationDescriptorGRPC() {} ;
-    AllocationDescriptorGRPC(DaphneContext* ctx, 
-                            const std::string &address, 
-                            const DistributedData &data) : ctx(ctx), workerAddress(address), distributedData(data) { } ;
 
-    ~AllocationDescriptorGRPC() override {};
-    [[nodiscard]] ALLOCATION_TYPE getType() const override 
-    { return type; };
-    
-    std::string getLocation() const override 
-    {return workerAddress; };
+  public:
+    AllocationDescriptorGRPC(){};
+    AllocationDescriptorGRPC(DaphneContext *ctx, const std::string &address, const DistributedData &data)
+        : ctx(ctx), workerAddress(address), distributedData(data){};
+
+    ~AllocationDescriptorGRPC() override{};
+    [[nodiscard]] ALLOCATION_TYPE getType() const override { return type; };
+
+    std::string getLocation() const override { return workerAddress; };
     void createAllocation(size_t size, bool zero) override {}
     // TODO: Implement transferTo and transferFrom functions
-    std::shared_ptr<std::byte> getData() override { 
+    std::shared_ptr<std::byte> getData() override {
         throw std::runtime_error("TransferTo/From functions are not implemented yet.");
     }
 
-    bool operator==(const IAllocationDescriptor* other) const override {
-        if(getType() == other->getType())
-            return(getLocation() == dynamic_cast<const AllocationDescriptorGRPC *>(other)->getLocation());
+    bool operator==(const IAllocationDescriptor *other) const override {
+        if (getType() == other->getType())
+            return (getLocation() == dynamic_cast<const AllocationDescriptorGRPC *>(other)->getLocation());
         return false;
     }
 
     [[nodiscard]] std::unique_ptr<IAllocationDescriptor> clone() const override {
         return std::make_unique<AllocationDescriptorGRPC>(*this);
     }
-    
-    /* 
+
+    /*
     TODO:
     We currently do not support transferTo/From functions for gRPC.
     All communication is handled by the distributed kernels (e.g. Distribute.h).
     In order to support these functions we need to know what data-type we
-    are sending (representation, value type, etc.) since the worker 
-    needs to store with the appropriate representation. 
-    This might not be necessary when issue #103 "(De)Serialization of data objects" is implemented.
-    For now support for these is not necessary but we need to think about this...
+    are sending (representation, value type, etc.) since the worker
+    needs to store with the appropriate representation.
+    This might not be necessary when issue #103 "(De)Serialization of data
+    objects" is implemented. For now support for these is not necessary but we
+    need to think about this...
     */
-    void transferTo(std::byte *src, size_t size) override { 
-        /* TODO */ 
+    void transferTo(std::byte *src, size_t size) override {
+        /* TODO */
         throw std::runtime_error("TransferTo (gRPC) function is not implemented yet.");
     };
-    void transferFrom(std::byte *src, size_t size) override { 
-        /* TODO */ 
+    void transferFrom(std::byte *src, size_t size) override {
+        /* TODO */
         throw std::runtime_error("TransferFrom (gRPC) function is not implemented yet.");
     };
 
-    const DistributedIndex getDistributedIndex()
-    { return distributedData.ix; }    
-    const DistributedData getDistributedData()
-    { return distributedData; }
-    void updateDistributedData(DistributedData data_)
-    { distributedData = data_; }
+    const DistributedIndex getDistributedIndex() { return distributedData.ix; }
+    const DistributedData getDistributedData() { return distributedData; }
+    void updateDistributedData(DistributedData data_) { distributedData = data_; }
 };
diff --git a/src/runtime/local/datastructures/AllocationDescriptorHost.h b/src/runtime/local/datastructures/AllocationDescriptorHost.h
index 22b1639f8..af7e71e67 100644
--- a/src/runtime/local/datastructures/AllocationDescriptorHost.h
+++ b/src/runtime/local/datastructures/AllocationDescriptorHost.h
@@ -23,17 +23,16 @@ class AllocationDescriptorHost : public IAllocationDescriptor {
     ALLOCATION_TYPE type = ALLOCATION_TYPE::HOST;
     std::shared_ptr<std::byte> data{};
 
-public:
+  public:
     ~AllocationDescriptorHost() override = default;
     [[nodiscard]] ALLOCATION_TYPE getType() const override { return type; }
-    void createAllocation(size_t size, bool zero) override { }
-    std::string getLocation() const override 
-    { return "Host"; }
+    void createAllocation(size_t size, bool zero) override {}
+    std::string getLocation() const override { return "Host"; }
     std::shared_ptr<std::byte> getData() override { return data; }
-    void transferTo(std::byte* src, size_t size) override { }
-    void transferFrom(std::byte* dst, size_t size) override {}
+    void transferTo(std::byte *src, size_t size) override {}
+    void transferFrom(std::byte *dst, size_t size) override {}
     [[nodiscard]] std::unique_ptr<IAllocationDescriptor> clone() const override {
         return std::make_unique<AllocationDescriptorHost>(*this);
     }
-    bool operator==(const IAllocationDescriptor* other) const override { return (getType() == other->getType()); }
+    bool operator==(const IAllocationDescriptor *other) const override { return (getType() == other->getType()); }
 };
diff --git a/src/runtime/local/datastructures/AllocationDescriptorMPI.h b/src/runtime/local/datastructures/AllocationDescriptorMPI.h
index e38923e88..8f9d0cf72 100644
--- a/src/runtime/local/datastructures/AllocationDescriptorMPI.h
+++ b/src/runtime/local/datastructures/AllocationDescriptorMPI.h
@@ -17,59 +17,51 @@
 #ifndef SRC_RUNTIME_LOCAL_DATASTRUCTURE_ALLOCATION_DESCRIPTORMPH_H
 #define SRC_RUNTIME_LOCAL_DATASTRUCTURE_ALLOCATION_DESCRIPTORMPH_H
 
+#include <ir/daphneir/Daphne.h>
 #include <runtime/local/context/DaphneContext.h>
 #include <runtime/local/datastructures/Structure.h>
-#include <ir/daphneir/Daphne.h>
 
-#include <runtime/local/datastructures/DistributedAllocationHelpers.h>
 #include <memory>
+#include <runtime/local/datastructures/DistributedAllocationHelpers.h>
 
 class AllocationDescriptorMPI : public IAllocationDescriptor {
     ALLOCATION_TYPE type = ALLOCATION_TYPE::DIST_MPI;
     int processRankID;
-    DaphneContext* ctx;
+    DaphneContext *ctx;
     DistributedData distributedData;
     std::shared_ptr<std::byte> data;
 
-public:
-    AllocationDescriptorMPI() {} ;
-    AllocationDescriptorMPI(int id,
-                            DaphneContext* ctx, 
-                            DistributedData data) :  processRankID(id), ctx(ctx), distributedData(data) {} ;
+  public:
+    AllocationDescriptorMPI(){};
+    AllocationDescriptorMPI(int id, DaphneContext *ctx, DistributedData data)
+        : processRankID(id), ctx(ctx), distributedData(data){};
 
-    ~AllocationDescriptorMPI() override {};
+    ~AllocationDescriptorMPI() override{};
 
-    [[nodiscard]] ALLOCATION_TYPE getType() const override 
-    { return type; };
-    
-    std::string getLocation() const override {
-        return std::to_string(processRankID); 
-    };
-    
-    void createAllocation(size_t size, bool zero) override {} ;
-    std::shared_ptr<std::byte> getData() override {return nullptr;} ;
+    [[nodiscard]] ALLOCATION_TYPE getType() const override { return type; };
 
-    bool operator==(const IAllocationDescriptor* other) const override {
-        if(getType() == other->getType())
-            return(getLocation() == dynamic_cast<const AllocationDescriptorMPI *>(other)->getLocation());
+    std::string getLocation() const override { return std::to_string(processRankID); };
+
+    void createAllocation(size_t size, bool zero) override {};
+    std::shared_ptr<std::byte> getData() override { return nullptr; };
+
+    bool operator==(const IAllocationDescriptor *other) const override {
+        if (getType() == other->getType())
+            return (getLocation() == dynamic_cast<const AllocationDescriptorMPI *>(other)->getLocation());
         return false;
-    } ;
+    };
 
     [[nodiscard]] std::unique_ptr<IAllocationDescriptor> clone() const override {
         return std::make_unique<AllocationDescriptorMPI>(*this);
     }
-    
+
     void transferTo(std::byte *src, size_t size) override { /* TODO */ };
     void transferFrom(std::byte *src, size_t size) override { /* TODO */ };
 
-    const DistributedIndex getDistributedIndex()
-    { return distributedData.ix; }    
-    const DistributedData getDistributedData()
-    { return distributedData; }
-    void updateDistributedData(DistributedData data_)
-    { distributedData = data_; }
-    int getRank()
-    { return processRankID; }
+    const DistributedIndex getDistributedIndex() { return distributedData.ix; }
+    const DistributedData getDistributedData() { return distributedData; }
+    void updateDistributedData(DistributedData data_) { distributedData = data_; }
+    int getRank() { return processRankID; }
 };
 
-#endif //SRC_RUNTIME_LOCAL_DATASTRUCTURE_ALLOCATION_DESCRIPTORMPH_H
+#endif // SRC_RUNTIME_LOCAL_DATASTRUCTURE_ALLOCATION_DESCRIPTORMPH_H
diff --git a/src/runtime/local/datastructures/CSRMatrix.cpp b/src/runtime/local/datastructures/CSRMatrix.cpp
index d23254878..e49f88ff0 100644
--- a/src/runtime/local/datastructures/CSRMatrix.cpp
+++ b/src/runtime/local/datastructures/CSRMatrix.cpp
@@ -18,8 +18,7 @@
 
 #include "CSRMatrix.h"
 
-template<typename ValueType>
-size_t CSRMatrix<ValueType>::serialize(std::vector<char> &buf) const {
+template <typename ValueType> size_t CSRMatrix<ValueType>::serialize(std::vector<char> &buf) const {
     return DaphneSerializer<CSRMatrix<ValueType>>::serialize(this, buf);
 }
 
diff --git a/src/runtime/local/datastructures/CSRMatrix.h b/src/runtime/local/datastructures/CSRMatrix.h
index e3acfa640..08dc4a3e1 100644
--- a/src/runtime/local/datastructures/CSRMatrix.h
+++ b/src/runtime/local/datastructures/CSRMatrix.h
@@ -30,7 +30,7 @@
 
 /**
  * @brief A sparse matrix in Compressed Sparse Row (CSR) format.
- * 
+ *
  * This matrix implementation is backed by three contiguous arrays. The
  * `values` array contains all non-zero values in the matrix. For each of these
  * non-zero values, the `colIdxs` array contains the number of the column.
@@ -39,89 +39,81 @@
  * `colIdxs` arrays. Additionally, the `rowOffsets` array ends with the offset
  * to the first element after the valid elements in the `values` and `colIdxs`
  * arrays.
- * 
+ *
  * Each instance of this class might represent a sub-matrix of another
  * `CSRMatrix`. Thus, to traverse the matrix by row, you can safely go via the
  * `rowOffsets`, but for traversing the matrix by non-zero value, you must
  * start at `values[rowOffsets[0]`.
  */
-template<typename ValueType>
-class CSRMatrix : public Matrix<ValueType> {
+template <typename ValueType> class CSRMatrix : public Matrix<ValueType> {
     // `using`, so that we do not need to prefix each occurrence of these
     // fields from the super-classes.
     using Matrix<ValueType>::numRows;
     using Matrix<ValueType>::numCols;
-    
+
     /**
      * @brief The number of rows allocated starting from `rowOffsets`. This can
      * differ from `numRows` if this `CSRMatrix` is a view on a larger
      * `CSRMatrix`.
      */
     size_t numRowsAllocated;
-    
+
     bool isRowAllocatedBefore;
-    
+
     /**
      * @brief The maximum number of non-zero values this matrix was allocated
      * to accommodate.
      */
     size_t maxNumNonZeros;
-    
+
     std::shared_ptr<ValueType> values;
     std::shared_ptr<size_t> colIdxs;
     std::shared_ptr<size_t> rowOffsets;
-    
+
     size_t lastAppendedRowIdx;
 
     // Grant DataObjectFactory access to the private constructors and
     // destructors.
-    template<class DataType, typename ... ArgTypes>
-    friend DataType * DataObjectFactory::create(ArgTypes ...);
-    template<class DataType>
-    friend void DataObjectFactory::destroy(const DataType * obj);
-    
+    template <class DataType, typename... ArgTypes> friend DataType *DataObjectFactory::create(ArgTypes...);
+    template <class DataType> friend void DataObjectFactory::destroy(const DataType *obj);
+
     /**
      * @brief Creates a `CSRMatrix` and allocates enough memory for the
      * specified size in the internal `values`, `colIdxs`, and `rowOffsets`
      * arrays.
-     * 
+     *
      * @param maxNumRows The maximum number of rows.
      * @param numCols The exact number of columns.
      * @param maxNumNonZeros The maximum number of non-zeros in the matrix.
      * @param zero Whether the allocated memory of the internal arrays shall be
      * initialized to zeros (`true`), or be left uninitialized (`false`).
      */
-    CSRMatrix(size_t maxNumRows, size_t numCols, size_t maxNumNonZeros, bool zero) : 
-            Matrix<ValueType>(maxNumRows, numCols),
-            numRowsAllocated(maxNumRows),
-            isRowAllocatedBefore(false),
-            maxNumNonZeros(maxNumNonZeros),
-            values(new ValueType[maxNumNonZeros], std::default_delete<ValueType[]>()),
-            colIdxs(new size_t[maxNumNonZeros], std::default_delete<size_t[]>()),
-            rowOffsets(new size_t[numRows + 1], std::default_delete<size_t[]>()),
-            lastAppendedRowIdx(0)
-    {
-        if(zero) {
+    CSRMatrix(size_t maxNumRows, size_t numCols, size_t maxNumNonZeros, bool zero)
+        : Matrix<ValueType>(maxNumRows, numCols), numRowsAllocated(maxNumRows), isRowAllocatedBefore(false),
+          maxNumNonZeros(maxNumNonZeros), values(new ValueType[maxNumNonZeros], std::default_delete<ValueType[]>()),
+          colIdxs(new size_t[maxNumNonZeros], std::default_delete<size_t[]>()),
+          rowOffsets(new size_t[numRows + 1], std::default_delete<size_t[]>()), lastAppendedRowIdx(0) {
+        if (zero) {
             memset(values.get(), 0, maxNumNonZeros * sizeof(ValueType));
             memset(colIdxs.get(), 0, maxNumNonZeros * sizeof(size_t));
             memset(rowOffsets.get(), 0, (numRows + 1) * sizeof(size_t));
         }
     }
-    
+
     /**
      * @brief Creates a `CSRMatrix` around a sub-matrix of another `CSRMatrix`
      * without copying the data.
-     * 
+     *
      * @param src The other `CSRMatrix`.
-     * @param rowLowerIncl Inclusive lower bound for the range of rows to extract.
-     * @param rowUpperExcl Exclusive upper bound for the range of rows to extract.
+     * @param rowLowerIncl Inclusive lower bound for the range of rows to
+     * extract.
+     * @param rowUpperExcl Exclusive upper bound for the range of rows to
+     * extract.
      */
-    CSRMatrix(const CSRMatrix<ValueType> * src, size_t rowLowerIncl, size_t rowUpperExcl) :
-            Matrix<ValueType>(rowUpperExcl - rowLowerIncl, src->numCols),
-            numRowsAllocated(src->numRowsAllocated - rowLowerIncl),
-            isRowAllocatedBefore(rowLowerIncl > 0),
-            lastAppendedRowIdx(0)
-    {
+    CSRMatrix(const CSRMatrix<ValueType> *src, size_t rowLowerIncl, size_t rowUpperExcl)
+        : Matrix<ValueType>(rowUpperExcl - rowLowerIncl, src->numCols),
+          numRowsAllocated(src->numRowsAllocated - rowLowerIncl), isRowAllocatedBefore(rowLowerIncl > 0),
+          lastAppendedRowIdx(0) {
         if (!src)
             throw std::runtime_error("CSRMatrix: src must not be null");
         if (rowLowerIncl >= src->numRows)
@@ -130,33 +122,29 @@ class CSRMatrix : public Matrix<ValueType> {
             throw std::runtime_error("CSRMatrix: rowUpperExcl is out of bounds");
         if (rowLowerIncl >= rowUpperExcl)
             throw std::runtime_error("CSRMatrix: rowLowerIncl must be lower than rowUpperExcl");
-        
+
         maxNumNonZeros = src->maxNumNonZeros;
         values = src->values;
         colIdxs = src->colIdxs;
         rowOffsets = std::shared_ptr<size_t>(src->rowOffsets, src->rowOffsets.get() + rowLowerIncl);
     }
-    
+
     virtual ~CSRMatrix() {
         // nothing to do
     }
-    
+
     void fillNextPosUntil(size_t nextPos, size_t rowIdx) {
-        if(rowIdx > lastAppendedRowIdx) {
-            for(size_t r = lastAppendedRowIdx + 2; r <= rowIdx + 1; r++)
+        if (rowIdx > lastAppendedRowIdx) {
+            for (size_t r = lastAppendedRowIdx + 2; r <= rowIdx + 1; r++)
                 rowOffsets.get()[r] = nextPos;
             lastAppendedRowIdx = rowIdx;
         }
     }
-    
-public:
 
-    template<typename NewValueType>
-    using WithValueType = CSRMatrix<NewValueType>;
+  public:
+    template <typename NewValueType> using WithValueType = CSRMatrix<NewValueType>;
 
-    static std::string getName() {
-        return "CSRMatrix";
-    }
+    static std::string getName() { return "CSRMatrix"; }
 
     void shrinkNumRows(size_t numRows) {
         if (numRows > this->numRows)
@@ -164,115 +152,100 @@ class CSRMatrix : public Matrix<ValueType> {
         // TODO Here we could reduce the allocated size of the rowOffsets array.
         this->numRows = numRows;
     }
-    
-    size_t getMaxNumNonZeros() const {
-        return maxNumNonZeros;
-    }
-    size_t getNumNonZeros() const {
-        return rowOffsets.get()[numRows] - rowOffsets.get()[0];
-    }
-    
+
+    size_t getMaxNumNonZeros() const { return maxNumNonZeros; }
+    size_t getNumNonZeros() const { return rowOffsets.get()[numRows] - rowOffsets.get()[0]; }
+
     size_t getNumNonZeros(size_t rowIdx) const {
         if (rowIdx >= numRows)
             throw std::runtime_error("CSRMatrix (getNumNonZeros): rowIdx is out of bounds");
         return rowOffsets.get()[rowIdx + 1] - rowOffsets.get()[rowIdx];
     }
-    
+
     void shrinkNumNonZeros(size_t numNonZeros) {
         if (numNonZeros > getNumNonZeros())
-            throw std::runtime_error("CSRMatrix (shrinkNumNonZeros): numNonZeros can only be shrunk");
+            throw std::runtime_error("CSRMatrix (shrinkNumNonZeros): "
+                                     "numNonZeros can only be shrunk");
         // TODO Here we could reduce the allocated size of the values and
         // colIdxs arrays.
     }
 
-    ValueType * getValues() {
-        return values.get();
-    }
-    
-    const ValueType * getValues() const {
-        return values.get();
-    }
-    
-    ValueType * getValues(size_t rowIdx) {
+    ValueType *getValues() { return values.get(); }
+
+    const ValueType *getValues() const { return values.get(); }
+
+    ValueType *getValues(size_t rowIdx) {
         // We allow equality here to enable retrieving a pointer to the end.
         if (rowIdx > numRows)
             throw std::runtime_error("CSRMatrix (getValues): rowIdx is out of bounds");
         return values.get() + rowOffsets.get()[rowIdx];
     }
-    
-    const ValueType * getValues(size_t rowIdx) const {
+
+    const ValueType *getValues(size_t rowIdx) const {
         return const_cast<CSRMatrix<ValueType> *>(this)->getValues(rowIdx);
     }
-    
-    size_t * getColIdxs() {
-        return colIdxs.get();
-    }
-    
-    const size_t * getColIdxs() const {
-        return colIdxs.get();
-    }
-    
-    size_t * getColIdxs(size_t rowIdx) {
+
+    size_t *getColIdxs() { return colIdxs.get(); }
+
+    const size_t *getColIdxs() const { return colIdxs.get(); }
+
+    size_t *getColIdxs(size_t rowIdx) {
         // We allow equality here to enable retrieving a pointer to the end.
         if (rowIdx > numRows)
             throw std::runtime_error("CSRMatrix (getColIdxs): rowIdx is out of bounds");
         return colIdxs.get() + rowOffsets.get()[rowIdx];
     }
 
-    const size_t * getColIdxs(size_t rowIdx) const {
+    const size_t *getColIdxs(size_t rowIdx) const {
         return const_cast<CSRMatrix<ValueType> *>(this)->getColIdxs(rowIdx);
     }
 
-    size_t * getRowOffsets() {
-        return rowOffsets.get();
-    }
+    size_t *getRowOffsets() { return rowOffsets.get(); }
 
-    const size_t * getRowOffsets() const {
-        return rowOffsets.get();
-    }
+    const size_t *getRowOffsets() const { return rowOffsets.get(); }
 
     ValueType get(size_t rowIdx, size_t colIdx) const override {
         if (rowIdx >= numRows)
             throw std::runtime_error("CSRMatrix (get): rowIdx is out of bounds");
         if (colIdx >= numCols)
             throw std::runtime_error("CSRMatrix (get): colIdx is out of bounds");
-        
-        const size_t * rowColIdxsBeg = getColIdxs(rowIdx);
-        const size_t * rowColIdxsEnd = getColIdxs(rowIdx + 1);
-        const size_t * ptrExpected = std::lower_bound(rowColIdxsBeg, rowColIdxsEnd, colIdx);
 
-        if(ptrExpected == rowColIdxsEnd || *ptrExpected != colIdx)
+        const size_t *rowColIdxsBeg = getColIdxs(rowIdx);
+        const size_t *rowColIdxsEnd = getColIdxs(rowIdx + 1);
+        const size_t *ptrExpected = std::lower_bound(rowColIdxsBeg, rowColIdxsEnd, colIdx);
+
+        if (ptrExpected == rowColIdxsEnd || *ptrExpected != colIdx)
             // No entry for the given coordinates present.
             return ValueType(0);
         else
             // Entry for the given coordinates present.
             return getValues(rowIdx)[ptrExpected - rowColIdxsBeg];
     }
-    
+
     void set(size_t rowIdx, size_t colIdx, ValueType value) override {
         if (rowIdx >= numRows)
             throw std::runtime_error("CSRMatrix (set): rowIdx is out of bounds");
         if (colIdx >= numCols)
             throw std::runtime_error("CSRMatrix (set): colIdx is out of bounds");
-        
-        size_t * rowColIdxsBeg = getColIdxs(rowIdx);
-        size_t * rowColIdxsEnd = getColIdxs(rowIdx + 1);
-        const size_t * ptrExpected = std::lower_bound(rowColIdxsBeg, rowColIdxsEnd, colIdx);
+
+        size_t *rowColIdxsBeg = getColIdxs(rowIdx);
+        size_t *rowColIdxsEnd = getColIdxs(rowIdx + 1);
+        const size_t *ptrExpected = std::lower_bound(rowColIdxsBeg, rowColIdxsEnd, colIdx);
         const size_t posExpected = ptrExpected - rowColIdxsBeg;
-        
+
         const size_t posEnd = colIdxs.get() + rowOffsets.get()[numRowsAllocated] - rowColIdxsBeg;
-        ValueType * rowValuesBeg = getValues(rowIdx);
-        
-        if(ptrExpected == rowColIdxsEnd || *ptrExpected != colIdx) {
+        ValueType *rowValuesBeg = getValues(rowIdx);
+
+        if (ptrExpected == rowColIdxsEnd || *ptrExpected != colIdx) {
             // No entry for the given coordinates present.
-            if(value == ValueType(0))
+            if (value == ValueType(0))
                 return; // do nothing
             else {
                 // Create gap.
                 // TODO We might want to reallocate here to ensure that enough
                 // space is allocated.
-                if(posEnd)
-                    for(size_t pos = posEnd; pos > posExpected; pos--) {
+                if (posEnd)
+                    for (size_t pos = posEnd; pos > posExpected; pos--) {
                         rowValuesBeg[pos] = rowValuesBeg[pos - 1];
                         rowColIdxsBeg[pos] = rowColIdxsBeg[pos - 1];
                     }
@@ -280,31 +253,29 @@ class CSRMatrix : public Matrix<ValueType> {
                 rowValuesBeg[posExpected] = value;
                 rowColIdxsBeg[posExpected] = colIdx;
                 // Update rowOffsets.
-                for(size_t r = rowIdx + 1; r <= numRowsAllocated; r++)
+                for (size_t r = rowIdx + 1; r <= numRowsAllocated; r++)
                     rowOffsets.get()[r]++;
             }
-        }
-        else {
+        } else {
             // Entry for the given coordinates present.
-            if(value == ValueType(0)) {
+            if (value == ValueType(0)) {
                 // Close gap.
-                for(size_t pos = posExpected; pos < posEnd; pos++) {
+                for (size_t pos = posExpected; pos < posEnd; pos++) {
                     rowValuesBeg[pos] = rowValuesBeg[pos + 1];
                     rowColIdxsBeg[pos] = rowColIdxsBeg[pos + 1];
                 }
                 // Update rowOffsets.
                 // TODO We might want to shrink the arrays here.
-                for(size_t r = rowIdx + 1; r <= numRowsAllocated; r++)
+                for (size_t r = rowIdx + 1; r <= numRowsAllocated; r++)
                     rowOffsets.get()[r]--;
-            }
-            else
+            } else
                 // Simply overwrite the existing value.
                 rowValuesBeg[posExpected] = value;
         }
     }
-    
+
     void prepareAppend() override {
-        if(isRowAllocatedBefore)
+        if (isRowAllocatedBefore)
             // In this case, we assume that the matrix has been populated up to
             // just before this view.
             rowOffsets.get()[1] = rowOffsets.get()[0];
@@ -312,7 +283,7 @@ class CSRMatrix : public Matrix<ValueType> {
             rowOffsets.get()[1] = rowOffsets.get()[0] = 0;
         lastAppendedRowIdx = 0;
     }
-    
+
     // Note that if this matrix is a view on a larger `CSRMatrix`, then
     // `prepareAppend`/`append`/`finishAppend` assume that the larger matrix
     // has been populated up to just before the row range of this view.
@@ -321,48 +292,50 @@ class CSRMatrix : public Matrix<ValueType> {
             throw std::runtime_error("CSRMatrix (append): rowIdx is out of bounds");
         if (colIdx >= numCols)
             throw std::runtime_error("CSRMatrix (append): colIdx is out of bounds");
-        
-        if(value == ValueType(0))
+
+        if (value == ValueType(0))
             return;
-        
+
         const size_t nextPos = rowOffsets.get()[lastAppendedRowIdx + 1];
         fillNextPosUntil(nextPos, rowIdx);
-        
+
         values.get()[nextPos] = value;
         colIdxs.get()[nextPos] = colIdx;
         rowOffsets.get()[rowIdx + 1]++;
     }
-    
-    void finishAppend() override {
-        fillNextPosUntil(rowOffsets.get()[lastAppendedRowIdx + 1], numRows - 1);
-    }
 
-    bool isView() const {
-        return (numRowsAllocated > numRows || isRowAllocatedBefore);
-    }
-    
-    void printValue(std::ostream & os, ValueType val) const {
-      switch (ValueTypeUtils::codeFor<ValueType>) {
-        case ValueTypeCode::SI8 : os << static_cast<int32_t>(val); break;
-        case ValueTypeCode::UI8 : os << static_cast<uint32_t>(val); break;
-        default : os << val; break;
-      }
+    void finishAppend() override { fillNextPosUntil(rowOffsets.get()[lastAppendedRowIdx + 1], numRows - 1); }
+
+    bool isView() const { return (numRowsAllocated > numRows || isRowAllocatedBefore); }
+
+    void printValue(std::ostream &os, ValueType val) const {
+        switch (ValueTypeUtils::codeFor<ValueType>) {
+        case ValueTypeCode::SI8:
+            os << static_cast<int32_t>(val);
+            break;
+        case ValueTypeCode::UI8:
+            os << static_cast<uint32_t>(val);
+            break;
+        default:
+            os << val;
+            break;
+        }
     }
 
-    void print(std::ostream & os) const override {
-        os << "CSRMatrix(" << numRows << 'x' << numCols << ", "
-                << ValueTypeUtils::cppNameFor<ValueType> << ')' << std::endl;
+    void print(std::ostream &os) const override {
+        os << "CSRMatrix(" << numRows << 'x' << numCols << ", " << ValueTypeUtils::cppNameFor<ValueType> << ')'
+           << std::endl;
         // Note that, in general, the values within one row might not be sorted
         // by column index. Thus, the following is a little complicated.
-        ValueType * oneRow = new ValueType[numCols];
+        ValueType *oneRow = new ValueType[numCols];
         for (size_t r = 0; r < numRows; r++) {
             memset(oneRow, 0, numCols * sizeof(ValueType));
             const size_t rowNumNonZeros = getNumNonZeros(r);
-            const size_t * rowColIdxs = getColIdxs(r);
-            const ValueType * rowValues = getValues(r);
-            for(size_t i = 0; i < rowNumNonZeros; i++)
+            const size_t *rowColIdxs = getColIdxs(r);
+            const ValueType *rowValues = getValues(r);
+            for (size_t i = 0; i < rowNumNonZeros; i++)
                 oneRow[rowColIdxs[i]] = rowValues[i];
-            for(size_t c = 0; c < numCols; c++) {
+            for (size_t c = 0; c < numCols; c++) {
                 printValue(os, oneRow[c]);
                 if (c < numCols - 1)
                     os << ' ';
@@ -371,97 +344,93 @@ class CSRMatrix : public Matrix<ValueType> {
         }
         delete[] oneRow;
     }
-    
+
     /**
      * @brief Prints the internal arrays of this matrix.
-     * 
+     *
      * Meant to be used for testing and debugging only. Note that this method
      * works even if the internal state of the matrix is invalid, e.g., due to
      * uninitialized row offsets or column indexes.
-     * 
+     *
      * @param os The stream to print to.
      */
-    void printRaw(std::ostream & os) const {
-        os << "CSRMatrix(" << numRows << 'x' << numCols << ", "
-                << ValueTypeUtils::cppNameFor<ValueType> << ')' << std::endl;
+    void printRaw(std::ostream &os) const {
+        os << "CSRMatrix(" << numRows << 'x' << numCols << ", " << ValueTypeUtils::cppNameFor<ValueType> << ')'
+           << std::endl;
         os << "maxNumNonZeros: \t" << maxNumNonZeros << std::endl;
         os << "values: \t";
-        for(size_t i = 0; i < maxNumNonZeros; i++)
+        for (size_t i = 0; i < maxNumNonZeros; i++)
             os << values.get()[i] << ", ";
         os << std::endl;
         os << "colIdxs: \t";
-        for(size_t i = 0; i < maxNumNonZeros; i++)
+        for (size_t i = 0; i < maxNumNonZeros; i++)
             os << colIdxs.get()[i] << ", ";
         os << std::endl;
         os << "rowOffsets: \t";
-        for(size_t i = 0; i <= numRows; i++)
+        for (size_t i = 0; i <= numRows; i++)
             os << rowOffsets.get()[i] << ", ";
         os << std::endl;
     }
 
-    CSRMatrix* sliceRow(size_t rl, size_t ru) const override {
+    CSRMatrix *sliceRow(size_t rl, size_t ru) const override {
         return DataObjectFactory::create<CSRMatrix>(this, rl, ru);
     }
 
-    CSRMatrix* sliceCol(size_t cl, size_t cu) const override {
+    CSRMatrix *sliceCol(size_t cl, size_t cu) const override {
         // TODO add boundary validation when implementing
         throw std::runtime_error("CSRMatrix does not support sliceCol yet");
     }
 
-    CSRMatrix* slice(size_t rl, size_t ru, size_t cl, size_t cu) const override {
+    CSRMatrix *slice(size_t rl, size_t ru, size_t cl, size_t cu) const override {
         // TODO add boundary validation when implementing
         throw std::runtime_error("CSRMatrix does not support slice yet");
     }
 
-    size_t bufferSize() {
-        return this->getNumItems() * sizeof(ValueType);
-    }
+    size_t bufferSize() { return this->getNumItems() * sizeof(ValueType); }
 
-    bool operator==(const CSRMatrix<ValueType> & rhs) const {
-        // Note that we do not use the generic `get` interface to matrices here since
-        // this operator is meant to be used for writing tests for, besides others,
-        // those generic interfaces.
+    bool operator==(const CSRMatrix<ValueType> &rhs) const {
+        // Note that we do not use the generic `get` interface to matrices here
+        // since this operator is meant to be used for writing tests for,
+        // besides others, those generic interfaces.
 
-        if(this == &rhs)
+        if (this == &rhs)
             return true;
 
         const size_t numRows = this->getNumRows();
         const size_t numCols = this->getNumCols();
 
-        if(numRows != rhs.getNumRows() || numCols != rhs.getNumCols())
+        if (numRows != rhs.getNumRows() || numCols != rhs.getNumCols())
             return false;
 
-        const ValueType * valuesBegLhs = this->getValues(0);
-        const ValueType * valuesEndLhs = this->getValues(numRows);
-        const ValueType * valuesBegRhs = rhs.getValues(0);
-        const ValueType * valuesEndRhs = rhs.getValues(numRows);
+        const ValueType *valuesBegLhs = this->getValues(0);
+        const ValueType *valuesEndLhs = this->getValues(numRows);
+        const ValueType *valuesBegRhs = rhs.getValues(0);
+        const ValueType *valuesEndRhs = rhs.getValues(numRows);
 
         const size_t nnzLhs = valuesEndLhs - valuesBegLhs;
         const size_t nnzRhs = valuesEndRhs - valuesBegRhs;
 
-        if(nnzLhs != nnzRhs)
+        if (nnzLhs != nnzRhs)
             return false;
 
-        if(valuesBegLhs != valuesBegRhs)
-            if(memcmp(valuesBegLhs, valuesBegRhs, nnzLhs * sizeof(ValueType)))
+        if (valuesBegLhs != valuesBegRhs)
+            if (memcmp(valuesBegLhs, valuesBegRhs, nnzLhs * sizeof(ValueType)))
                 return false;
 
-        const size_t * colIdxsBegLhs = this->getColIdxs(0);
-        const size_t * colIdxsBegRhs = rhs.getColIdxs(0);
+        const size_t *colIdxsBegLhs = this->getColIdxs(0);
+        const size_t *colIdxsBegRhs = rhs.getColIdxs(0);
 
-        if(colIdxsBegLhs != colIdxsBegRhs)
-            if(memcmp(colIdxsBegLhs, colIdxsBegRhs, nnzLhs * sizeof(size_t)))
+        if (colIdxsBegLhs != colIdxsBegRhs)
+            if (memcmp(colIdxsBegLhs, colIdxsBegRhs, nnzLhs * sizeof(size_t)))
                 return false;
 
         return true;
     }
 
-    size_t serialize(std::vector<char> &buf) const override ;
+    size_t serialize(std::vector<char> &buf) const override;
 };
 
-template <typename ValueType>
-std::ostream & operator<<(std::ostream & os, const CSRMatrix<ValueType> & obj)
-{
+template <typename ValueType> std::ostream &operator<<(std::ostream &os, const CSRMatrix<ValueType> &obj) {
     obj.print(os);
     return os;
 }
diff --git a/src/runtime/local/datastructures/ChunkedTensor.cpp b/src/runtime/local/datastructures/ChunkedTensor.cpp
index b8876bb07..fb1bb08b2 100644
--- a/src/runtime/local/datastructures/ChunkedTensor.cpp
+++ b/src/runtime/local/datastructures/ChunkedTensor.cpp
@@ -16,20 +16,17 @@
 
 #include "ChunkedTensor.h"
 
-template<typename ValueType>
-void ChunkedTensor<ValueType>::printValue(std::ostream& os, ValueType val) const {
+template <typename ValueType> void ChunkedTensor<ValueType>::printValue(std::ostream &os, ValueType val) const {
     os << val;
 }
 
 // Convert to an integer to print uint8_t values as numbers
 // even if they fall into the range of special ASCII characters.
-template<>
-[[maybe_unused]] void ChunkedTensor<uint8_t>::printValue(std::ostream& os, uint8_t val) const {
+template <> [[maybe_unused]] void ChunkedTensor<uint8_t>::printValue(std::ostream &os, uint8_t val) const {
     os << static_cast<uint32_t>(val);
 }
 
-template<>
-[[maybe_unused]] void ChunkedTensor<int8_t>::printValue(std::ostream& os, int8_t val) const {
+template <> [[maybe_unused]] void ChunkedTensor<int8_t>::printValue(std::ostream &os, int8_t val) const {
     os << static_cast<int32_t>(val);
 }
 
diff --git a/src/runtime/local/datastructures/ChunkedTensor.h b/src/runtime/local/datastructures/ChunkedTensor.h
index 418180b21..4936e5d4a 100644
--- a/src/runtime/local/datastructures/ChunkedTensor.h
+++ b/src/runtime/local/datastructures/ChunkedTensor.h
@@ -28,16 +28,15 @@
 #include <runtime/local/datastructures/ContiguousTensor.h>
 #include <runtime/local/datastructures/DataObjectFactory.h>
 #include <runtime/local/datastructures/Tensor.h>
-#include <runtime/local/io/io_uring/AsyncUtil.h>
 #include <runtime/local/datastructures/ValueTypeUtils.h>
+#include <runtime/local/io/io_uring/AsyncUtil.h>
 
 struct AsyncIOInfo {
     std::atomic<IO_STATUS> status = IO_STATUS::PRE_SUBMISSION;
-    bool needs_byte_reversal      = false;
+    bool needs_byte_reversal = false;
 };
 
-template<typename VT>
-void ReverseArray(VT *data, uint64_t element_count) {
+template <typename VT> void ReverseArray(VT *data, uint64_t element_count) {
     for (uint64_t i = 0; i < element_count; i++) {
         VT tmp = data[i];
         for (uint32_t j = 0; j < sizeof(VT); j++) {
@@ -47,47 +46,57 @@ void ReverseArray(VT *data, uint64_t element_count) {
 }
 
 /**
-* @brief A chunked tensor implementation
-*
-* This tensor implementation is backed by a single array of values. Compared to the ContiguousTensor for a chunked
-* tensor, a chunked tensor of rank N with a given shape (its size in each dimension) is further logically split into
-* chunks with its own, typically much smaller chunk_shape. In this implementation all chunks have the same chunk_shape.
-* Within the chunks the contained elements are arranged in the "higher dimensional equivalent of row-major" order, i.e.
-* in the same order as a equivalent ContiguosTensor with shape==chunk_shape.
-* The chunks themselves are then also arranged in "row-major" order. An alternative view of this memory layout is to
-* interpret the initially rank N tensor as a 2N-1 rank tensor arranged in "row-major" order (with the "last" N
-* dimensions given by the chunk_shape and the first N-1 dimensions reduced in size by the corresponding factor).
-*
-* The two main advantages of this memory layout are:
-*
-* 1. Fine grained and flexible control over the memory layout of the data structure by initially choosing a specific
-* chunk_shape or later on choosing to "rechunk" the tensor to a new chunk_shape.
-*
-* This allows the data structure layout to be adapted to the access pattern of specific kernels making their access
-* patterns potentially significantly more cache friendly, which is especially relevant for larger tensors. Typical
-* examples here are e.g. image processing kernels operating on 2D slices of a tensor with N >= 2 (smoothing, gradients,
-* lvl adjustments, etc.), reduce operations over dimensions that are not the primary dimension (a common example would
-* be an averaging step over the time dimension) or e.g. solving higher dim differential equations over a grid.
-*
-* 2. Provides a convenient point to sensibly integrate async I/O and ideally also async processing, by loading individual
-*    chunks asynchronously and ideally starting to process them asynchronously as well, immediately after a chunk has
-*    been read rather than waiting for all chunks to arrive, i.e. block.
-*
-* While this implementation is already an improvement in respect to memory layout and the options for partial and/or
-* async I/O and/or processing, the choice here to use a single allocation rather than individual allocations per chunk
-* limit its use to tensors that fit into memory and is potentially wasteful even for those which do.
-*/
-template<typename ValueType>
-class ChunkedTensor : public Tensor<ValueType> {
-    public:
+ * @brief A chunked tensor implementation
+ *
+ * This tensor implementation is backed by a single array of values. Compared to
+ * the ContiguousTensor for a chunked tensor, a chunked tensor of rank N with a
+ * given shape (its size in each dimension) is further logically split into
+ * chunks with its own, typically much smaller chunk_shape. In this
+ * implementation all chunks have the same chunk_shape. Within the chunks the
+ * contained elements are arranged in the "higher dimensional equivalent of
+ * row-major" order, i.e. in the same order as a equivalent ContiguosTensor with
+ * shape==chunk_shape. The chunks themselves are then also arranged in
+ * "row-major" order. An alternative view of this memory layout is to interpret
+ * the initially rank N tensor as a 2N-1 rank tensor arranged in "row-major"
+ * order (with the "last" N dimensions given by the chunk_shape and the first
+ * N-1 dimensions reduced in size by the corresponding factor).
+ *
+ * The two main advantages of this memory layout are:
+ *
+ * 1. Fine grained and flexible control over the memory layout of the data
+ * structure by initially choosing a specific chunk_shape or later on choosing
+ * to "rechunk" the tensor to a new chunk_shape.
+ *
+ * This allows the data structure layout to be adapted to the access pattern of
+ * specific kernels making their access patterns potentially significantly more
+ * cache friendly, which is especially relevant for larger tensors. Typical
+ * examples here are e.g. image processing kernels operating on 2D slices of a
+ * tensor with N >= 2 (smoothing, gradients, lvl adjustments, etc.), reduce
+ * operations over dimensions that are not the primary dimension (a common
+ * example would be an averaging step over the time dimension) or e.g. solving
+ * higher dim differential equations over a grid.
+ *
+ * 2. Provides a convenient point to sensibly integrate async I/O and ideally
+ * also async processing, by loading individual chunks asynchronously and
+ * ideally starting to process them asynchronously as well, immediately after a
+ * chunk has been read rather than waiting for all chunks to arrive, i.e. block.
+ *
+ * While this implementation is already an improvement in respect to memory
+ * layout and the options for partial and/or async I/O and/or processing, the
+ * choice here to use a single allocation rather than individual allocations per
+ * chunk limit its use to tensors that fit into memory and is potentially
+ * wasteful even for those which do.
+ */
+template <typename ValueType> class ChunkedTensor : public Tensor<ValueType> {
+  public:
     std::vector<size_t> chunk_shape;
     size_t chunk_element_count;
     std::vector<size_t> chunk_strides;
     std::vector<size_t> intra_chunk_strides;
     std::vector<size_t> chunks_per_dim;
-    // In this implementation overhanging chunks are stored as full chunks instead
-    // of partial chunks. This also means that total_size_in_elements and
-    // total_element_count are not necessarily the same
+    // In this implementation overhanging chunks are stored as full chunks
+    // instead of partial chunks. This also means that total_size_in_elements
+    // and total_element_count are not necessarily the same
     size_t total_size_in_elements;
     size_t total_chunk_count;
 
@@ -96,13 +105,11 @@ class ChunkedTensor : public Tensor<ValueType> {
 
     std::shared_ptr<ValueType[]> data;
 
-    private:
+  private:
     // Grant DataObjectFactory access to the private constructors and
     // destructors.
-    template<class DataType, typename ... ArgTypes>
-    friend DataType * DataObjectFactory::create(ArgTypes ...);
-    template<class DataType>
-    friend void DataObjectFactory::destroy(const DataType * obj);
+    template <class DataType, typename... ArgTypes> friend DataType *DataObjectFactory::create(ArgTypes...);
+    template <class DataType> friend void DataObjectFactory::destroy(const DataType *obj);
 
     ChunkedTensor(const std::vector<size_t> &tensor_shape, const std::vector<size_t> &chunk_shape, InitCode init_code)
         : Tensor<ValueType>::Tensor(tensor_shape), chunk_shape(chunk_shape) {
@@ -114,7 +121,7 @@ class ChunkedTensor : public Tensor<ValueType> {
             intra_chunk_strides[0] = 1;
         }
 
-        for(size_t i=0; i<this->rank; i++) {
+        for (size_t i = 0; i < this->rank; i++) {
             if ((tensor_shape[i] == 0) || chunk_shape[i] == 0) {
                 throw std::runtime_error("Tensors with dimensions of extent 0 are disallowed.");
             }
@@ -133,8 +140,8 @@ class ChunkedTensor : public Tensor<ValueType> {
         chunks_per_dim.resize(this->rank);
         for (size_t i = 0; i < this->rank; i++) {
             chunks_per_dim[i] = this->tensor_shape[i] % chunk_shape[i] == 0
-                                  ? this->tensor_shape[i] / chunk_shape[i]
-                                  : (this->tensor_shape[i] / chunk_shape[i]) + 1;
+                                    ? this->tensor_shape[i] / chunk_shape[i]
+                                    : (this->tensor_shape[i] / chunk_shape[i]) + 1;
             total_chunk_count *= chunks_per_dim[i];
             chunk_strides[i] = i == 0 ? chunk_element_count : chunk_strides[i - 1] * chunks_per_dim[i - 1];
         }
@@ -144,77 +151,77 @@ class ChunkedTensor : public Tensor<ValueType> {
         data = std::shared_ptr<ValueType[]>(new ValueType[total_size_in_elements], std::default_delete<ValueType[]>());
 
         chunk_materialization_flags = std::make_unique<std::atomic<bool>[]>(total_chunk_count);
-        chunk_io_futures            = std::make_unique<AsyncIOInfo[]>(total_chunk_count);
+        chunk_io_futures = std::make_unique<AsyncIOInfo[]>(total_chunk_count);
         for (size_t i = 0; i < total_chunk_count; i++) {
             chunk_io_futures[i].needs_byte_reversal = false;
-            chunk_io_futures[i].status              = IO_STATUS::PRE_SUBMISSION;
+            chunk_io_futures[i].status = IO_STATUS::PRE_SUBMISSION;
         }
 
         switch (init_code) {
-            case InitCode::NONE:
-                for (size_t i = 0; i < total_chunk_count; i++) {
-                    chunk_materialization_flags[i] = false;
-                }
-                break;
-            case InitCode::ZERO: {
-                for (size_t i = 0; i < total_chunk_count; i++) {
-                    chunk_materialization_flags[i] = true;
-                }
-                for (size_t i = 0; i < total_size_in_elements; i++) {
-                    data.get()[i] = 0;
-                }
-                break;
+        case InitCode::NONE:
+            for (size_t i = 0; i < total_chunk_count; i++) {
+                chunk_materialization_flags[i] = false;
             }
-            case InitCode::MAX: {
-                for (size_t i = 0; i < total_chunk_count; i++) {
-                    chunk_materialization_flags[i] = true;
-                }
-                for (size_t i = 0; i < total_size_in_elements; i++) {
-                    data.get()[i] = std::numeric_limits<ValueType>::max();
-                }
-                break;
+            break;
+        case InitCode::ZERO: {
+            for (size_t i = 0; i < total_chunk_count; i++) {
+                chunk_materialization_flags[i] = true;
             }
-            case InitCode::MIN: {
-                for (size_t i = 0; i < total_chunk_count; i++) {
-                    chunk_materialization_flags[i] = true;
-                }
-                for (size_t i = 0; i < total_size_in_elements; i++) {
-                    data.get()[i] = std::numeric_limits<ValueType>::min();
-                }
-                break;
+            for (size_t i = 0; i < total_size_in_elements; i++) {
+                data.get()[i] = 0;
+            }
+            break;
+        }
+        case InitCode::MAX: {
+            for (size_t i = 0; i < total_chunk_count; i++) {
+                chunk_materialization_flags[i] = true;
+            }
+            for (size_t i = 0; i < total_size_in_elements; i++) {
+                data.get()[i] = std::numeric_limits<ValueType>::max();
+            }
+            break;
+        }
+        case InitCode::MIN: {
+            for (size_t i = 0; i < total_chunk_count; i++) {
+                chunk_materialization_flags[i] = true;
+            }
+            for (size_t i = 0; i < total_size_in_elements; i++) {
+                data.get()[i] = std::numeric_limits<ValueType>::min();
             }
-            case InitCode::IOTA: {
-                for (size_t i = 0; i < total_chunk_count; i++) {
-                    chunk_materialization_flags[i] = true;
+            break;
+        }
+        case InitCode::IOTA: {
+            for (size_t i = 0; i < total_chunk_count; i++) {
+                chunk_materialization_flags[i] = true;
+            }
+            if (this->rank == 0) {
+                data[0] = 0;
+            } else {
+                std::vector<size_t> linear_strides;
+                linear_strides.resize(this->rank);
+
+                linear_strides[0] = 1;
+                for (size_t i = 1; i < this->rank; i++) {
+                    linear_strides[i] = linear_strides[i - 1] * this->tensor_shape[i - 1];
                 }
-                if (this->rank == 0) {
-                    data[0] = 0;
-                } else {
-                    std::vector<size_t> linear_strides;
-                    linear_strides.resize(this->rank);
-
-                    linear_strides[0] = 1;
-                    for (size_t i = 1; i < this->rank; i++) {
-                        linear_strides[i] = linear_strides[i - 1] * this->tensor_shape[i - 1];
-                    }
-                    std::vector<size_t> current_ids;
-                    current_ids.resize(this->rank);
-
-                    for (size_t i = 0; i < this->total_element_count; i++) {
-                        size_t tmp = i;
-                        for (int64_t j = this->rank - 1; j >= 0; j--) {
-                            current_ids[static_cast<size_t>(j)] = tmp / linear_strides[static_cast<size_t>(j)];
-                            tmp                                 = tmp % linear_strides[static_cast<size_t>(j)];
-                        }
-                        set(current_ids, i);
+                std::vector<size_t> current_ids;
+                current_ids.resize(this->rank);
+
+                for (size_t i = 0; i < this->total_element_count; i++) {
+                    size_t tmp = i;
+                    for (int64_t j = this->rank - 1; j >= 0; j--) {
+                        current_ids[static_cast<size_t>(j)] = tmp / linear_strides[static_cast<size_t>(j)];
+                        tmp = tmp % linear_strides[static_cast<size_t>(j)];
                     }
+                    set(current_ids, i);
                 }
-                break;
             }
+            break;
+        }
         }
     }
 
-    template<typename VTArg>
+    template <typename VTArg>
     explicit ChunkedTensor(const ChunkedTensor<VTArg> *other)
         : Tensor<ValueType>::Tensor(other->tensor_shape), chunk_shape(other->chunk_shape),
           chunk_element_count(other->chunk_element_count), chunk_strides(other->chunk_strides),
@@ -228,30 +235,30 @@ class ChunkedTensor : public Tensor<ValueType> {
         chunk_io_futures = std::make_unique<AsyncIOInfo[]>(total_chunk_count);
         for (size_t i = 0; i < total_chunk_count; i++) {
             chunk_io_futures[i].needs_byte_reversal = other->chunk_io_futures[i].needs_byte_reversal;
-            chunk_io_futures[i].status              = other->chunk_io_futures[i].status.load();
+            chunk_io_futures[i].status = other->chunk_io_futures[i].status.load();
         }
-        for(size_t i=0; i<total_size_in_elements; i++) {
+        for (size_t i = 0; i < total_size_in_elements; i++) {
             data[i] = static_cast<ValueType>(other->data[i]);
         }
     }
 
     ChunkedTensor(const DenseMatrix<ValueType> *matrix, size_t chunk_size_x, size_t chunk_size_y)
         : Tensor<ValueType>::Tensor(matrix->getNumRows(), matrix->getNumCols()) {
-        for(size_t i=0; i<this->rank; i++) {
-            if ((this->tensor_shape[i] == 0) || chunk_shape[i] == 0)  {
+        for (size_t i = 0; i < this->rank; i++) {
+            if ((this->tensor_shape[i] == 0) || chunk_shape[i] == 0) {
                 throw std::runtime_error("Tensors with dimensions of extent 0 are disallowed.");
             }
         }
 
-        chunk_shape         = {chunk_size_x, chunk_size_y};
+        chunk_shape = {chunk_size_x, chunk_size_y};
         chunk_element_count = chunk_size_x * chunk_size_y;
-        chunks_per_dim      = {this->tensor_shape[0] % chunk_size_x == 0 ? this->tensor_shape[0] / chunk_size_x
-                                                                         : 1 + (this->tensor_shape[0] / chunk_size_x),
+        chunks_per_dim = {this->tensor_shape[0] % chunk_size_x == 0 ? this->tensor_shape[0] / chunk_size_x
+                                                                    : 1 + (this->tensor_shape[0] / chunk_size_x),
                           this->tensor_shape[1] % chunk_size_y == 0 ? this->tensor_shape[1] / chunk_size_y
-                                                                         : 1 + (this->tensor_shape[1] / chunk_size_y)};
-        intra_chunk_strides    = {1, chunk_size_x};
-        chunk_strides          = {chunk_element_count, chunk_element_count * chunks_per_dim[0]};
-        total_chunk_count      = chunks_per_dim[0] * chunks_per_dim[1];
+                                                                    : 1 + (this->tensor_shape[1] / chunk_size_y)};
+        intra_chunk_strides = {1, chunk_size_x};
+        chunk_strides = {chunk_element_count, chunk_element_count * chunks_per_dim[0]};
+        total_chunk_count = chunks_per_dim[0] * chunks_per_dim[1];
         total_size_in_elements = total_chunk_count * chunk_element_count;
 
         data = std::shared_ptr<ValueType[]>(new ValueType[total_size_in_elements], std::default_delete<ValueType[]>());
@@ -264,16 +271,16 @@ class ChunkedTensor : public Tensor<ValueType> {
         }
 
         chunk_materialization_flags = std::make_unique<std::atomic<bool>[]>(total_chunk_count);
-        chunk_io_futures            = std::make_unique<AsyncIOInfo[]>(total_chunk_count);
+        chunk_io_futures = std::make_unique<AsyncIOInfo[]>(total_chunk_count);
         for (size_t i = 0; i < total_chunk_count; i++) {
             chunk_materialization_flags[i] = true;
             chunk_io_futures[i].needs_byte_reversal = false;
-            chunk_io_futures[i].status              = IO_STATUS::PRE_SUBMISSION;
+            chunk_io_futures[i].status = IO_STATUS::PRE_SUBMISSION;
         }
     }
 
-    // Use this + rechunk() for a conversion from a contiguous tensor to a chunked
-    // one with arbitrary chunking
+    // Use this + rechunk() for a conversion from a contiguous tensor to a
+    // chunked one with arbitrary chunking
     explicit ChunkedTensor(const ContiguousTensor<ValueType> *other)
         : Tensor<ValueType>::Tensor(other->tensor_shape), chunk_shape(other->tensor_shape) {
         chunk_strides.resize(this->rank);
@@ -283,7 +290,7 @@ class ChunkedTensor : public Tensor<ValueType> {
             intra_chunk_strides[0] = 1;
         }
 
-        for(size_t i=0; i<this->rank; i++) {
+        for (size_t i = 0; i < this->rank; i++) {
             if ((this->tensor_shape[i] == 0) || chunk_shape[i] == 0) {
                 throw std::runtime_error("Tensors with dimensions of extent 0 are disallowed.");
             }
@@ -300,7 +307,7 @@ class ChunkedTensor : public Tensor<ValueType> {
         for (size_t i = 0; i < this->rank; i++) {
             chunks_per_dim[i] = 1;
             total_chunk_count = 1;
-            chunk_strides[i]  = chunk_element_count;
+            chunk_strides[i] = chunk_element_count;
         }
 
         total_size_in_elements = this->total_element_count;
@@ -316,18 +323,15 @@ class ChunkedTensor : public Tensor<ValueType> {
         chunk_io_futures = std::make_unique<AsyncIOInfo[]>(total_chunk_count);
         for (size_t i = 0; i < total_chunk_count; i++) {
             chunk_io_futures[i].needs_byte_reversal = false;
-            chunk_io_futures[i].status              = IO_STATUS::PRE_SUBMISSION;
+            chunk_io_futures[i].status = IO_STATUS::PRE_SUBMISSION;
         }
     }
 
-    ~ChunkedTensor() override {};
+    ~ChunkedTensor() override{};
 
     void printValue(std::ostream &os, ValueType val) const;
-    
-    public:
-
-    public:
 
+  public:
     bool operator==(const ChunkedTensor<ValueType> &rhs) const {
         if (this->tensor_shape != rhs.tensor_shape || chunk_shape != rhs.chunk_shape) {
             return false;
@@ -361,12 +365,12 @@ class ChunkedTensor : public Tensor<ValueType> {
     }
 
     size_t getLinearId(const std::vector<size_t> &indices) const {
-        size_t chunk_id       = indices[0] / chunk_shape[0];
+        size_t chunk_id = indices[0] / chunk_shape[0];
         size_t intra_chunk_id = indices[0] % chunk_shape[0];
-        size_t linear_id      = intra_chunk_id + chunk_strides[0] * chunk_id;
+        size_t linear_id = intra_chunk_id + chunk_strides[0] * chunk_id;
 
         for (size_t i = 1; i < this->rank; i++) {
-            chunk_id       = indices[i] / chunk_shape[i];
+            chunk_id = indices[i] / chunk_shape[i];
             intra_chunk_id = indices[i] % chunk_shape[i];
             linear_id += (intra_chunk_id * intra_chunk_strides[i]) + (chunk_id * chunk_strides[i]);
         }
@@ -390,7 +394,7 @@ class ChunkedTensor : public Tensor<ValueType> {
         for (size_t i = 1; i < this->rank; i++) {
             chunk_id_strides[i] = (chunks_per_dim[i - 1] * chunk_id_strides[i - 1]);
         }
-        
+
         chunk_ids.resize(this->rank);
         for (int64_t i = this->rank - 1; i >= 0; i--) {
             chunk_ids[i] = linear_chunk_id / chunk_id_strides[i];
@@ -430,9 +434,10 @@ class ChunkedTensor : public Tensor<ValueType> {
         return chunk_ids;
     }
 
-    // Ranges inclusive on lower bound and exclusive on upper bound i.e. [x,y] at dsl lvl is in math == [x:y)
-    std::optional<std::vector<std::pair<size_t, size_t>>> GetChunkRangeFromIdRange(
-      std::vector<std::pair<size_t, size_t>> element_id_ranges) const {
+    // Ranges inclusive on lower bound and exclusive on upper bound i.e. [x,y]
+    // at dsl lvl is in math == [x:y)
+    std::optional<std::vector<std::pair<size_t, size_t>>>
+    GetChunkRangeFromIdRange(std::vector<std::pair<size_t, size_t>> element_id_ranges) const {
         if (element_id_ranges.size() != this->rank) {
             return std::nullopt;
         }
@@ -456,9 +461,10 @@ class ChunkedTensor : public Tensor<ValueType> {
         return chunk_id_ranges;
     }
 
-    // Ranges inclusive on lower bound and exclusive on upper bound i.e. [x,y] at dsl lvl is in math == [x:y)
-    std::optional<std::vector<std::vector<size_t>>> GetChunkListFromIdRange (
-      std::vector<std::pair<size_t, size_t>> element_id_ranges) const {
+    // Ranges inclusive on lower bound and exclusive on upper bound i.e. [x,y]
+    // at dsl lvl is in math == [x:y)
+    std::optional<std::vector<std::vector<size_t>>>
+    GetChunkListFromIdRange(std::vector<std::pair<size_t, size_t>> element_id_ranges) const {
         if (element_id_ranges.size() != this->rank) {
             return std::nullopt;
         }
@@ -502,16 +508,17 @@ class ChunkedTensor : public Tensor<ValueType> {
             chunk_id_list[i].resize(this->rank);
             for (int64_t j = (this->rank - 1); j >= 0; j--) {
                 chunk_id_list[i][static_cast<size_t>(j)] = std::get<0>(chunk_id_ranges[j]) + (tmp / strides[j]);
-                tmp                                      = tmp % strides[j];
+                tmp = tmp % strides[j];
             }
         }
 
         return chunk_id_list;
     }
 
-    // Ranges inclusive on lower bound and exclusive on upper bound i.e. [x,y] at dsl lvl is in math == [x:y)
-    std::optional<std::vector<std::vector<size_t>>> GetChunkListFromChunkRange (
-      std::vector<std::pair<size_t, size_t>> chunk_id_ranges) const {
+    // Ranges inclusive on lower bound and exclusive on upper bound i.e. [x,y]
+    // at dsl lvl is in math == [x:y)
+    std::optional<std::vector<std::vector<size_t>>>
+    GetChunkListFromChunkRange(std::vector<std::pair<size_t, size_t>> chunk_id_ranges) const {
         if (chunk_id_ranges.size() != this->rank) {
             return std::nullopt;
         }
@@ -548,7 +555,7 @@ class ChunkedTensor : public Tensor<ValueType> {
             chunk_id_list[i].resize(this->rank);
             for (int64_t j = (this->rank - 1); j >= 0; j--) {
                 chunk_id_list[i][static_cast<size_t>(j)] = std::get<0>(chunk_id_ranges[j]) + (tmp / strides[j]);
-                tmp                                      = tmp % strides[j];
+                tmp = tmp % strides[j];
             }
         }
 
@@ -573,7 +580,7 @@ class ChunkedTensor : public Tensor<ValueType> {
                 ReverseArray(getPtrToChunk(linear_chunk_id), chunk_element_count);
                 chunk_io_futures[linear_chunk_id].needs_byte_reversal = false;
             }
-            
+
             chunk_materialization_flags[linear_chunk_id] = true;
             return true;
         }
@@ -641,7 +648,7 @@ class ChunkedTensor : public Tensor<ValueType> {
         if (this->rank == 0) {
             return data.get();
         }
-        return &(data.get()[linear_chunk_id*chunk_element_count]);
+        return &(data.get()[linear_chunk_id * chunk_element_count]);
     }
 
     std::optional<std::unique_ptr<ValueType[]>> tryGetChunk(const std::vector<size_t> &chunk_indices) const {
@@ -651,7 +658,7 @@ class ChunkedTensor : public Tensor<ValueType> {
 
         if (this->rank == 0) {
             auto result = std::make_unique<ValueType[]>(1);
-            result[0]   = data.get()[0];
+            result[0] = data.get()[0];
             return result;
         }
 
@@ -675,7 +682,7 @@ class ChunkedTensor : public Tensor<ValueType> {
     std::unique_ptr<ValueType[]> getChunk(const std::vector<size_t> &chunk_indices) const {
         if (this->rank == 0) {
             auto result = std::make_unique<ValueType[]>(1);
-            result[0]   = data.get()[0];
+            result[0] = data.get()[0];
             return result;
         }
 
@@ -740,7 +747,7 @@ class ChunkedTensor : public Tensor<ValueType> {
 
     void setChunk(const std::vector<size_t> &chunk_indices, ValueType *values) {
         if (this->rank == 0) {
-            data.get()[0]                                                            = values[0];
+            data.get()[0] = values[0];
             chunk_materialization_flags[getLinearChunkIdFromChunkIds(chunk_indices)] = true;
             return;
         }
@@ -756,7 +763,7 @@ class ChunkedTensor : public Tensor<ValueType> {
         }
 
         if (this->rank == 0) {
-            data.get()[0]                                                            = other.data.get()[0];
+            data.get()[0] = other.data.get()[0];
             chunk_materialization_flags[getLinearChunkIdFromChunkIds(chunk_indices)] = true;
             return true;
         }
@@ -811,11 +818,11 @@ class ChunkedTensor : public Tensor<ValueType> {
         size_t new_total_chunk_count = 1;
         for (size_t i = 0; i < this->rank; i++) {
             new_chunks_per_dim[i] = this->tensor_shape[i] % new_chunk_shape[i] == 0
-                                      ? this->tensor_shape[i] / new_chunk_shape[i]
-                                      : (this->tensor_shape[i] / new_chunk_shape[i]) + 1;
+                                        ? this->tensor_shape[i] / new_chunk_shape[i]
+                                        : (this->tensor_shape[i] / new_chunk_shape[i]) + 1;
             new_total_chunk_count = new_total_chunk_count * new_chunks_per_dim[i];
             new_chunk_strides[i] =
-              i == 0 ? new_chunk_element_count : new_chunk_strides[i - 1] * new_chunks_per_dim[i - 1];
+                i == 0 ? new_chunk_element_count : new_chunk_strides[i - 1] * new_chunks_per_dim[i - 1];
         }
 
         size_t new_total_size_in_elements = new_total_chunk_count * new_chunk_element_count;
@@ -837,7 +844,7 @@ class ChunkedTensor : public Tensor<ValueType> {
             size_t tmp = i;
             for (int64_t j = this->rank - 1; j >= 0; j--) {
                 current_old_chunk_ids[static_cast<size_t>(j)] = tmp / chunk_count_strides[static_cast<size_t>(j)];
-                tmp                                           = tmp % chunk_count_strides[static_cast<size_t>(j)];
+                tmp = tmp % chunk_count_strides[static_cast<size_t>(j)];
             }
 
             size_t current_chunk_offset = 0;
@@ -849,7 +856,7 @@ class ChunkedTensor : public Tensor<ValueType> {
                 tmp = j;
                 for (int64_t k = this->rank - 1; k >= 0; k--) {
                     current_old_element_ids[static_cast<size_t>(k)] =
-                      tmp / intra_chunk_strides[static_cast<size_t>(k)] + current_old_chunk_ids[k] * chunk_shape[k];
+                        tmp / intra_chunk_strides[static_cast<size_t>(k)] + current_old_chunk_ids[k] * chunk_shape[k];
                     tmp = tmp % intra_chunk_strides[static_cast<size_t>(k)];
                 }
 
@@ -867,7 +874,7 @@ class ChunkedTensor : public Tensor<ValueType> {
 
                 size_t new_linear_id = 0;
                 for (size_t k = 0; k < this->rank; k++) {
-                    size_t current_new_chunk_id       = current_old_element_ids[k] / new_chunk_shape[k];
+                    size_t current_new_chunk_id = current_old_element_ids[k] / new_chunk_shape[k];
                     size_t current_new_intra_chunk_id = current_old_element_ids[k] % new_chunk_shape[k];
 
                     new_linear_id += current_new_chunk_id * new_chunk_strides[k] +
@@ -878,28 +885,29 @@ class ChunkedTensor : public Tensor<ValueType> {
             }
         }
 
-        chunk_shape            = new_chunk_shape;
-        chunk_element_count    = new_chunk_element_count;
-        chunk_strides          = new_chunk_strides;
-        intra_chunk_strides    = new_intra_chunk_strides;
-        chunks_per_dim         = new_chunks_per_dim;
+        chunk_shape = new_chunk_shape;
+        chunk_element_count = new_chunk_element_count;
+        chunk_strides = new_chunk_strides;
+        intra_chunk_strides = new_intra_chunk_strides;
+        chunks_per_dim = new_chunks_per_dim;
         total_size_in_elements = new_total_size_in_elements;
-        total_chunk_count      = new_total_chunk_count;
+        total_chunk_count = new_total_chunk_count;
 
         data = new_data;
 
         return true;
     }
 
-    // Ranges inclusive on lower bound and exclusive on upper bound i.e. [x,y] at dsl lvl is in math == [x:y)
-    ChunkedTensor<ValueType> *tryDiceAtChunkLvl(std::vector<std::pair<size_t, size_t>>& chunk_ranges) const {
+    // Ranges inclusive on lower bound and exclusive on upper bound i.e. [x,y]
+    // at dsl lvl is in math == [x:y)
+    ChunkedTensor<ValueType> *tryDiceAtChunkLvl(std::vector<std::pair<size_t, size_t>> &chunk_ranges) const {
         if (chunk_ranges.size() != this->rank) {
             return nullptr;
         }
         if (this->rank == 0) {
             ChunkedTensor<ValueType> *tmp = DataObjectFactory::create<ChunkedTensor<ValueType>>(
-              this->tensor_shape, this->chunk_shape, InitCode::NONE);
-            tmp->data.get()[0]             = data.get()[0];
+                this->tensor_shape, this->chunk_shape, InitCode::NONE);
+            tmp->data.get()[0] = data.get()[0];
             chunk_materialization_flags[0] = true;
             return tmp;
         }
@@ -920,18 +928,19 @@ class ChunkedTensor : public Tensor<ValueType> {
         new_chunk_strides[0] = 1;
         new_chunk_count_per_dim[0] = std::get<1>(chunk_ranges[0]) - std::get<0>(chunk_ranges[0]) + 1;
         size_t chunk_count = new_chunk_count_per_dim[0];
-        for(size_t i=1; i<this->rank; i++) {
+        for (size_t i = 1; i < this->rank; i++) {
             new_chunk_count_per_dim[i] = std::get<1>(chunk_ranges[i]) - std::get<0>(chunk_ranges[i]) + 1;
             chunk_count *= new_chunk_count_per_dim[i];
-            new_chunk_strides[i] = new_chunk_strides[i-1] * new_chunk_count_per_dim[i-1];
+            new_chunk_strides[i] = new_chunk_strides[i - 1] * new_chunk_count_per_dim[i - 1];
         }
 
         std::vector<size_t> current_chunk_ids;
         current_chunk_ids.resize(this->rank);
-        for(size_t i=0; i<chunk_count; i++) {
+        for (size_t i = 0; i < chunk_count; i++) {
             size_t tmp = i;
-            for(int64_t j=this->rank-1; j>=0; j--) {
-                current_chunk_ids[static_cast<size_t>(j)] = (tmp / new_chunk_strides[static_cast<size_t>(j)]) + std::get<0>(chunk_ranges[static_cast<size_t>(j)]);
+            for (int64_t j = this->rank - 1; j >= 0; j--) {
+                current_chunk_ids[static_cast<size_t>(j)] = (tmp / new_chunk_strides[static_cast<size_t>(j)]) +
+                                                            std::get<0>(chunk_ranges[static_cast<size_t>(j)]);
                 tmp = tmp % new_chunk_strides[j];
             }
 
@@ -955,7 +964,7 @@ class ChunkedTensor : public Tensor<ValueType> {
         }
 
         ChunkedTensor<ValueType> *new_tensor =
-          DataObjectFactory::create<ChunkedTensor<ValueType>>(new_tensor_shape, chunk_shape, InitCode::NONE);
+            DataObjectFactory::create<ChunkedTensor<ValueType>>(new_tensor_shape, chunk_shape, InitCode::NONE);
 
         std::vector<size_t> new_chunk_count_strides;
         new_chunk_count_strides.push_back(1);
@@ -973,7 +982,7 @@ class ChunkedTensor : public Tensor<ValueType> {
             for (int64_t j = this->rank - 1; j >= 0; j--) {
                 new_current_chunk_ids[static_cast<size_t>(j)] = tmp / new_chunk_count_strides[static_cast<size_t>(j)];
                 old_current_chunk_ids[static_cast<size_t>(j)] =
-                  new_current_chunk_ids[static_cast<size_t>(j)] + std::get<0>(chunk_ranges[static_cast<size_t>(j)]);
+                    new_current_chunk_ids[static_cast<size_t>(j)] + std::get<0>(chunk_ranges[static_cast<size_t>(j)]);
                 tmp = tmp % new_chunk_count_strides[static_cast<size_t>(j)];
             }
 
@@ -986,7 +995,8 @@ class ChunkedTensor : public Tensor<ValueType> {
         return new_tensor;
     }
 
-    // Ranges inclusive on lower bound and exclusive on upper bound i.e. [x,y] at dsl lvl is in math == [x:y)
+    // Ranges inclusive on lower bound and exclusive on upper bound i.e. [x,y]
+    // at dsl lvl is in math == [x:y)
     ChunkedTensor<ValueType> *tryDice(std::vector<std::pair<size_t, size_t>> index_ranges,
                                       const std::vector<size_t> &new_chunk_shape) const {
         if (index_ranges.size() != this->rank || new_chunk_shape.size() != this->rank) {
@@ -995,7 +1005,7 @@ class ChunkedTensor : public Tensor<ValueType> {
 
         if (this->rank == 0) {
             ChunkedTensor<ValueType> *tmp =
-              DataObjectFactory::create<ChunkedTensor<ValueType>>(this->tensor_shape, chunk_shape, InitCode::NONE);
+                DataObjectFactory::create<ChunkedTensor<ValueType>>(this->tensor_shape, chunk_shape, InitCode::NONE);
             tmp->data.get()[0] = data.get()[0];
             return tmp;
         }
@@ -1014,20 +1024,23 @@ class ChunkedTensor : public Tensor<ValueType> {
         new_chunk_strides.resize(this->rank);
         new_chunk_count_per_dim.resize(this->rank);
         new_chunk_strides[0] = 1;
-        new_chunk_count_per_dim[0] = (std::get<1>(index_ranges[0]) / chunk_shape[0]) - (std::get<0>(index_ranges[0]) / chunk_shape[0]) + 1;
+        new_chunk_count_per_dim[0] =
+            (std::get<1>(index_ranges[0]) / chunk_shape[0]) - (std::get<0>(index_ranges[0]) / chunk_shape[0]) + 1;
         size_t chunk_count = new_chunk_count_per_dim[0];
-        for(size_t i=1; i<this->rank; i++) {
-            new_chunk_count_per_dim[i] = (std::get<1>(index_ranges[i]) / chunk_shape[i]) - (std::get<0>(index_ranges[i]) / chunk_shape[i]) + 1;
+        for (size_t i = 1; i < this->rank; i++) {
+            new_chunk_count_per_dim[i] =
+                (std::get<1>(index_ranges[i]) / chunk_shape[i]) - (std::get<0>(index_ranges[i]) / chunk_shape[i]) + 1;
             chunk_count *= new_chunk_count_per_dim[i];
-            new_chunk_strides[i] = new_chunk_strides[i-1] * new_chunk_count_per_dim[i-1];
+            new_chunk_strides[i] = new_chunk_strides[i - 1] * new_chunk_count_per_dim[i - 1];
         }
 
         std::vector<size_t> current_chunk_ids;
         current_chunk_ids.resize(this->rank);
-        for(size_t i=0; i<chunk_count; i++) {
+        for (size_t i = 0; i < chunk_count; i++) {
             size_t tmp = i;
-            for(int64_t j=this->rank-1; j>=0; j--) {
-                current_chunk_ids[static_cast<size_t>(j)] = (tmp / new_chunk_strides[j]) + (std::get<0>(index_ranges[j]) / chunk_shape[j]);
+            for (int64_t j = this->rank - 1; j >= 0; j--) {
+                current_chunk_ids[static_cast<size_t>(j)] =
+                    (tmp / new_chunk_strides[j]) + (std::get<0>(index_ranges[j]) / chunk_shape[j]);
                 tmp = tmp % new_chunk_strides[j];
             }
 
@@ -1043,7 +1056,7 @@ class ChunkedTensor : public Tensor<ValueType> {
         }
 
         ChunkedTensor<ValueType> *new_tensor =
-          DataObjectFactory::create<ChunkedTensor<ValueType>>(new_tensor_shape, new_chunk_shape, InitCode::NONE);
+            DataObjectFactory::create<ChunkedTensor<ValueType>>(new_tensor_shape, new_chunk_shape, InitCode::NONE);
 
         for (size_t i = 0; i < new_tensor->total_chunk_count; i++) {
             new_tensor->chunk_materialization_flags[i] = true;
@@ -1065,15 +1078,15 @@ class ChunkedTensor : public Tensor<ValueType> {
             size_t tmp = i;
             for (int64_t j = this->rank - 1; j >= 0; j--) {
                 current_new_chunk_ids[static_cast<size_t>(j)] = tmp / new_chunk_count_strides[static_cast<size_t>(j)];
-                tmp                                           = tmp % new_chunk_count_strides[static_cast<size_t>(j)];
+                tmp = tmp % new_chunk_count_strides[static_cast<size_t>(j)];
             }
 
             for (size_t j = 0; j < new_tensor->chunk_element_count; j++) {
                 tmp = j;
                 for (int64_t k = this->rank - 1; k >= 0; k--) {
                     current_new_element_ids[static_cast<size_t>(k)] =
-                      tmp / new_tensor->intra_chunk_strides[static_cast<size_t>(k)] +
-                      current_new_chunk_ids[k] * new_tensor->chunk_strides[k];
+                        tmp / new_tensor->intra_chunk_strides[static_cast<size_t>(k)] +
+                        current_new_chunk_ids[k] * new_tensor->chunk_strides[k];
                     tmp = tmp % new_tensor->intra_chunk_strides[static_cast<size_t>(k)];
                 }
 
@@ -1096,11 +1109,12 @@ class ChunkedTensor : public Tensor<ValueType> {
                 new_tensor->set(current_new_element_ids, get(current_old_element_ids));
             }
         }
-        
+
         return new_tensor;
     }
 
-    // Ranges inclusive on lower bound and exclusive on upper bound i.e. [x,y] at dsl lvl is in math == [x:y)
+    // Ranges inclusive on lower bound and exclusive on upper bound i.e. [x,y]
+    // at dsl lvl is in math == [x:y)
     ContiguousTensor<ValueType> *tryDiceToContiguousTensor(std::vector<std::pair<size_t, size_t>> index_ranges) const {
         if (index_ranges.size() != this->rank) {
             return nullptr;
@@ -1108,7 +1122,7 @@ class ChunkedTensor : public Tensor<ValueType> {
 
         if (this->rank == 0) {
             ContiguousTensor<ValueType> *tmp =
-              DataObjectFactory::create<ContiguousTensor<ValueType>>(this->tensor_shape, InitCode::NONE);
+                DataObjectFactory::create<ContiguousTensor<ValueType>>(this->tensor_shape, InitCode::NONE);
             tmp->data.get()[0] = data.get()[0];
             return tmp;
         }
@@ -1127,20 +1141,23 @@ class ChunkedTensor : public Tensor<ValueType> {
         new_chunk_strides.resize(this->rank);
         new_chunk_count_per_dim.resize(this->rank);
         new_chunk_strides[0] = 1;
-        new_chunk_count_per_dim[0] = (std::get<1>(index_ranges[0]) / chunk_shape[0]) - (std::get<0>(index_ranges[0]) / chunk_shape[0]) + 1;
+        new_chunk_count_per_dim[0] =
+            (std::get<1>(index_ranges[0]) / chunk_shape[0]) - (std::get<0>(index_ranges[0]) / chunk_shape[0]) + 1;
         size_t chunk_count = new_chunk_count_per_dim[0];
-        for(size_t i=1; i<this->rank; i++) {
-            new_chunk_count_per_dim[i] = (std::get<1>(index_ranges[i]) / chunk_shape[i]) - (std::get<0>(index_ranges[i]) / chunk_shape[i]) + 1;
+        for (size_t i = 1; i < this->rank; i++) {
+            new_chunk_count_per_dim[i] =
+                (std::get<1>(index_ranges[i]) / chunk_shape[i]) - (std::get<0>(index_ranges[i]) / chunk_shape[i]) + 1;
             chunk_count *= new_chunk_count_per_dim[i];
-            new_chunk_strides[i] = new_chunk_strides[i-1] * new_chunk_count_per_dim[i-1];
+            new_chunk_strides[i] = new_chunk_strides[i - 1] * new_chunk_count_per_dim[i - 1];
         }
 
         std::vector<size_t> current_chunk_ids;
         current_chunk_ids.resize(this->rank);
-        for(size_t i=0; i<chunk_count; i++) {
+        for (size_t i = 0; i < chunk_count; i++) {
             size_t tmp = i;
-            for(int64_t j=this->rank-1; j>=0; j--) {
-                current_chunk_ids[static_cast<size_t>(j)] = (tmp / new_chunk_strides[j]) + (std::get<0>(index_ranges[j]) / chunk_shape[j]);
+            for (int64_t j = this->rank - 1; j >= 0; j--) {
+                current_chunk_ids[static_cast<size_t>(j)] =
+                    (tmp / new_chunk_strides[j]) + (std::get<0>(index_ranges[j]) / chunk_shape[j]);
                 tmp = tmp % new_chunk_strides[j];
             }
 
@@ -1156,7 +1173,7 @@ class ChunkedTensor : public Tensor<ValueType> {
         }
 
         ContiguousTensor<ValueType> *new_tensor =
-          DataObjectFactory::create<ContiguousTensor<ValueType>>(new_tensor_shape, InitCode::NONE);
+            DataObjectFactory::create<ContiguousTensor<ValueType>>(new_tensor_shape, InitCode::NONE);
 
         std::vector<size_t> current_new_indices;
         std::vector<size_t> current_old_indices;
@@ -1168,7 +1185,7 @@ class ChunkedTensor : public Tensor<ValueType> {
             for (int64_t j = this->rank - 1; j >= 0; j--) {
                 current_new_indices[static_cast<size_t>(j)] = (tmp / new_tensor->strides[static_cast<size_t>(j)]);
                 current_old_indices[static_cast<size_t>(j)] =
-                  current_new_indices[static_cast<size_t>(j)] + std::get<0>(index_ranges[static_cast<size_t>(j)]);
+                    current_new_indices[static_cast<size_t>(j)] + std::get<0>(index_ranges[static_cast<size_t>(j)]);
                 tmp = tmp % new_tensor->strides[static_cast<size_t>(j)];
             }
 
@@ -1214,7 +1231,7 @@ class ChunkedTensor : public Tensor<ValueType> {
             size_t tmp = i;
             for (int64_t j = this->rank - 1; j >= 0; j--) {
                 current_ids[static_cast<size_t>(j)] = tmp / linear_strides[static_cast<size_t>(j)];
-                tmp                                 = tmp % linear_strides[static_cast<size_t>(j)];
+                tmp = tmp % linear_strides[static_cast<size_t>(j)];
             }
 
             if (IsValueMaterialized(current_ids)) {
@@ -1230,12 +1247,10 @@ class ChunkedTensor : public Tensor<ValueType> {
         throw std::runtime_error("ChunkedTensor::serialize() is not supported (yet)");
     }
 
-    size_t getNumItems() const override {
-        return this->total_element_count;
-    }
+    size_t getNumItems() const override { return this->total_element_count; }
 };
 
-template<typename ValueType>
+template <typename ValueType>
 bool areLogicalElementsEqual(const ContiguousTensor<ValueType> &contiguous_tensor,
                              const ChunkedTensor<ValueType> &chunked_tensor) {
     if (contiguous_tensor.tensor_shape != chunked_tensor.tensor_shape) {
@@ -1260,7 +1275,7 @@ bool areLogicalElementsEqual(const ContiguousTensor<ValueType> &contiguous_tenso
         size_t tmp = i;
         for (int64_t j = chunked_tensor.rank - 1; j >= 0; j--) {
             current_ids[static_cast<size_t>(j)] = tmp / linear_strides[static_cast<size_t>(j)];
-            tmp                                 = tmp % linear_strides[static_cast<size_t>(j)];
+            tmp = tmp % linear_strides[static_cast<size_t>(j)];
         }
 
         if (contiguous_tensor.get(current_ids) != chunked_tensor.get(current_ids)) {
diff --git a/src/runtime/local/datastructures/ContiguousTensor.cpp b/src/runtime/local/datastructures/ContiguousTensor.cpp
index 32b1f246a..c62019fef 100644
--- a/src/runtime/local/datastructures/ContiguousTensor.cpp
+++ b/src/runtime/local/datastructures/ContiguousTensor.cpp
@@ -16,20 +16,17 @@
 
 #include "ContiguousTensor.h"
 
-template<typename ValueType>
-void ContiguousTensor<ValueType>::printValue(std::ostream& os, ValueType val) const {
+template <typename ValueType> void ContiguousTensor<ValueType>::printValue(std::ostream &os, ValueType val) const {
     os << val;
 }
 
 // Convert to an integer to print uint8_t values as numbers
 // even if they fall into the range of special ASCII characters.
-template<>
-[[maybe_unused]] void ContiguousTensor<uint8_t>::printValue(std::ostream& os, uint8_t val) const {
+template <> [[maybe_unused]] void ContiguousTensor<uint8_t>::printValue(std::ostream &os, uint8_t val) const {
     os << static_cast<uint32_t>(val);
 }
 
-template<>
-[[maybe_unused]] void ContiguousTensor<int8_t>::printValue(std::ostream& os, int8_t val) const {
+template <> [[maybe_unused]] void ContiguousTensor<int8_t>::printValue(std::ostream &os, int8_t val) const {
     os << static_cast<int32_t>(val);
 }
 
diff --git a/src/runtime/local/datastructures/ContiguousTensor.h b/src/runtime/local/datastructures/ContiguousTensor.h
index 2ab68a9c7..b421da4da 100644
--- a/src/runtime/local/datastructures/ContiguousTensor.h
+++ b/src/runtime/local/datastructures/ContiguousTensor.h
@@ -16,41 +16,40 @@
 
 #pragma once
 
-#include <memory>
+#include <cstddef>
 #include <cstring>
+#include <memory>
+#include <optional>
 #include <ostream>
-#include <cstddef>
 #include <stdexcept>
-#include <utility>
 #include <type_traits>
+#include <utility>
 #include <vector>
-#include <optional>
 
 #include <runtime/local/datastructures/DataObjectFactory.h>
-#include <runtime/local/datastructures/Tensor.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
+#include <runtime/local/datastructures/Tensor.h>
 
 /**
-*  @brief An implementation of a tensor with a contiguous, "row-major" memory layout
-*
-*  This tensor implementation is backed by a single allocation for its data. The elements of the tensor are placed
-*  within this in the "higher dimensional equivalent of row-major order".
-*
-*/
-template<typename ValueType>
-class ContiguousTensor : public Tensor<ValueType> {
-    public:
+ *  @brief An implementation of a tensor with a contiguous, "row-major" memory
+ * layout
+ *
+ *  This tensor implementation is backed by a single allocation for its data.
+ * The elements of the tensor are placed within this in the "higher dimensional
+ * equivalent of row-major order".
+ *
+ */
+template <typename ValueType> class ContiguousTensor : public Tensor<ValueType> {
+  public:
     std::vector<size_t> strides;
 
     std::shared_ptr<ValueType[]> data;
 
-    private:
+  private:
     // Grant DataObjectFactory access to the private constructors and
     // destructors.
-    template<class DataType, typename ... ArgTypes>
-    friend DataType * DataObjectFactory::create(ArgTypes ...);
-    template<class DataType>
-    friend void DataObjectFactory::destroy(const DataType * obj);
+    template <class DataType, typename... ArgTypes> friend DataType *DataObjectFactory::create(ArgTypes...);
+    template <class DataType> friend void DataObjectFactory::destroy(const DataType *obj);
 
     ContiguousTensor(const std::vector<size_t> &tensor_shape, InitCode init_code)
         : Tensor<ValueType>::Tensor(tensor_shape),
@@ -60,7 +59,7 @@ class ContiguousTensor : public Tensor<ValueType> {
             strides[0] = 1;
         }
 
-        for(size_t i=0; i<this->rank; i++) {
+        for (size_t i = 0; i < this->rank; i++) {
             if (tensor_shape[i] == 0) {
                 throw std::runtime_error("Tensors with dimensions of extend 0 are disallowed.");
             }
@@ -71,45 +70,46 @@ class ContiguousTensor : public Tensor<ValueType> {
         }
 
         switch (init_code) {
-            case InitCode::NONE:
-                break;
-            case InitCode::ZERO: {
-                for (size_t i = 0; i < this->total_element_count; i++) {
-                    data.get()[i] = 0;
-                }
-                break;
+        case InitCode::NONE:
+            break;
+        case InitCode::ZERO: {
+            for (size_t i = 0; i < this->total_element_count; i++) {
+                data.get()[i] = 0;
             }
-            case InitCode::MAX: {
-                for (size_t i = 0; i < this->total_element_count; i++) {
-                    data.get()[i] = std::numeric_limits<ValueType>::max();
-                }
-                break;
+            break;
+        }
+        case InitCode::MAX: {
+            for (size_t i = 0; i < this->total_element_count; i++) {
+                data.get()[i] = std::numeric_limits<ValueType>::max();
             }
-            case InitCode::MIN: {
-                for (size_t i = 0; i < this->total_element_count; i++) {
-                    data.get()[i] = std::numeric_limits<ValueType>::min();
-                }
-                break;
+            break;
+        }
+        case InitCode::MIN: {
+            for (size_t i = 0; i < this->total_element_count; i++) {
+                data.get()[i] = std::numeric_limits<ValueType>::min();
             }
-            case InitCode::IOTA: {
-                for (size_t i = 0; i < this->total_element_count; i++) {
-                    data.get()[i] = i;
-                }
-                break;
+            break;
+        }
+        case InitCode::IOTA: {
+            for (size_t i = 0; i < this->total_element_count; i++) {
+                data.get()[i] = i;
             }
+            break;
+        }
         }
     };
-    
-    template<typename VTArg>
+
+    template <typename VTArg>
     ContiguousTensor(const ContiguousTensor<VTArg> *other)
         : Tensor<ValueType>::Tensor(other->tensor_shape), strides(other->strides) {
         // workarround for old versions of gcc with template specialization bug
-        //https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85282
-        if constexpr (std::is_same<VTArg, ValueType>::value) { 
+        // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85282
+        if constexpr (std::is_same<VTArg, ValueType>::value) {
             data = other->data;
         } else {
-            data = std::shared_ptr<ValueType[]>(new ValueType[this->total_element_count], std::default_delete<ValueType[]>());
-            for(size_t i=0; i<this->total_element_count; i++) {
+            data = std::shared_ptr<ValueType[]>(new ValueType[this->total_element_count],
+                                                std::default_delete<ValueType[]>());
+            for (size_t i = 0; i < this->total_element_count; i++) {
                 data[i] = static_cast<ValueType>(other->data[i]);
             }
         }
@@ -118,7 +118,7 @@ class ContiguousTensor : public Tensor<ValueType> {
     ContiguousTensor(const DenseMatrix<ValueType> *other)
         : Tensor<ValueType>::Tensor(other->getNumRows(), other->getNumCols()), data(other->getValuesSharedPtr()) {
         strides = {1, other->getRowSkip()};
-        for(size_t i=0; i<this->rank; i++) {
+        for (size_t i = 0; i < this->rank; i++) {
             if (this->tensor_shape[i] == 0) {
                 throw std::runtime_error("Tensors with dimensions of extend 0 are disallowed.");
             }
@@ -134,7 +134,7 @@ class ContiguousTensor : public Tensor<ValueType> {
             strides[0] = 1;
         }
 
-        for(size_t i=0; i<this->rank; i++) {
+        for (size_t i = 0; i < this->rank; i++) {
             if (this->tensor_shape[i] == 0) {
                 throw std::runtime_error("Tensors with dimensions of extend 0 are disallowed.");
             }
@@ -147,7 +147,7 @@ class ContiguousTensor : public Tensor<ValueType> {
         std::memcpy(data.get(), input_data, this->total_element_count * sizeof(ValueType));
     }
 
-    ContiguousTensor(std::shared_ptr<ValueType[]>& input_data, const std::vector<size_t> &tensor_shape)
+    ContiguousTensor(std::shared_ptr<ValueType[]> &input_data, const std::vector<size_t> &tensor_shape)
         : Tensor<ValueType>::Tensor(tensor_shape) {
         data = input_data;
         strides.resize(this->rank);
@@ -155,7 +155,7 @@ class ContiguousTensor : public Tensor<ValueType> {
             strides[0] = 1;
         }
 
-        for(size_t i=0; i<this->rank; i++) {
+        for (size_t i = 0; i < this->rank; i++) {
             if (this->tensor_shape[i] == 0) {
                 throw std::runtime_error("Tensors with dimensions of extend 0 are disallowed.");
             }
@@ -174,7 +174,7 @@ class ContiguousTensor : public Tensor<ValueType> {
             strides[0] = 1;
         }
 
-        for(size_t i=0; i<this->rank; i++) {
+        for (size_t i = 0; i < this->rank; i++) {
             if (this->tensor_shape[i] == 0) {
                 throw std::runtime_error("Tensors with dimensions of extend 0 are disallowed.");
             }
@@ -188,18 +188,15 @@ class ContiguousTensor : public Tensor<ValueType> {
     ~ContiguousTensor() override = default;
 
     void printValue(std::ostream &os, ValueType val) const;
-    
-    public:
-
-    public:
 
+  public:
     bool operator==(const ContiguousTensor<ValueType> &rhs) const {
         if (this->tensor_shape != rhs.tensor_shape) {
             return false;
         }
 
         return !static_cast<bool>(
-          std::memcmp(data.get(), rhs.data.get(), this->total_element_count * sizeof(ValueType)));
+            std::memcmp(data.get(), rhs.data.get(), this->total_element_count * sizeof(ValueType)));
     }
 
     DenseMatrix<ValueType> *tryToGetDenseMatrix() const {
@@ -298,12 +295,13 @@ class ContiguousTensor : public Tensor<ValueType> {
         os << std::endl;
     }
 
-    // Ranges inclusive on lower bound and exclusive on upper bound i.e. [x,y] at dsl lvl is in math == [x:y)
+    // Ranges inclusive on lower bound and exclusive on upper bound i.e. [x,y]
+    // at dsl lvl is in math == [x:y)
     ContiguousTensor<ValueType> *tryDice(std::vector<std::pair<size_t, size_t>> index_ranges) const {
         if (index_ranges.size() != this->rank) {
             return nullptr;
         }
-        
+
         for (size_t i = 0; i < this->rank; i++) {
             index_ranges[i] = {std::get<0>(index_ranges[i]), std::get<1>(index_ranges[i]) - 1};
             if (std::get<0>(index_ranges[i]) >= this->tensor_shape[i] ||
@@ -320,7 +318,7 @@ class ContiguousTensor : public Tensor<ValueType> {
         }
 
         ContiguousTensor<ValueType> *new_tensor =
-          DataObjectFactory::create<ContiguousTensor<ValueType>>(new_tensor_shape, InitCode::NONE);
+            DataObjectFactory::create<ContiguousTensor<ValueType>>(new_tensor_shape, InitCode::NONE);
 
         std::vector<size_t> current_indices;
         current_indices.resize(this->rank);
@@ -349,9 +347,7 @@ class ContiguousTensor : public Tensor<ValueType> {
         this->rank = this->tensor_shape.size();
     }
 
-    size_t getNumItems() const override {
-        return this->total_element_count;
-    }
+    size_t getNumItems() const override { return this->total_element_count; }
 
     size_t serialize(std::vector<char> &buf) const override {
         throw std::runtime_error("ContiguousTensor::serialize() is not supported (yet)");
diff --git a/src/runtime/local/datastructures/DataObjectFactory.h b/src/runtime/local/datastructures/DataObjectFactory.h
index 77e6e14c4..6ee40a9ad 100644
--- a/src/runtime/local/datastructures/DataObjectFactory.h
+++ b/src/runtime/local/datastructures/DataObjectFactory.h
@@ -20,62 +20,53 @@
 #include <stdexcept>
 
 struct DataObjectFactory {
-    
+
     /**
      * @brief The central function for creating data objects of any data type
      * (matrices, frames).
-     * 
+     *
      * The arguments must match those of any (private) constructor of the
      * specified data type.
-     * 
+     *
      * @param args
-     * @return 
+     * @return
      */
-    template<class DataType, typename ... ArgTypes>
-    static DataType * create(ArgTypes ... args) {
+    template <class DataType, typename... ArgTypes> static DataType *create(ArgTypes... args) {
         // TODO Employ placement-new.
         return new DataType(args...);
     }
-    
+
     /**
      * @brief The central function for destroying data objects of any data type
      * (matrices, frames).
-     * 
+     *
      * Decreases the reference counter of the given data object. If the
      * reference counter becomes zero, the data object is destroyed.
-     * 
+     *
      * The access is protected by a mutex, such that multiple threads may call
      * this method concurrently.
-     * 
+     *
      * @param obj The data object to destroy.
      */
-    template<class DataType>
-    static void destroy(const DataType *obj)
-    {
-        if(!obj)
-            throw std::runtime_error(
-                    "DataObjectFactory::destroy() must not be called with nullptr"
-            );
-        
+    template <class DataType> static void destroy(const DataType *obj) {
+        if (!obj)
+            throw std::runtime_error("DataObjectFactory::destroy() must not be called with nullptr");
+
         obj->refCounterMutex.lock();
         obj->refCounter--;
-        if(obj->refCounter == 0) {
+        if (obj->refCounter == 0) {
             obj->refCounterMutex.unlock();
             delete obj;
-        }
-        else
+        } else
             obj->refCounterMutex.unlock();
     }
 
     // TODO Simplify many places in the code (especially test cases) by using
     // the new feature of destroying multiple data objects by one call.
-    template<typename DataType, typename... Rest>
-    static void destroy(const DataType *obj, const Rest *...rest)
-    {
+    template <typename DataType, typename... Rest> static void destroy(const DataType *obj, const Rest *...rest) {
         destroy(obj);
         destroy(rest...);
     }
 };
 
-
-#endif //SRC_RUNTIME_LOCAL_DATASTRUCTURES_DATAOBJECTFACTORY_H
\ No newline at end of file
+#endif // SRC_RUNTIME_LOCAL_DATASTRUCTURES_DATAOBJECTFACTORY_H
\ No newline at end of file
diff --git a/src/runtime/local/datastructures/DataPlacement.h b/src/runtime/local/datastructures/DataPlacement.h
index 6410d2a85..d99a96879 100644
--- a/src/runtime/local/datastructures/DataPlacement.h
+++ b/src/runtime/local/datastructures/DataPlacement.h
@@ -23,20 +23,20 @@
 #include <atomic>
 
 /**
- * The DataPlacement struct binds an allocation descriptor to a range description and stores an ID of
- * an instantiated object.
+ * The DataPlacement struct binds an allocation descriptor to a range
+ * description and stores an ID of an instantiated object.
  */
 struct DataPlacement {
     size_t dp_id;
 
     // used to generate object IDs
     static std::atomic_size_t instance_count;
-    
+
     std::unique_ptr<IAllocationDescriptor> allocation{};
 
     std::unique_ptr<Range> range{};
 
     DataPlacement() = delete;
-    DataPlacement(std::unique_ptr<IAllocationDescriptor> _a, std::unique_ptr<Range> _r) : dp_id(instance_count++),
-            allocation(std::move(_a)), range(std::move(_r)) { }
+    DataPlacement(std::unique_ptr<IAllocationDescriptor> _a, std::unique_ptr<Range> _r)
+        : dp_id(instance_count++), allocation(std::move(_a)), range(std::move(_r)) {}
 };
diff --git a/src/runtime/local/datastructures/DenseMatrix.cpp b/src/runtime/local/datastructures/DenseMatrix.cpp
index 4520478c8..463a5ca0c 100644
--- a/src/runtime/local/datastructures/DenseMatrix.cpp
+++ b/src/runtime/local/datastructures/DenseMatrix.cpp
@@ -14,10 +14,11 @@
  * limitations under the License.
  */
 
+#include "DenseMatrix.h"
 #include <runtime/local/datastructures/AllocationDescriptorHost.h>
 #include <runtime/local/io/DaphneSerializer.h>
-#include "DenseMatrix.h"
 
+#include <fmt/core.h>
 #include <spdlog/spdlog.h>
 
 #include <sstream>
@@ -27,128 +28,136 @@
 // Boundary Validation
 // ****************************************************************************
 
-template<typename ValueType>
-void validateArgs(const DenseMatrix<ValueType> * src, int64_t rowLowerIncl, int64_t rowUpperExcl, int64_t colLowerIncl, int64_t colUpperExcl) {
+template <typename ValueType>
+void validateArgs(const DenseMatrix<ValueType> *src, int64_t rowLowerIncl, int64_t rowUpperExcl, int64_t colLowerIncl,
+                  int64_t colUpperExcl) {
     if (src == nullptr)
-        throw std::runtime_error("invalid argument passed to dense matrix constructor: src must not be null");
-    
-    if (rowLowerIncl < 0 || rowUpperExcl < rowLowerIncl || static_cast<ssize_t>(src->getNumRows()) < rowUpperExcl
-        || (rowLowerIncl == static_cast<ssize_t>(src->getNumRows()) && rowLowerIncl != 0)) {
+        throw std::runtime_error("invalid argument passed to dense matrix "
+                                 "constructor: src must not be null");
+
+    if (rowLowerIncl < 0 || rowUpperExcl < rowLowerIncl || static_cast<ssize_t>(src->getNumRows()) < rowUpperExcl ||
+        (rowLowerIncl == static_cast<ssize_t>(src->getNumRows()) && rowLowerIncl != 0)) {
         std::ostringstream errMsg;
         errMsg << "invalid arguments '" << rowLowerIncl << ", " << rowUpperExcl
-                << "' passed to dense matrix constructor: it must hold 0 <= rowLowerIncl <= rowUpperExcl <= #rows "
-                << "and rowLowerIncl < #rows (unless both are zero) where #rows of src is '" << src->getNumRows() << "'";
+               << "' passed to dense matrix constructor: it must hold 0 <= "
+                  "rowLowerIncl <= rowUpperExcl <= #rows "
+               << "and rowLowerIncl < #rows (unless both are zero) where #rows "
+                  "of src is '"
+               << src->getNumRows() << "'";
         throw std::out_of_range(errMsg.str());
     }
 
-    if(colLowerIncl < 0 || colUpperExcl < colLowerIncl || static_cast<ssize_t>(src->getNumCols()) < colUpperExcl
-        || (colLowerIncl == static_cast<ssize_t>(src->getNumCols()) && colLowerIncl != 0)) {
+    if (colLowerIncl < 0 || colUpperExcl < colLowerIncl || static_cast<ssize_t>(src->getNumCols()) < colUpperExcl ||
+        (colLowerIncl == static_cast<ssize_t>(src->getNumCols()) && colLowerIncl != 0)) {
         std::ostringstream errMsg;
         errMsg << "invalid arguments '" << colLowerIncl << ", " << colUpperExcl
-                << "' passed to dense matrix constructor: it must hold 0 <= colLowerIncl <= colUpperExcl <= #columns "
-                << "and colLowerIncl < #columns (unless both are zero) where #columns of src is '" << src->getNumCols() << "'";
+               << "' passed to dense matrix constructor: it must hold 0 <= "
+                  "colLowerIncl <= colUpperExcl <= #columns "
+               << "and colLowerIncl < #columns (unless both are zero) where "
+                  "#columns of src is '"
+               << src->getNumCols() << "'";
         throw std::out_of_range(errMsg.str());
     }
 }
 
 // ****************************************************************************
-// 
+//
 // ****************************************************************************
 
-template<typename ValueType>
-DenseMatrix<ValueType>::DenseMatrix(size_t maxNumRows, size_t numCols, bool zero, IAllocationDescriptor* allocInfo) :
-        Matrix<ValueType>(maxNumRows, numCols), is_view(false), rowSkip(numCols),
-        bufferSize(numRows*numCols*sizeof(ValueType)), lastAppendedRowIdx(0), lastAppendedColIdx(0)
-{
-    DataPlacement* new_data_placement;
-    if(allocInfo != nullptr) {
-        spdlog::debug("Creating {} x {} dense matrix of type: {}. Required memory: {} Mb",
-                numRows, numCols, static_cast<int>(allocInfo->getType()),
-                static_cast<float>(getBufferSize()) / (1048576));
+template <typename ValueType>
+DenseMatrix<ValueType>::DenseMatrix(size_t maxNumRows, size_t numCols, bool zero, IAllocationDescriptor *allocInfo)
+    : Matrix<ValueType>(maxNumRows, numCols), is_view(false), rowSkip(numCols),
+      bufferSize(numRows * numCols * sizeof(ValueType)), lastAppendedRowIdx(0), lastAppendedColIdx(0) {
+    DataPlacement *new_data_placement;
+    if (allocInfo != nullptr) {
+        spdlog::debug("Creating {} x {} dense matrix of type: {}. Required memory: {} Mb", numRows, numCols,
+                      static_cast<int>(allocInfo->getType()), static_cast<float>(getBufferSize()) / (1048576));
 
         new_data_placement = this->mdo->addDataPlacement(allocInfo);
         new_data_placement->allocation->createAllocation(getBufferSize(), zero);
-    }
-    else {
+    } else {
         AllocationDescriptorHost myHostAllocInfo;
         alloc_shared_values();
-        if(zero)
-            memset(values.get(), 0, maxNumRows * numCols * sizeof(ValueType));
+
+        if (zero)
+            std::fill(values.get(), values.get() + maxNumRows * numCols, ValueTypeUtils::defaultValue<ValueType>);
         new_data_placement = this->mdo->addDataPlacement(&myHostAllocInfo);
     }
     this->mdo->addLatest(new_data_placement->dp_id);
 }
 
-template<typename ValueType>
-DenseMatrix<ValueType>::DenseMatrix(size_t numRows, size_t numCols, std::shared_ptr<ValueType[]>& values) :
-        Matrix<ValueType>(numRows, numCols), is_view(false), rowSkip(numCols), values(values),
-        bufferSize(numRows*numCols*sizeof(ValueType)), lastAppendedRowIdx(0), lastAppendedColIdx(0)  {
+template <typename ValueType>
+DenseMatrix<ValueType>::DenseMatrix(size_t numRows, size_t numCols, std::shared_ptr<ValueType[]> &values)
+    : Matrix<ValueType>(numRows, numCols), is_view(false), rowSkip(numCols), values(values),
+      bufferSize(numRows * numCols * sizeof(ValueType)), lastAppendedRowIdx(0), lastAppendedColIdx(0) {
     AllocationDescriptorHost myHostAllocInfo;
-    DataPlacement* new_data_placement = this->mdo->addDataPlacement(&myHostAllocInfo);
+    DataPlacement *new_data_placement = this->mdo->addDataPlacement(&myHostAllocInfo);
     this->mdo->addLatest(new_data_placement->dp_id);
 }
 
-template<typename ValueType>
-DenseMatrix<ValueType>::DenseMatrix(const DenseMatrix<ValueType> * src, int64_t rowLowerIncl, int64_t rowUpperExcl,
-        int64_t colLowerIncl, int64_t colUpperExcl) : Matrix<ValueType>(rowUpperExcl - rowLowerIncl,
-        colUpperExcl - colLowerIncl),  is_view(true), bufferSize(numRows*numCols*sizeof(ValueType)),
-        lastAppendedRowIdx(0), lastAppendedColIdx(0)
-{
+template <typename ValueType>
+DenseMatrix<ValueType>::DenseMatrix(const DenseMatrix<ValueType> *src, int64_t rowLowerIncl, int64_t rowUpperExcl,
+                                    int64_t colLowerIncl, int64_t colUpperExcl)
+    : Matrix<ValueType>(rowUpperExcl - rowLowerIncl, colUpperExcl - colLowerIncl), is_view(true),
+      bufferSize(numRows * numCols * sizeof(ValueType)), lastAppendedRowIdx(0), lastAppendedColIdx(0) {
     validateArgs(src, rowLowerIncl, rowUpperExcl, colLowerIncl, colUpperExcl);
-    
+
     this->row_offset = rowLowerIncl;
     this->col_offset = colLowerIncl;
     rowSkip = src->rowSkip;
 
     // ToDo: manage host mem (values) in a data placement
-    if(src->values) {
+    if (src->values) {
         alloc_shared_values(src->values, offset());
-        bufferSize = numRows*rowSkip*sizeof(ValueType);
+        bufferSize = numRows * rowSkip * sizeof(ValueType);
     }
     this->clone_mdo(src);
 }
 
-template<typename ValueType>
-DenseMatrix<ValueType>::DenseMatrix(size_t numRows, size_t numCols, const DenseMatrix<ValueType> *src) :
-        Matrix<ValueType>(numRows, numCols), is_view(false), rowSkip(numCols),
-        bufferSize(numRows*numCols*sizeof(ValueType)), lastAppendedRowIdx(0), lastAppendedColIdx(0) {
-    if(src->values)
+template <typename ValueType>
+DenseMatrix<ValueType>::DenseMatrix(size_t numRows, size_t numCols, const DenseMatrix<ValueType> *src)
+    : Matrix<ValueType>(numRows, numCols), is_view(false), rowSkip(numCols),
+      bufferSize(numRows * numCols * sizeof(ValueType)), lastAppendedRowIdx(0), lastAppendedColIdx(0) {
+    if (src->values)
         values = src->values;
     this->clone_mdo(src);
 }
 
-template<typename ValueType>
-auto DenseMatrix<ValueType>::getValuesInternal(const IAllocationDescriptor* alloc_desc, const Range* range)
-        -> std::tuple<bool, size_t, ValueType*> {
-    // If no range information is provided we assume the full range that this matrix covers
-    if(range == nullptr || *range == Range(*this)) {
-        if(alloc_desc) {
+template <typename ValueType>
+auto DenseMatrix<ValueType>::getValuesInternal(const IAllocationDescriptor *alloc_desc,
+                                               const Range *range) -> std::tuple<bool, size_t, ValueType *> {
+    // If no range information is provided we assume the full range that this
+    // matrix covers
+    if (range == nullptr || *range == Range(*this)) {
+        if (alloc_desc) {
             auto ret = this->mdo->findDataPlacementByType(alloc_desc, range);
-            if(!ret) {
-                // find other allocation type X (preferably host allocation) to transfer from in latest_version
+            if (!ret) {
+                // find other allocation type X (preferably host allocation) to
+                // transfer from in latest_version
 
                 // tuple content: <is latest, latest-id, ptr-to-data-placement>
                 std::tuple<bool, size_t, ValueType *> result = std::make_tuple(false, 0, nullptr);
                 auto latest = this->mdo->getLatest();
                 DataPlacement *placement;
-                for (auto &placement_id: latest) {
+                for (auto &placement_id : latest) {
                     placement = this->mdo->getDataPlacementByID(placement_id);
-                    if(placement->range == nullptr || *(placement->range) == Range{0, 0, this->getNumRows(),
-                            this->getNumCols()}) {
+                    if (placement->range == nullptr ||
+                        *(placement->range) == Range{0, 0, this->getNumRows(), this->getNumCols()}) {
                         std::get<0>(result) = true;
                         std::get<1>(result) = placement->dp_id;
                         // prefer host allocation
-                        if(placement->allocation->getType() == ALLOCATION_TYPE::HOST) {
+                        if (placement->allocation->getType() == ALLOCATION_TYPE::HOST) {
                             std::get<2>(result) = reinterpret_cast<ValueType *>(values.get());
                             break;
                         }
                     }
                 }
 
-                // if we found a data placement that is not in host memory, transfer it there before returning
-                if(std::get<0>(result) == true && std::get<2>(result) == nullptr) {
+                // if we found a data placement that is not in host memory,
+                // transfer it there before returning
+                if (std::get<0>(result) == true && std::get<2>(result) == nullptr) {
                     AllocationDescriptorHost myHostAllocInfo;
-                    if(!values)
+                    if (!values)
                         this->alloc_shared_values();
                     this->mdo->addDataPlacement(&myHostAllocInfo);
                     placement->allocation->transferFrom(reinterpret_cast<std::byte *>(startAddress()), getBufferSize());
@@ -160,133 +169,139 @@ auto DenseMatrix<ValueType>::getValuesInternal(const IAllocationDescriptor* allo
                 new_data_placement->allocation->createAllocation(getBufferSize(), false);
 
                 // transfer to requested data placement
-                new_data_placement->allocation->transferTo(reinterpret_cast<std::byte *>(startAddress()), getBufferSize());
-                return std::make_tuple(false, new_data_placement->dp_id, reinterpret_cast<ValueType *>(
-                        new_data_placement->allocation->getData().get()));
-            }
-            else {
+                new_data_placement->allocation->transferTo(reinterpret_cast<std::byte *>(startAddress()),
+                                                           getBufferSize());
+                return std::make_tuple(false, new_data_placement->dp_id,
+                                       reinterpret_cast<ValueType *>(new_data_placement->allocation->getData().get()));
+            } else {
                 bool latest = this->mdo->isLatestVersion(ret->dp_id);
-                if(!latest) {
+                if (!latest) {
                     ret->allocation->transferTo(reinterpret_cast<std::byte *>(startAddress()), getBufferSize());
                 }
-                return std::make_tuple(latest, ret->dp_id, reinterpret_cast<ValueType *>(ret->allocation->getData()
-                        .get()));
+                return std::make_tuple(latest, ret->dp_id,
+                                       reinterpret_cast<ValueType *>(ret->allocation->getData().get()));
             }
-        }
-        else {
-            // if no alloc info was provided we try to get/create a full host allocation and return that
+        } else {
+            // if no alloc info was provided we try to get/create a full host
+            // allocation and return that
             std::tuple<bool, size_t, ValueType *> result = std::make_tuple(false, 0, nullptr);
             auto latest = this->mdo->getLatest();
             DataPlacement *placement;
-            for (auto &placement_id: latest) {
+            for (auto &placement_id : latest) {
                 placement = this->mdo->getDataPlacementByID(placement_id);
-                
+
                 // only consider allocations covering full range of matrix
-                if(placement->range == nullptr || *(placement->range) == Range{this->row_offset, this->col_offset,
-                        this->getNumRows(), this->getNumCols()}) {
+                if (placement->range == nullptr ||
+                    *(placement->range) ==
+                        Range{this->row_offset, this->col_offset, this->getNumRows(), this->getNumCols()}) {
                     std::get<0>(result) = true;
                     std::get<1>(result) = placement->dp_id;
                     // prefer host allocation
-                    if(placement->allocation->getType() == ALLOCATION_TYPE::HOST) {
+                    if (placement->allocation->getType() == ALLOCATION_TYPE::HOST) {
                         std::get<2>(result) = reinterpret_cast<ValueType *>(startAddress());
                         break;
                     }
                 }
             }
 
-            // if we found a data placement that is not in host memory, transfer it there before returning
-            if(std::get<0>(result) == true && std::get<2>(result) == nullptr) {
+            // if we found a data placement that is not in host memory, transfer
+            // it there before returning
+            if (std::get<0>(result) == true && std::get<2>(result) == nullptr) {
                 AllocationDescriptorHost myHostAllocInfo;
 
                 // ToDo: this needs fixing in the context of matrix views
-                if(!values)
-                    const_cast<DenseMatrix<ValueType>*>(this)->alloc_shared_values();
+                if (!values)
+                    const_cast<DenseMatrix<ValueType> *>(this)->alloc_shared_values();
 
                 this->mdo->addDataPlacement(&myHostAllocInfo);
-//                std::cout << "bufferSize: " << getBufferSize() << " RxRS: " << this->getNumRows() * this->getRowSkip() * sizeof(ValueType) << std::endl;
-//                if(getBufferSize() != this->getNumRows() * this->getRowSkip()) {
-//                    placement->allocation->transferFrom(reinterpret_cast<std::byte *>(startAddress()), this->getNumRows() * this->getRowSkip() * sizeof(ValueType));
-//                }
-//                else
-                    placement->allocation->transferFrom(reinterpret_cast<std::byte *>(startAddress()), getBufferSize());
+                //                std::cout << "bufferSize: " << getBufferSize()
+                //                << " RxRS: " << this->getNumRows() *
+                //                this->getRowSkip() * sizeof(ValueType) <<
+                //                std::endl; if(getBufferSize() !=
+                //                this->getNumRows() * this->getRowSkip()) {
+                //                    placement->allocation->transferFrom(reinterpret_cast<std::byte
+                //                    *>(startAddress()), this->getNumRows() *
+                //                    this->getRowSkip() * sizeof(ValueType));
+                //                }
+                //                else
+                placement->allocation->transferFrom(reinterpret_cast<std::byte *>(startAddress()), getBufferSize());
                 std::get<2>(result) = startAddress();
             }
-            if(std::get<2>(result) == nullptr)
+            if (std::get<2>(result) == nullptr)
                 throw std::runtime_error("No object meta data in matrix");
             else
                 return result;
         }
-    }
-    else
+    } else
         throw std::runtime_error("Range support under construction");
 }
 
-template <typename ValueType> void DenseMatrix<ValueType>::printValue(std::ostream & os, ValueType val) const {
+template <typename ValueType> void DenseMatrix<ValueType>::printValue(std::ostream &os, ValueType val) const {
     os << val;
 }
 
 // Convert to an integer to print uint8_t values as numbers
 // even if they fall into the range of special ASCII characters.
-template <>
-[[maybe_unused]] void DenseMatrix<uint8_t>::printValue(std::ostream & os, uint8_t val) const {
+template <> [[maybe_unused]] void DenseMatrix<uint8_t>::printValue(std::ostream &os, uint8_t val) const {
     os << static_cast<uint32_t>(val);
 }
 
-template <>
-[[maybe_unused]] void DenseMatrix<int8_t>::printValue(std::ostream & os, int8_t val) const {
+template <> [[maybe_unused]] void DenseMatrix<int8_t>::printValue(std::ostream &os, int8_t val) const {
     os << static_cast<int32_t>(val);
 }
 
-template<typename ValueType>
+template <typename ValueType>
 void DenseMatrix<ValueType>::alloc_shared_values(std::shared_ptr<ValueType[]> src, size_t offset) {
     // correct since C++17: Calls delete[] instead of simple delete
-    if(src) {
+    if (src) {
         values = std::shared_ptr<ValueType[]>(src, src.get() + offset);
-    }
-    else
-//        values = std::shared_ptr<ValueType[]>(new ValueType[numRows*numCols]);
+    } else
+        //        values = std::shared_ptr<ValueType[]>(new
+        //        ValueType[numRows*numCols]);
         values = std::shared_ptr<ValueType[]>(new ValueType[numRows * getRowSkip()]);
 }
 
-template<typename ValueType>
-size_t DenseMatrix<ValueType>::serialize(std::vector<char> &buf) const {
+template <typename ValueType> size_t DenseMatrix<ValueType>::serialize(std::vector<char> &buf) const {
     return DaphneSerializer<DenseMatrix<ValueType>>::serialize(this, buf);
 }
 
-template<>
-size_t DenseMatrix<bool>::serialize(std::vector<char> &buf) const{
+template <> size_t DenseMatrix<bool>::serialize(std::vector<char> &buf) const {
     throw std::runtime_error("DenseMatrix<bool> serialization not implemented");
 }
 
 // ----------------------------------------------------------------------------
 // const char* specialization
-DenseMatrix<const char*>::DenseMatrix(size_t maxNumRows, size_t numCols, bool zero, size_t strBufferCapacity_, ALLOCATION_TYPE type) :
-        Matrix<const char*>(maxNumRows, numCols), rowSkip(numCols), lastAppendedRowIdx(0), lastAppendedColIdx(0)
-{
-    spdlog::debug("creating dense matrix of allocation type {}, dims: {}x{} req.mem.: {}Mb with at least {} bytes for strings",  static_cast<int>(type), numRows, numCols, printBufferSize(), strBufferCapacity_);
+DenseMatrix<const char *>::DenseMatrix(size_t maxNumRows, size_t numCols, bool zero, size_t strBufferCapacity_,
+                                       ALLOCATION_TYPE type)
+    : Matrix<const char *>(maxNumRows, numCols), rowSkip(numCols), lastAppendedRowIdx(0), lastAppendedColIdx(0) {
+    spdlog::debug("creating dense matrix of allocation type {}, dims: {}x{} "
+                  "req.mem.: {}Mb with at least {} bytes for strings",
+                  static_cast<int>(type), numRows, numCols, printBufferSize(), strBufferCapacity_);
     if (type == ALLOCATION_TYPE::HOST) {
         alloc_shared_values();
         alloc_shared_strings(nullptr, strBufferCapacity_);
-    }
-    else {
+    } else {
         throw std::runtime_error("Unknown allocation type: " + std::to_string(static_cast<int>(type)));
     }
 }
 
-DenseMatrix<const char*>::DenseMatrix(size_t numRows, size_t numCols, std::shared_ptr<const char*[]>& strings_, size_t strBufferCapacity_, std::shared_ptr<const char*> cuda_ptr_):
-            Matrix<const char*>(numRows, numCols), rowSkip(numCols), cuda_ptr(cuda_ptr_), lastAppendedRowIdx(0), lastAppendedColIdx(0){
-                alloc_shared_values();
-                alloc_shared_strings();
-                prepareAppend();
-                for(size_t r = 0; r < numRows; r++)
-                    for(size_t c = 0; c < numCols; c++)
-                        append(r, c, strings_.get()[r * rowSkip + c]);
-                finishAppend();
-            }
+DenseMatrix<const char *>::DenseMatrix(size_t numRows, size_t numCols, std::shared_ptr<const char *[]> &strings_,
+                                       size_t strBufferCapacity_, std::shared_ptr<const char *> cuda_ptr_)
+    : Matrix<const char *>(numRows, numCols), rowSkip(numCols), cuda_ptr(cuda_ptr_), lastAppendedRowIdx(0),
+      lastAppendedColIdx(0) {
+    alloc_shared_values();
+    alloc_shared_strings();
+    prepareAppend();
+    for (size_t r = 0; r < numRows; r++)
+        for (size_t c = 0; c < numCols; c++)
+            append(r, c, strings_.get()[r * rowSkip + c]);
+    finishAppend();
+}
 
-DenseMatrix<const char*>::DenseMatrix(const DenseMatrix<const char*> * src, int64_t rowLowerIncl, int64_t rowUpperExcl, int64_t colLowerIncl,
-        int64_t colUpperExcl) : Matrix<const char*>(rowUpperExcl - rowLowerIncl, colUpperExcl - colLowerIncl), lastAppendedRowIdx(0), lastAppendedColIdx(0)
-{
+DenseMatrix<const char *>::DenseMatrix(const DenseMatrix<const char *> *src, int64_t rowLowerIncl, int64_t rowUpperExcl,
+                                       int64_t colLowerIncl, int64_t colUpperExcl)
+    : Matrix<const char *>(rowUpperExcl - rowLowerIncl, colUpperExcl - colLowerIncl), lastAppendedRowIdx(0),
+      lastAppendedColIdx(0) {
     validateArgs(src, rowLowerIncl, rowUpperExcl, colLowerIncl, colUpperExcl);
 
     rowSkip = src->rowSkip;
@@ -295,28 +310,22 @@ DenseMatrix<const char*>::DenseMatrix(const DenseMatrix<const char*> * src, int6
     alloc_shared_strings(src->strBuf);
 }
 
-void DenseMatrix<const char*>::printValue(std::ostream & os, const char* val) const {
-    os << '\"' << val << "\"";
-}
+void DenseMatrix<const char *>::printValue(std::ostream &os, const char *val) const { os << '\"' << val << "\""; }
 
-void DenseMatrix<const char*>::alloc_shared_values(std::shared_ptr<const char*[]> src, size_t offset) {
+void DenseMatrix<const char *>::alloc_shared_values(std::shared_ptr<const char *[]> src, size_t offset) {
     // correct since C++17: Calls delete[] instead of simple delete
-    if(src) {
-        values = std::shared_ptr<const char*[]>(src, src.get() + offset);
-    }
-    else
-    {
-        values = std::shared_ptr<const char*[]>(new const char*[getNumItems()]);
+    if (src) {
+        values = std::shared_ptr<const char *[]>(src, src.get() + offset);
+    } else {
+        values = std::shared_ptr<const char *[]>(new const char *[getNumItems()]);
     }
 }
 
-void DenseMatrix<const char*>::alloc_shared_strings(std::shared_ptr<CharBuf> src, size_t strBufferCapacity_) {
-    if(src) {
+void DenseMatrix<const char *>::alloc_shared_strings(std::shared_ptr<CharBuf> src, size_t strBufferCapacity_) {
+    if (src) {
         strBuf = std::shared_ptr<CharBuf>(src);
-    }
-    else
-    {
-        if(!values)
+    } else {
+        if (!values)
             alloc_shared_values();
         strBuf = std::make_shared<CharBuf>(strBufferCapacity_, getNumItems());
         appendZerosRange(&values[0], getNumItems());
@@ -333,4 +342,5 @@ template class DenseMatrix<unsigned char>;
 template class DenseMatrix<unsigned int>;
 template class DenseMatrix<unsigned long>;
 template class DenseMatrix<bool>;
-template class DenseMatrix<const char*>;
+template class DenseMatrix<std::string>;
+template class DenseMatrix<FixedStr16>;
diff --git a/src/runtime/local/datastructures/DenseMatrix.h b/src/runtime/local/datastructures/DenseMatrix.h
index aa2b9fcdb..ec4c9bd05 100644
--- a/src/runtime/local/datastructures/DenseMatrix.h
+++ b/src/runtime/local/datastructures/DenseMatrix.h
@@ -39,9 +39,7 @@
  * be added to a pointer to a particular cell in the `values` array in order to
  * obtain a pointer to the corresponding cell in the next row.
  */
-template <typename ValueType>
-class DenseMatrix : public Matrix<ValueType>
-{
+template <typename ValueType> class DenseMatrix : public Matrix<ValueType> {
     // `using`, so that we do not need to prefix each occurrence of these
     // fields from the super-classes.
     using Matrix<ValueType>::numRows;
@@ -59,201 +57,216 @@ class DenseMatrix : public Matrix<ValueType>
 
     // Grant DataObjectFactory access to the private constructors and
     // destructors.
-    template<class DataType, typename ... ArgTypes>
-    friend DataType * DataObjectFactory::create(ArgTypes ...);
-    template<class DataType>
-    friend void DataObjectFactory::destroy(const DataType * obj);
+    template <class DataType, typename... ArgTypes> friend DataType *DataObjectFactory::create(ArgTypes...);
+    template <class DataType> friend void DataObjectFactory::destroy(const DataType *obj);
 
     /**
-     * @brief Creates a `DenseMatrix` and allocates enough memory for the specified maximum size in the `values` array.
+     * @brief Creates a `DenseMatrix` and allocates enough memory for the
+     * specified maximum size in the `values` array.
      *
      * @param maxNumRows The maximum number of rows.
      * @param numCols The exact number of columns.
-     * @param zero Whether the allocated memory of the `values` array shall be initialized to zeros (`true`), or be left
-     * uninitialized (`false`).
+     * @param zero Whether the allocated memory of the `values` array shall be
+     * initialized to zeros (`true`), or be left uninitialized (`false`).
      */
-    DenseMatrix(size_t maxNumRows, size_t numCols, bool zero, IAllocationDescriptor* allocInfo = nullptr);
+    DenseMatrix(size_t maxNumRows, size_t numCols, bool zero, IAllocationDescriptor *allocInfo = nullptr);
 
     /**
-     * @brief Creates a `DenseMatrix` around an existing array of values without copying the data.
+     * @brief Creates a `DenseMatrix` around an existing array of values without
+     * copying the data.
      *
      * @param numRows The exact number of rows.
      * @param numCols The exact number of columns.
      * @param values A `std::shared_ptr` to an existing array of values.
      */
-    DenseMatrix(size_t numRows, size_t numCols, std::shared_ptr<ValueType[]>& values);
+    DenseMatrix(size_t numRows, size_t numCols, std::shared_ptr<ValueType[]> &values);
 
     /**
-     * @brief Creates a `DenseMatrix` around a sub-matrix of another `DenseMatrix` without copying the data.
+     * @brief Creates a `DenseMatrix` around a sub-matrix of another
+     * `DenseMatrix` without copying the data.
      *
      * @param src The other dense matrix.
-     * @param rowLowerIncl Inclusive lower bound for the range of rows to extract.
-     * @param rowUpperExcl Exclusive upper bound for the range of rows to extract.
-     * @param colLowerIncl Inclusive lower bound for the range of columns to extract.
-     * @param colUpperExcl Exclusive upper bound for the range of columns to extract.
+     * @param rowLowerIncl Inclusive lower bound for the range of rows to
+     * extract.
+     * @param rowUpperExcl Exclusive upper bound for the range of rows to
+     * extract.
+     * @param colLowerIncl Inclusive lower bound for the range of columns to
+     * extract.
+     * @param colUpperExcl Exclusive upper bound for the range of columns to
+     * extract.
      */
-    DenseMatrix(const DenseMatrix<ValueType> * src, int64_t rowLowerIncl, int64_t rowUpperExcl, int64_t colLowerIncl,
-            int64_t colUpperExcl);
-    DenseMatrix(const DenseMatrix<ValueType> * src, int64_t rowLowerIncl, int64_t rowUpperExcl) : DenseMatrix(src, rowLowerIncl, rowUpperExcl, 0, src->numCols) {};
-
+    DenseMatrix(const DenseMatrix<ValueType> *src, int64_t rowLowerIncl, int64_t rowUpperExcl, int64_t colLowerIncl,
+                int64_t colUpperExcl);
+    DenseMatrix(const DenseMatrix<ValueType> *src, int64_t rowLowerIncl, int64_t rowUpperExcl)
+        : DenseMatrix(src, rowLowerIncl, rowUpperExcl, 0, src->numCols){};
 
     /**
-     * @brief Creates a `DenseMatrix` around an existing array of values without copying the data.
+     * @brief Creates a `DenseMatrix` around an existing array of values without
+     * copying the data.
      *
      * @param numRows The exact number of rows.
      * @param numCols The exact number of columns.
      * @param values A `std::shared_ptr` to an existing array of values.
      */
-    DenseMatrix(size_t numRows, size_t numCols, const DenseMatrix<ValueType>* src);
+    DenseMatrix(size_t numRows, size_t numCols, const DenseMatrix<ValueType> *src);
 
     ~DenseMatrix() override = default;
 
     [[nodiscard]] size_t pos(size_t rowIdx, size_t colIdx, bool rowSkipOverride = false) const {
-        if(rowIdx >= numRows)
+        if (rowIdx >= numRows)
             throw std::runtime_error("rowIdx is out of bounds");
-        if(colIdx >= numCols)
+        if (colIdx >= numCols)
             throw std::runtime_error("colIdx is out of bounds");
         return rowIdx * (rowSkipOverride ? numCols : rowSkip) + colIdx;
     }
-    
+
     void fillZeroUntil(size_t rowIdx, size_t colIdx) {
-        if(rowSkip == numCols || lastAppendedRowIdx == rowIdx) {
+        if (rowSkip == numCols || lastAppendedRowIdx == rowIdx) {
             const size_t startPosIncl = pos(lastAppendedRowIdx, lastAppendedColIdx) + 1;
             const size_t endPosExcl = pos(rowIdx, colIdx);
-            if(startPosIncl < endPosExcl)
-                memset(values.get() + startPosIncl, 0, (endPosExcl - startPosIncl) * sizeof(ValueType));
-        }
-        else {
+
+            if (startPosIncl < endPosExcl)
+                std::fill(values.get() + startPosIncl, values.get() + endPosExcl,
+                          ValueTypeUtils::defaultValue<ValueType>);
+        } else {
             auto v = values.get() + lastAppendedRowIdx * rowSkip;
-            memset(v + lastAppendedColIdx + 1, 0, (numCols - lastAppendedColIdx - 1) * sizeof(ValueType));
+            std::fill(v + lastAppendedColIdx + 1, v + numCols, ValueTypeUtils::defaultValue<ValueType>);
+
             v += rowSkip;
-            for(size_t r = lastAppendedRowIdx + 1; r < rowIdx; r++) {
-                memset(v, 0, numCols * sizeof(ValueType));
+
+            for (size_t r = lastAppendedRowIdx + 1; r < rowIdx; r++) {
+                std::fill(v, v + numCols, ValueTypeUtils::defaultValue<ValueType>);
                 v += rowSkip;
             }
-            if(colIdx)
-                memset(v, 0, (colIdx - 1) * sizeof(ValueType));
+            if (colIdx)
+                std::fill(v, v + colIdx - 1, ValueTypeUtils::defaultValue<ValueType>);
         }
     }
 
-    void printValue(std::ostream & os, ValueType val) const;
+    void printValue(std::ostream &os, ValueType val) const;
 
     void alloc_shared_values(std::shared_ptr<ValueType[]> src = nullptr, size_t offset = 0);
 
-
     /**
-     * @brief The getValuesInternal method fetches a pointer to an allocation of values. Optionally a sub range
-     * can be specified.
+     * @brief The getValuesInternal method fetches a pointer to an allocation of
+     * values. Optionally a sub range can be specified.
      *
-     * This method is called by the public getValues() methods either in const or non-const fashion. The const version
-     * returns a data pointer that is meant to be read-only. Read only access updates the list of up-to-date
-     * allocations (the latest_versions "list"). This way several copies of the data in various allocations can be
-     * kept without invalidating their copy. If the read write version of getValues() is used, the latest_versions
-     * list is cleared because a write to an allocation is assumed, which renders the other allocations of the
-     * same data out of sync.
+     * This method is called by the public getValues() methods either in const
+     * or non-const fashion. The const version returns a data pointer that is
+     * meant to be read-only. Read only access updates the list of up-to-date
+     * allocations (the latest_versions "list"). This way several copies of the
+     * data in various allocations can be kept without invalidating their copy.
+     * If the read write version of getValues() is used, the latest_versions
+     * list is cleared because a write to an allocation is assumed, which
+     * renders the other allocations of the same data out of sync.
      *
-     * @param alloc_desc An instance of an IAllocationDescriptor derived class that is used to specify what type of
-     * allocation is requested. If no allocation descriptor is provided, a host allocation (plain main memory) is
-     * assumed by default.
+     * @param alloc_desc An instance of an IAllocationDescriptor derived class
+     * that is used to specify what type of allocation is requested. If no
+     * allocation descriptor is provided, a host allocation (plain main memory)
+     * is assumed by default.
      *
-     * @param range An optional range describing which rows and columns of a matrix-like structure are requested.
-     * By default this is null and means all rows and columns.
+     * @param range An optional range describing which rows and columns of a
+     * matrix-like structure are requested. By default this is null and means
+     * all rows and columns.
      * @return A tuple of three values is returned:
      *         1: bool - is the returend allocation in the latest_versions list
-     *         2: size_t - the ID of the data placement (a structure relating an allocation to a range)
-     *         3: ValueType* - the pointer to the actual data
+     *         2: size_t - the ID of the data placement (a structure relating an
+     * allocation to a range) 3: ValueType* - the pointer to the actual data
      */
 
-    auto getValuesInternal(const IAllocationDescriptor* alloc_desc = nullptr, const Range* range = nullptr)
-    -> std::tuple<bool, size_t, ValueType*>;
-    
+    auto getValuesInternal(const IAllocationDescriptor *alloc_desc = nullptr,
+                           const Range *range = nullptr) -> std::tuple<bool, size_t, ValueType *>;
+
     [[nodiscard]] size_t offset() const { return this->row_offset * rowSkip + this->col_offset; }
-    
-    
-    ValueType* startAddress() const { return isPartialBuffer() ?  values.get() + offset() : values.get(); }
 
-public:
+    ValueType *startAddress() const { return isPartialBuffer() ? values.get() + offset() : values.get(); }
 
-    template<typename NewValueType>
-    using WithValueType = DenseMatrix<NewValueType>;
+  public:
+    template <typename NewValueType> using WithValueType = DenseMatrix<NewValueType>;
 
-    static std::string getName() {
-        return "DenseMatrix";
-    }
+    static std::string getName() { return "DenseMatrix"; }
 
-    [[nodiscard]] bool isPartialBuffer() const { return bufferSize != this->getNumRows() * this->getRowSkip() * sizeof(ValueType); }
+    [[nodiscard]] bool isPartialBuffer() const {
+        return bufferSize != this->getNumRows() * this->getRowSkip() * sizeof(ValueType);
+    }
 
     void shrinkNumRows(size_t numRows) {
         if (numRows > this->numRows)
-            throw std::runtime_error("DenseMatrix (shrinkNumRows): number of rows can only be shrunk");
+            throw std::runtime_error("DenseMatrix (shrinkNumRows): number of "
+                                     "rows can only be shrunk");
         // TODO Here we could reduce the allocated size of the values array.
         this->numRows = numRows;
     }
-    
+
     [[nodiscard]] size_t getRowSkip() const { return rowSkip; }
 
     [[nodiscard]] bool isView() const { return is_view; }
 
     /**
-     * @brief Fetch a pointer to the data held by this structure meant for read-only access.
+     * @brief Fetch a pointer to the data held by this structure meant for
+     * read-only access.
      *
-     * A difference is made between read-only and read-write access because with read-only access, the data
-     * can be cached in several memory spaces at the same time.
+     * A difference is made between read-only and read-write access because with
+     * read-only access, the data can be cached in several memory spaces at the
+     * same time.
      *
-     * @param alloc_desc An allocation descriptor describing which type of memory is requested (e.g. main memory in
-     * the current system, memory in an accelerator card or memory in another host)
+     * @param alloc_desc An allocation descriptor describing which type of
+     * memory is requested (e.g. main memory in the current system, memory in an
+     * accelerator card or memory in another host)
      *
-     * @param range A Range object describing optionally requesting a sub range of a data structure.
+     * @param range A Range object describing optionally requesting a sub range
+     * of a data structure.
      * @return A pointer to the data in the requested memory space
      */
-    const ValueType* getValues(const IAllocationDescriptor* alloc_desc = nullptr, const Range* range = nullptr) const
-    {
-        auto[isLatest, id, ptr] = const_cast<DenseMatrix<ValueType> *>(this)->getValuesInternal(alloc_desc, range);
-        if(!isLatest)
+    const ValueType *getValues(const IAllocationDescriptor *alloc_desc = nullptr, const Range *range = nullptr) const {
+        auto [isLatest, id, ptr] = const_cast<DenseMatrix<ValueType> *>(this)->getValuesInternal(alloc_desc, range);
+        if (!isLatest)
             this->mdo->addLatest(id);
         return ptr;
     }
 
     /**
-     * @brief Fetch a pointer to the data held by this structure meant for read-write access.
+     * @brief Fetch a pointer to the data held by this structure meant for
+     * read-write access.
      *
-     * A difference is made between read-only and read-write access. With read-write access, all copies in various
-     * memory spaces will be invalidated because data is assumed to change.
+     * A difference is made between read-only and read-write access. With
+     * read-write access, all copies in various memory spaces will be
+     * invalidated because data is assumed to change.
      *
-     * @param alloc_desc An allocation descriptor describing which type of memory is requested (e.g. main memory in
-     * the current system, memory in an accelerator card or memory in another host)
+     * @param alloc_desc An allocation descriptor describing which type of
+     * memory is requested (e.g. main memory in the current system, memory in an
+     * accelerator card or memory in another host)
      *
-     * @param range A Range object describing optionally requesting a sub range of a data structure.
+     * @param range A Range object describing optionally requesting a sub range
+     * of a data structure.
      * @return A pointer to the data in the requested memory space
      */
-    ValueType* getValues(IAllocationDescriptor* alloc_desc = nullptr, const Range* range = nullptr) {
-        auto [isLatest, id, ptr] = const_cast<DenseMatrix<ValueType>*>(this)->getValuesInternal(alloc_desc, range);
-        if(!isLatest)
+    ValueType *getValues(IAllocationDescriptor *alloc_desc = nullptr, const Range *range = nullptr) {
+        auto [isLatest, id, ptr] = const_cast<DenseMatrix<ValueType> *>(this)->getValuesInternal(alloc_desc, range);
+        if (!isLatest)
             this->mdo->setLatest(id);
         return ptr;
     }
-    
-    std::shared_ptr<ValueType[]> getValuesSharedPtr() const {
-        return values;
-    }
-    
+
+    std::shared_ptr<ValueType[]> getValuesSharedPtr() const { return values; }
+
     ValueType get(size_t rowIdx, size_t colIdx) const override {
         return getValues()[pos(rowIdx, colIdx, isPartialBuffer())];
     }
-    
+
     void set(size_t rowIdx, size_t colIdx, ValueType value) override {
         auto vals = getValues();
         vals[pos(rowIdx, colIdx)] = value;
     }
-    
+
     void prepareAppend() override {
         // The matrix might be empty.
         if (numRows != 0 && numCols != 0)
-            values.get()[0] = ValueType(0);
+            values.get()[0] = ValueType(ValueTypeUtils::defaultValue<ValueType>);
         lastAppendedRowIdx = 0;
         lastAppendedColIdx = 0;
     }
-    
+
     void append(size_t rowIdx, size_t colIdx, ValueType value) override {
         // Set all cells since the last one that was appended to zero.
         fillZeroUntil(rowIdx, colIdx);
@@ -263,22 +276,17 @@ class DenseMatrix : public Matrix<ValueType>
         lastAppendedRowIdx = rowIdx;
         lastAppendedColIdx = colIdx;
     }
-    
+
     void finishAppend() override {
         // The matrix might be empty.
-        if (
-            (numRows != 0 && numCols != 0) &&
-            (
-                (lastAppendedRowIdx + 1 < numRows) ||
-                (lastAppendedColIdx + 1 < numCols)
-            )
-        )
-            append(numRows - 1, numCols - 1, ValueType(0));
+        if ((numRows != 0 && numCols != 0) &&
+            ((lastAppendedRowIdx + 1 < numRows) || (lastAppendedColIdx + 1 < numCols)))
+            append(numRows - 1, numCols - 1, ValueType(ValueTypeUtils::defaultValue<ValueType>));
     }
 
-    void print(std::ostream & os) const override {
+    void print(std::ostream &os) const override {
         os << "DenseMatrix(" << numRows << 'x' << numCols << ", " << ValueTypeUtils::cppNameFor<ValueType> << ')'
-                << std::endl;
+           << std::endl;
 
         for (size_t r = 0; r < numRows; r++) {
             for (size_t c = 0; c < numCols; c++) {
@@ -290,74 +298,66 @@ class DenseMatrix : public Matrix<ValueType>
         }
     }
 
-    DenseMatrix<ValueType>* sliceRow(size_t rl, size_t ru) const override {
-        return slice(rl, ru, 0, numCols);
-    }
+    DenseMatrix<ValueType> *sliceRow(size_t rl, size_t ru) const override { return slice(rl, ru, 0, numCols); }
 
-    DenseMatrix<ValueType>* sliceCol(size_t cl, size_t cu) const override {
-        return slice(0, numRows, cl, cu);
-    }
+    DenseMatrix<ValueType> *sliceCol(size_t cl, size_t cu) const override { return slice(0, numRows, cl, cu); }
 
-    DenseMatrix<ValueType>* slice(size_t rl, size_t ru, size_t cl, size_t cu) const override {
+    DenseMatrix<ValueType> *slice(size_t rl, size_t ru, size_t cl, size_t cu) const override {
         return DataObjectFactory::create<DenseMatrix<ValueType>>(this, rl, ru, cl, cu);
     }
 
     [[nodiscard]] size_t getBufferSize() const { return bufferSize; }
 
-    bool operator==(const DenseMatrix<ValueType> & rhs) const {
-        // Note that we do not use the generic `get` interface to matrices here since
-        // this operator is meant to be used for writing tests for, besides others,
-        // those generic interfaces.
+    bool operator==(const DenseMatrix<ValueType> &rhs) const {
+        // Note that we do not use the generic `get` interface to matrices here
+        // since this operator is meant to be used for writing tests for,
+        // besides others, those generic interfaces.
 
-        if(this == &rhs)
+        if (this == &rhs)
             return true;
 
         const size_t numRows = this->getNumRows();
         const size_t numCols = this->getNumCols();
 
-        if(numRows != rhs.getNumRows() || numCols != rhs.getNumCols())
+        if (numRows != rhs.getNumRows() || numCols != rhs.getNumCols())
             return false;
 
-        const ValueType* valuesLhs = this->getValues();
-        const ValueType* valuesRhs = rhs.getValues();
+        const ValueType *valuesLhs = this->getValues();
+        const ValueType *valuesRhs = rhs.getValues();
 
         const size_t rowSkipLhs = this->getRowSkip();
         const size_t rowSkipRhs = rhs.getRowSkip();
 
-        if(valuesLhs == valuesRhs && rowSkipLhs == rowSkipRhs)
+        if (valuesLhs == valuesRhs && rowSkipLhs == rowSkipRhs)
             return true;
 
-        if(rowSkipLhs == numCols && rowSkipRhs == numCols)
-            return !memcmp(valuesLhs, valuesRhs, numRows * numCols * sizeof(ValueType));
-        else {
-            for(size_t r = 0; r < numRows; r++) {
-                if(memcmp(valuesLhs, valuesRhs, numCols * sizeof(ValueType)))
+        for (size_t r = 0; r < numRows; ++r) {
+            for (size_t c = 0; c < numCols; ++c) {
+                if (*(valuesLhs + c) != *(valuesRhs + c))
                     return false;
-                valuesLhs += rowSkipLhs;
-                valuesRhs += rowSkipRhs;
             }
-            return true;
+            valuesLhs += rowSkipLhs;
+            valuesRhs += rowSkipRhs;
         }
+        return true;
     }
 
     size_t serialize(std::vector<char> &buf) const override;
 };
 
-template <typename ValueType>
-std::ostream & operator<<(std::ostream & os, const DenseMatrix<ValueType> & obj)
-{
+template <typename ValueType> std::ostream &operator<<(std::ostream &os, const DenseMatrix<ValueType> &obj) {
     obj.print(os);
     return os;
 }
 
 /*
     Helper struct for DenseMatrix with strings, represents a char buffer.
-    Needs to keep numCells from original DenseMatrix to manage modifications from views.
+    Needs to keep numCells from original DenseMatrix to manage modifications
+   from views.
 */
-struct CharBuf
-{
-    char* strings;
-    char* currentTop;
+struct CharBuf {
+    char *strings;
+    char *currentTop;
 
     size_t capacity;
     size_t numCells;
@@ -366,21 +366,20 @@ struct CharBuf
         strings = new char[capacity];
         currentTop = strings;
     }
-    ~CharBuf() {
-        delete[] strings;
-    }
-    
-    void expandStringBuffer(const size_t toFit, const char **vals, size_t numRows, size_t rowSkip, const size_t valsSize) {
+    ~CharBuf() { delete[] strings; }
+
+    void expandStringBuffer(const size_t toFit, const char **vals, size_t numRows, size_t rowSkip,
+                            const size_t valsSize) {
         size_t strBufSize = getSize();
 
         size_t largerStrCapacity = (capacity * 2) > toFit ? (capacity * 2) : toFit;
-        char* largerStrings = new char[largerStrCapacity];
+        char *largerStrings = new char[largerStrCapacity];
         memcpy(largerStrings, strings, strBufSize);
 
         auto start = vals[0];
         const size_t numCols = numCells / numRows;
-        for(size_t r = 0; r < numRows; r++) {
-            for(size_t c = 0; c < numCols; c++) {
+        for (size_t r = 0; r < numRows; r++) {
+            for (size_t c = 0; c < numCols; c++) {
                 size_t offset = vals[c] - start;
                 vals[c] = &largerStrings[offset];
             }
@@ -393,64 +392,61 @@ struct CharBuf
         currentTop = &largerStrings[strBufSize];
     }
 
-    size_t getSize() {
-        return currentTop - strings;
-    }
+    size_t getSize() { return currentTop - strings; }
 };
 
-template <>
-class DenseMatrix<const char*> : public Matrix<const char*>
-{
+template <> class DenseMatrix<const char *> : public Matrix<const char *> {
     // `using`, so that we do not need to prefix each occurrence of these
     // fields from the super-classes.
-    using Matrix<const char*>::numRows;
-    using Matrix<const char*>::numCols;
-    
+    using Matrix<const char *>::numRows;
+    using Matrix<const char *>::numCols;
+
     size_t rowSkip;
-    std::shared_ptr<const char*[]> values{};
+    std::shared_ptr<const char *[]> values{};
     std::shared_ptr<CharBuf> strBuf;
 
-    std::shared_ptr<const char*> cuda_ptr{};
+    std::shared_ptr<const char *> cuda_ptr{};
     uint32_t deleted = 0;
 
     size_t lastAppendedRowIdx;
     size_t lastAppendedColIdx;
-    
+
     // Grant DataObjectFactory access to the private constructors and
     // destructors.
-    template<class DataType, typename ... ArgTypes>
-    friend DataType * DataObjectFactory::create(ArgTypes ...);
-    template<class DataType>
-    friend void DataObjectFactory::destroy(const DataType * obj);
+    template <class DataType, typename... ArgTypes> friend DataType *DataObjectFactory::create(ArgTypes...);
+    template <class DataType> friend void DataObjectFactory::destroy(const DataType *obj);
 
-    DenseMatrix(size_t maxNumRows, size_t numCols, bool zero, size_t strBufCapacity = 1024, ALLOCATION_TYPE type = ALLOCATION_TYPE::HOST);
-    
-    DenseMatrix(size_t numRows, size_t numCols, std::shared_ptr<const char*[]>& strings, size_t strBufCapacity = 1024, std::shared_ptr<const char*> cuda_ptr_ = nullptr);
+    DenseMatrix(size_t maxNumRows, size_t numCols, bool zero, size_t strBufCapacity = 1024,
+                ALLOCATION_TYPE type = ALLOCATION_TYPE::HOST);
 
-    DenseMatrix(const DenseMatrix<const char*> * src, int64_t rowLowerIncl, int64_t rowUpperExcl, int64_t colLowerIncl, int64_t colUpperExcl);
+    DenseMatrix(size_t numRows, size_t numCols, std::shared_ptr<const char *[]> &strings, size_t strBufCapacity = 1024,
+                std::shared_ptr<const char *> cuda_ptr_ = nullptr);
+
+    DenseMatrix(const DenseMatrix<const char *> *src, int64_t rowLowerIncl, int64_t rowUpperExcl, int64_t colLowerIncl,
+                int64_t colUpperExcl);
 
     ~DenseMatrix() override = default;
-    
+
     /**
      * @brief Calculates the position within linear memory given row/col indices
      *
      * @param rowIdx
      * @param colIdx
-     * @param rowSkipOverride use numCols instead of rowSkip if get() is used on a partial buffer
+     * @param rowSkipOverride use numCols instead of rowSkip if get() is used on
+     * a partial buffer
      * @return linearized position
      */
     [[nodiscard]] size_t pos(size_t rowIdx, size_t colIdx, bool rowSkipOverride = false) const {
-        if(rowIdx >= numRows)
+        if (rowIdx >= numRows)
             throw std::runtime_error("rowIdx is out of bounds");
-        if(colIdx >= numCols)
+        if (colIdx >= numCols)
             throw std::runtime_error("colIdx is out of bounds");
-        return rowIdx *  (rowSkipOverride ? numCols : rowSkip) + colIdx;
+        return rowIdx * (rowSkipOverride ? numCols : rowSkip) + colIdx;
     }
-    
-    void appendZerosRange(const char** valsStartPos, const size_t length)
-    {
+
+    void appendZerosRange(const char **valsStartPos, const size_t length) {
         memset(strBuf->currentTop, '\0', length * sizeof(char));
-        for(size_t val = 0; val < length; val++){
+        for (size_t val = 0; val < length; val++) {
             valsStartPos[val] = &strBuf->currentTop[val];
         }
         strBuf->currentTop += length;
@@ -458,113 +454,104 @@ class DenseMatrix<const char*> : public Matrix<const char*>
 
     void fillZeroUntil(size_t rowIdx, size_t colIdx) {
         auto vals = values.get();
-        if(rowSkip == numCols || lastAppendedRowIdx == rowIdx) {
+        if (rowSkip == numCols || lastAppendedRowIdx == rowIdx) {
             const size_t startPosIncl = pos(lastAppendedRowIdx, lastAppendedColIdx) + 1;
             const size_t endPosExcl = pos(rowIdx, colIdx);
-            if(startPosIncl < endPosExcl){
+            if (startPosIncl < endPosExcl) {
                 appendZerosRange(&vals[startPosIncl], endPosExcl - startPosIncl);
             }
-        }
-        else {
+        } else {
             auto v = vals + lastAppendedRowIdx * rowSkip;
             appendZerosRange(&v[lastAppendedColIdx + 1], numCols - lastAppendedColIdx - 1);
             v += rowSkip;
-            
-            for(size_t r = lastAppendedRowIdx + 1; r < rowIdx; r++) {
+
+            for (size_t r = lastAppendedRowIdx + 1; r < rowIdx; r++) {
                 appendZerosRange(v, numCols);
                 v += rowSkip;
             }
 
-            if(colIdx)
+            if (colIdx)
                 appendZerosRange(v, colIdx - 1);
         }
     }
 
-    void printValue(std::ostream & os, const char* val) const;
+    void printValue(std::ostream &os, const char *val) const;
 
-    void alloc_shared_values(std::shared_ptr<const char*[]> src = nullptr, size_t offset = 0);
+    void alloc_shared_values(std::shared_ptr<const char *[]> src = nullptr, size_t offset = 0);
 
     void alloc_shared_strings(std::shared_ptr<CharBuf> src = nullptr, size_t strBufferCapacity = 1024);
 
-    void alloc_shared_cuda_buffer(std::shared_ptr<const char*> src = nullptr, size_t offset = 0);
-
-public:
+    void alloc_shared_cuda_buffer(std::shared_ptr<const char *> src = nullptr, size_t offset = 0);
 
+  public:
     void shrinkNumRows(size_t numRows) {
         if (numRows > this->numRows)
-            throw std::runtime_error("DenseMatrix (shrinkNumRows): number of rows can only be shrunk");
+            throw std::runtime_error("DenseMatrix (shrinkNumRows): number of "
+                                     "rows can only be shrunk");
         // TODO Here we could reduce the allocated size of the values array.
         this->numRows = numRows;
     }
-    
-    [[nodiscard]] size_t getRowSkip() const {
-        return rowSkip;
-    }
 
-    const char** getValues() const{
-        if(!values)
-            const_cast<DenseMatrix*>(this)->alloc_shared_values();
+    [[nodiscard]] size_t getRowSkip() const { return rowSkip; }
+
+    const char **getValues() const {
+        if (!values)
+            const_cast<DenseMatrix *>(this)->alloc_shared_values();
         return values.get();
     }
 
-    const char** getValues(){
-        if(!values)
+    const char **getValues() {
+        if (!values)
             alloc_shared_values();
         return values.get();
     }
 
-    std::shared_ptr<CharBuf> getStrBufSharedPtr() const{
-        return strBuf;
-    }
+    std::shared_ptr<CharBuf> getStrBufSharedPtr() const { return strBuf; }
 
-    CharBuf* getStrBuf() const{
-        if(!strBuf)
-            const_cast<DenseMatrix*>(this)->alloc_shared_strings();
+    CharBuf *getStrBuf() const {
+        if (!strBuf)
+            const_cast<DenseMatrix *>(this)->alloc_shared_strings();
         return strBuf.get();
     }
 
-    CharBuf* getStrBuf() {
-        if(!strBuf)
+    CharBuf *getStrBuf() {
+        if (!strBuf)
             alloc_shared_strings();
         return strBuf.get();
     }
 
-    std::shared_ptr<const char*[]> getValuesSharedPtr() const {
-        return values;
-    }
-    
-    const char* get(size_t rowIdx, size_t colIdx) const override {
-        return getValues()[pos(rowIdx, colIdx, false)];
-    }
+    std::shared_ptr<const char *[]> getValuesSharedPtr() const { return values; }
+
+    const char *get(size_t rowIdx, size_t colIdx) const override { return getValues()[pos(rowIdx, colIdx, false)]; }
 
-    void set(size_t rowIdx, size_t colIdx, const char* value) override {
+    void set(size_t rowIdx, size_t colIdx, const char *value) override {
         auto vals = getValues();
         size_t currentPos = pos(rowIdx, colIdx);
         auto currentVal = vals[currentPos];
         size_t currentSize = strBuf.get()->getSize();
         int32_t diff = strlen(value) - strlen(currentVal);
 
-        if(currentSize + diff > strBuf->capacity){
+        if (currentSize + diff > strBuf->capacity) {
             strBuf.get()->expandStringBuffer(currentSize + diff, vals, numRows, rowSkip, getNumItems());
             currentVal = vals[currentPos];
         }
 
-        if(diff && (currentPos + 1 < getStrBuf()->numCells)) {
-            const char* from = values[currentPos + 1];
-            const char* to = from + diff;
+        if (diff && (currentPos + 1 < getStrBuf()->numCells)) {
+            const char *from = values[currentPos + 1];
+            const char *to = from + diff;
             size_t length = strBuf->currentTop - from;
-            memmove(const_cast<char*>(to), from, length);
+            memmove(const_cast<char *>(to), from, length);
         }
-        memcpy(const_cast<char*>(currentVal), value, strlen(value) + 1);
+        memcpy(const_cast<char *>(currentVal), value, strlen(value) + 1);
 
-        if(diff){
-            for(size_t offset = currentPos + 1; offset < getStrBuf()->numCells; offset++)
+        if (diff) {
+            for (size_t offset = currentPos + 1; offset < getStrBuf()->numCells; offset++)
                 vals[offset] += diff;
         }
 
         strBuf->currentTop += diff;
     }
-    
+
     void prepareAppend() override {
         // the matrix might be empty
         if (numRows != 0 && numCols != 0)
@@ -573,8 +560,8 @@ class DenseMatrix<const char*> : public Matrix<const char*>
         lastAppendedColIdx = 0;
         strBuf->currentTop = strBuf.get()->strings;
     }
-    
-    void append(size_t rowIdx, size_t colIdx, const char* value) override {
+
+    void append(size_t rowIdx, size_t colIdx, const char *value) override {
         // Set all cells since the last one that was appended to zero.
         fillZeroUntil(rowIdx, colIdx);
         auto vals = getValues();
@@ -582,9 +569,9 @@ class DenseMatrix<const char*> : public Matrix<const char*>
         size_t length = strlen(value) + 1;
         size_t currentSize = strBuf.get()->getSize();
 
-        if(currentSize + length > strBuf->capacity)
+        if (currentSize + length > strBuf->capacity)
             strBuf.get()->expandStringBuffer(currentSize + length, vals, numRows, rowSkip, getNumRows() * getNumCols());
-    
+
         memcpy(strBuf->currentTop, value, length);
         vals[pos(rowIdx, colIdx)] = strBuf->currentTop;
 
@@ -593,18 +580,17 @@ class DenseMatrix<const char*> : public Matrix<const char*>
         lastAppendedRowIdx = rowIdx;
         lastAppendedColIdx = colIdx;
     }
-    
+
     void finishAppend() override {
         // numRows/numCols are unsigned and can underflow
-        if (    (numRows != 0 && numCols != 0)
-            && ((lastAppendedRowIdx + 1 < numRows) || (lastAppendedColIdx + 1 < numCols))
-            )
+        if ((numRows != 0 && numCols != 0) &&
+            ((lastAppendedRowIdx + 1 < numRows) || (lastAppendedColIdx + 1 < numCols)))
             append(numRows - 1, numCols - 1, "\0");
     }
 
-    void print(std::ostream & os) const override {
-        os << "DenseMatrix(" << numRows << 'x' << numCols << ", "
-                << ValueTypeUtils::cppNameFor<const char*> << ')' << std::endl;
+    void print(std::ostream &os) const override {
+        os << "DenseMatrix(" << numRows << 'x' << numCols << ", " << ValueTypeUtils::cppNameFor<const char *> << ')'
+           << std::endl;
         for (size_t r = 0; r < numRows; r++) {
             for (size_t c = 0; c < numCols; c++) {
                 printValue(os, get(r, c));
@@ -615,31 +601,26 @@ class DenseMatrix<const char*> : public Matrix<const char*>
         }
     }
 
-    DenseMatrix<const char*>* sliceRow(size_t rl, size_t ru) const override {
-        return slice(rl, ru, 0, numCols);
-    }
+    DenseMatrix<const char *> *sliceRow(size_t rl, size_t ru) const override { return slice(rl, ru, 0, numCols); }
 
-    DenseMatrix<const char*>* sliceCol(size_t cl, size_t cu) const override {
-        return slice(0, numRows, cl, cu);
-    }
+    DenseMatrix<const char *> *sliceCol(size_t cl, size_t cu) const override { return slice(0, numRows, cl, cu); }
 
-    DenseMatrix<const char*>* slice(size_t rl, size_t ru, size_t cl, size_t cu) const override {
-        return DataObjectFactory::create<DenseMatrix<const char*>>(this, rl, ru, cl, cu);
+    DenseMatrix<const char *> *slice(size_t rl, size_t ru, size_t cl, size_t cu) const override {
+        return DataObjectFactory::create<DenseMatrix<const char *>>(this, rl, ru, cl, cu);
     }
 
-    float printBufferSize() const { return static_cast<float>(numRows*numCols) / (1048576); }
+    float printBufferSize() const { return static_cast<float>(numRows * numCols) / (1048576); }
 
-    bool operator==(const DenseMatrix<const char*> &M) const {
+    bool operator==(const DenseMatrix<const char *> &M) const {
         if (getNumRows() == 0 || getNumCols() == 0 || !strBuf || !values)
-            throw std::runtime_error("DenseMatrix (operator==): invalid matrix. DenseMatrix must not be empty");
-        for(size_t r = 0; r < getNumRows(); r++)
-            for(size_t c = 0; c < getNumCols(); c++)
-                if(strcmp(M.getValues()[M.pos(r,c)], values.get()[pos(r,c)]))
+            throw std::runtime_error("DenseMatrix (operator==): invalid "
+                                     "matrix. DenseMatrix must not be empty");
+        for (size_t r = 0; r < getNumRows(); r++)
+            for (size_t c = 0; c < getNumCols(); c++)
+                if (strcmp(M.getValues()[M.pos(r, c)], values.get()[pos(r, c)]))
                     return false;
         return true;
-  }
+    }
 
-  size_t serialize(std::vector<char> &buf) const override {
-    throw std::runtime_error("Not implemented");
-  }
+    size_t serialize(std::vector<char> &buf) const override { throw std::runtime_error("Not implemented"); }
 };
diff --git a/src/runtime/local/datastructures/DistributedAllocationHelpers.h b/src/runtime/local/datastructures/DistributedAllocationHelpers.h
index 84f209333..7172b160f 100644
--- a/src/runtime/local/datastructures/DistributedAllocationHelpers.h
+++ b/src/runtime/local/datastructures/DistributedAllocationHelpers.h
@@ -1,45 +1,33 @@
 #ifndef SRC_RUNTIME_LOCAL_DATASTRUCTURES_DISTRIBUTEDALLOCATIONHELPER_H
 #define SRC_RUNTIME_LOCAL_DATASTRUCTURES_DISTRIBUTEDALLOCATIONHELPER_H
 
-class DistributedIndex
-{
-    public:
-        DistributedIndex() : row_(0), col_(0)
-        {}
-        DistributedIndex(size_t row, size_t col) : row_(row), col_(col)
-        {}
+class DistributedIndex {
+  public:
+    DistributedIndex() : row_(0), col_(0) {}
+    DistributedIndex(size_t row, size_t col) : row_(row), col_(col) {}
 
-        size_t getRow() const
-        {
-            return row_;
-        }
-        size_t getCol() const
-        {
-            return col_;
-        }
+    size_t getRow() const { return row_; }
+    size_t getCol() const { return col_; }
 
-        bool operator<(const DistributedIndex rhs) const
-        {
-            if (row_ < rhs.row_)
-                return true;
-            else if (row_ == rhs.row_)
-                return col_ < rhs.col_;
-            return false;
-        }
-    private:
-        size_t row_;
-        size_t col_;
-};
+    bool operator<(const DistributedIndex rhs) const {
+        if (row_ < rhs.row_)
+            return true;
+        else if (row_ == rhs.row_)
+            return col_ < rhs.col_;
+        return false;
+    }
 
+  private:
+    size_t row_;
+    size_t col_;
+};
 
-struct DistributedData
-{
+struct DistributedData {
     std::string identifier;
     size_t numRows, numCols;
     mlir::daphne::VectorCombine vectorCombine;
     bool isPlacedAtWorker = false;
     DistributedIndex ix;
-
 };
 
 #endif // SRC_RUNTIME_LOCAL_DATASTRUCTURES_DISTRIBUTEDALLOCATIONHELPER_H
\ No newline at end of file
diff --git a/src/runtime/local/datastructures/FixedSizeStringValueType.h b/src/runtime/local/datastructures/FixedSizeStringValueType.h
new file mode 100644
index 000000000..f39cbda6a
--- /dev/null
+++ b/src/runtime/local/datastructures/FixedSizeStringValueType.h
@@ -0,0 +1,146 @@
+/*
+ * Copyright 2024 The DAPHNE Consortium
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <algorithm>
+#include <iostream>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#include <cstddef>
+#include <cstring>
+
+/**
+ * @brief A string value type with a maximum length of 15 characters.
+ *
+ * Each instance is backed by a 16-character buffer, whereby at least the last character must always be a null
+ * character. The null-termination is required for some operations to work correctly (e.g., casting to a number).
+ */
+struct FixedStr16 {
+    static const std::size_t N = 16;
+    char buffer[N];
+
+    // Default constructor
+    FixedStr16() { std::fill(buffer, buffer + N, '\0'); }
+
+    // Constructor from a C-style string
+    FixedStr16(const char *str) {
+        size_t len = std::strlen(str);
+        if (len >= N) {
+            throw std::length_error("string exceeds fixed buffer size");
+        }
+        std::copy(str, str + len, buffer);
+        std::fill(buffer + len, buffer + N, '\0');
+    }
+
+    // Copy constructor
+    FixedStr16(const FixedStr16 &other) { std::copy(other.buffer, other.buffer + N, buffer); }
+
+    // Constructor from a std::string
+    FixedStr16(const std::string &other) {
+        size_t len = other.size();
+        if (len >= N) {
+            throw std::length_error("string exceeds fixed buffer size");
+        }
+        std::copy(other.begin(), other.end(), buffer);
+        std::fill(buffer + len, buffer + N, '\0');
+    }
+
+    // Assignment operator
+    FixedStr16 &operator=(const FixedStr16 &other) {
+        if (this != &other) {
+            std::copy(other.buffer, other.buffer + N, buffer);
+        }
+        return *this;
+    }
+
+    // Overriding the equality operator
+    bool operator==(const FixedStr16 &other) const { return std::equal(buffer, buffer + N, other.buffer); }
+
+    bool operator==(const char *str) const { return std::strncmp(buffer, str, sizeof(buffer)) == 0; }
+
+    // Overriding the inequality operator
+    bool operator!=(const FixedStr16 &other) const { return !(std::equal(buffer, buffer + N, other.buffer)); }
+
+    bool operator!=(const char *str) const { return !(std::strncmp(buffer, str, sizeof(buffer)) == 0); }
+
+    // Overriding the Less than operator
+    bool operator<(const FixedStr16 &other) const { return std::strncmp(buffer, other.buffer, N) < 0; }
+
+    // Overriding the Greater than operator
+    bool operator>(const FixedStr16 &other) const { return std::strncmp(buffer, other.buffer, N) > 0; }
+
+    // Concatenation operator
+    friend std::string operator+(const FixedStr16 &lhs, const FixedStr16 &rhs) {
+        std::string result(lhs.buffer);
+        result.append(rhs.buffer);
+        return result;
+    }
+
+    // Serialization function
+    void serialize(std::vector<char> &outBuffer) const { outBuffer.insert(outBuffer.end(), buffer, buffer + N); }
+
+    // Overload the output stream operator
+    friend std::ostream &operator<<(std::ostream &os, const FixedStr16 &fs) {
+        os.write(fs.buffer, N);
+        return os;
+    }
+
+    // Size method
+    size_t size() const { return std::strlen(buffer); }
+
+    // Method to set the string
+    void set(const char *str) {
+        size_t len = std::strlen(str);
+        if (len >= N) {
+            throw std::length_error("string exceeds fixed buffer size");
+        }
+        std::transform(str, str + len, buffer, [](char c) { return c; });
+        std::fill(buffer + len, buffer + N, '\0');
+    }
+
+    // C-string method for compatibility
+    std::string to_string() const { return std::string(buffer, size()); }
+
+    // Compare method similar to std::string::compare
+    int compare(const FixedStr16 &other) const { return std::strncmp(buffer, other.buffer, N); }
+
+    // Convert to lowercase
+    FixedStr16 lower() const {
+        FixedStr16 result;
+        std::transform(buffer, buffer + N, result.buffer, [](unsigned char c) { return std::tolower(c); });
+        return result;
+    }
+
+    // Convert to uppercase
+    FixedStr16 upper() const {
+        FixedStr16 result;
+        std::transform(buffer, buffer + N, result.buffer, [](unsigned char c) { return std::toupper(c); });
+        return result;
+    }
+};
+
+// Specialize std::hash for FixedStr16 this is nessary to use FixedStr16 as a key in std::unordered_map
+namespace std {
+template <> struct hash<FixedStr16> {
+    std::size_t operator()(const FixedStr16 &key) const {
+        // Compute the hash of the fixed-size buffer
+        return std::hash<std::string>()(std::string(key.buffer, key.N));
+    }
+};
+} // namespace std
diff --git a/src/runtime/local/datastructures/Frame.cpp b/src/runtime/local/datastructures/Frame.cpp
index cac5a2d78..d58bd66aa 100644
--- a/src/runtime/local/datastructures/Frame.cpp
+++ b/src/runtime/local/datastructures/Frame.cpp
@@ -14,16 +14,13 @@
  * limitations under the License.
  */
 
-#include <runtime/local/io/DaphneSerializer.h>
-#include <runtime/local/datastructures/Frame.h>
 #include <ostream>
+#include <runtime/local/datastructures/Frame.h>
+#include <runtime/local/io/DaphneSerializer.h>
 
-std::ostream & operator<<(std::ostream & os, const Frame & obj)
-{
+std::ostream &operator<<(std::ostream &os, const Frame &obj) {
     obj.print(os);
     return os;
 }
 
-size_t Frame::serialize(std::vector<char> &buf) const {
-    return DaphneSerializer<Frame>::serialize(this, buf);
-}
\ No newline at end of file
+size_t Frame::serialize(std::vector<char> &buf) const { return DaphneSerializer<Frame>::serialize(this, buf); }
\ No newline at end of file
diff --git a/src/runtime/local/datastructures/Frame.h b/src/runtime/local/datastructures/Frame.h
index 5b178a819..37af47c3d 100644
--- a/src/runtime/local/datastructures/Frame.h
+++ b/src/runtime/local/datastructures/Frame.h
@@ -36,73 +36,69 @@
 
 /**
  * @brief A data structure with an individual value type per column.
- * 
+ *
  * A `Frame` is organized in column-major fashion and is backed by an
  * individual dense array for each column.
  */
 class Frame : public Structure {
-    
+
     // Grant DataObjectFactory access to the private constructors and
     // destructors.
-    template<class DataType, typename ... ArgTypes>
-    friend DataType * DataObjectFactory::create(ArgTypes ...);
-    template<class DataType>
-    friend void DataObjectFactory::destroy(const DataType * obj);
-    
+    template <class DataType, typename... ArgTypes> friend DataType *DataObjectFactory::create(ArgTypes...);
+    template <class DataType> friend void DataObjectFactory::destroy(const DataType *obj);
+
     /**
      * @brief An array of length `numCols` of the value types of the columns of
      * this frame.
-     * 
+     *
      * Note that the schema is not encoded as template parameters since this
      * would lead to an explosion of frame types to be compiled.
      */
-    ValueTypeCode * schema;
-    
+    ValueTypeCode *schema;
+
     /**
      * @brief An array of length `numCols` of the names of the columns of this
      * frame.
      */
-    std::string * labels;
-    
+    std::string *labels;
+
     /**
      * @brief A mapping from a column's label to its position in the frame.
      */
     std::unordered_map<std::string, size_t> labels2idxs;
-    
+
     /**
      * @brief The common pointer type used for the array of each column,
      * irrespective of the actual value type of the column.
-     * 
+     *
      * Each column can have its own value type, as determined by the `schema`.
      * However, we cannot declare the column pointers of individual types,
      * since we want to store them in one array. Thus, we use a common pointer
      * type for all of them, internally.
-     * 
+     *
      * Using `uint8_t` is advantageous, since `sizeof(uint8_t) == 1`, which
      * simplifies the computation of physical sizes.
      */
     using ColByteType = uint8_t;
-    
+
     /**
      * @brief An array of length `numCols` of the column arrays of this frame.
      */
-    std::shared_ptr<ColByteType> * columns;
-    
+    std::shared_ptr<ColByteType> *columns;
+
     /**
      * @brief Initializes the mapping from column labels to column positions in
      * the frame and checks for duplicate column labels.
-     * 
+     *
      * This method should be called by each constructor, after the column
      * labels have been initialized.
      */
     void initLabels2Idxs() {
         labels2idxs.clear();
-        for(size_t i = 0; i < numCols; i++) {
-            if(labels2idxs.count(labels[i]))
-                throw std::runtime_error(
-                        "a frame's column labels must be unique, but '" +
-                        labels[i] + "' occurs more than once"
-                );
+        for (size_t i = 0; i < numCols; i++) {
+            if (labels2idxs.count(labels[i]))
+                throw std::runtime_error("a frame's column labels must be unique, but '" + labels[i] +
+                                         "' occurs more than once");
             labels2idxs[labels[i]] = i;
         }
     }
@@ -110,24 +106,25 @@ class Frame : public Structure {
     /**
      * @brief Initializes the mapping from column labels to column positions in
      * the frame and assigns default labels to duplicate column labels.
-     * 
-     * This method should only be called by constructors, that may intentionally duplicate 
-     * columns, instead of initLabels2Idxs(), after the column labels have been initialized.
+     *
+     * This method should only be called by constructors, that may intentionally
+     * duplicate columns, instead of initLabels2Idxs(), after the column labels
+     * have been initialized.
      */
     void initDeduplicatedLabels2Idxs() {
         labels2idxs.clear();
-        for(size_t i = 0; i < numCols; i++) {
-            if(labels2idxs.count(labels[i]))
-               labels[i] = getDefaultLabel(i);
+        for (size_t i = 0; i < numCols; i++) {
+            if (labels2idxs.count(labels[i]))
+                labels[i] = getDefaultLabel(i);
             labels2idxs[labels[i]] = i;
         }
     }
-    
+
     // TODO Should the given schema array really be copied, or reused?
     /**
      * @brief Creates a `Frame` and allocates enough memory for the specified
      * size.
-     * 
+     *
      * @param maxNumRows The maximum number of rows.
      * @param numCols The exact number of columns.
      * @param schema An array of length `numCols` of the value types of the
@@ -136,57 +133,51 @@ class Frame : public Structure {
      * shall be initialized to zeros (`true`), or be left uninitialized
      * (`false`).
      */
-    Frame(size_t maxNumRows, size_t numCols, const ValueTypeCode * schema, const std::string * labels, bool zero) :
-            Structure(maxNumRows, numCols),
-            schema(new ValueTypeCode[numCols]),
-            labels(new std::string[numCols]),
-            columns(new std::shared_ptr<ColByteType>[numCols])
-    {
-        for(size_t i = 0; i < numCols; i++) {
+    Frame(size_t maxNumRows, size_t numCols, const ValueTypeCode *schema, const std::string *labels, bool zero)
+        : Structure(maxNumRows, numCols), schema(new ValueTypeCode[numCols]), labels(new std::string[numCols]),
+          columns(new std::shared_ptr<ColByteType>[numCols]) {
+        for (size_t i = 0; i < numCols; i++) {
             this->schema[i] = schema[i];
             this->labels[i] = labels ? labels[i] : getDefaultLabel(i);
             const size_t sizeAlloc = maxNumRows * ValueTypeUtils::sizeOf(schema[i]);
-            this->columns[i] = std::shared_ptr<ColByteType>(new ColByteType[sizeAlloc],
-                    std::default_delete<ColByteType []>());
-            if(zero)
+            this->columns[i] =
+                std::shared_ptr<ColByteType>(new ColByteType[sizeAlloc], std::default_delete<ColByteType[]>());
+            if (zero)
                 memset(this->columns[i].get(), 0, sizeAlloc);
         }
         initLabels2Idxs();
     }
-    
-    Frame(const Frame * lhs, const Frame * rhs) :
-            Structure(lhs->getNumRows(), lhs->getNumCols() + rhs->getNumCols())
-    {
-        if(lhs->getNumRows() != rhs->getNumRows())
-            throw std::runtime_error(
-                    "both input frames must have the same number of rows"
-            );
-        
+
+    Frame(const Frame *lhs, const Frame *rhs) : Structure(lhs->getNumRows(), lhs->getNumCols() + rhs->getNumCols()) {
+        if (lhs->getNumRows() != rhs->getNumRows())
+            throw std::runtime_error("both input frames must have the same number of rows");
+
         schema = new ValueTypeCode[numCols];
         labels = new std::string[numCols];
         columns = new std::shared_ptr<ColByteType>[numCols];
-        
+
         const size_t numColsLhs = lhs->getNumCols();
         const size_t numColsRhs = rhs->getNumCols();
-        
-        for(size_t i = 0; i < numColsLhs; i++) {
-            schema [i] = lhs->schema[i];
-            labels [i] = lhs->labels[i];
+
+        for (size_t i = 0; i < numColsLhs; i++) {
+            schema[i] = lhs->schema[i];
+            labels[i] = lhs->labels[i];
             columns[i] = std::shared_ptr<ColByteType>(lhs->columns[i]);
         }
-        for(size_t i = 0; i < numColsRhs; i++) {
-            schema [numColsLhs + i] = rhs->schema[i];
-            labels [numColsLhs + i] = rhs->labels[i];
+        for (size_t i = 0; i < numColsRhs; i++) {
+            schema[numColsLhs + i] = rhs->schema[i];
+            labels[numColsLhs + i] = rhs->labels[i];
             columns[numColsLhs + i] = std::shared_ptr<ColByteType>(rhs->columns[i]);
         }
         initLabels2Idxs();
     }
-    
-    template<typename VT>
-    bool tryValueType(Structure * colMat, ValueTypeCode * schemaSlot, std::shared_ptr<ColByteType> * columnsSlot) {
-        if(auto colMat2 = dynamic_cast<DenseMatrix<VT> *>(colMat)) {
+
+    template <typename VT>
+    bool tryValueType(Structure *colMat, ValueTypeCode *schemaSlot, std::shared_ptr<ColByteType> *columnsSlot) {
+        if (auto colMat2 = dynamic_cast<DenseMatrix<VT> *>(colMat)) {
             if (colMat2->getRowSkip() != 1)
-                throw std::runtime_error("Frame (tryValueType): all given matrices must not be a view of a column of a larger matrix");
+                throw std::runtime_error("Frame (tryValueType): all given matrices must not be a "
+                                         "view of a column of a larger matrix");
             *schemaSlot = ValueTypeUtils::codeFor<VT>;
             std::shared_ptr<VT[]> orig = colMat2->getValuesSharedPtr();
             *columnsSlot = std::shared_ptr<ColByteType>(orig, reinterpret_cast<ColByteType *>(orig.get()));
@@ -194,209 +185,191 @@ class Frame : public Structure {
         }
         return false;
     }
-    
+
     /**
      * @brief Creates a `Frame` with the given single-column matrices as its
      * columns.
-     * 
+     *
      * The schema of the frame is automatically determined based on the value
      * types of the given matrices.
-     * 
+     *
      * The data arrays are shared with the given matrices, i.e., no copying is
      * performed.
-     * 
+     *
      * @param colMats A `std::vector` of single-column matrices. These must be
      * `DenseMatrix`s of any value type (the type `Structure` is used here only
      * to not depend on a template parameter for the value type). Furthermore,
      * these matrices must not be views on a single column of a larger matrix.
      */
-    Frame(const std::vector<Structure *>& colMats, const std::string * labels) :
-            Structure(colMats.empty() ? 0 : colMats[0]->getNumRows(), colMats.size())
-    {
+    Frame(const std::vector<Structure *> &colMats, const std::string *labels)
+        : Structure(colMats.empty() ? 0 : colMats[0]->getNumRows(), colMats.size()) {
         const size_t numCols = colMats.size();
         if (numCols == 0)
             throw std::runtime_error("Frame: at least one column matrix must be provided");
         schema = new ValueTypeCode[numCols];
         this->labels = new std::string[numCols];
         columns = new std::shared_ptr<ColByteType>[numCols];
-        for(size_t c = 0; c < numCols; c++) {
-            Structure * colMat = colMats[c];
+        for (size_t c = 0; c < numCols; c++) {
+            Structure *colMat = colMats[c];
             if (colMat->getNumCols() != 1)
                 throw std::runtime_error("Frame: all given matrices must be column matrices");
             if (colMat->getNumRows() != numRows)
-                throw std::runtime_error("Frame: all given matrices must have the same number of rows");
+                throw std::runtime_error("Frame: all given matrices must have "
+                                         "the same number of rows");
             this->labels[c] = labels ? labels[c] : getDefaultLabel(c);
             // For all value types.
             bool found = tryValueType<int8_t>(colMat, schema + c, columns + c);
             found = found || tryValueType<int32_t>(colMat, schema + c, columns + c);
             found = found || tryValueType<int64_t>(colMat, schema + c, columns + c);
-            found = found || tryValueType<uint8_t> (colMat, schema + c, columns + c);
+            found = found || tryValueType<uint8_t>(colMat, schema + c, columns + c);
             found = found || tryValueType<uint32_t>(colMat, schema + c, columns + c);
             found = found || tryValueType<uint64_t>(colMat, schema + c, columns + c);
-            found = found || tryValueType<float> (colMat, schema + c, columns + c);
+            found = found || tryValueType<float>(colMat, schema + c, columns + c);
             found = found || tryValueType<double>(colMat, schema + c, columns + c);
-            if(!found)
+            if (!found)
                 throw std::runtime_error("unsupported value type");
         }
         initLabels2Idxs();
     }
-    
+
     /**
      * @brief Creates a `Frame` around a sub-frame of another `Frame` without
      * copying the data.
-     * 
+     *
      * @param src The other frame.
-     * @param rowLowerIncl Inclusive lower bound for the range of rows to extract.
-     * @param rowUpperIncl Exclusive upper bound for the range of rows to extract.
+     * @param rowLowerIncl Inclusive lower bound for the range of rows to
+     * extract.
+     * @param rowUpperIncl Exclusive upper bound for the range of rows to
+     * extract.
      * @param numCols The number of columns to extract.
      * @param colIdxs An array of length `numCols` of the indexes of the
      * columns to extract from `src`.
      */
-    Frame(const Frame * src, int64_t rowLowerIncl, int64_t rowUpperExcl, size_t numCols, const size_t * colIdxs) :
-            Structure(rowUpperExcl - rowLowerIncl, numCols)
-    {
+    Frame(const Frame *src, int64_t rowLowerIncl, int64_t rowUpperExcl, size_t numCols, const size_t *colIdxs)
+        : Structure(rowUpperExcl - rowLowerIncl, numCols) {
         if (src == nullptr)
-            throw std::runtime_error("invalid argument passed to frame constructor: src must not be null");
-        
-        if (rowLowerIncl < 0 || rowUpperExcl < rowLowerIncl || static_cast<ssize_t>(src->numRows) < rowUpperExcl
-            || (rowLowerIncl == static_cast<ssize_t>(src->numRows) && rowLowerIncl != 0)) {
+            throw std::runtime_error("invalid argument passed to frame "
+                                     "constructor: src must not be null");
+
+        if (rowLowerIncl < 0 || rowUpperExcl < rowLowerIncl || static_cast<ssize_t>(src->numRows) < rowUpperExcl ||
+            (rowLowerIncl == static_cast<ssize_t>(src->numRows) && rowLowerIncl != 0)) {
             std::ostringstream errMsg;
             errMsg << "invalid arguments '" << rowLowerIncl << ", " << rowUpperExcl
-                    << "' passed to frame constructor: it must hold 0 <= rowLowerIncl <= rowUpperExcl <= #rows "
-                    << "and rowLowerIncl < #rows (unless both are zero) where #rows of src is '" << src->numRows << "'";
+                   << "' passed to frame constructor: it must hold 0 <= "
+                      "rowLowerIncl <= rowUpperExcl <= #rows "
+                   << "and rowLowerIncl < #rows (unless both are zero) where "
+                      "#rows of src is '"
+                   << src->numRows << "'";
             throw std::out_of_range(errMsg.str());
         }
 
         size_t numColsSrc = src->numCols;
-        for(size_t i = 0; i < numCols; i++) {
+        for (size_t i = 0; i < numCols; i++) {
             if (numColsSrc <= colIdxs[i]) {
                 std::ostringstream errMsg;
-                errMsg << "invalid argument '" << colIdxs[i] << "' passed to frame constructor: "
-                    "colIdx is out of bounds for frame with column boundaries '[0, " << numColsSrc << ")'";
+                errMsg << "invalid argument '" << colIdxs[i]
+                       << "' passed to frame constructor: "
+                          "colIdx is out of bounds for frame with column "
+                          "boundaries '[0, "
+                       << numColsSrc << ")'";
                 throw std::out_of_range(errMsg.str());
             }
         }
-        
+
         this->schema = new ValueTypeCode[numCols];
         this->labels = new std::string[numCols];
         this->columns = new std::shared_ptr<ColByteType>[numCols];
-        for(size_t i = 0; i < numCols; i++) {
+        for (size_t i = 0; i < numCols; i++) {
             this->schema[i] = src->schema[colIdxs[i]];
             this->labels[i] = src->labels[colIdxs[i]];
-            this->columns[i] = std::shared_ptr<ColByteType>(
-                    src->columns[colIdxs[i]],
-                    src->columns[colIdxs[i]].get() + rowLowerIncl * ValueTypeUtils::sizeOf(schema[i])
-            );
+            this->columns[i] = std::shared_ptr<ColByteType>(src->columns[colIdxs[i]],
+                                                            src->columns[colIdxs[i]].get() +
+                                                                rowLowerIncl * ValueTypeUtils::sizeOf(schema[i]));
         }
         initDeduplicatedLabels2Idxs();
     }
-    
+
     ~Frame() override {
         delete[] schema;
         delete[] labels;
         delete[] columns;
     }
-    
-public:
-    
+
+  public:
     /**
      * @brief Returns the default label to use for the pos-th column, if no
      * column label was specified.
      * @param pos The position of the column in the frame (starting at zero).
      * @return The default label for the pos-th column.
      */
-    static std::string getDefaultLabel(size_t pos) {
-        return "col_" + std::to_string(pos);
-    }
-    
+    static std::string getDefaultLabel(size_t pos) { return "col_" + std::to_string(pos); }
+
     void shrinkNumRows(size_t numRows) {
         // TODO Here we could reduce the allocated size of the column arrays.
         this->numRows = numRows;
     }
-    
-    const ValueTypeCode * getSchema() const {
-        return schema;
-    }
-    
-    const std::string * getLabels() const {
-        return labels;
-    }
-    
-    void setLabels(const std::string * newLabels) {
-        for(size_t i = 0; i < numCols; i++)
+
+    const ValueTypeCode *getSchema() const { return schema; }
+
+    const std::string *getLabels() const { return labels; }
+
+    void setLabels(const std::string *newLabels) {
+        for (size_t i = 0; i < numCols; i++)
             labels[i] = newLabels[i];
         initLabels2Idxs();
     }
-    
-    size_t getColumnIdx(const std::string & label) const {
+
+    size_t getColumnIdx(const std::string &label) const {
         auto it = labels2idxs.find(label);
-        if(it != labels2idxs.end())
+        if (it != labels2idxs.end())
             return it->second;
         throw std::runtime_error("column label not found: '" + label + "'");
     }
-    
+
     ValueTypeCode getColumnType(size_t idx) const {
         if (idx >= numCols)
             throw std::runtime_error("Frame (getColumnType): column index is out of bounds");
         return schema[idx];
     }
-    
-    ValueTypeCode getColumnType(const std::string & label) const {
-        return getColumnType(getColumnIdx(label));
-    }
-    
-    template<typename ValueType>
-    DenseMatrix<ValueType> * getColumn(size_t idx) {
+
+    ValueTypeCode getColumnType(const std::string &label) const { return getColumnType(getColumnIdx(label)); }
+
+    template <typename ValueType> DenseMatrix<ValueType> *getColumn(size_t idx) {
         if (ValueTypeUtils::codeFor<ValueType> != schema[idx])
-            throw std::runtime_error("Frame (getColumn): requested value type must match the type of the column");
+            throw std::runtime_error("Frame (getColumn): requested value type "
+                                     "must match the type of the column");
         return DataObjectFactory::create<DenseMatrix<ValueType>>(
-                numRows, 1,
-                std::shared_ptr<ValueType[]>(
-                        columns[idx],
-                        reinterpret_cast<ValueType *>(columns[idx].get())
-                )
-        );
+            numRows, 1, std::shared_ptr<ValueType[]>(columns[idx], reinterpret_cast<ValueType *>(columns[idx].get())));
     }
-    
-    template<typename ValueType>
-    const DenseMatrix<ValueType> * getColumn(size_t idx) const {
+
+    template <typename ValueType> const DenseMatrix<ValueType> *getColumn(size_t idx) const {
         return const_cast<Frame *>(this)->getColumn<ValueType>(idx);
     }
-    
-    template<typename ValueType>
-    DenseMatrix<ValueType> * getColumn(const std::string & label) {
+
+    template <typename ValueType> DenseMatrix<ValueType> *getColumn(const std::string &label) {
         return getColumn<ValueType>(getColumnIdx(label));
     }
-    
-    template<typename ValueType>
-    const DenseMatrix<ValueType> * getColumn(const std::string & label) const {
+
+    template <typename ValueType> const DenseMatrix<ValueType> *getColumn(const std::string &label) const {
         return const_cast<Frame *>(this)->getColumn<ValueType>(label);
     }
-    
-    void * getColumnRaw(size_t idx) {
-        return columns[idx].get();
-    }
-    
-    const void * getColumnRaw(size_t idx) const {
-        return const_cast<Frame *>(this)->getColumnRaw(idx);
-    }
 
-    size_t getNumDims() const override {
-        return 2;
-    }
+    void *getColumnRaw(size_t idx) { return columns[idx].get(); }
 
-    size_t getNumItems() const override {
-        return this->numRows * this->numCols;
-    }
+    const void *getColumnRaw(size_t idx) const { return const_cast<Frame *>(this)->getColumnRaw(idx); }
+
+    size_t getNumDims() const override { return 2; }
 
-    void print(std::ostream & os) const override {
+    size_t getNumItems() const override { return this->numRows * this->numCols; }
+
+    void print(std::ostream &os) const override {
         os << "Frame(" << numRows << 'x' << numCols << ", [";
-        for(size_t c = 0; c < numCols; c++) {
+        for (size_t c = 0; c < numCols; c++) {
             // TODO Ideally, special characters in the labels should be
             // escaped.
             os << labels[c] << ':';
             os << ValueTypeUtils::cppNameForCode(schema[c]);
-            if(c < numCols - 1)
+            if (c < numCols - 1)
                 os << ", ";
         }
         os << "])" << std::endl;
@@ -410,97 +383,92 @@ class Frame : public Structure {
         }
     }
 
-    Frame* sliceRow(size_t rl, size_t ru) const override {
-        return slice(rl, ru, 0, numCols);
-    }
+    Frame *sliceRow(size_t rl, size_t ru) const override { return slice(rl, ru, 0, numCols); }
 
-    Frame* sliceCol(size_t cl, size_t cu) const override {
-        return slice(0, numRows, cl, cu);
-    }
+    Frame *sliceCol(size_t cl, size_t cu) const override { return slice(0, numRows, cl, cu); }
 
-    Frame* slice(size_t rl, size_t ru, size_t cl, size_t cu) const override {
-        if(cl > cu)
+    Frame *slice(size_t rl, size_t ru, size_t cl, size_t cu) const override {
+        if (cl > cu)
             throw std::runtime_error("Frame::slice(): cl must not be greater than cu");
-        size_t * colIdxs = new size_t[cu-cl];
+        size_t *colIdxs = new size_t[cu - cl];
         size_t i = 0;
-        for(size_t c = cl; c < cu; c++, i++)
+        for (size_t c = cl; c < cu; c++, i++)
             colIdxs[i] = c;
-        auto res = DataObjectFactory::create<Frame>(this, rl, ru, cu-cl, colIdxs);
+        auto res = DataObjectFactory::create<Frame>(this, rl, ru, cu - cl, colIdxs);
         delete[] colIdxs;
         return res;
     }
     size_t serialize(std::vector<char> &buf) const override;
 
-    bool operator==(const Frame & rhs) const {
-        if(this == &rhs)
+    bool operator==(const Frame &rhs) const {
+        if (this == &rhs)
             return true;
 
         const size_t numRows = this->getNumRows();
         const size_t numCols = this->getNumCols();
 
-        if(numRows != rhs.getNumRows() || numCols != rhs.getNumCols())
+        if (numRows != rhs.getNumRows() || numCols != rhs.getNumCols())
             return false;
 
-        if(memcmp(this->getSchema(), rhs.getSchema(), numCols * sizeof(ValueTypeCode)))
+        if (memcmp(this->getSchema(), rhs.getSchema(), numCols * sizeof(ValueTypeCode)))
             return false;
 
-        const std::string * labelsLhs = this->getLabels();
-        const std::string * labelsRhs = rhs.getLabels();
+        const std::string *labelsLhs = this->getLabels();
+        const std::string *labelsRhs = rhs.getLabels();
         for (size_t c = 0; c < numCols; c++) {
-            if(labelsLhs[c] != labelsRhs[c])
+            if (labelsLhs[c] != labelsRhs[c])
                 return false;
         }
 
-        for (size_t c = 0; c < numCols; c++)
-        {
-            switch(this->getColumnType(c)) {
-                // For all value types:
-                case ValueTypeCode::F64:
-                    if (!(*(this->getColumn<double>(c)) == *(rhs.getColumn<double>(c)))) {
-                        return false;
-                    }
-                    break;
-                case ValueTypeCode::F32:
-                    if (!(*(this->getColumn<float>(c)) == *(rhs.getColumn<float>(c)))) {
-                        return false;
-                    }
-                    break;
-                case ValueTypeCode::SI64:
-                    if (!(*(this->getColumn<int64_t>(c)) == *(rhs.getColumn<int64_t>(c)))) {
-                        return false;
-                    }
-                    break;
-                case ValueTypeCode::SI32:
-                    if (!(*(this->getColumn<int32_t>(c)) == *(rhs.getColumn<int32_t>(c)))) {
-                        return false;
-                    }
-                    break;
-                case ValueTypeCode::SI8 :
-                    if (!(*(this->getColumn<int8_t>(c)) == *(rhs.getColumn<int8_t>(c)))) {
-                        return false;
-                    }
-                    break;
-                case ValueTypeCode::UI64:
-                    if (!(*(this->getColumn<uint64_t>(c)) == *(rhs.getColumn<uint64_t>(c)))) {
-                        return false;
-                    }
-                    break;
-                case ValueTypeCode::UI32:
-                    if (!(*(this->getColumn<uint32_t>(c)) == *(rhs.getColumn<uint32_t>(c)))) {
-                        return false;
-                    }
-                    break;
-                case ValueTypeCode::UI8 :
-                    if (!(*(this->getColumn<uint8_t>(c)) == *(rhs.getColumn<uint8_t>(c)))) {
-                        return false;
-                    }
-                    break;
-                default:
-                    throw std::runtime_error("CheckEq::apply: unknown value type code");
+        for (size_t c = 0; c < numCols; c++) {
+            switch (this->getColumnType(c)) {
+            // For all value types:
+            case ValueTypeCode::F64:
+                if (!(*(this->getColumn<double>(c)) == *(rhs.getColumn<double>(c)))) {
+                    return false;
+                }
+                break;
+            case ValueTypeCode::F32:
+                if (!(*(this->getColumn<float>(c)) == *(rhs.getColumn<float>(c)))) {
+                    return false;
+                }
+                break;
+            case ValueTypeCode::SI64:
+                if (!(*(this->getColumn<int64_t>(c)) == *(rhs.getColumn<int64_t>(c)))) {
+                    return false;
+                }
+                break;
+            case ValueTypeCode::SI32:
+                if (!(*(this->getColumn<int32_t>(c)) == *(rhs.getColumn<int32_t>(c)))) {
+                    return false;
+                }
+                break;
+            case ValueTypeCode::SI8:
+                if (!(*(this->getColumn<int8_t>(c)) == *(rhs.getColumn<int8_t>(c)))) {
+                    return false;
+                }
+                break;
+            case ValueTypeCode::UI64:
+                if (!(*(this->getColumn<uint64_t>(c)) == *(rhs.getColumn<uint64_t>(c)))) {
+                    return false;
+                }
+                break;
+            case ValueTypeCode::UI32:
+                if (!(*(this->getColumn<uint32_t>(c)) == *(rhs.getColumn<uint32_t>(c)))) {
+                    return false;
+                }
+                break;
+            case ValueTypeCode::UI8:
+                if (!(*(this->getColumn<uint8_t>(c)) == *(rhs.getColumn<uint8_t>(c)))) {
+                    return false;
+                }
+                break;
+            default:
+                throw std::runtime_error("CheckEq::apply: unknown value type code");
             }
         }
         return true;
     }
 };
 
-std::ostream & operator<<(std::ostream & os, const Frame & obj);
+std::ostream &operator<<(std::ostream &os, const Frame &obj);
diff --git a/src/runtime/local/datastructures/IAllocationDescriptor.h b/src/runtime/local/datastructures/IAllocationDescriptor.h
index 732587484..eb17eb8f8 100644
--- a/src/runtime/local/datastructures/IAllocationDescriptor.h
+++ b/src/runtime/local/datastructures/IAllocationDescriptor.h
@@ -21,9 +21,9 @@
 // An alphabetically sorted wishlist of supported allocation types ;-)
 // Supporting all of that is probably unmaintainable :-/
 enum class ALLOCATION_TYPE {
-    DIST_GRPC, // Generic gRPC TAG
+    DIST_GRPC,       // Generic gRPC TAG
     DIST_GRPC_ASYNC, // Asynchronous gRPC communication
-    DIST_GRPC_SYNC, // Synchronous gRPC communication
+    DIST_GRPC_SYNC,  // Synchronous gRPC communication
     DIST_MPI,
     DIST_SPARK,
     GPU_CUDA,
@@ -32,29 +32,32 @@ enum class ALLOCATION_TYPE {
     HOST_PINNED_CUDA,
     FPGA_INT, // Intel
     FPGA_XLX, // Xilinx
-    ONEAPI, // probably need separate ones for CPU/GPU/FPGA
+    ONEAPI,   // probably need separate ones for CPU/GPU/FPGA
     NUM_ALLOC_TYPES
 };
 
 /**
- * @brief The IAllocationDescriptor interface class describes an abstract interface to handle memory allocations
+ * @brief The IAllocationDescriptor interface class describes an abstract
+ * interface to handle memory allocations
  *
- * To decouple specifics of a certain API for managing memory (e.g, hardware accelerators, distributed libraries)
- * the allocation descriptor interface provides a set of methods that need to be implemented by the concrete API
+ * To decouple specifics of a certain API for managing memory (e.g, hardware
+ * accelerators, distributed libraries) the allocation descriptor interface
+ * provides a set of methods that need to be implemented by the concrete API
  * specific derivations.
- * These allocation descriptors are used to request a certain type of memory when using the getValues() method of
- * a matrix/frame. They are also responsible for transferring to and from the special memory that is handled by
- * the allocator.
+ * These allocation descriptors are used to request a certain type of memory
+ * when using the getValues() method of a matrix/frame. They are also
+ * responsible for transferring to and from the special memory that is handled
+ * by the allocator.
  */
 class IAllocationDescriptor {
-public:
+  public:
     virtual ~IAllocationDescriptor() = default;
     [[nodiscard]] virtual ALLOCATION_TYPE getType() const = 0;
     virtual void createAllocation(size_t size, bool zero) = 0;
     [[nodiscard]] virtual std::string getLocation() const = 0;
     virtual std::shared_ptr<std::byte> getData() = 0;
-    virtual void transferTo(std::byte* src, size_t size) = 0;
-    virtual void transferFrom(std::byte* dst, size_t size) = 0;
+    virtual void transferTo(std::byte *src, size_t size) = 0;
+    virtual void transferFrom(std::byte *dst, size_t size) = 0;
     [[nodiscard]] virtual std::unique_ptr<IAllocationDescriptor> clone() const = 0;
-    virtual bool operator==(const IAllocationDescriptor* other) const { return (getType() == other->getType()); }
+    virtual bool operator==(const IAllocationDescriptor *other) const { return (getType() == other->getType()); }
 };
diff --git a/src/runtime/local/datastructures/LabelUtils.h b/src/runtime/local/datastructures/LabelUtils.h
index 1f5bb9b95..0ddeec5b8 100644
--- a/src/runtime/local/datastructures/LabelUtils.h
+++ b/src/runtime/local/datastructures/LabelUtils.h
@@ -23,13 +23,10 @@
 
 struct LabelUtils {
 
-    static std::string setPrefix(const std::string & prefix, const std::string & label) {
+    static std::string setPrefix(const std::string &prefix, const std::string &label) {
         const size_t pos = label.find('.');
-        return (pos == std::string::npos)
-            ? (prefix + "." + label)
-            : (prefix + label.substr(pos));
+        return (pos == std::string::npos) ? (prefix + "." + label) : (prefix + label.substr(pos));
     }
-    
 };
 
-#endif //SRC_RUNTIME_LOCAL_DATASTRUCTURES_LABELUTILS_H
\ No newline at end of file
+#endif // SRC_RUNTIME_LOCAL_DATASTRUCTURES_LABELUTILS_H
\ No newline at end of file
diff --git a/src/runtime/local/datastructures/List.h b/src/runtime/local/datastructures/List.h
index aa070fa9e..536ddc21e 100644
--- a/src/runtime/local/datastructures/List.h
+++ b/src/runtime/local/datastructures/List.h
@@ -18,18 +18,19 @@
 
 #include <runtime/local/datastructures/DataObjectFactory.h>
 #include <runtime/local/datastructures/Structure.h>
+#include <runtime/local/datastructures/ValueTypeUtils.h>
 
 #include <vector>
 
 #include <cstddef>
 
 /**
- * @brief An ordered sequence of homogeneous-typed elements of any data/value type.
+ * @brief An ordered sequence of homogeneous-typed elements of any data/value
+ * type.
  *
  * Most importantly, a list can store data objects (matrices/frames).
  */
-template<typename DataType>
-class List : public Structure {
+template <typename DataType> class List : public Structure {
     /**
      * @brief The elements of this list.
      */
@@ -37,39 +38,37 @@ class List : public Structure {
 
     // Grant DataObjectFactory access to the private constructors and
     // destructors.
-    template<class DataType_, typename ... ArgTypes>
-    friend DataType_ * DataObjectFactory::create(ArgTypes ...);
-    template<class DataType_>
-    friend void DataObjectFactory::destroy(const DataType_ * obj);
+    template <class DataType_, typename... ArgTypes> friend DataType_ *DataObjectFactory::create(ArgTypes...);
+    template <class DataType_> friend void DataObjectFactory::destroy(const DataType_ *obj);
 
     /**
      * @brief Creates a new empty list.
      */
-    List() : Structure(0, 1) {
-        // nothing to do
-    };
+    List()
+        : Structure(0, 1){
+              // nothing to do
+          };
 
     /**
      * @brief Creates a new list containing the same elements as the given list.
      *
      * @param other The other list.
      */
-    List(const List<DataType> * other) : Structure(other->length(), 1) {
-        for(const DT * elem : other->elements) {
-            // We must increase the reference counter of each element we put into
-            // this list to prevent the element from being freed as long as this
-            // list exists.
+    List(const List<DataType> *other) : Structure(other->length(), 1) {
+        for (const DT *elem : other->elements) {
+            // We must increase the reference counter of each element we put
+            // into this list to prevent the element from being freed as long as
+            // this list exists.
             elem->increaseRefCounter();
             this->elements.push_back(elem);
         }
     }
 
-public:
-
+  public:
     virtual ~List() {
         // Decrease reference counters of each element by 1.
         // If the reference counter becomes 0, destroy the element.
-        for(const DataType * element : elements)
+        for (const DataType *element : elements)
             DataObjectFactory::destroy(element);
     };
 
@@ -78,30 +77,26 @@ class List : public Structure {
      */
     using DT = DataType;
 
-    size_t getNumDims() const override {
-        return 1;
-    }
+    size_t getNumDims() const override { return 1; }
 
-    size_t getNumItems() const override {
-        return this->numRows;
-    }
+    size_t getNumItems() const override { return this->numRows; }
 
-    void print(std::ostream & os) const override {
+    void print(std::ostream &os) const override {
         os << "List(" << elements.size() << ", " << DataType::getName() << ", "
-            << ValueTypeUtils::cppNameFor<typename DataType::VT> << ')' << std::endl;
-        for(size_t i = 0; i < elements.size(); i++)
+           << ValueTypeUtils::cppNameFor<typename DataType::VT> << ')' << std::endl;
+        for (size_t i = 0; i < elements.size(); i++)
             elements[i]->print(os);
     }
 
-    Structure* sliceRow(size_t rl, size_t ru) const override {
+    Structure *sliceRow(size_t rl, size_t ru) const override {
         throw std::runtime_error("sliceRow is not supported for List yet");
     }
 
-    Structure* sliceCol(size_t cl, size_t cu) const override {
+    Structure *sliceCol(size_t cl, size_t cu) const override {
         throw std::runtime_error("sliceCol is not supported for List yet");
     }
-    
-    Structure* slice(size_t rl, size_t ru, size_t cl, size_t cu) const override {
+
+    Structure *slice(size_t rl, size_t ru, size_t cl, size_t cu) const override {
         throw std::runtime_error("slice is not supported for List yet");
     }
 
@@ -114,44 +109,40 @@ class List : public Structure {
      *
      * @return The number of elements in this list.
      */
-    size_t length() const {
-        return elements.size();
-    }
+    size_t length() const { return elements.size(); }
 
     /**
      * @brief Appends the given element to the end of this list.
-     * 
+     *
      * @param element The element to append.
      */
-    void append(const DataType * element) {
-        // We must increase the reference counter of the new element to prevent it
-        // from being freed as long as this list exists.
+    void append(const DataType *element) {
+        // We must increase the reference counter of the new element to prevent
+        // it from being freed as long as this list exists.
         element->increaseRefCounter();
         elements.push_back(element);
     }
 
     /**
-     * @brief Removes the element at the given position from this list and returns it.
+     * @brief Removes the element at the given position from this list and
+     * returns it.
      *
      * @param idx The position of the element to remove.
      * @return The removed element.
      */
-    const DataType * remove(size_t idx) {
-        if(idx >= elements.size())
-            throw std::runtime_error(
-                "trying to remove element at position " + std::to_string(idx) +
-                " from a list with " + std::to_string(elements.size()) + " elements"
-            );
-        const DataType * element = elements[idx];
+    const DataType *remove(size_t idx) {
+        if (idx >= elements.size())
+            throw std::runtime_error("trying to remove element at position " + std::to_string(idx) +
+                                     " from a list with " + std::to_string(elements.size()) + " elements");
+        const DataType *element = elements[idx];
         elements.erase(elements.begin() + idx);
-        // Note that we do not decrease the reference counter of the element. It must
-        // not be freed here, since we return it.
+        // Note that we do not decrease the reference counter of the element. It
+        // must not be freed here, since we return it.
         return element;
     }
 };
 
-template<typename DataType>
-std::ostream & operator<<(std::ostream & os, const List<DataType> & obj) {
+template <typename DataType> std::ostream &operator<<(std::ostream &os, const List<DataType> &obj) {
     obj.print(os);
     return os;
-}
\ No newline at end of file
+}
diff --git a/src/runtime/local/datastructures/Matrix.h b/src/runtime/local/datastructures/Matrix.h
index d9f584458..45d7514d6 100644
--- a/src/runtime/local/datastructures/Matrix.h
+++ b/src/runtime/local/datastructures/Matrix.h
@@ -23,131 +23,124 @@
 
 /**
  * @brief The base class of all matrix implementations.
- * 
+ *
  * All elements of a matrix have the same value type. Rows and columns are
  * addressed starting at zero.
  */
-template<typename ValueType>
-class Matrix : public Structure
-{
+template <typename ValueType> class Matrix : public Structure {
 
-protected:
+  protected:
+    Matrix(size_t numRows, size_t numCols)
+        : Structure(numRows, numCols){
+              // nothing to do
+          };
 
-    Matrix(size_t numRows, size_t numCols) :
-            Structure(numRows, numCols)
-    {
+  public:
+    virtual ~Matrix(){
         // nothing to do
     };
 
-public:
-
-    virtual ~Matrix()
-    {
-        // nothing to do
-    };
+    template <typename NewValueType> using WithValueType = Matrix<NewValueType>;
 
-    template<typename NewValueType>
-    using WithValueType = Matrix<NewValueType>;
-    
     /**
      * @brief The common type of all values in this matrix.
      */
     using VT = ValueType;
-    
+
     /**
      * @brief Returns the value at the given coordinates in this matrix.
-     * 
+     *
      * Expect this method to be inefficient and fall back to it only if not
      * avoidable. Use specific access methods of the particular sub-class
      * whenever possible.
-     * 
+     *
      * @param rowIdx
      * @param colIdx
      * @return The value at coordinates `rowIdx`, `colIdx`.
      */
     virtual ValueType get(size_t rowIdx, size_t colIdx) const = 0;
-    
+
     /**
      * @brief Set the value at the given coordinates in this matrix.
-     * 
+     *
      * Assumes that this matrix is in a structurally valid state. What exactly
      * this means depends on the particular sub-class. Usually, a matrix might
      * not be structurally valid if its underlying arrays are uninitialized or
      * during the time it is populated by a kernel. After each call to this
      * method, this matrix will be in a structurally valid state again.
-     * 
+     *
      * You should refrain from mixing `set`, `append`, and direct access to the
      * underlying arrays to populate a matrix.
-     * 
+     *
      * Expect this method to be inefficient and fall back to it only if not
      * avoidable. Use specific access methods of the particular sub-class
      * whenever possible.
-     * 
+     *
      * @param rowIdx
      * @param colIdx
      * @param value
      */
     virtual void set(size_t rowIdx, size_t colIdx, ValueType value) = 0;
-    
+
     /**
      * @brief Prepares this matrix for being populated by `append`-calls.
-     * 
+     *
      * See `append` for more details.
      */
     virtual void prepareAppend() = 0;
-    
+
     /**
      * @brief Set the value at the given coordinates in this matrix, assuming
      * that nothing has been appended to coordinates after the given ones so
      * far.
-     * 
+     *
      * Subsequent calls to this method must address strictly increasing
      * coordinates in the matrix, w.r.t. a row-major layout. E.g., a value may
      * be appended at (3, 2) after (3, 0) or (2, 3), but not after (3, 3) or
      * (4, 1).
-     * 
+     *
      * Unlike `set`, this method does not assume that this matrix is in a
      * structurally valid state. However, `prepareAppend` must be called before
      * the first call to `append`. After a call to `append`, this matrix will
      * not necessarily be in a valid state. Thus, after a matrix has been
      * populated by `append`-calls, `finishAppend` must be called. To sum up,
      * the protocol is:
-     * 
+     *
      * ```c++
      * Matrix * m = ...;
      * m->prepareAppend();
      * m->append(...); // as often as you want
      * m->finishAppend();
      * ```
-     * 
+     *
      * Coordinates in this matrix not addressed by any `append`-call between
      * `prepareAppend` and `finishAppend` are assumed to be zero.
-     * 
+     *
      * Note that populating a matrix by `append`-calls will overwrite any data
      * already present in the matrix.
-     * 
+     *
      * You should refrain from mixing `set`, `append`, and direct access to the
      * underlying arrays to populate a matrix.
-     * 
+     *
      * Expect this method to be inefficient and fall back to it only if not
      * avoidable. Use specific access methods of the particular sub-class
      * whenever possible.
-     * 
+     *
      * @param rowIdx
      * @param colIdx
      * @param value
      */
     virtual void append(size_t rowIdx, size_t colIdx, ValueType value) = 0;
-    
+
     /**
      * @brief Brings this matrix into a valid state after it has been populated
      * by `append`-calls.
-     * 
+     *
      * See `append` for more details.
      */
     virtual void finishAppend() = 0;
 
-    bool operator==(const Matrix<ValueType> & rhs) const {
+    bool operator==(const Matrix<ValueType> &rhs) const {
         if (this == &rhs)
             return true;
 
@@ -165,19 +158,14 @@ class Matrix : public Structure
         return true;
     }
 
-    size_t getNumDims() const override {
-        return 2;
-    }
+    size_t getNumDims() const override { return 2; }
 
-    size_t getNumItems() const override {
-        return this->numCols * this->numRows;
-    }
+    size_t getNumItems() const override { return this->numCols * this->numRows; }
 };
 
-template<typename ValueType>
-std::ostream & operator<<(std::ostream & os, const Matrix<ValueType> & obj) {
+template <typename ValueType> std::ostream &operator<<(std::ostream &os, const Matrix<ValueType> &obj) {
     obj.print(os);
     return os;
 }
 
-#endif //SRC_RUNTIME_LOCAL_DATASTRUCTURES_MATRIX_H
+#endif // SRC_RUNTIME_LOCAL_DATASTRUCTURES_MATRIX_H
diff --git a/src/runtime/local/datastructures/MetaDataObject.cpp b/src/runtime/local/datastructures/MetaDataObject.cpp
index 1005a15b4..5f8b36028 100644
--- a/src/runtime/local/datastructures/MetaDataObject.cpp
+++ b/src/runtime/local/datastructures/MetaDataObject.cpp
@@ -14,24 +14,24 @@
  * limitations under the License.
  */
 
-#include "DataPlacement.h"
 #include "MetaDataObject.h"
+#include "DataPlacement.h"
 
-DataPlacement* MetaDataObject::addDataPlacement(const IAllocationDescriptor *allocInfo, Range *r) {
-    data_placements[static_cast<size_t>(allocInfo->getType())].emplace_back(std::make_unique<DataPlacement>(
-            allocInfo->clone(), r == nullptr ? nullptr : r->clone()));
+DataPlacement *MetaDataObject::addDataPlacement(const IAllocationDescriptor *allocInfo, Range *r) {
+    data_placements[static_cast<size_t>(allocInfo->getType())].emplace_back(
+        std::make_unique<DataPlacement>(allocInfo->clone(), r == nullptr ? nullptr : r->clone()));
     return data_placements[static_cast<size_t>(allocInfo->getType())].back().get();
 }
 
 auto MetaDataObject::getDataPlacementByType(ALLOCATION_TYPE type) const
-        -> const std::vector<std::unique_ptr<DataPlacement>>* {
+    -> const std::vector<std::unique_ptr<DataPlacement>> * {
     return &(data_placements[static_cast<size_t>(type)]);
 }
 
-DataPlacement* MetaDataObject::getDataPlacementByLocation(const std::string& location) const {
-    for (const auto &_omdType: data_placements) {
-        for (auto &_omd: _omdType) {
-            if(_omd->allocation->getLocation() == location)
+DataPlacement *MetaDataObject::getDataPlacementByLocation(const std::string &location) const {
+    for (const auto &_omdType : data_placements) {
+        for (auto &_omd : _omdType) {
+            if (_omd->allocation->getLocation() == location)
                 return const_cast<DataPlacement *>(_omd.get());
         }
     }
@@ -39,9 +39,9 @@ DataPlacement* MetaDataObject::getDataPlacementByLocation(const std::string& loc
 }
 
 void MetaDataObject::updateRangeDataPlacementByID(size_t id, Range *r) {
-    for(auto &_omdType : data_placements) {
-        for(auto& _omd : _omdType) {
-            if(_omd->dp_id == id){
+    for (auto &_omdType : data_placements) {
+        for (auto &_omd : _omdType) {
+            if (_omd->dp_id == id) {
                 _omd->range = r->clone();
                 return;
             }
@@ -50,24 +50,25 @@ void MetaDataObject::updateRangeDataPlacementByID(size_t id, Range *r) {
 }
 
 DataPlacement *MetaDataObject::getDataPlacementByID(size_t id) const {
-    for (const auto &_omdType: data_placements) {
-        for (auto &_omd: _omdType) {
-            if(_omd->dp_id == id)
+    for (const auto &_omdType : data_placements) {
+        for (auto &_omd : _omdType) {
+            if (_omd->dp_id == id)
                 return const_cast<DataPlacement *>(_omd.get());
         }
     }
     return nullptr;
 }
 
-const DataPlacement* MetaDataObject::findDataPlacementByType(const IAllocationDescriptor *alloc_desc, const Range *range) const {
+const DataPlacement *MetaDataObject::findDataPlacementByType(const IAllocationDescriptor *alloc_desc,
+                                                             const Range *range) const {
     auto res = getDataPlacementByType(alloc_desc->getType());
-    if(res->empty())
+    if (res->empty())
         return nullptr;
     else {
         for (size_t i = 0; i < res->size(); ++i) {
-            if((*res)[i]->allocation->operator==(alloc_desc)) {
-                if(((*res)[i]->range == nullptr && range == nullptr) ||
-                   ((*res)[i]->range != nullptr && (*res)[i]->range->operator==(range))) {
+            if ((*res)[i]->allocation->operator==(alloc_desc)) {
+                if (((*res)[i]->range == nullptr && range == nullptr) ||
+                    ((*res)[i]->range != nullptr && (*res)[i]->range->operator==(range))) {
                     return (*res)[i].get();
                 }
             }
@@ -80,15 +81,11 @@ bool MetaDataObject::isLatestVersion(size_t placement) const {
     return (std::find(latest_version.begin(), latest_version.end(), placement) != latest_version.end());
 }
 
-void MetaDataObject::addLatest(size_t id) {
-    latest_version.push_back(id);
-}
+void MetaDataObject::addLatest(size_t id) { latest_version.push_back(id); }
 
 void MetaDataObject::setLatest(size_t id) {
     latest_version.clear();
     latest_version.push_back(id);
 }
 
-auto MetaDataObject::getLatest() const -> std::vector<size_t> {
-    return latest_version;
-}
+auto MetaDataObject::getLatest() const -> std::vector<size_t> { return latest_version; }
diff --git a/src/runtime/local/datastructures/MetaDataObject.h b/src/runtime/local/datastructures/MetaDataObject.h
index 67e022378..994b0ca73 100644
--- a/src/runtime/local/datastructures/MetaDataObject.h
+++ b/src/runtime/local/datastructures/MetaDataObject.h
@@ -28,29 +28,30 @@ struct DataPlacement;
 #include <vector>
 
 /**
- * @brief The MetaDataObject class contains meta data of a data structure (Frame, Matrix)
+ * @brief The MetaDataObject class contains meta data of a data structure
+ * (Frame, Matrix)
  *
- * The MetaDataObject holds a vector of data placements (separated by type) and a vector of IDs of
- * data placements that all hold the current/latest version of the contained data.
- * Additionaly, this class contains methods to access/manipulate the contained information.
+ * The MetaDataObject holds a vector of data placements (separated by type) and
+ * a vector of IDs of data placements that all hold the current/latest version
+ * of the contained data. Additionaly, this class contains methods to
+ * access/manipulate the contained information.
  */
 class MetaDataObject {
-    std::array<std::vector<std::unique_ptr<DataPlacement>>,
-            static_cast<size_t>(ALLOCATION_TYPE::NUM_ALLOC_TYPES)> data_placements;
+    std::array<std::vector<std::unique_ptr<DataPlacement>>, static_cast<size_t>(ALLOCATION_TYPE::NUM_ALLOC_TYPES)>
+        data_placements;
     std::vector<size_t> latest_version;
 
-public:
+  public:
     DataPlacement *addDataPlacement(const IAllocationDescriptor *allocInfo, Range *r = nullptr);
     const DataPlacement *findDataPlacementByType(const IAllocationDescriptor *alloc_desc, const Range *range) const;
     [[nodiscard]] DataPlacement *getDataPlacementByID(size_t id) const;
-    [[nodiscard]] DataPlacement *getDataPlacementByLocation(const std::string& location) const;
-    [[nodiscard]] auto getDataPlacementByType(ALLOCATION_TYPE type) const ->
-            const std::vector<std::unique_ptr<DataPlacement>>*;
+    [[nodiscard]] DataPlacement *getDataPlacementByLocation(const std::string &location) const;
+    [[nodiscard]] auto
+    getDataPlacementByType(ALLOCATION_TYPE type) const -> const std::vector<std::unique_ptr<DataPlacement>> *;
     void updateRangeDataPlacementByID(size_t id, Range *r);
 
     [[nodiscard]] bool isLatestVersion(size_t placement) const;
     void addLatest(size_t id);
     void setLatest(size_t id);
     [[nodiscard]] auto getLatest() const -> std::vector<size_t>;
-
 };
diff --git a/src/runtime/local/datastructures/Range.h b/src/runtime/local/datastructures/Range.h
index 6ce84381c..772fb02f6 100644
--- a/src/runtime/local/datastructures/Range.h
+++ b/src/runtime/local/datastructures/Range.h
@@ -25,17 +25,16 @@ struct Range {
     size_t r_len;
     size_t c_len;
 
-    explicit Range() : r_start(0), c_start(0), r_len(0), c_len(0) { }
-    explicit Range(size_t r1, size_t c1, size_t r2, size_t c2) : r_start(r1), c_start(c1), r_len(r2), c_len(c2) { }
+    explicit Range() : r_start(0), c_start(0), r_len(0), c_len(0) {}
+    explicit Range(size_t r1, size_t c1, size_t r2, size_t c2) : r_start(r1), c_start(c1), r_len(r2), c_len(c2) {}
 
-    bool operator==(const Range* other) const {
-        return((other != nullptr) && (r_start == other->r_start && c_start == other->c_start && r_len == other->r_len &&
-                                      c_len == other->c_len));
+    bool operator==(const Range *other) const {
+        return ((other != nullptr) && (r_start == other->r_start && c_start == other->c_start &&
+                                       r_len == other->r_len && c_len == other->c_len));
     }
 
     bool operator==(const Range other) const {
-        return(r_start == other.r_start && c_start == other.c_start && r_len == other.r_len &&
-                                      c_len == other.c_len);
+        return (r_start == other.r_start && c_start == other.c_start && r_len == other.r_len && c_len == other.c_len);
     }
 
     [[nodiscard]] std::unique_ptr<Range> clone() const { return std::make_unique<Range>(*this); }
diff --git a/src/runtime/local/datastructures/Structure.cpp b/src/runtime/local/datastructures/Structure.cpp
index 234b1982a..bdef9f240 100644
--- a/src/runtime/local/datastructures/Structure.cpp
+++ b/src/runtime/local/datastructures/Structure.cpp
@@ -14,22 +14,23 @@
  * limitations under the License.
  */
 
-#include <runtime/local/datastructures/Structure.h>
 #include <runtime/local/datastructures/DataPlacement.h>
+#include <runtime/local/datastructures/Structure.h>
 
 Structure::Structure(size_t numRows, size_t numCols) : refCounter(1), numRows(numRows), numCols(numCols) {
     mdo = std::make_shared<MetaDataObject>();
 };
 
-void Structure::clone_mdo(const Structure* src) {
-    // FIXME: This clones the meta data to avoid locking (thread synchronization for data copy)
-    for(int i = 0; i < static_cast<int>(ALLOCATION_TYPE::NUM_ALLOC_TYPES); i++) {
+void Structure::clone_mdo(const Structure *src) {
+    // FIXME: This clones the meta data to avoid locking (thread synchronization
+    // for data copy)
+    for (int i = 0; i < static_cast<int>(ALLOCATION_TYPE::NUM_ALLOC_TYPES); i++) {
         auto placements = src->mdo->getDataPlacementByType(static_cast<ALLOCATION_TYPE>(i));
-        for(auto it = placements->begin(); it != placements->end(); it++) {
+        for (auto it = placements->begin(); it != placements->end(); it++) {
             auto src_alloc = it->get()->allocation.get();
             auto src_range = it->get()->range.get();
             auto new_data_placement = this->mdo->addDataPlacement(src_alloc, src_range);
-            if(src->mdo->isLatestVersion(it->get()->dp_id))
+            if (src->mdo->isLatestVersion(it->get()->dp_id))
                 this->mdo->addLatest(new_data_placement->dp_id);
         }
     }
diff --git a/src/runtime/local/datastructures/Structure.h b/src/runtime/local/datastructures/Structure.h
index 81cee5116..13f9a7cf5 100644
--- a/src/runtime/local/datastructures/Structure.h
+++ b/src/runtime/local/datastructures/Structure.h
@@ -25,16 +25,14 @@
 /**
  * @brief The base class of all data structure implementations.
  */
-class Structure
-{
-private:
+class Structure {
+  private:
     mutable size_t refCounter;
     mutable std::mutex refCounterMutex;
-    
-    template<class DataType>
-    friend void DataObjectFactory::destroy(const DataType * obj);
 
-protected:
+    template <class DataType> friend void DataObjectFactory::destroy(const DataType *obj);
+
+  protected:
     size_t row_offset{};
     size_t col_offset{};
     size_t numRows;
@@ -44,30 +42,24 @@ class Structure
 
     mutable std::shared_ptr<MetaDataObject> mdo;
 
-    void clone_mdo(const Structure* src);
-    
-public:
+    void clone_mdo(const Structure *src);
+
+  public:
     virtual ~Structure() = default;
 
     explicit operator std::unique_ptr<Range>() const {
         return std::make_unique<Range>(Range(0ul, 0ul, this->getNumRows(), this->getNumCols()));
     }
 
-     explicit operator Range() const {
-        return Range(0, 0, this->getNumRows(), this->getNumCols());
-    }
+    explicit operator Range() const { return Range(0, 0, this->getNumRows(), this->getNumCols()); }
 
-    size_t getRefCounter() const {
-        return refCounter;
-    }
-    
-    MetaDataObject* getMetaDataObject() const {
-        return mdo.get();
-    }
+    size_t getRefCounter() const { return refCounter; }
+
+    MetaDataObject *getMetaDataObject() const { return mdo.get(); }
 
     /**
      * @brief Increases the reference counter of this data object.
-     * 
+     *
      * The access is protected by a mutex, such that multiple threads may call
      * this method concurrently.
      */
@@ -76,7 +68,7 @@ class Structure
         refCounter++;
         refCounterMutex.unlock();
     }
-    
+
     // Note that there is no method for decreasing the reference counter here.
     // Instead, use DataObjectFactory::destroy(). It is important that the
     // reference counter becoming zero triggers the deletion of the data
@@ -84,68 +76,62 @@ class Structure
 
     [[nodiscard]] virtual size_t getNumDims() const = 0;
 
-    [[nodiscard]] size_t getNumRows() const
-    {
-        return numRows;
-    }
+    [[nodiscard]] size_t getNumRows() const { return numRows; }
 
-    [[nodiscard]] size_t getNumCols() const
-    {
-        return numCols;
-    }
+    [[nodiscard]] size_t getNumCols() const { return numCols; }
 
     [[nodiscard]] virtual size_t getNumItems() const = 0;
 
     /**
      * @brief Prints a human-readable representation of this data object to the
      * given stream.
-     * 
+     *
      * This method is not optimized for performance. It should only be used for
      * moderately small data objects.
-     * 
+     *
      * @param os The stream where to print this data object.
      */
-    virtual void print(std::ostream & os) const = 0;
+    virtual void print(std::ostream &os) const = 0;
 
     /**
      * @brief Extracts a row range out of this structure.
-     * 
+     *
      * Might be implemented as a zero-copy operation.
-     * 
+     *
      * @param rl Row range lower bound (inclusive).
      * @param ru Row range upper bound (exclusive).
-     * @return 
+     * @return
      */
-    virtual Structure* sliceRow(size_t rl, size_t ru) const = 0;
+    virtual Structure *sliceRow(size_t rl, size_t ru) const = 0;
 
     /**
      * @brief Extracts a column range out of this structure.
-     * 
+     *
      * Might be implemented as a zero-copy operation.
-     * 
+     *
      * @param cl Column range lower bound (inclusive).
      * @param cu Column range upper bound (exclusive).
-     * @return 
+     * @return
      */
-    virtual Structure* sliceCol(size_t cl, size_t cu) const = 0;
-    
+    virtual Structure *sliceCol(size_t cl, size_t cu) const = 0;
+
     /**
      * @brief Extracts a rectangular sub-structure (row and column range) out
      * of this structure.
-     * 
+     *
      * Might be implemented as a zero-copy operation.
-     * 
+     *
      * @param rl Row range lower bound (inclusive).
      * @param ru Row range upper bound (exclusive).
      * @param cl Column range lower bound (inclusive).
      * @param cu Column range upper bound (exclusive).
-     * @return 
+     * @return
      */
-    virtual Structure* slice(size_t rl, size_t ru, size_t cl, size_t cu) const = 0;
+    virtual Structure *slice(size_t rl, size_t ru, size_t cl, size_t cu) const = 0;
 
     /**
-     * @brief Serializes the object to a void buffer.     
-     * 
+     * @brief Serializes the object to a void buffer.
+     *
      * @param buf buffer to store bytes.
      * @return The serialized buffer.
      */
diff --git a/src/runtime/local/datastructures/Tensor.h b/src/runtime/local/datastructures/Tensor.h
index 38ff0a8da..e85fe3e9f 100644
--- a/src/runtime/local/datastructures/Tensor.h
+++ b/src/runtime/local/datastructures/Tensor.h
@@ -17,21 +17,20 @@
 #pragma once
 
 #include <cstddef>
-#include <vector>
 #include <cstdlib>
 #include <stdexcept>
+#include <vector>
 
 #include <runtime/local/datastructures/Structure.h>
 
-template<typename ValueType>
-class Tensor : public Structure {
-    public:
+template <typename ValueType> class Tensor : public Structure {
+  public:
     size_t rank;
     std::vector<size_t> tensor_shape;
     size_t total_element_count;
 
-    protected:
-    Tensor(const std::vector<size_t>& tensor_shape)
+  protected:
+    Tensor(const std::vector<size_t> &tensor_shape)
         : Structure(tensor_shape.size() >= 1 ? tensor_shape[0] : 0, tensor_shape.size() >= 2 ? tensor_shape[1] : 0),
           rank(tensor_shape.size()), tensor_shape(tensor_shape) {
         if (rank > 0) {
@@ -50,27 +49,23 @@ class Tensor : public Structure {
         total_element_count = numRows * numCols;
     };
 
-    virtual ~Tensor() {};
+    virtual ~Tensor(){};
 
-    public:
+  public:
+    virtual size_t getNumDims() const override { return rank; }
 
-    virtual size_t getNumDims() const override {
-        return rank;
-    }
-    
     // These pure virtual functions are only well defined for a ND-tensor in the
-    // case of N=2. Which dimension is addressed via row and column id is ambiguous
-    // for larger N.
-    // Use the provided tryDice() function instead.
-    virtual Tensor* sliceRow(size_t rl, size_t ru) const override {
+    // case of N=2. Which dimension is addressed via row and column id is
+    // ambiguous for larger N. Use the provided tryDice() function instead.
+    virtual Tensor *sliceRow(size_t rl, size_t ru) const override {
         throw std::runtime_error("Tensor::sliceRow() is not supported (yet)");
     }
 
-    virtual Tensor* sliceCol(size_t cl, size_t cu) const override {
+    virtual Tensor *sliceCol(size_t cl, size_t cu) const override {
         throw std::runtime_error("Tensor::sliceCol() is not supported (yet)");
     }
 
-    virtual Tensor* slice(size_t rl, size_t ru, size_t cl, size_t cu) const override {
+    virtual Tensor *slice(size_t rl, size_t ru, size_t cl, size_t cu) const override {
         throw std::runtime_error("Tensor::slice() is not supported (yet)");
     }
 };
diff --git a/src/runtime/local/datastructures/ValueTypeCode.h b/src/runtime/local/datastructures/ValueTypeCode.h
index c8cf085e9..f1f7e5f64 100644
--- a/src/runtime/local/datastructures/ValueTypeCode.h
+++ b/src/runtime/local/datastructures/ValueTypeCode.h
@@ -21,18 +21,25 @@
 
 /**
  * @brief A run-time representation for value types.
- * 
+ *
  * Each of these represents one value type from DaphneIR and the underlying
  * C++ type to use. A `ValueTypeCode` is meant to be used in situations when
  * the value type cannot be known at compile-time.
  */
 enum class ValueTypeCode : uint8_t {
-    SI8, SI32, SI64, // signed integers (intX_t)
-    UI8, UI32, UI64, // unsigned integers (uintx_t)
-    F32, F64, // floating point (float, double)
-    INVALID, // only for JSON enum conversion
+    SI8,
+    SI32,
+    SI64, // signed integers (intX_t)
+    UI8,
+    UI32,
+    UI64, // unsigned integers (uintx_t)
+    F32,
+    F64,        // floating point (float, double)
+    STR,        // std::string
+    FIXEDSTR16, // fixed-size string (length 16)
+    INVALID,    // only for JSON enum conversion
     // TODO Support bool as well, but poses some challenges (e.g. sizeof).
-//    UI1 // boolean (bool)
+    //    UI1 // boolean (bool)
 };
 
-#endif //SRC_RUNTIME_LOCAL_DATASTRUCTURES_VALUETYPECODE_H
\ No newline at end of file
+#endif // SRC_RUNTIME_LOCAL_DATASTRUCTURES_VALUETYPECODE_H
\ No newline at end of file
diff --git a/src/runtime/local/datastructures/ValueTypeUtils.cpp b/src/runtime/local/datastructures/ValueTypeUtils.cpp
index 571a7789b..0739b29b2 100644
--- a/src/runtime/local/datastructures/ValueTypeUtils.cpp
+++ b/src/runtime/local/datastructures/ValueTypeUtils.cpp
@@ -25,88 +25,148 @@
 #include <cstdint>
 
 size_t ValueTypeUtils::sizeOf(ValueTypeCode type) {
-    switch(type) {
-        case ValueTypeCode::SI8:  return sizeof(int8_t);
-        case ValueTypeCode::SI32: return sizeof(int32_t);
-        case ValueTypeCode::SI64: return sizeof(int64_t);
-        case ValueTypeCode::UI8:  return sizeof(uint8_t);
-        case ValueTypeCode::UI32: return sizeof(uint32_t);
-        case ValueTypeCode::UI64: return sizeof(uint64_t);
-        case ValueTypeCode::F32: return sizeof(float);
-        case ValueTypeCode::F64: return sizeof(double);
-        default: throw std::runtime_error("ValueTypeUtils::sizeOf: unknown value type code");
+    switch (type) {
+    case ValueTypeCode::SI8:
+        return sizeof(int8_t);
+    case ValueTypeCode::SI32:
+        return sizeof(int32_t);
+    case ValueTypeCode::SI64:
+        return sizeof(int64_t);
+    case ValueTypeCode::UI8:
+        return sizeof(uint8_t);
+    case ValueTypeCode::UI32:
+        return sizeof(uint32_t);
+    case ValueTypeCode::UI64:
+        return sizeof(uint64_t);
+    case ValueTypeCode::F32:
+        return sizeof(float);
+    case ValueTypeCode::F64:
+        return sizeof(double);
+    default:
+        throw std::runtime_error("ValueTypeUtils::sizeOf: unknown value type code");
     }
 }
 
-void ValueTypeUtils::printValue(std::ostream & os, ValueTypeCode type, const void * array, size_t pos) {
-    switch(type) {
-        //Conversion int8->int32 for formating as number as opposed to character
-        case ValueTypeCode::SI8:  os << static_cast<int32_t>(reinterpret_cast<const int8_t  *>(array)[pos]); break;
-        case ValueTypeCode::SI32: os << reinterpret_cast<const int32_t *>(array)[pos]; break;
-        case ValueTypeCode::SI64: os << reinterpret_cast<const int64_t *>(array)[pos]; break;
-        //Conversion uint8->uint32 for formating as number as opposed to character
-        case ValueTypeCode::UI8:  os << static_cast<uint32_t>(reinterpret_cast<const uint8_t  *>(array)[pos]); break;
-        case ValueTypeCode::UI32: os << reinterpret_cast<const uint32_t *>(array)[pos]; break;
-        case ValueTypeCode::UI64: os << reinterpret_cast<const uint64_t *>(array)[pos]; break;
-        case ValueTypeCode::F32: os << reinterpret_cast<const float  *>(array)[pos]; break;
-        case ValueTypeCode::F64: os << reinterpret_cast<const double *>(array)[pos]; break;
-        default: throw std::runtime_error("ValueTypeUtils::printValue: unknown value type code");
+void ValueTypeUtils::printValue(std::ostream &os, ValueTypeCode type, const void *array, size_t pos) {
+    switch (type) {
+    // Conversion int8->int32 for formating as number as opposed to character
+    case ValueTypeCode::SI8:
+        os << static_cast<int32_t>(reinterpret_cast<const int8_t *>(array)[pos]);
+        break;
+    case ValueTypeCode::SI32:
+        os << reinterpret_cast<const int32_t *>(array)[pos];
+        break;
+    case ValueTypeCode::SI64:
+        os << reinterpret_cast<const int64_t *>(array)[pos];
+        break;
+    // Conversion uint8->uint32 for formating as number as opposed to character
+    case ValueTypeCode::UI8:
+        os << static_cast<uint32_t>(reinterpret_cast<const uint8_t *>(array)[pos]);
+        break;
+    case ValueTypeCode::UI32:
+        os << reinterpret_cast<const uint32_t *>(array)[pos];
+        break;
+    case ValueTypeCode::UI64:
+        os << reinterpret_cast<const uint64_t *>(array)[pos];
+        break;
+    case ValueTypeCode::F32:
+        os << reinterpret_cast<const float *>(array)[pos];
+        break;
+    case ValueTypeCode::F64:
+        os << reinterpret_cast<const double *>(array)[pos];
+        break;
+    default:
+        throw std::runtime_error("ValueTypeUtils::printValue: unknown value type code");
     }
 }
 
-template<> const ValueTypeCode ValueTypeUtils::codeFor<int8_t>   = ValueTypeCode::SI8;
-template<> const ValueTypeCode ValueTypeUtils::codeFor<int32_t>  = ValueTypeCode::SI32;
-template<> const ValueTypeCode ValueTypeUtils::codeFor<int64_t>  = ValueTypeCode::SI64;
-template<> const ValueTypeCode ValueTypeUtils::codeFor<uint8_t>  = ValueTypeCode::UI8;
-template<> const ValueTypeCode ValueTypeUtils::codeFor<uint32_t> = ValueTypeCode::UI32;
-template<> const ValueTypeCode ValueTypeUtils::codeFor<uint64_t> = ValueTypeCode::UI64;
-template<> const ValueTypeCode ValueTypeUtils::codeFor<float>  = ValueTypeCode::F32;
-template<> const ValueTypeCode ValueTypeUtils::codeFor<double> = ValueTypeCode::F64;
+template <> const ValueTypeCode ValueTypeUtils::codeFor<int8_t> = ValueTypeCode::SI8;
+template <> const ValueTypeCode ValueTypeUtils::codeFor<int32_t> = ValueTypeCode::SI32;
+template <> const ValueTypeCode ValueTypeUtils::codeFor<int64_t> = ValueTypeCode::SI64;
+template <> const ValueTypeCode ValueTypeUtils::codeFor<uint8_t> = ValueTypeCode::UI8;
+template <> const ValueTypeCode ValueTypeUtils::codeFor<uint32_t> = ValueTypeCode::UI32;
+template <> const ValueTypeCode ValueTypeUtils::codeFor<uint64_t> = ValueTypeCode::UI64;
+template <> const ValueTypeCode ValueTypeUtils::codeFor<float> = ValueTypeCode::F32;
+template <> const ValueTypeCode ValueTypeUtils::codeFor<double> = ValueTypeCode::F64;
+template <> const ValueTypeCode ValueTypeUtils::codeFor<std::string> = ValueTypeCode::STR;
+template <> const ValueTypeCode ValueTypeUtils::codeFor<FixedStr16> = ValueTypeCode::FIXEDSTR16;
 
-template<> const std::string ValueTypeUtils::cppNameFor<int8_t>   = "int8_t";
-template<> const std::string ValueTypeUtils::cppNameFor<int32_t>  = "int32_t";
-template<> const std::string ValueTypeUtils::cppNameFor<int64_t>  = "int64_t";
-template<> const std::string ValueTypeUtils::cppNameFor<uint8_t>  = "uint8_t";
-template<> const std::string ValueTypeUtils::cppNameFor<uint32_t> = "uint32_t";
-template<> const std::string ValueTypeUtils::cppNameFor<uint64_t> = "uint64_t";
-template<> const std::string ValueTypeUtils::cppNameFor<float>  = "float";
-template<> const std::string ValueTypeUtils::cppNameFor<double> = "double";
-template<> const std::string ValueTypeUtils::cppNameFor<bool> = "bool";
-template<> const std::string ValueTypeUtils::cppNameFor<const char*> = "const char*";
+template <> const std::string ValueTypeUtils::cppNameFor<int8_t> = "int8_t";
+template <> const std::string ValueTypeUtils::cppNameFor<int32_t> = "int32_t";
+template <> const std::string ValueTypeUtils::cppNameFor<int64_t> = "int64_t";
+template <> const std::string ValueTypeUtils::cppNameFor<uint8_t> = "uint8_t";
+template <> const std::string ValueTypeUtils::cppNameFor<uint32_t> = "uint32_t";
+template <> const std::string ValueTypeUtils::cppNameFor<uint64_t> = "uint64_t";
+template <> const std::string ValueTypeUtils::cppNameFor<float> = "float";
+template <> const std::string ValueTypeUtils::cppNameFor<double> = "double";
+template <> const std::string ValueTypeUtils::cppNameFor<bool> = "bool";
+template <> const std::string ValueTypeUtils::cppNameFor<const char *> = "const char*";
+template <> const std::string ValueTypeUtils::cppNameFor<std::string> = "std::string";
+template <> const std::string ValueTypeUtils::cppNameFor<FixedStr16> = "FixedStr";
+
+template <> const std::string ValueTypeUtils::irNameFor<int8_t> = "si8";
+template <> const std::string ValueTypeUtils::irNameFor<int32_t> = "si32";
+template <> const std::string ValueTypeUtils::irNameFor<int64_t> = "si64";
+template <> const std::string ValueTypeUtils::irNameFor<uint8_t> = "ui8";
+template <> const std::string ValueTypeUtils::irNameFor<uint32_t> = "ui32";
+template <> const std::string ValueTypeUtils::irNameFor<uint64_t> = "ui64";
+template <> const std::string ValueTypeUtils::irNameFor<float> = "f32";
+template <> const std::string ValueTypeUtils::irNameFor<double> = "f64";
+
+template <> const int8_t ValueTypeUtils::defaultValue<int8_t> = 0;
+template <> const int32_t ValueTypeUtils::defaultValue<int32_t> = 0;
+template <> const int64_t ValueTypeUtils::defaultValue<int64_t> = 0;
+template <> const uint8_t ValueTypeUtils::defaultValue<uint8_t> = 0;
+template <> const uint32_t ValueTypeUtils::defaultValue<uint32_t> = 0;
+template <> const uint64_t ValueTypeUtils::defaultValue<uint64_t> = 0;
+template <> const float ValueTypeUtils::defaultValue<float> = 0;
+template <> const double ValueTypeUtils::defaultValue<double> = 0;
+template <> const bool ValueTypeUtils::defaultValue<bool> = false;
+template <> const std::string ValueTypeUtils::defaultValue<std::string> = std::string("");
+template <> const FixedStr16 ValueTypeUtils::defaultValue<FixedStr16> = FixedStr16();
 
-template<> const std::string ValueTypeUtils::irNameFor<int8_t>   = "si8";
-template<> const std::string ValueTypeUtils::irNameFor<int32_t>  = "si32";
-template<> const std::string ValueTypeUtils::irNameFor<int64_t>  = "si64";
-template<> const std::string ValueTypeUtils::irNameFor<uint8_t>  = "ui8";
-template<> const std::string ValueTypeUtils::irNameFor<uint32_t> = "ui32";
-template<> const std::string ValueTypeUtils::irNameFor<uint64_t> = "ui64";
-template<> const std::string ValueTypeUtils::irNameFor<float>  = "f32";
-template<> const std::string ValueTypeUtils::irNameFor<double> = "f64";
-    
 const std::string ValueTypeUtils::cppNameForCode(ValueTypeCode type) {
-    switch(type) {
-        case ValueTypeCode::SI8:  return cppNameFor<int8_t>;
-        case ValueTypeCode::SI32: return cppNameFor<int32_t>;
-        case ValueTypeCode::SI64: return cppNameFor<int64_t>;
-        case ValueTypeCode::UI8:  return cppNameFor<uint8_t>;
-        case ValueTypeCode::UI32: return cppNameFor<uint32_t>;
-        case ValueTypeCode::UI64: return cppNameFor<uint64_t>;
-        case ValueTypeCode::F32: return cppNameFor<float>;
-        case ValueTypeCode::F64: return cppNameFor<double>;
-        default: throw std::runtime_error("ValueTypeUtils::cppNameForCode: unknown value type code");
+    switch (type) {
+    case ValueTypeCode::SI8:
+        return cppNameFor<int8_t>;
+    case ValueTypeCode::SI32:
+        return cppNameFor<int32_t>;
+    case ValueTypeCode::SI64:
+        return cppNameFor<int64_t>;
+    case ValueTypeCode::UI8:
+        return cppNameFor<uint8_t>;
+    case ValueTypeCode::UI32:
+        return cppNameFor<uint32_t>;
+    case ValueTypeCode::UI64:
+        return cppNameFor<uint64_t>;
+    case ValueTypeCode::F32:
+        return cppNameFor<float>;
+    case ValueTypeCode::F64:
+        return cppNameFor<double>;
+    default:
+        throw std::runtime_error("ValueTypeUtils::cppNameForCode: unknown value type code");
     }
 }
 
 const std::string ValueTypeUtils::irNameForCode(ValueTypeCode type) {
-    switch(type) {
-        case ValueTypeCode::SI8:  return irNameFor<int8_t>;
-        case ValueTypeCode::SI32: return irNameFor<int32_t>;
-        case ValueTypeCode::SI64: return irNameFor<int64_t>;
-        case ValueTypeCode::UI8:  return irNameFor<uint8_t>;
-        case ValueTypeCode::UI32: return irNameFor<uint32_t>;
-        case ValueTypeCode::UI64: return irNameFor<uint64_t>;
-        case ValueTypeCode::F32: return irNameFor<float>;
-        case ValueTypeCode::F64: return irNameFor<double>;
-        default: throw std::runtime_error("ValueTypeUtils::irNameForCode: unknown value type code");
+    switch (type) {
+    case ValueTypeCode::SI8:
+        return irNameFor<int8_t>;
+    case ValueTypeCode::SI32:
+        return irNameFor<int32_t>;
+    case ValueTypeCode::SI64:
+        return irNameFor<int64_t>;
+    case ValueTypeCode::UI8:
+        return irNameFor<uint8_t>;
+    case ValueTypeCode::UI32:
+        return irNameFor<uint32_t>;
+    case ValueTypeCode::UI64:
+        return irNameFor<uint64_t>;
+    case ValueTypeCode::F32:
+        return irNameFor<float>;
+    case ValueTypeCode::F64:
+        return irNameFor<double>;
+    default:
+        throw std::runtime_error("ValueTypeUtils::irNameForCode: unknown value type code");
     }
 }
diff --git a/src/runtime/local/datastructures/ValueTypeUtils.h b/src/runtime/local/datastructures/ValueTypeUtils.h
index 73ba5a0a3..d8297cc2e 100644
--- a/src/runtime/local/datastructures/ValueTypeUtils.h
+++ b/src/runtime/local/datastructures/ValueTypeUtils.h
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <runtime/local/datastructures/FixedSizeStringValueType.h>
 #include <runtime/local/datastructures/ValueTypeCode.h>
 
 #include <iostream>
@@ -27,57 +28,68 @@
 // Intended for use with TEMPLATE_TEST_CASE in the test cases, but fits nicely
 // here where everything else value-type-related resides, as that helps to keep
 // changes to the list of supported data types local.
-#define ALL_VALUE_TYPES \
-    int8_t, int32_t, int64_t, \
-    uint8_t, uint32_t, uint64_t, \
-    float, double
+#define ALL_VALUE_TYPES int8_t, int32_t, int64_t, uint8_t, uint32_t, uint64_t, float, double
+
+#define ALL_STRING_VALUE_TYPES std::string, FixedStr16
 
 struct ValueTypeUtils {
 
     static size_t sizeOf(ValueTypeCode type);
-    
-    static void printValue(std::ostream & os, ValueTypeCode type, const void * array, size_t pos);
-
-    template<typename ValueType>
-    static const ValueTypeCode codeFor;
-    
-    template<typename ValueType>
-    static const std::string cppNameFor;
-    
-    template<typename ValueType>
-    static const std::string irNameFor;
-    
+
+    static void printValue(std::ostream &os, ValueTypeCode type, const void *array, size_t pos);
+
+    template <typename ValueType> static const ValueTypeCode codeFor;
+
+    template <typename ValueType> static const ValueType defaultValue;
+
+    template <typename ValueType> static const std::string cppNameFor;
+
+    template <typename ValueType> static const std::string irNameFor;
+
     static const std::string cppNameForCode(ValueTypeCode type);
-    
+
     static const std::string irNameForCode(ValueTypeCode type);
 };
 
-template<> const ValueTypeCode ValueTypeUtils::codeFor<int8_t>;
-template<> const ValueTypeCode ValueTypeUtils::codeFor<int32_t>;
-template<> const ValueTypeCode ValueTypeUtils::codeFor<int64_t>;
-template<> const ValueTypeCode ValueTypeUtils::codeFor<uint8_t>;
-template<> const ValueTypeCode ValueTypeUtils::codeFor<uint32_t>;
-template<> const ValueTypeCode ValueTypeUtils::codeFor<uint64_t>;
-template<> const ValueTypeCode ValueTypeUtils::codeFor<float>;
-template<> const ValueTypeCode ValueTypeUtils::codeFor<double>;
-
-template<> const std::string ValueTypeUtils::cppNameFor<int8_t>;
-template<> const std::string ValueTypeUtils::cppNameFor<int32_t>;
-template<> const std::string ValueTypeUtils::cppNameFor<int64_t>;
-template<> const std::string ValueTypeUtils::cppNameFor<uint8_t>;
-template<> const std::string ValueTypeUtils::cppNameFor<uint32_t>;
-template<> const std::string ValueTypeUtils::cppNameFor<uint64_t>;
-template<> const std::string ValueTypeUtils::cppNameFor<float>;
-template<> const std::string ValueTypeUtils::cppNameFor<double>;
-template<> const std::string ValueTypeUtils::cppNameFor<bool>;
-template<> const std::string ValueTypeUtils::cppNameFor<char*>;
-
-template<> const std::string ValueTypeUtils::irNameFor<int8_t>;
-template<> const std::string ValueTypeUtils::irNameFor<int32_t>;
-template<> const std::string ValueTypeUtils::irNameFor<int64_t>;
-template<> const std::string ValueTypeUtils::irNameFor<uint8_t>;
-template<> const std::string ValueTypeUtils::irNameFor<uint32_t>;
-template<> const std::string ValueTypeUtils::irNameFor<uint64_t>;
-template<> const std::string ValueTypeUtils::irNameFor<float>;
-template<> const std::string ValueTypeUtils::irNameFor<double>;
+template <> const ValueTypeCode ValueTypeUtils::codeFor<int8_t>;
+template <> const ValueTypeCode ValueTypeUtils::codeFor<int32_t>;
+template <> const ValueTypeCode ValueTypeUtils::codeFor<int64_t>;
+template <> const ValueTypeCode ValueTypeUtils::codeFor<uint8_t>;
+template <> const ValueTypeCode ValueTypeUtils::codeFor<uint32_t>;
+template <> const ValueTypeCode ValueTypeUtils::codeFor<uint64_t>;
+template <> const ValueTypeCode ValueTypeUtils::codeFor<float>;
+template <> const ValueTypeCode ValueTypeUtils::codeFor<double>;
+template <> const ValueTypeCode ValueTypeUtils::codeFor<std::string>;
+template <> const ValueTypeCode ValueTypeUtils::codeFor<FixedStr16>;
+
+template <> const std::string ValueTypeUtils::cppNameFor<int8_t>;
+template <> const std::string ValueTypeUtils::cppNameFor<int32_t>;
+template <> const std::string ValueTypeUtils::cppNameFor<int64_t>;
+template <> const std::string ValueTypeUtils::cppNameFor<uint8_t>;
+template <> const std::string ValueTypeUtils::cppNameFor<uint32_t>;
+template <> const std::string ValueTypeUtils::cppNameFor<uint64_t>;
+template <> const std::string ValueTypeUtils::cppNameFor<float>;
+template <> const std::string ValueTypeUtils::cppNameFor<double>;
+template <> const std::string ValueTypeUtils::cppNameFor<bool>;
+template <> const std::string ValueTypeUtils::cppNameFor<char *>;
+
+template <> const std::string ValueTypeUtils::irNameFor<int8_t>;
+template <> const std::string ValueTypeUtils::irNameFor<int32_t>;
+template <> const std::string ValueTypeUtils::irNameFor<int64_t>;
+template <> const std::string ValueTypeUtils::irNameFor<uint8_t>;
+template <> const std::string ValueTypeUtils::irNameFor<uint32_t>;
+template <> const std::string ValueTypeUtils::irNameFor<uint64_t>;
+template <> const std::string ValueTypeUtils::irNameFor<float>;
+template <> const std::string ValueTypeUtils::irNameFor<double>;
 
+template <> const int8_t ValueTypeUtils::defaultValue<int8_t>;
+template <> const int32_t ValueTypeUtils::defaultValue<int32_t>;
+template <> const int64_t ValueTypeUtils::defaultValue<int64_t>;
+template <> const uint8_t ValueTypeUtils::defaultValue<uint8_t>;
+template <> const uint32_t ValueTypeUtils::defaultValue<uint32_t>;
+template <> const uint64_t ValueTypeUtils::defaultValue<uint64_t>;
+template <> const float ValueTypeUtils::defaultValue<float>;
+template <> const double ValueTypeUtils::defaultValue<double>;
+template <> const std::string ValueTypeUtils::defaultValue<std::string>;
+template <> const FixedStr16 ValueTypeUtils::defaultValue<FixedStr16>;
+template <> const char *ValueTypeUtils::defaultValue<const char *>;
diff --git a/src/runtime/local/instrumentation/KernelInstrumentation.cpp b/src/runtime/local/instrumentation/KernelInstrumentation.cpp
index bd9fd3826..f0e60f451 100644
--- a/src/runtime/local/instrumentation/KernelInstrumentation.cpp
+++ b/src/runtime/local/instrumentation/KernelInstrumentation.cpp
@@ -1,13 +1,11 @@
 #include <runtime/local/instrumentation/KernelInstrumentation.h>
 
-void preKernelInstrumentation(int kId, DaphneContext *ctx)
-{
+void preKernelInstrumentation(int kId, DaphneContext *ctx) {
     if (ctx->getUserConfig().statistics)
         ctx->startKernelTimer(kId);
 }
 
-void postKernelInstrumentation(int kId, DaphneContext *ctx)
-{
+void postKernelInstrumentation(int kId, DaphneContext *ctx) {
     if (ctx->getUserConfig().statistics)
         ctx->stopKernelTimer(kId);
 }
diff --git a/src/runtime/local/io/DaphneFile.h b/src/runtime/local/io/DaphneFile.h
index 0f4f1c079..8b5226e7f 100644
--- a/src/runtime/local/io/DaphneFile.h
+++ b/src/runtime/local/io/DaphneFile.h
@@ -18,26 +18,24 @@
 
 #include <cstdint>
 
-
 struct DF_header {
-	uint8_t version;
-	uint8_t dt;
-	uint64_t nbrows;
-	uint64_t nbcols;
+    uint8_t version;
+    uint8_t dt;
+    uint64_t nbrows;
+    uint64_t nbcols;
 } __attribute__((__packed__));
 
-enum DF_data_t {reserved = 0, DenseMatrix_t = 1, CSRMatrix_t = 2, Frame_t = 3, Value_t = 4};
+enum DF_data_t { reserved = 0, DenseMatrix_t = 1, CSRMatrix_t = 2, Frame_t = 3, Value_t = 4 };
 
 struct DF_body {
-	uint64_t rx; // row index
-	uint64_t cx; // column index
+    uint64_t rx; // row index
+    uint64_t cx; // column index
 } __attribute__((__packed__));
 
 struct DF_body_block {
-	uint32_t nbrows;
-	uint32_t nbcols;
-	uint8_t bt;
+    uint32_t nbrows;
+    uint32_t nbcols;
+    uint8_t bt;
 } __attribute__((__packed__));
 
-enum DF_body_t {empty = 0, dense = 1, sparse = 2, ultra_sparse = 3};
-
+enum DF_body_t { empty = 0, dense = 1, sparse = 2, ultra_sparse = 3 };
diff --git a/src/runtime/local/io/DaphneSerializer.h b/src/runtime/local/io/DaphneSerializer.h
index f81ca667e..ad43c8700 100644
--- a/src/runtime/local/io/DaphneSerializer.h
+++ b/src/runtime/local/io/DaphneSerializer.h
@@ -18,17 +18,17 @@
 
 #include <runtime/local/io/DaphneFile.h>
 
-#include <runtime/local/datastructures/ValueTypeCode.h>
-#include <runtime/local/datastructures/ValueTypeUtils.h>
-#include <runtime/local/datastructures/DenseMatrix.h>
 #include <runtime/local/datastructures/CSRMatrix.h>
+#include <runtime/local/datastructures/DenseMatrix.h>
 #include <runtime/local/datastructures/Frame.h>
+#include <runtime/local/datastructures/ValueTypeCode.h>
+#include <runtime/local/datastructures/ValueTypeUtils.h>
 
+#include <cmath>
 #include <cstdint>
-#include <stdlib.h>
-#include <stdexcept>
 #include <iterator>
-#include <cmath>
+#include <stdexcept>
+#include <stdlib.h>
 
 // ****************************************************************************
 // Helper functions
@@ -36,36 +36,32 @@
 
 /**
  * @brief Deserializes a DF_data_t struct.
-*/
-inline DF_data_t DF_Dtype(const char *buf) { 
-    return (DF_data_t)((const DF_header *)buf)->dt;
-}
-inline DF_data_t DF_Dtype(const std::vector<char>& buf) { return DF_Dtype(buf.data()); };
+ */
+inline DF_data_t DF_Dtype(const char *buf) { return (DF_data_t)((const DF_header *)buf)->dt; }
+inline DF_data_t DF_Dtype(const std::vector<char> &buf) { return DF_Dtype(buf.data()); };
 
 /**
  * @brief Deserializes the ValueTypeCode.
-*/
+ */
 inline ValueTypeCode DF_Vtype(const char *buf) {
     const ValueTypeCode *vt = (const ValueTypeCode *)((const char *)buf + sizeof(DF_header));
     return *vt;
 }
-inline ValueTypeCode DF_Vtype(const std::vector<char>& buf) { return DF_Vtype(buf.data()); }
-
+inline ValueTypeCode DF_Vtype(const std::vector<char> &buf) { return DF_Vtype(buf.data()); }
 
 // ****************************************************************************
 // Struct for partial template specialization
 // ****************************************************************************
 /**
  * @brief Serialize and deserialize Daphne objects.
-*/
-template <class DTArg, bool isFundumental = std::is_fundamental<DTArg>::value>
-struct DaphneSerializer { 
+ */
+template <class DTArg, bool isFundumental = std::is_fundamental<DTArg>::value> struct DaphneSerializer {
     static size_t length(const DTArg *arg);
     static size_t serialize(const DTArg *arg, char *buf, size_t chunkSize = 0, size_t serializeFromByte = 0);
-    static Structure *deserialize(const char *buf, size_t chunkSize = 0, DTArg *arg = nullptr, size_t deserializeFromByte = 0);
+    static Structure *deserialize(const char *buf, size_t chunkSize = 0, DTArg *arg = nullptr,
+                                  size_t deserializeFromByte = 0);
 };
 
-
 // ****************************************************************************
 // (Partial) template specializations for different data/value types
 // ****************************************************************************
@@ -76,12 +72,11 @@ struct DaphneSerializer {
 
 /**
  * @brief Serialize and deserialize DenseMatrix data types.
- * 
+ *
  * Contains static methods for finding the length in bytes, serializing and
  * deserializing DenseMatrix objects.
-*/
-template <typename VT>
-struct DaphneSerializer<DenseMatrix<VT>, false> {
+ */
+template <typename VT> struct DaphneSerializer<DenseMatrix<VT>, false> {
     /**
      * @brief The default serialization chunk size
      */
@@ -90,13 +85,13 @@ struct DaphneSerializer<DenseMatrix<VT>, false> {
     static const size_t HEADER_BUFFER_SIZE = 45;
     /**
      * @brief Returns the size of the header.
-    */
+     */
     static size_t headerSize(const DenseMatrix<VT> *arg) { return HEADER_BUFFER_SIZE; }
 
     /**
      * @brief Calculates the byte length of the object.
-     * 
-    */
+     *
+     */
     static size_t length(const DenseMatrix<VT> *arg) {
         size_t len = 0;
 
@@ -110,14 +105,15 @@ struct DaphneSerializer<DenseMatrix<VT>, false> {
 
         return len;
     };
-    
+
     /**
-     * @brief Creates a header and copies it to the buffer, containing information about the object (dimensions, types, other)
-     * 
+     * @brief Creates a header and copies it to the buffer, containing
+     * information about the object (dimensions, types, other)
+     *
      * @param arg The object to be serialized.
      * @param buffer A pointer to copy the data.
      * @param bufferIdx (optional) A byte index for the buffer pointer.
-    */
+     */
     static size_t serializeHeader(const DenseMatrix<VT> *arg, char *buffer, size_t bufferIdx = 0) {
         size_t serializationIdx = 0;
 
@@ -128,18 +124,19 @@ struct DaphneSerializer<DenseMatrix<VT>, false> {
         DF_header h;
         h.version = 1;
         h.dt = (uint8_t)DF_data_t::DenseMatrix_t;
-        h.nbrows = (uint64_t) arg->getNumRows();
-        h.nbcols = (uint64_t) arg->getNumCols();
-   
-        std::copy(reinterpret_cast<const char*>(&h), reinterpret_cast<const char*>(&h) + sizeof(h), buffer);
+        h.nbrows = (uint64_t)arg->getNumRows();
+        h.nbcols = (uint64_t)arg->getNumCols();
+
+        std::copy(reinterpret_cast<const char *>(&h), reinterpret_cast<const char *>(&h) + sizeof(h), buffer);
         bufferIdx += sizeof(h);
         serializationIdx += sizeof(h);
 
         // value type
         const ValueTypeCode vt = ValueTypeUtils::codeFor<VT>;
-            
-        std::copy(reinterpret_cast<const char*>(&vt), reinterpret_cast<const char*>(&vt) + sizeof(vt), buffer + bufferIdx);
-        bufferIdx += sizeof(vt);        
+
+        std::copy(reinterpret_cast<const char *>(&vt), reinterpret_cast<const char *>(&vt) + sizeof(vt),
+                  buffer + bufferIdx);
+        bufferIdx += sizeof(vt);
         serializationIdx += sizeof(vt);
 
         // write body
@@ -147,24 +144,27 @@ struct DaphneSerializer<DenseMatrix<VT>, false> {
         DF_body b;
         b.rx = 0;
         b.cx = 0;
-        
-        std::copy(reinterpret_cast<const char*>(&b), reinterpret_cast<const char*>(&b) + sizeof(b), buffer + bufferIdx);
+
+        std::copy(reinterpret_cast<const char *>(&b), reinterpret_cast<const char *>(&b) + sizeof(b),
+                  buffer + bufferIdx);
         bufferIdx += sizeof(b);
         serializationIdx += sizeof(b);
 
         // block header
         DF_body_block bb;
-        bb.nbrows = (uint32_t) arg->getNumRows();
-        bb.nbcols = (uint32_t) arg->getNumCols();
+        bb.nbrows = (uint32_t)arg->getNumRows();
+        bb.nbcols = (uint32_t)arg->getNumCols();
         bb.bt = (uint8_t)DF_body_t::dense;
-                
-        std::copy(reinterpret_cast<const char*>(&bb), reinterpret_cast<const char*>(&bb) + sizeof(bb), buffer + bufferIdx);
+
+        std::copy(reinterpret_cast<const char *>(&bb), reinterpret_cast<const char *>(&bb) + sizeof(bb),
+                  buffer + bufferIdx);
         bufferIdx += sizeof(bb);
         serializationIdx += sizeof(bb);
-        
-        // value type        
-        std::copy(reinterpret_cast<const char*>(&vt), reinterpret_cast<const char*>(&vt) + sizeof(vt), buffer + bufferIdx);
-        bufferIdx += sizeof(vt);        
+
+        // value type
+        std::copy(reinterpret_cast<const char *>(&vt), reinterpret_cast<const char *>(&vt) + sizeof(vt),
+                  buffer + bufferIdx);
+        bufferIdx += sizeof(vt);
         serializationIdx += sizeof(vt);
 
         return serializationIdx;
@@ -172,62 +172,76 @@ struct DaphneSerializer<DenseMatrix<VT>, false> {
 
     /**
      * @brief Partially serializes an Daphne object into a buffer
-     * 
+     *
      * @param arg The daphne Matrix
-     * @param buffer A pointer to char, the buffer that data will be serialized to.
-     * @param chunkSize Optional The size of the buffer (default is DEFAULT_SERIALIZATION_BUFFER_SIZE). Since at least one chunk will contain the header, the minimum chunk size should be HEADER_BUFFER_SIZE bytes (so the header won't be partially serialized).
-     * @param serializeFromByte Optional The byte index of the object, at which serialization should begin (default 0). 
-    */
-    static size_t serialize(const DenseMatrix<VT> *arg, char *buffer, size_t chunkSize = DEFAULT_SERIALIZATION_BUFFER_SIZE, size_t serializeFromByte = 0) {
+     * @param buffer A pointer to char, the buffer that data will be serialized
+     * to.
+     * @param chunkSize Optional The size of the buffer (default is
+     * DEFAULT_SERIALIZATION_BUFFER_SIZE). Since at least one chunk will contain
+     * the header, the minimum chunk size should be HEADER_BUFFER_SIZE bytes (so
+     * the header won't be partially serialized).
+     * @param serializeFromByte Optional The byte index of the object, at which
+     * serialization should begin (default 0).
+     */
+    static size_t serialize(const DenseMatrix<VT> *arg, char *buffer,
+                            size_t chunkSize = DEFAULT_SERIALIZATION_BUFFER_SIZE, size_t serializeFromByte = 0) {
         size_t bufferIdx = 0;
         size_t serializationIdx = 0;
         chunkSize = chunkSize != 0 ? chunkSize : DaphneSerializer<DenseMatrix<VT>>::length(arg);
 
-        if (buffer == nullptr){
+        if (buffer == nullptr) {
             throw std::runtime_error("Buffer is nullptr");
         }
 
-        // Since at least one chunk will contain the header, the minimum chunk size should be HEADER_BUFFER_SIZE bytes (so the header won't be partially serialized).
+        // Since at least one chunk will contain the header, the minimum chunk
+        // size should be HEADER_BUFFER_SIZE bytes (so the header won't be
+        // partially serialized).
         if (serializeFromByte == 0 && chunkSize < HEADER_BUFFER_SIZE)
-            throw std::runtime_error("Minimum starting chunk size " + std::to_string(HEADER_BUFFER_SIZE) + " bytes"); // For now..?
+            throw std::runtime_error("Minimum starting chunk size " + std::to_string(HEADER_BUFFER_SIZE) +
+                                     " bytes"); // For now..?
 
         if (serializeFromByte == 0) {
             auto bytesWritten = serializeHeader(arg, buffer);
             bufferIdx += bytesWritten;
         }
         serializationIdx += headerSize(arg);
-        
+
         // block values
-        const VT * valuesArg = arg->getValues();
-            
+        const VT *valuesArg = arg->getValues();
+
         size_t bytesToCopy = 0;
         size_t valuesSize = arg->getNumRows() * arg->getNumCols() * sizeof(VT);
 
         if (serializeFromByte < serializationIdx) {
-            bytesToCopy = (chunkSize > (serializationIdx - serializeFromByte) + valuesSize) ?
-                valuesSize : (chunkSize - bufferIdx);
+            bytesToCopy = (chunkSize > (serializationIdx - serializeFromByte) + valuesSize) ? valuesSize
+                                                                                            : (chunkSize - bufferIdx);
         } else {
-            bytesToCopy = (serializeFromByte + chunkSize > valuesSize + serializationIdx) ?
-                (valuesSize + serializationIdx - serializeFromByte) : (chunkSize - bufferIdx);
+            bytesToCopy = (serializeFromByte + chunkSize > valuesSize + serializationIdx)
+                              ? (valuesSize + serializationIdx - serializeFromByte)
+                              : (chunkSize - bufferIdx);
         }
 
         size_t startOffset = (serializeFromByte > serializationIdx ? serializeFromByte - serializationIdx : 0);
-        std::copy(reinterpret_cast<const char*>(valuesArg) + startOffset,
-                    reinterpret_cast<const char*>(valuesArg) + startOffset + bytesToCopy,
-                    buffer + bufferIdx);
+        std::copy(reinterpret_cast<const char *>(valuesArg) + startOffset,
+                  reinterpret_cast<const char *>(valuesArg) + startOffset + bytesToCopy, buffer + bufferIdx);
         bufferIdx += bytesToCopy;
 
         return bufferIdx;
     };
     /**
-     * @brief Partially serializes an Daphne object into a buffer. This overloaded function can allocate memory for the buffer if needed.
-     * 
+     * @brief Partially serializes an Daphne object into a buffer. This
+     * overloaded function can allocate memory for the buffer if needed.
+     *
      * @param arg The daphne Matrix
-     * @param buffer A pointer to pointer to char, the buffer that data will be serialized to.
-     * @param chunkSize Optional The size of the buffer (default is 0 - the size needed for the whole object)
-     * @param serializeFromByte Optional The byte index of the object, at which serialization should begin (default 0). 
-    */
-    static size_t serialize(const DenseMatrix<VT> *arg, char **buffer, size_t chunkSize = 0, size_t serializeFromByte = 0) {        
+     * @param buffer A pointer to pointer to char, the buffer that data will be
+     * serialized to.
+     * @param chunkSize Optional The size of the buffer (default is 0 - the size
+     * needed for the whole object)
+     * @param serializeFromByte Optional The byte index of the object, at which
+     * serialization should begin (default 0).
+     */
+    static size_t serialize(const DenseMatrix<VT> *arg, char **buffer, size_t chunkSize = 0,
+                            size_t serializeFromByte = 0) {
         if (*buffer == nullptr) {
             chunkSize = chunkSize != 0 ? chunkSize : DaphneSerializer<DenseMatrix<VT>>::length(arg);
             *buffer = new char[chunkSize];
@@ -235,27 +249,30 @@ struct DaphneSerializer<DenseMatrix<VT>, false> {
         return serialize(arg, *buffer, chunkSize, serializeFromByte);
     }
     /**
-     * @brief Serializes a Daphne object into a vector<char> buffer. 
-     * 
-     * The user can reserve memory for the buffer before the call. If the capacity of the buffer is 0,
-     * memory is reserved for the whole object.
-     * 
+     * @brief Serializes a Daphne object into a vector<char> buffer.
+     *
+     * The user can reserve memory for the buffer before the call. If the
+     * capacity of the buffer is 0, memory is reserved for the whole object.
+     *
      * @param arg The daphne object.
      * @param buffer The std::vector<char> buffer.
-     * @param serializeFromByte Optional The byte index at which serialization should start (default 0).
-    */
+     * @param serializeFromByte Optional The byte index at which serialization
+     * should start (default 0).
+     */
     static size_t serialize(const DenseMatrix<VT> *arg, std::vector<char> &buffer, size_t serializeFromByte = 0) {
-        // if caller provides an empty buffer, assume we want to serialize the whole object
+        // if caller provides an empty buffer, assume we want to serialize the
+        // whole object
         size_t chunkSize = buffer.size() == 0 ? DaphneSerializer<DenseMatrix<VT>>::length(arg) : buffer.size();
-        if (buffer.size() == 0) 
+        if (buffer.size() == 0)
             buffer.resize(chunkSize);
-        
+
         return serialize(arg, buffer.data(), chunkSize, serializeFromByte);
     }
 
     /**
-     * @brief Deserializes the header of a buffer containing information about a DenseMatrix.
-     * 
+     * @brief Deserializes the header of a buffer containing information about a
+     * DenseMatrix.
+     *
      * @param buf The buffer which contains the header.
      * @param matrix The DenseMatrix to initialize with the header information.
      * @return DenseMatrix<VT>* The result matrix.
@@ -270,9 +287,9 @@ struct DaphneSerializer<DenseMatrix<VT>, false> {
         bufIdx += sizeof(DF_header);
         bufIdx += sizeof(ValueTypeCode);
         bufIdx += sizeof(DF_body);
-    
+
         DF_body_block bb;
-        std::copy(buf + bufIdx, buf + bufIdx + sizeof(DF_body_block), reinterpret_cast<char*>(&bb));
+        std::copy(buf + bufIdx, buf + bufIdx + sizeof(DF_body_block), reinterpret_cast<char *>(&bb));
 
         bufIdx += sizeof(DF_body_block);
         // empty Matrix
@@ -280,12 +297,11 @@ struct DaphneSerializer<DenseMatrix<VT>, false> {
             return DataObjectFactory::create<DenseMatrix<VT>>(0, 0, false);
         }
         // Dense Matrix
-        else if (bb.bt == (uint8_t)DF_body_t::dense) {     
+        else if (bb.bt == (uint8_t)DF_body_t::dense) {
             bufIdx += sizeof(ValueTypeCode);
             // Allocate if first chunk
-            if (matrix == nullptr){
-                matrix = DataObjectFactory::create<DenseMatrix<VT>>((size_t)bb.nbrows,
-                        (size_t)bb.nbcols, false);
+            if (matrix == nullptr) {
+                matrix = DataObjectFactory::create<DenseMatrix<VT>>((size_t)bb.nbrows, (size_t)bb.nbcols, false);
             }
         } else {
             throw std::runtime_error("unknown body type code");
@@ -295,30 +311,38 @@ struct DaphneSerializer<DenseMatrix<VT>, false> {
 
     /**
      * @brief Deserializes a DenseMatrix from a buffer.
-     * 
-     * Deserialization can be done partially by specifing an byte-index as a starting point in the Matrix. 
-     * Notice that index is related to the byte length of the matrix (provided by length(matrix)).
-     * 
+     *
+     * Deserialization can be done partially by specifing an byte-index as a
+     * starting point in the Matrix. Notice that index is related to the byte
+     * length of the matrix (provided by length(matrix)).
+     *
      * @param buf The buffer containing the serialized data.
-     * @param chunkSize The size of the buffer. Since at least one chunk will contain the header, the minimum chunk size should be HEADER_BUFFER_SIZE bytes (so the header won't be partially serialized).
+     * @param chunkSize The size of the buffer. Since at least one chunk will
+     * contain the header, the minimum chunk size should be HEADER_BUFFER_SIZE
+     * bytes (so the header won't be partially serialized).
      * @param matrix The result matrix to write data.
-     * @param deserializeFromByte (Optional) The index of the @matrix that deserialization should begin writing data.
+     * @param deserializeFromByte (Optional) The index of the @matrix that
+     * deserialization should begin writing data.
      * @return DenseMatrix<VT>* The result matrix.
      */
-    static DenseMatrix<VT> *deserialize(const char *buf, size_t chunkSize, DenseMatrix<VT> *matrix = nullptr, size_t deserializeFromByte = 0) {
-        // Since at least one chunk will contain the header, the minimum chunk size should be HEADER_BUFFER_SIZE bytes (so the header won't be partially serialized).
+    static DenseMatrix<VT> *deserialize(const char *buf, size_t chunkSize, DenseMatrix<VT> *matrix = nullptr,
+                                        size_t deserializeFromByte = 0) {
+        // Since at least one chunk will contain the header, the minimum chunk
+        // size should be HEADER_BUFFER_SIZE bytes (so the header won't be
+        // partially serialized).
         if (deserializeFromByte == 0 && chunkSize < HEADER_BUFFER_SIZE)
-            throw std::runtime_error("Minimum starting chunk size " + std::to_string(HEADER_BUFFER_SIZE) + " bytes"); // For now..?
-        
-        size_t bufIdx = 0;     
-        size_t serializationIdx = 0;   
-        
+            throw std::runtime_error("Minimum starting chunk size " + std::to_string(HEADER_BUFFER_SIZE) +
+                                     " bytes"); // For now..?
+
+        size_t bufIdx = 0;
+        size_t serializationIdx = 0;
+
         if (deserializeFromByte == 0) {
             matrix = deserializeHeader(buf, matrix);
             bufIdx += HEADER_BUFFER_SIZE;
         }
         serializationIdx += HEADER_BUFFER_SIZE;
-        
+
         auto valuesArg = matrix->getValues();
         size_t bytesToCopy = 0;
         size_t valuesSize = matrix->getNumRows() * matrix->getNumCols() * sizeof(VT);
@@ -326,52 +350,54 @@ struct DaphneSerializer<DenseMatrix<VT>, false> {
         size_t valuesOffset = deserializeFromByte == 0 ? 0 : deserializeFromByte - HEADER_BUFFER_SIZE;
 
         if (deserializeFromByte < serializationIdx) {
-            bytesToCopy = (chunkSize > (serializationIdx - deserializeFromByte) + valuesSize) ?
-                valuesSize : (chunkSize - bufIdx);
+            bytesToCopy =
+                (chunkSize > (serializationIdx - deserializeFromByte) + valuesSize) ? valuesSize : (chunkSize - bufIdx);
         } else {
-            bytesToCopy = (deserializeFromByte + chunkSize > valuesSize + serializationIdx) ?
-                (valuesSize + serializationIdx - deserializeFromByte) : (chunkSize - bufIdx);
+            bytesToCopy = (deserializeFromByte + chunkSize > valuesSize + serializationIdx)
+                              ? (valuesSize + serializationIdx - deserializeFromByte)
+                              : (chunkSize - bufIdx);
         }
 
-        std::copy(buf + bufIdx, buf + bufIdx + bytesToCopy, reinterpret_cast<char*>(valuesArg) + valuesOffset);
-        
+        std::copy(buf + bufIdx, buf + bufIdx + bytesToCopy, reinterpret_cast<char *>(valuesArg) + valuesOffset);
+
         return matrix;
     };
     /**
      * @brief Deserializes a DenseMatrix from a buffer.
-     * 
-     * Deserialization can be done partially by specifing an byte-index as a starting point in the Matrix. 
-     * Notice that index is related to the byte length of the matrix (provided by length(matrix)).
-     * 
+     *
+     * Deserialization can be done partially by specifing an byte-index as a
+     * starting point in the Matrix. Notice that index is related to the byte
+     * length of the matrix (provided by length(matrix)).
+     *
      * @param buffer An std::vector<char> buffer containg serialized data.
      * @param matrix The result matrix to write data.
-     * @param deserializeFromByte (Optional) The index of the @matrix that deserialization should begin writing data.
+     * @param deserializeFromByte (Optional) The index of the @matrix that
+     * deserialization should begin writing data.
      * @return DenseMatrix<VT>* The result matrix.
      */
-    static DenseMatrix<VT> *deserialize(const std::vector<char> &buffer, DenseMatrix<VT> *matrix = nullptr, size_t deserializeFromByte = 0) {                        
+    static DenseMatrix<VT> *deserialize(const std::vector<char> &buffer, DenseMatrix<VT> *matrix = nullptr,
+                                        size_t deserializeFromByte = 0) {
         return deserialize(buffer.data(), buffer.size(), matrix, deserializeFromByte);
     }
-
 };
 
 // ----------------------------------------------------------------------------
 // const DenseMatrix
 // ----------------------------------------------------------------------------
 
-template<typename VT>
-struct DaphneSerializer<const DenseMatrix<VT>, false> : public DaphneSerializer<DenseMatrix<VT>, false> { };
+template <typename VT>
+struct DaphneSerializer<const DenseMatrix<VT>, false> : public DaphneSerializer<DenseMatrix<VT>, false> {};
 
 // ----------------------------------------------------------------------------
 // CSRMatrix
 // ----------------------------------------------------------------------------
 /**
  * @brief Serialize and deserialize CSRMatrix data types.
- * 
+ *
  * Contains static methods for finding the length in bytes, serializing and
  * deserializing CSRMatrix objects.
-*/
-template <typename VT>
-struct DaphneSerializer<CSRMatrix<VT>, false> {
+ */
+template <typename VT> struct DaphneSerializer<CSRMatrix<VT>, false> {
     const CSRMatrix<VT> *matrix;
     CSRMatrix<VT> **matrixPtr;
     size_t chunkSize;
@@ -383,23 +409,31 @@ struct DaphneSerializer<CSRMatrix<VT>, false> {
     static const size_t HEADER_BUFFER_SIZE = 53;
     /**
      * @brief Returns the size of the header.
-    */
+     */
     static size_t headerSize(const CSRMatrix<VT> *arg) { return HEADER_BUFFER_SIZE; }
-    
-    DaphneSerializer(const CSRMatrix<VT> *matrix, size_t chunkSize = DEFAULT_SERIALIZATION_BUFFER_SIZE) : matrix(matrix), chunkSize(chunkSize) {
-        // Since at least one chunk will contain the header, the minimum chunk size should be HEADER_BUFFER_SIZE bytes (so the header won't be partially serialized).
+
+    DaphneSerializer(const CSRMatrix<VT> *matrix, size_t chunkSize = DEFAULT_SERIALIZATION_BUFFER_SIZE)
+        : matrix(matrix), chunkSize(chunkSize) {
+        // Since at least one chunk will contain the header, the minimum chunk
+        // size should be HEADER_BUFFER_SIZE bytes (so the header won't be
+        // partially serialized).
         if (chunkSize < HEADER_BUFFER_SIZE)
-            throw std::runtime_error("Minimum chunk size " + std::to_string(HEADER_BUFFER_SIZE) + " bytes"); // For now..?
+            throw std::runtime_error("Minimum chunk size " + std::to_string(HEADER_BUFFER_SIZE) +
+                                     " bytes"); // For now..?
     };
-    DaphneSerializer(CSRMatrix<VT> **matrix, size_t chunkSize = DEFAULT_SERIALIZATION_BUFFER_SIZE) : matrixPtr(matrix), chunkSize(chunkSize) {
-        // Since at least one chunk will contain the header, the minimum chunk size should be HEADER_BUFFER_SIZE bytes (so the header won't be partially serialized).
+    DaphneSerializer(CSRMatrix<VT> **matrix, size_t chunkSize = DEFAULT_SERIALIZATION_BUFFER_SIZE)
+        : matrixPtr(matrix), chunkSize(chunkSize) {
+        // Since at least one chunk will contain the header, the minimum chunk
+        // size should be HEADER_BUFFER_SIZE bytes (so the header won't be
+        // partially serialized).
         if (chunkSize < HEADER_BUFFER_SIZE)
-            throw std::runtime_error("Minimum chunk size " + std::to_string(HEADER_BUFFER_SIZE) + " bytes"); // For now..?
+            throw std::runtime_error("Minimum chunk size " + std::to_string(HEADER_BUFFER_SIZE) +
+                                     " bytes"); // For now..?
     };
     /**
      * @brief Calculates the byte length of the object.
-     * 
-    */
+     *
+     */
     static size_t length(const CSRMatrix<VT> *arg) {
         size_t len = 0;
 
@@ -418,12 +452,12 @@ struct DaphneSerializer<CSRMatrix<VT>, false> {
         // rowOffsets
         len += ((arg->getNumRows() + 1) * sizeof(size_t));
 
-        // When Serializing if matrix is View we need to count Non Zeros for the rows we need
-        // If it is not a view we can simply get maxNumNonZeros
+        // When Serializing if matrix is View we need to count Non Zeros for the
+        // rows we need If it is not a view we can simply get maxNumNonZeros
         size_t nzb = 0;
-        if (arg->isView()){
-            for (size_t r = 0; r < arg->getNumRows(); r++){
-                nzb += arg->getNumNonZeros(r);            
+        if (arg->isView()) {
+            for (size_t r = 0; r < arg->getNumRows(); r++) {
+                nzb += arg->getNumNonZeros(r);
             }
         } else {
             nzb = arg->getMaxNumNonZeros();
@@ -436,16 +470,17 @@ struct DaphneSerializer<CSRMatrix<VT>, false> {
         return len;
     };
     /**
-     * @brief Creates a header and copies it to the buffer, containing information about the object (dimensions, types, other)
-     * 
+     * @brief Creates a header and copies it to the buffer, containing
+     * information about the object (dimensions, types, other)
+     *
      * @param arg The object to be serialized.
      * @param buffer A pointer to copy the data.
      * @param bufferIdx (optional) A byte index for the buffer pointer.
-    */
+     */
     static size_t serializeHeader(const CSRMatrix<VT> *arg, char *buffer, size_t bufferIdx = 0) {
         size_t serializationIdx = 0;
 
-        if (buffer == nullptr){
+        if (buffer == nullptr) {
             throw std::runtime_error("buffer is nullptr");
         }
 
@@ -453,18 +488,19 @@ struct DaphneSerializer<CSRMatrix<VT>, false> {
         DF_header h;
         h.version = 1;
         h.dt = (uint8_t)DF_data_t::CSRMatrix_t;
-        h.nbrows = (uint64_t) arg->getNumRows();
-        h.nbcols = (uint64_t) arg->getNumCols();
+        h.nbrows = (uint64_t)arg->getNumRows();
+        h.nbcols = (uint64_t)arg->getNumCols();
 
-        std::copy(reinterpret_cast<const char*>(&h), reinterpret_cast<const char*>(&h) + sizeof(h), buffer);
+        std::copy(reinterpret_cast<const char *>(&h), reinterpret_cast<const char *>(&h) + sizeof(h), buffer);
         bufferIdx += sizeof(h);
         serializationIdx += sizeof(h);
 
         // value type
         const ValueTypeCode vt = ValueTypeUtils::codeFor<VT>;
-        // Check if we actually need this    
-        std::copy(reinterpret_cast<const char*>(&vt), reinterpret_cast<const char*>(&vt) + sizeof(vt), buffer + bufferIdx);
-        bufferIdx += sizeof(vt);        
+        // Check if we actually need this
+        std::copy(reinterpret_cast<const char *>(&vt), reinterpret_cast<const char *>(&vt) + sizeof(vt),
+                  buffer + bufferIdx);
+        bufferIdx += sizeof(vt);
         serializationIdx += sizeof(vt);
 
         // write body
@@ -473,86 +509,98 @@ struct DaphneSerializer<CSRMatrix<VT>, false> {
         b.rx = 0;
         b.cx = 0;
 
-        std::copy(reinterpret_cast<const char*>(&b), reinterpret_cast<const char*>(&b) + sizeof(b), buffer + bufferIdx);
+        std::copy(reinterpret_cast<const char *>(&b), reinterpret_cast<const char *>(&b) + sizeof(b),
+                  buffer + bufferIdx);
         bufferIdx += sizeof(b);
         serializationIdx += sizeof(b);
 
         // block header
         DF_body_block bb;
-        bb.nbrows = (uint32_t) arg->getNumRows();
-        bb.nbcols = (uint32_t) arg->getNumCols();
+        bb.nbrows = (uint32_t)arg->getNumRows();
+        bb.nbcols = (uint32_t)arg->getNumCols();
         bb.bt = (uint8_t)DF_body_t::sparse;
 
-        std::copy(reinterpret_cast<const char*>(&bb), reinterpret_cast<const char*>(&bb) + sizeof(bb), buffer + bufferIdx);
+        std::copy(reinterpret_cast<const char *>(&bb), reinterpret_cast<const char *>(&bb) + sizeof(bb),
+                  buffer + bufferIdx);
         bufferIdx += sizeof(bb);
         serializationIdx += sizeof(bb);
 
         // value type
-        std::copy(reinterpret_cast<const char*>(&vt), reinterpret_cast<const char*>(&vt) + sizeof(vt), buffer + bufferIdx);
+        std::copy(reinterpret_cast<const char *>(&vt), reinterpret_cast<const char *>(&vt) + sizeof(vt),
+                  buffer + bufferIdx);
         bufferIdx += sizeof(vt);
         serializationIdx += sizeof(vt);
 
         // num non-zeros
         size_t nzb = 0;
-        for (size_t r = 0; r < arg->getNumRows(); r++){
-            nzb += arg->getNumNonZeros(r);            
+        for (size_t r = 0; r < arg->getNumRows(); r++) {
+            nzb += arg->getNumNonZeros(r);
         }
-        std::copy(reinterpret_cast<const char*>(&nzb), reinterpret_cast<const char*>(&nzb) + sizeof(nzb), buffer + bufferIdx);
+        std::copy(reinterpret_cast<const char *>(&nzb), reinterpret_cast<const char *>(&nzb) + sizeof(nzb),
+                  buffer + bufferIdx);
         bufferIdx += sizeof(nzb);
         serializationIdx += sizeof(nzb);
         return serializationIdx;
     }
     /**
      * @brief Partially serializes an Daphne object into a buffer
-     * 
+     *
      * @param arg The daphne Matrix
-     * @param buffer A pointer to char, the buffer that data will be serialized to.
-     * @param chunkSize Optional The size of the buffer (default is DEFAULT_SERIALIZATION_BUFFER_SIZE). Since at least one chunk will contain the header, the minimum chunk size should be HEADER_BUFFER_SIZE bytes (so the header won't be partially serialized).
-     * @param serializeFromByte Optional The byte index of the object, at which serialization should begin (default 0). 
-    */
-    static size_t serialize(const CSRMatrix<VT> *arg, char *buffer, size_t chunkSize = DEFAULT_SERIALIZATION_BUFFER_SIZE, size_t serializeFromByte = 0) {
+     * @param buffer A pointer to char, the buffer that data will be serialized
+     * to.
+     * @param chunkSize Optional The size of the buffer (default is
+     * DEFAULT_SERIALIZATION_BUFFER_SIZE). Since at least one chunk will contain
+     * the header, the minimum chunk size should be HEADER_BUFFER_SIZE bytes (so
+     * the header won't be partially serialized).
+     * @param serializeFromByte Optional The byte index of the object, at which
+     * serialization should begin (default 0).
+     */
+    static size_t serialize(const CSRMatrix<VT> *arg, char *buffer,
+                            size_t chunkSize = DEFAULT_SERIALIZATION_BUFFER_SIZE, size_t serializeFromByte = 0) {
         size_t bufferIdx = 0;
         size_t serializationIdx = 0;
         chunkSize = chunkSize != 0 ? chunkSize : DaphneSerializer<CSRMatrix<VT>>::length(arg);
 
-        // Since at least one chunk will contain the header, the minimum chunk size should be HEADER_BUFFER_SIZE bytes (so the header won't be partially serialized).
+        // Since at least one chunk will contain the header, the minimum chunk
+        // size should be HEADER_BUFFER_SIZE bytes (so the header won't be
+        // partially serialized).
         if (chunkSize < HEADER_BUFFER_SIZE)
-            throw std::runtime_error("Minimum chunk size " + std::to_string(HEADER_BUFFER_SIZE) + " bytes"); // For now..?
+            throw std::runtime_error("Minimum chunk size " + std::to_string(HEADER_BUFFER_SIZE) +
+                                     " bytes"); // For now..?
 
         if (serializeFromByte == 0) {
-            auto bytesWritten = serializeHeader(arg, buffer);            
+            auto bytesWritten = serializeHeader(arg, buffer);
             bufferIdx += bytesWritten;
         }
         serializationIdx += headerSize(arg);
 
         // num non-zeros
         size_t nzb = 0;
-        for (size_t r = 0; r < arg->getNumRows(); r++){
-            nzb += arg->getNumNonZeros(r);            
+        for (size_t r = 0; r < arg->getNumRows(); r++) {
+            nzb += arg->getNumNonZeros(r);
         }
 
         if (serializeFromByte < serializationIdx + (arg->getNumRows() + 1) * sizeof(size_t)) {
-            const size_t * rowOffsets = arg->getRowOffsets();
+            const size_t *rowOffsets = arg->getRowOffsets();
             const size_t offset_diff = *arg->getRowOffsets();
             auto new_rows = std::make_unique<size_t[]>(arg->getNumRows() + 1);
-            for (size_t r = 0; r < arg->getNumRows() + 1; r++){
-                auto newVal = *(rowOffsets + r) - offset_diff;                        
+            for (size_t r = 0; r < arg->getNumRows() + 1; r++) {
+                auto newVal = *(rowOffsets + r) - offset_diff;
                 new_rows.get()[r] = newVal;
             }
             size_t startOffset = serializeFromByte > serializationIdx ? serializeFromByte - serializationIdx : 0;
             size_t bytesToCopy = 0;
             size_t arraySize = (arg->getNumRows() + 1) * sizeof(size_t);
-            if (serializeFromByte < serializationIdx){
-                bytesToCopy = chunkSize - bufferIdx > arraySize ? 
-                        arraySize : 
-                        chunkSize - bufferIdx;            
+            if (serializeFromByte < serializationIdx) {
+                bytesToCopy = chunkSize - bufferIdx > arraySize ? arraySize : chunkSize - bufferIdx;
             } else {
-                bytesToCopy = chunkSize > arraySize + serializationIdx - serializeFromByte ?
-                            arraySize + serializationIdx - serializeFromByte:
-                            chunkSize - bufferIdx;
+                bytesToCopy = chunkSize > arraySize + serializationIdx - serializeFromByte
+                                  ? arraySize + serializationIdx - serializeFromByte
+                                  : chunkSize - bufferIdx;
             }
 
-            std::copy(reinterpret_cast<const char*>(new_rows.get()) + startOffset, reinterpret_cast<const char*>(new_rows.get()) + startOffset + bytesToCopy, buffer + bufferIdx);
+            std::copy(reinterpret_cast<const char *>(new_rows.get()) + startOffset,
+                      reinterpret_cast<const char *>(new_rows.get()) + startOffset + bytesToCopy, buffer + bufferIdx);
             bufferIdx += bytesToCopy;
         }
         serializationIdx += sizeof(size_t) * (arg->getNumRows() + 1);
@@ -560,22 +608,20 @@ struct DaphneSerializer<CSRMatrix<VT>, false> {
         if (chunkSize <= bufferIdx)
             return bufferIdx;
 
-
-        const size_t * colIdxs = arg->getColIdxs(0);
-        if (serializeFromByte < serializationIdx + nzb * sizeof(size_t)){
+        const size_t *colIdxs = arg->getColIdxs(0);
+        if (serializeFromByte < serializationIdx + nzb * sizeof(size_t)) {
             size_t startOffset = serializeFromByte > serializationIdx ? serializeFromByte - serializationIdx : 0;
             size_t bytesToCopy = 0;
             size_t arraySize = (nzb * sizeof(size_t));
-            if (serializeFromByte < serializationIdx){
-                bytesToCopy = chunkSize - bufferIdx > arraySize ? 
-                        arraySize : 
-                        chunkSize - bufferIdx;            
+            if (serializeFromByte < serializationIdx) {
+                bytesToCopy = chunkSize - bufferIdx > arraySize ? arraySize : chunkSize - bufferIdx;
             } else {
-                bytesToCopy = chunkSize > arraySize + serializationIdx - serializeFromByte ?
-                            arraySize + serializationIdx - serializeFromByte:
-                            chunkSize - bufferIdx;
+                bytesToCopy = chunkSize > arraySize + serializationIdx - serializeFromByte
+                                  ? arraySize + serializationIdx - serializeFromByte
+                                  : chunkSize - bufferIdx;
             }
-            std::copy(reinterpret_cast<const char*>(colIdxs) + startOffset, reinterpret_cast<const char*>(colIdxs) + startOffset + bytesToCopy, buffer + bufferIdx);
+            std::copy(reinterpret_cast<const char *>(colIdxs) + startOffset,
+                      reinterpret_cast<const char *>(colIdxs) + startOffset + bytesToCopy, buffer + bufferIdx);
             bufferIdx += bytesToCopy;
         }
         serializationIdx += sizeof(size_t) * nzb;
@@ -584,51 +630,60 @@ struct DaphneSerializer<CSRMatrix<VT>, false> {
         if (chunkSize <= bufferIdx)
             return bufferIdx;
 
-        const VT * vals = arg->getValues(0);
-        if (serializeFromByte < serializationIdx + nzb * sizeof(VT)){
+        const VT *vals = arg->getValues(0);
+        if (serializeFromByte < serializationIdx + nzb * sizeof(VT)) {
             size_t startOffset = serializeFromByte > serializationIdx ? serializeFromByte - serializationIdx : 0;
             size_t bytesToCopy = 0;
             size_t arraySize = nzb * sizeof(VT);
-            if (serializeFromByte < serializationIdx){
-                bytesToCopy = chunkSize - bufferIdx > arraySize ? 
-                        arraySize : 
-                        chunkSize - bufferIdx;            
+            if (serializeFromByte < serializationIdx) {
+                bytesToCopy = chunkSize - bufferIdx > arraySize ? arraySize : chunkSize - bufferIdx;
             } else {
-                bytesToCopy = chunkSize > arraySize + serializationIdx - serializeFromByte ?
-                            arraySize + serializationIdx - serializeFromByte:
-                            chunkSize - bufferIdx;
+                bytesToCopy = chunkSize > arraySize + serializationIdx - serializeFromByte
+                                  ? arraySize + serializationIdx - serializeFromByte
+                                  : chunkSize - bufferIdx;
             }
-            std::copy(reinterpret_cast<const char*>(vals) + startOffset, reinterpret_cast<const char*>(vals) + startOffset + bytesToCopy, buffer + bufferIdx);
+            std::copy(reinterpret_cast<const char *>(vals) + startOffset,
+                      reinterpret_cast<const char *>(vals) + startOffset + bytesToCopy, buffer + bufferIdx);
             bufferIdx += bytesToCopy;
         }
 
         return bufferIdx;
     };
     /**
-     * @brief Partially serializes an Daphne object into a buffer. This overloaded function can allocate memory for the buffer if needed.
-     * 
+     * @brief Partially serializes an Daphne object into a buffer. This
+     * overloaded function can allocate memory for the buffer if needed.
+     *
      * @param arg The daphne Matrix
-     * @param buffer A pointer to pointer to char, the buffer that data will be serialized to.
-     * @param chunkSize Optional The size of the buffer (default is 0 - the size needed for the whole object)
-     * @param serializeFromByte Optional The byte index of the object, at which serialization should begin (default 0). 
-    */
-    static size_t serialize(const CSRMatrix<VT> *arg, char **buffer, size_t chunkSize = 0, size_t serializeFromByte = 0) {
+     * @param buffer A pointer to pointer to char, the buffer that data will be
+     * serialized to.
+     * @param chunkSize Optional The size of the buffer (default is 0 - the size
+     * needed for the whole object)
+     * @param serializeFromByte Optional The byte index of the object, at which
+     * serialization should begin (default 0).
+     */
+    static size_t serialize(const CSRMatrix<VT> *arg, char **buffer, size_t chunkSize = 0,
+                            size_t serializeFromByte = 0) {
         chunkSize = chunkSize == 0 ? DaphneSerializer<CSRMatrix<VT>>::length(arg) : chunkSize;
-        
+
         if (*buffer == nullptr) // Maybe if is unecessary here..
             *buffer = new char[sizeof(chunkSize)];
         return serialize(arg, *buffer, chunkSize, serializeFromByte);
     }
     /**
-     * @brief Partially serializes an Daphne object into a buffer. This overloaded function can allocate memory for the buffer if needed.
-     * 
+     * @brief Partially serializes an Daphne object into a buffer. This
+     * overloaded function can allocate memory for the buffer if needed.
+     *
      * @param arg The daphne Matrix
-     * @param buffer A pointer to pointer to char, the buffer that data will be serialized to.
-     * @param chunkSize Optional The size of the buffer (default is DEFAULT_SERIALIZATION_BUFFER_SIZE)
-     * @param serializeFromByte Optional The byte index of the object, at which serialization should begin. 
-    */
+     * @param buffer A pointer to pointer to char, the buffer that data will be
+     * serialized to.
+     * @param chunkSize Optional The size of the buffer (default is
+     * DEFAULT_SERIALIZATION_BUFFER_SIZE)
+     * @param serializeFromByte Optional The byte index of the object, at which
+     * serialization should begin.
+     */
     static size_t serialize(const CSRMatrix<VT> *arg, std::vector<char> &buffer, size_t serializeFromByte = 0) {
-        // if caller provides an empty buffer, assume we want to serialize the whole object
+        // if caller provides an empty buffer, assume we want to serialize the
+        // whole object
         size_t chunkSize = buffer.size() == 0 ? DaphneSerializer<CSRMatrix<VT>>::length(arg) : buffer.size();
         if (buffer.size() < chunkSize) // Maybe if is unecessary here..
             buffer.resize(chunkSize);
@@ -636,8 +691,9 @@ struct DaphneSerializer<CSRMatrix<VT>, false> {
         return serialize(arg, buffer.data(), chunkSize, serializeFromByte);
     }
     /**
-     * @brief Deserializes the header of a buffer containing information about a CSRMatrix.
-     * 
+     * @brief Deserializes the header of a buffer containing information about a
+     * CSRMatrix.
+     *
      * @param buf The buffer which contains the header.
      * @param matrix The CSRMatrix to initialize with the header information.
      * @return CSRMatrix<VT>* The result matrix.
@@ -655,53 +711,62 @@ struct DaphneSerializer<CSRMatrix<VT>, false> {
         bufferIdx += sizeof(DF_body);
 
         const DF_body_block *bb = (const DF_body_block *)(buffer + bufferIdx);
-        bufferIdx += sizeof(DF_body_block);        
-    
+        bufferIdx += sizeof(DF_body_block);
+
         // empty Matrix
         if (bb->bt == (uint8_t)DF_body_t::empty) {
             return DataObjectFactory::create<CSRMatrix<VT>>(0, 0, 0, false);
-        // CSRMatrix
+            // CSRMatrix
         } else if (bb->bt == (uint8_t)DF_body_t::sparse) {
             bufferIdx += sizeof(ValueTypeCode);
 
             size_t nzb;
-            std::copy(buffer + bufferIdx, buffer + bufferIdx + sizeof(nzb), reinterpret_cast<char*>(&nzb));
+            std::copy(buffer + bufferIdx, buffer + bufferIdx + sizeof(nzb), reinterpret_cast<char *>(&nzb));
             bufferIdx += sizeof(nzb);
 
-            if (matrix == nullptr){
+            if (matrix == nullptr) {
                 matrix = DataObjectFactory::create<CSRMatrix<VT>>(bb->nbrows, bb->nbcols, nzb, true);
             }
-        // /* TODO MPI: No COO support for write? */
-        // // COO Matrix
-        // } else if (bb->bt == (uint8_t)DF_body_t::ultra_sparse) {
-        //     bufferIdx += sizeof(ValueTypeCode);
-
-        //     size_t nzb;
-        //     std::copy(buffer.begin() + bufferIdx, buffer.begin() + bufferIdx + sizeof(nzb), reinterpret_cast<char*>(&nzb));
-        //     bufferIdx += sizeof(nzb);
-
-        //     if (matrix == nullptr)
-        //         matrix = DataObjectFactory::create<CSRMatrix<VT>>(bb->nbrows, bb->nbcols, nzb, false);
-        // }
+            // /* TODO MPI: No COO support for write? */
+            // // COO Matrix
+            // } else if (bb->bt == (uint8_t)DF_body_t::ultra_sparse) {
+            //     bufferIdx += sizeof(ValueTypeCode);
+
+            //     size_t nzb;
+            //     std::copy(buffer.begin() + bufferIdx, buffer.begin() +
+            //     bufferIdx + sizeof(nzb), reinterpret_cast<char*>(&nzb));
+            //     bufferIdx += sizeof(nzb);
+
+            //     if (matrix == nullptr)
+            //         matrix =
+            //         DataObjectFactory::create<CSRMatrix<VT>>(bb->nbrows,
+            //         bb->nbcols, nzb, false);
+            // }
         }
         return matrix;
     }
     /**
      * @brief Deserializes a CSRMatrix from a buffer.
-     * 
-     * Deserialization can be done partially by specifing an byte-index as a starting point in the Matrix. 
-     * Notice that index is related to the byte length of the matrix (provided by length(matrix)).
-     * 
+     *
+     * Deserialization can be done partially by specifing an byte-index as a
+     * starting point in the Matrix. Notice that index is related to the byte
+     * length of the matrix (provided by length(matrix)).
+     *
      * @param buf The buffer containing the serialized data.
      * @param chunkSize The size of the buffer.
      * @param matrix The result matrix to write data.
-     * @param deserializeFromByte (Optional) The index of the @matrix that deserialization should begin writing data.
+     * @param deserializeFromByte (Optional) The index of the @matrix that
+     * deserialization should begin writing data.
      * @return CSRMatrix<VT>* The result matrix.
      */
-    static CSRMatrix<VT> *deserialize(const char *buffer, size_t chunkSize, CSRMatrix<VT> * matrix = nullptr, size_t deserializeFromByte = 0) {            
-        // Since at least one chunk will contain the header, the minimum chunk size should be HEADER_BUFFER_SIZE bytes (so the header won't be partially serialized).
+    static CSRMatrix<VT> *deserialize(const char *buffer, size_t chunkSize, CSRMatrix<VT> *matrix = nullptr,
+                                      size_t deserializeFromByte = 0) {
+        // Since at least one chunk will contain the header, the minimum chunk
+        // size should be HEADER_BUFFER_SIZE bytes (so the header won't be
+        // partially serialized).
         if (deserializeFromByte == 0 && chunkSize < HEADER_BUFFER_SIZE)
-            throw std::runtime_error("Minimum starting chunk size " + std::to_string(HEADER_BUFFER_SIZE) + " bytes"); // For now..?
+            throw std::runtime_error("Minimum starting chunk size " + std::to_string(HEADER_BUFFER_SIZE) +
+                                     " bytes"); // For now..?
 
         size_t bufferIdx = 0;
         size_t serializationIdx = 0;
@@ -711,19 +776,23 @@ struct DaphneSerializer<CSRMatrix<VT>, false> {
             bufferIdx += HEADER_BUFFER_SIZE;
         }
         serializationIdx += HEADER_BUFFER_SIZE;
-        
+
         if (deserializeFromByte < serializationIdx + sizeof(size_t) * (matrix->getNumRows() + 1)) {
-            size_t * rowOffsets = matrix->getRowOffsets();
+            size_t *rowOffsets = matrix->getRowOffsets();
             size_t rowOffsets_offset = deserializeFromByte == 0 ? 0 : deserializeFromByte - serializationIdx;
 
             size_t bufferLen = 0;
             size_t arraySize = sizeof(size_t) * (matrix->getNumRows() + 1);
-            if (deserializeFromByte < serializationIdx) 
-                bufferLen = deserializeFromByte + chunkSize > serializationIdx + arraySize ? arraySize : chunkSize - bufferIdx;
+            if (deserializeFromByte < serializationIdx)
+                bufferLen =
+                    deserializeFromByte + chunkSize > serializationIdx + arraySize ? arraySize : chunkSize - bufferIdx;
             else
-                bufferLen = deserializeFromByte + chunkSize > serializationIdx + arraySize ? arraySize + serializationIdx - deserializeFromByte : chunkSize - bufferIdx;
+                bufferLen = deserializeFromByte + chunkSize > serializationIdx + arraySize
+                                ? arraySize + serializationIdx - deserializeFromByte
+                                : chunkSize - bufferIdx;
 
-            std::copy(buffer + bufferIdx, buffer + bufferIdx + bufferLen, reinterpret_cast<char*>(rowOffsets) + rowOffsets_offset);            
+            std::copy(buffer + bufferIdx, buffer + bufferIdx + bufferLen,
+                      reinterpret_cast<char *>(rowOffsets) + rowOffsets_offset);
             bufferIdx += bufferLen;
         }
         serializationIdx += sizeof(size_t) * (matrix->getNumRows() + 1);
@@ -732,37 +801,44 @@ struct DaphneSerializer<CSRMatrix<VT>, false> {
 
         size_t nzb = matrix->getMaxNumNonZeros();
         if (deserializeFromByte < serializationIdx + sizeof(size_t) * nzb) {
-            size_t * colIdxs = matrix->getColIdxs();
-            size_t colIdxs_offset = deserializeFromByte < serializationIdx ? 0 : deserializeFromByte - serializationIdx; 
+            size_t *colIdxs = matrix->getColIdxs();
+            size_t colIdxs_offset = deserializeFromByte < serializationIdx ? 0 : deserializeFromByte - serializationIdx;
 
             size_t bufferLen = 0;
             size_t arraySize = sizeof(size_t) * nzb;
-            if (deserializeFromByte < serializationIdx) 
-                bufferLen = deserializeFromByte + chunkSize > serializationIdx + arraySize ? arraySize : chunkSize - bufferIdx;
+            if (deserializeFromByte < serializationIdx)
+                bufferLen =
+                    deserializeFromByte + chunkSize > serializationIdx + arraySize ? arraySize : chunkSize - bufferIdx;
             else
-                bufferLen = deserializeFromByte + chunkSize > serializationIdx + arraySize ? arraySize + serializationIdx - deserializeFromByte : chunkSize - bufferIdx;
+                bufferLen = deserializeFromByte + chunkSize > serializationIdx + arraySize
+                                ? arraySize + serializationIdx - deserializeFromByte
+                                : chunkSize - bufferIdx;
 
-            std::copy(buffer + bufferIdx, buffer + bufferIdx + bufferLen, reinterpret_cast<char*>(colIdxs) + colIdxs_offset);
+            std::copy(buffer + bufferIdx, buffer + bufferIdx + bufferLen,
+                      reinterpret_cast<char *>(colIdxs) + colIdxs_offset);
             bufferIdx += bufferLen;
         }
         serializationIdx += sizeof(size_t) * nzb;
         if (chunkSize <= bufferIdx)
             return matrix;
 
-        if(deserializeFromByte < serializationIdx + sizeof(VT) * nzb) {
-            VT * vals = matrix->getValues();
+        if (deserializeFromByte < serializationIdx + sizeof(VT) * nzb) {
+            VT *vals = matrix->getValues();
             size_t vals_offset = deserializeFromByte < serializationIdx ? 0 : deserializeFromByte - serializationIdx;
-            
+
             size_t bufferLen = 0;
             size_t arraySize = sizeof(VT) * nzb;
-            if (deserializeFromByte < serializationIdx) 
-                bufferLen = deserializeFromByte + chunkSize > serializationIdx + arraySize ? arraySize : chunkSize - bufferIdx;
+            if (deserializeFromByte < serializationIdx)
+                bufferLen =
+                    deserializeFromByte + chunkSize > serializationIdx + arraySize ? arraySize : chunkSize - bufferIdx;
             else
-                bufferLen = deserializeFromByte + chunkSize > serializationIdx + arraySize ? arraySize + serializationIdx - deserializeFromByte : chunkSize - bufferIdx;
+                bufferLen = deserializeFromByte + chunkSize > serializationIdx + arraySize
+                                ? arraySize + serializationIdx - deserializeFromByte
+                                : chunkSize - bufferIdx;
 
-            std::copy(buffer + bufferIdx, buffer + bufferIdx + bufferLen, reinterpret_cast<char*>(vals) + vals_offset);
+            std::copy(buffer + bufferIdx, buffer + bufferIdx + bufferLen, reinterpret_cast<char *>(vals) + vals_offset);
         }
-        
+
         return matrix;
         /* TODO */
         // COO Matrix
@@ -774,7 +850,8 @@ struct DaphneSerializer<CSRMatrix<VT>, false> {
             memcpy(&nzb, ibuf, sizeof(nzb));
             ibuf += sizeof(nzb);
 
-            auto res = DataObjectFactory::create<CSRMatrix<VT>>(bb->nbrows, bb->nbcols, nzb, false);
+            auto res = DataObjectFactory::create<CSRMatrix<VT>>(bb->nbrows,
+        bb->nbcols, nzb, false);
 
             // Single column case
             if (bb->nbcols == 1) {
@@ -812,19 +889,22 @@ struct DaphneSerializer<CSRMatrix<VT>, false> {
             throw std::runtime_error("unknown body type code");
         }
         */
-   };
-   /**
+    };
+    /**
      * @brief Deserializes a CSRMatrix from a buffer.
-     * 
-     * Deserialization can be done partially by specifing an byte-index as a starting point in the Matrix. 
-     * Notice that index is related to the byte length of the matrix (provided by length(matrix)).
-     * 
+     *
+     * Deserialization can be done partially by specifing an byte-index as a
+     * starting point in the Matrix. Notice that index is related to the byte
+     * length of the matrix (provided by length(matrix)).
+     *
      * @param buffer An std::vector<char> buffer containg serialized data.
      * @param matrix The result matrix to write data.
-     * @param deserializeFromByte (Optional) The index of the @matrix that deserialization should begin writing data.
+     * @param deserializeFromByte (Optional) The index of the @matrix that
+     * deserialization should begin writing data.
      * @return CSRMatrix<VT>* The result matrix.
      */
-    static CSRMatrix<VT> *deserialize(const std::vector<char> &buffer, CSRMatrix<VT> * matrix = nullptr, size_t deserializeFromByte = 0) {
+    static CSRMatrix<VT> *deserialize(const std::vector<char> &buffer, CSRMatrix<VT> *matrix = nullptr,
+                                      size_t deserializeFromByte = 0) {
         return deserialize(buffer.data(), buffer.size(), matrix, deserializeFromByte);
     }
 };
@@ -832,35 +912,30 @@ struct DaphneSerializer<CSRMatrix<VT>, false> {
 // ----------------------------------------------------------------------------
 // const CSRMatrix
 // ----------------------------------------------------------------------------
-template<typename VT>
-struct DaphneSerializer<const CSRMatrix<VT>, false> : public DaphneSerializer<CSRMatrix<VT>, false> { };
+template <typename VT>
+struct DaphneSerializer<const CSRMatrix<VT>, false> : public DaphneSerializer<CSRMatrix<VT>, false> {};
 // ----------------------------------------------------------------------------
 // Frame
 // ----------------------------------------------------------------------------
 
 /**
  * @brief Serialize and deserialize Frame data types.
- * 
+ *
  * Contains static methods for finding the length in bytes, serializing and
  * deserializing Frame objects.
-*/
-template <>
-struct DaphneSerializer<Frame> {
-    static size_t length(const Frame *arg) {
-        throw std::runtime_error("not implemented");
-    };
+ */
+template <> struct DaphneSerializer<Frame> {
+    static size_t length(const Frame *arg) { throw std::runtime_error("not implemented"); };
 
     static size_t serialize(const Frame *arg, char *buf, size_t chunkSize = 0, size_t serializeFromByte = 0) {
         throw std::runtime_error("not implemented");
     };
-    static size_t serialize(const Frame *arg, std::vector<char> &buf, size_t chunkSize = 0, size_t serializeFromByte = 0) {
+    static size_t serialize(const Frame *arg, std::vector<char> &buf, size_t chunkSize = 0,
+                            size_t serializeFromByte = 0) {
         return serialize(arg, buf.data(), chunkSize, serializeFromByte);
     }
 
-
-   static Frame *deserialize(const char *buf) {
-       throw std::runtime_error("not implemented");
-   };
+    static Frame *deserialize(const char *buf) { throw std::runtime_error("not implemented"); };
 };
 
 // ----------------------------------------------------------------------------
@@ -868,224 +943,232 @@ struct DaphneSerializer<Frame> {
 // ----------------------------------------------------------------------------
 /**
  * @brief Serialize and deserialize Structure types.
- * 
- * Uses dynamic casts to downcast the Structure to a specific Daphne object (Dense, CSR, Frame) and uses the appropriate templated class.
-*/
-template<>
-struct DaphneSerializer<Structure> {
+ *
+ * Uses dynamic casts to downcast the Structure to a specific Daphne object
+ * (Dense, CSR, Frame) and uses the appropriate templated class.
+ */
+template <> struct DaphneSerializer<Structure> {
     /**
      * @brief The default serialization chunk size
      */
     static const size_t DEFAULT_SERIALIZATION_BUFFER_SIZE = 1048576;
 
-    // Since at least one chunk will contain the header and we do not know the specific type of the object (Dense, CSR, etc.),
-    // the minimum chunk size should be the maximum possible header size in bytes (so the header won't be partially serialized).    
+    // Since at least one chunk will contain the header and we do not know the
+    // specific type of the object (Dense, CSR, etc.), the minimum chunk size
+    // should be the maximum possible header size in bytes (so the header won't
+    // be partially serialized).
     // TODO Other types?
-    static const size_t HEADER_BUFFER_SIZE = std::min(
-        DaphneSerializer<DenseMatrix<double>>::HEADER_BUFFER_SIZE,
-        DaphneSerializer<CSRMatrix<double>>::HEADER_BUFFER_SIZE
-    );
-    
+    static const size_t HEADER_BUFFER_SIZE = std::min(DaphneSerializer<DenseMatrix<double>>::HEADER_BUFFER_SIZE,
+                                                      DaphneSerializer<CSRMatrix<double>>::HEADER_BUFFER_SIZE);
+
     const Structure *obj;
     Structure **objPtr;
     size_t chunkSize;
-    DaphneSerializer(const Structure *obj, size_t chunkSize = DEFAULT_SERIALIZATION_BUFFER_SIZE) : obj(obj), chunkSize(chunkSize) {
+    DaphneSerializer(const Structure *obj, size_t chunkSize = DEFAULT_SERIALIZATION_BUFFER_SIZE)
+        : obj(obj), chunkSize(chunkSize) {
         if (chunkSize < HEADER_BUFFER_SIZE)
-            throw std::runtime_error("Minimum chunk size " + std::to_string(HEADER_BUFFER_SIZE) + " bytes"); // For now..?
+            throw std::runtime_error("Minimum chunk size " + std::to_string(HEADER_BUFFER_SIZE) +
+                                     " bytes"); // For now..?
     };
-    DaphneSerializer(Structure **obj, size_t chunkSize = DEFAULT_SERIALIZATION_BUFFER_SIZE) : objPtr(obj), chunkSize(chunkSize) {
+    DaphneSerializer(Structure **obj, size_t chunkSize = DEFAULT_SERIALIZATION_BUFFER_SIZE)
+        : objPtr(obj), chunkSize(chunkSize) {
         if (chunkSize < HEADER_BUFFER_SIZE)
-            throw std::runtime_error("Minimum chunk size " + std::to_string(HEADER_BUFFER_SIZE) + " bytes"); // For now..?
+            throw std::runtime_error("Minimum chunk size " + std::to_string(HEADER_BUFFER_SIZE) +
+                                     " bytes"); // For now..?
     };
     // TODO Use macros for dynamic_casts
 
     /**
      * @brief Returns the size of the header.
-    */
-    static size_t headerSize(const Structure *arg) { 
+     */
+    static size_t headerSize(const Structure *arg) {
         /* DenseMatrix */
-        if (auto mat = dynamic_cast<const DenseMatrix<double>*>(arg))
+        if (auto mat = dynamic_cast<const DenseMatrix<double> *>(arg))
             return DaphneSerializer<DenseMatrix<double>>::headerSize(mat);
-        if (auto mat = dynamic_cast<const DenseMatrix<float>*>(arg))
+        if (auto mat = dynamic_cast<const DenseMatrix<float> *>(arg))
             return DaphneSerializer<DenseMatrix<float>>::headerSize(mat);
-        if (auto mat = dynamic_cast<const DenseMatrix<int8_t>*>(arg))
+        if (auto mat = dynamic_cast<const DenseMatrix<int8_t> *>(arg))
             return DaphneSerializer<DenseMatrix<int8_t>>::headerSize(mat);
-        if (auto mat = dynamic_cast<const DenseMatrix<int32_t>*>(arg))
+        if (auto mat = dynamic_cast<const DenseMatrix<int32_t> *>(arg))
             return DaphneSerializer<DenseMatrix<int32_t>>::headerSize(mat);
-        if (auto mat = dynamic_cast<const DenseMatrix<int64_t>*>(arg))
+        if (auto mat = dynamic_cast<const DenseMatrix<int64_t> *>(arg))
             return DaphneSerializer<DenseMatrix<int64_t>>::headerSize(mat);
-        if (auto mat = dynamic_cast<const DenseMatrix<uint8_t>*>(arg))
+        if (auto mat = dynamic_cast<const DenseMatrix<uint8_t> *>(arg))
             return DaphneSerializer<DenseMatrix<uint8_t>>::headerSize(mat);
-        if (auto mat = dynamic_cast<const DenseMatrix<uint32_t>*>(arg))
+        if (auto mat = dynamic_cast<const DenseMatrix<uint32_t> *>(arg))
             return DaphneSerializer<DenseMatrix<uint32_t>>::headerSize(mat);
-        if (auto mat = dynamic_cast<const DenseMatrix<uint64_t>*>(arg))
+        if (auto mat = dynamic_cast<const DenseMatrix<uint64_t> *>(arg))
             return DaphneSerializer<DenseMatrix<uint64_t>>::headerSize(mat);
         /* CSRMatrix */
-        if (auto mat = dynamic_cast<const CSRMatrix<double>*>(arg))
+        if (auto mat = dynamic_cast<const CSRMatrix<double> *>(arg))
             return DaphneSerializer<CSRMatrix<double>>::headerSize(mat);
-        if (auto mat = dynamic_cast<const CSRMatrix<float>*>(arg))
+        if (auto mat = dynamic_cast<const CSRMatrix<float> *>(arg))
             return DaphneSerializer<CSRMatrix<float>>::headerSize(mat);
-        if (auto mat = dynamic_cast<const CSRMatrix<int8_t>*>(arg))
+        if (auto mat = dynamic_cast<const CSRMatrix<int8_t> *>(arg))
             return DaphneSerializer<CSRMatrix<int8_t>>::headerSize(mat);
-        if (auto mat = dynamic_cast<const CSRMatrix<int32_t>*>(arg))
+        if (auto mat = dynamic_cast<const CSRMatrix<int32_t> *>(arg))
             return DaphneSerializer<CSRMatrix<int32_t>>::headerSize(mat);
-        if (auto mat = dynamic_cast<const CSRMatrix<int64_t>*>(arg))
+        if (auto mat = dynamic_cast<const CSRMatrix<int64_t> *>(arg))
             return DaphneSerializer<CSRMatrix<int64_t>>::headerSize(mat);
-        if (auto mat = dynamic_cast<const CSRMatrix<uint8_t>*>(arg))
+        if (auto mat = dynamic_cast<const CSRMatrix<uint8_t> *>(arg))
             return DaphneSerializer<CSRMatrix<uint8_t>>::headerSize(mat);
-        if (auto mat = dynamic_cast<const CSRMatrix<uint32_t>*>(arg))
+        if (auto mat = dynamic_cast<const CSRMatrix<uint32_t> *>(arg))
             return DaphneSerializer<CSRMatrix<uint32_t>>::headerSize(mat);
-        if (auto mat = dynamic_cast<const CSRMatrix<uint64_t>*>(arg))
+        if (auto mat = dynamic_cast<const CSRMatrix<uint64_t> *>(arg))
             return DaphneSerializer<CSRMatrix<uint64_t>>::headerSize(mat);
-        // else   
+        // else
         throw std::runtime_error("Serialization headerSize: uknown value type");
     };
     /**
      * @brief Calculates the byte length of the object.
-     * 
+     *
      * @param arg A structure to calculate the length.
      * @return size_t The byte length of the object.
      */
     static size_t length(const Structure *arg) {
         /* DenseMatrix */
-        if (auto mat = dynamic_cast<const DenseMatrix<double>*>(arg))
+        if (auto mat = dynamic_cast<const DenseMatrix<double> *>(arg))
             return DaphneSerializer<DenseMatrix<double>>::length(mat);
-        if (auto mat = dynamic_cast<const DenseMatrix<float>*>(arg))
+        if (auto mat = dynamic_cast<const DenseMatrix<float> *>(arg))
             return DaphneSerializer<DenseMatrix<float>>::length(mat);
-        if (auto mat = dynamic_cast<const DenseMatrix<int8_t>*>(arg))
+        if (auto mat = dynamic_cast<const DenseMatrix<int8_t> *>(arg))
             return DaphneSerializer<DenseMatrix<int8_t>>::length(mat);
-        if (auto mat = dynamic_cast<const DenseMatrix<int32_t>*>(arg))
+        if (auto mat = dynamic_cast<const DenseMatrix<int32_t> *>(arg))
             return DaphneSerializer<DenseMatrix<int32_t>>::length(mat);
-        if (auto mat = dynamic_cast<const DenseMatrix<int64_t>*>(arg))
+        if (auto mat = dynamic_cast<const DenseMatrix<int64_t> *>(arg))
             return DaphneSerializer<DenseMatrix<int64_t>>::length(mat);
-        if (auto mat = dynamic_cast<const DenseMatrix<uint8_t>*>(arg))
+        if (auto mat = dynamic_cast<const DenseMatrix<uint8_t> *>(arg))
             return DaphneSerializer<DenseMatrix<uint8_t>>::length(mat);
-        if (auto mat = dynamic_cast<const DenseMatrix<uint32_t>*>(arg))
+        if (auto mat = dynamic_cast<const DenseMatrix<uint32_t> *>(arg))
             return DaphneSerializer<DenseMatrix<uint32_t>>::length(mat);
-        if (auto mat = dynamic_cast<const DenseMatrix<uint64_t>*>(arg))
+        if (auto mat = dynamic_cast<const DenseMatrix<uint64_t> *>(arg))
             return DaphneSerializer<DenseMatrix<uint64_t>>::length(mat);
         /* CSRMatrix */
-        if (auto mat = dynamic_cast<const CSRMatrix<double>*>(arg))
+        if (auto mat = dynamic_cast<const CSRMatrix<double> *>(arg))
             return DaphneSerializer<CSRMatrix<double>>::length(mat);
-        if (auto mat = dynamic_cast<const CSRMatrix<float>*>(arg))
+        if (auto mat = dynamic_cast<const CSRMatrix<float> *>(arg))
             return DaphneSerializer<CSRMatrix<float>>::length(mat);
-        if (auto mat = dynamic_cast<const CSRMatrix<int8_t>*>(arg))
+        if (auto mat = dynamic_cast<const CSRMatrix<int8_t> *>(arg))
             return DaphneSerializer<CSRMatrix<int8_t>>::length(mat);
-        if (auto mat = dynamic_cast<const CSRMatrix<int32_t>*>(arg))
+        if (auto mat = dynamic_cast<const CSRMatrix<int32_t> *>(arg))
             return DaphneSerializer<CSRMatrix<int32_t>>::length(mat);
-        if (auto mat = dynamic_cast<const CSRMatrix<int64_t>*>(arg))
+        if (auto mat = dynamic_cast<const CSRMatrix<int64_t> *>(arg))
             return DaphneSerializer<CSRMatrix<int64_t>>::length(mat);
-        if (auto mat = dynamic_cast<const CSRMatrix<uint8_t>*>(arg))
+        if (auto mat = dynamic_cast<const CSRMatrix<uint8_t> *>(arg))
             return DaphneSerializer<CSRMatrix<uint8_t>>::length(mat);
-        if (auto mat = dynamic_cast<const CSRMatrix<uint32_t>*>(arg))
+        if (auto mat = dynamic_cast<const CSRMatrix<uint32_t> *>(arg))
             return DaphneSerializer<CSRMatrix<uint32_t>>::length(mat);
-        if (auto mat = dynamic_cast<const CSRMatrix<uint64_t>*>(arg))
+        if (auto mat = dynamic_cast<const CSRMatrix<uint64_t> *>(arg))
             return DaphneSerializer<CSRMatrix<uint64_t>>::length(mat);
-        // else   
+        // else
         throw std::runtime_error("Serialization length: uknown value type");
     };
     /**
      * @brief Serializes a header.
-    */
-    static size_t serializeHeader(const Structure *arg, char *buffer) { 
+     */
+    static size_t serializeHeader(const Structure *arg, char *buffer) {
         /* DenseMatrix */
-        if (auto mat = dynamic_cast<const DenseMatrix<double>*>(arg))
+        if (auto mat = dynamic_cast<const DenseMatrix<double> *>(arg))
             return DaphneSerializer<DenseMatrix<double>>::serializeHeader(mat, buffer);
-        if (auto mat = dynamic_cast<const DenseMatrix<float>*>(arg))
+        if (auto mat = dynamic_cast<const DenseMatrix<float> *>(arg))
             return DaphneSerializer<DenseMatrix<float>>::serializeHeader(mat, buffer);
-        if (auto mat = dynamic_cast<const DenseMatrix<int8_t>*>(arg))
+        if (auto mat = dynamic_cast<const DenseMatrix<int8_t> *>(arg))
             return DaphneSerializer<DenseMatrix<int8_t>>::serializeHeader(mat, buffer);
-        if (auto mat = dynamic_cast<const DenseMatrix<int32_t>*>(arg))
+        if (auto mat = dynamic_cast<const DenseMatrix<int32_t> *>(arg))
             return DaphneSerializer<DenseMatrix<int32_t>>::serializeHeader(mat, buffer);
-        if (auto mat = dynamic_cast<const DenseMatrix<int64_t>*>(arg))
+        if (auto mat = dynamic_cast<const DenseMatrix<int64_t> *>(arg))
             return DaphneSerializer<DenseMatrix<int64_t>>::serializeHeader(mat, buffer);
-        if (auto mat = dynamic_cast<const DenseMatrix<uint8_t>*>(arg))
+        if (auto mat = dynamic_cast<const DenseMatrix<uint8_t> *>(arg))
             return DaphneSerializer<DenseMatrix<uint8_t>>::serializeHeader(mat, buffer);
-        if (auto mat = dynamic_cast<const DenseMatrix<uint32_t>*>(arg))
+        if (auto mat = dynamic_cast<const DenseMatrix<uint32_t> *>(arg))
             return DaphneSerializer<DenseMatrix<uint32_t>>::serializeHeader(mat, buffer);
-        if (auto mat = dynamic_cast<const DenseMatrix<uint64_t>*>(arg))
+        if (auto mat = dynamic_cast<const DenseMatrix<uint64_t> *>(arg))
             return DaphneSerializer<DenseMatrix<uint64_t>>::serializeHeader(mat, buffer);
         /* CSRMatrix */
-        if (auto mat = dynamic_cast<const CSRMatrix<double>*>(arg))
+        if (auto mat = dynamic_cast<const CSRMatrix<double> *>(arg))
             return DaphneSerializer<CSRMatrix<double>>::serializeHeader(mat, buffer);
-        if (auto mat = dynamic_cast<const CSRMatrix<float>*>(arg))
+        if (auto mat = dynamic_cast<const CSRMatrix<float> *>(arg))
             return DaphneSerializer<CSRMatrix<float>>::serializeHeader(mat, buffer);
-        if (auto mat = dynamic_cast<const CSRMatrix<int8_t>*>(arg))
+        if (auto mat = dynamic_cast<const CSRMatrix<int8_t> *>(arg))
             return DaphneSerializer<CSRMatrix<int8_t>>::serializeHeader(mat, buffer);
-        if (auto mat = dynamic_cast<const CSRMatrix<int32_t>*>(arg))
+        if (auto mat = dynamic_cast<const CSRMatrix<int32_t> *>(arg))
             return DaphneSerializer<CSRMatrix<int32_t>>::serializeHeader(mat, buffer);
-        if (auto mat = dynamic_cast<const CSRMatrix<int64_t>*>(arg))
+        if (auto mat = dynamic_cast<const CSRMatrix<int64_t> *>(arg))
             return DaphneSerializer<CSRMatrix<int64_t>>::serializeHeader(mat, buffer);
-        if (auto mat = dynamic_cast<const CSRMatrix<uint8_t>*>(arg))
+        if (auto mat = dynamic_cast<const CSRMatrix<uint8_t> *>(arg))
             return DaphneSerializer<CSRMatrix<uint8_t>>::serializeHeader(mat, buffer);
-        if (auto mat = dynamic_cast<const CSRMatrix<uint32_t>*>(arg))
+        if (auto mat = dynamic_cast<const CSRMatrix<uint32_t> *>(arg))
             return DaphneSerializer<CSRMatrix<uint32_t>>::serializeHeader(mat, buffer);
-        if (auto mat = dynamic_cast<const CSRMatrix<uint64_t>*>(arg))
+        if (auto mat = dynamic_cast<const CSRMatrix<uint64_t> *>(arg))
             return DaphneSerializer<CSRMatrix<uint64_t>>::serializeHeader(mat, buffer);
-        // else   
+        // else
         throw std::runtime_error("Serialization serializeHeader: uknown value type");
     };
     /**
      * @brief Serializes a structure object to a buffer.
-     * 
+     *
      * @param arg A pointer to the object.
      * @param buf The buffer to serialize data.
-     * @param chunkSize (Optional) The size of the buffer, default is DEFAULT_SERIALIZATION_BUFFER_SIZE
-     * @param serializeFromByte Optional The byte index of the object, at which serialization should begin (default 0)
-     * @return size_t 
+     * @param chunkSize (Optional) The size of the buffer, default is
+     * DEFAULT_SERIALIZATION_BUFFER_SIZE
+     * @param serializeFromByte Optional The byte index of the object, at which
+     * serialization should begin (default 0)
+     * @return size_t
      */
     static size_t serialize(const Structure *arg, char *buf, size_t chunkSize = 0, size_t serializeFromByte = 0) {
         /* DenseMatrix */
-        if (auto mat = dynamic_cast<const DenseMatrix<double>*>(arg))
+        if (auto mat = dynamic_cast<const DenseMatrix<double> *>(arg))
             return DaphneSerializer<DenseMatrix<double>>::serialize(mat, buf, chunkSize, serializeFromByte);
-        if (auto mat = dynamic_cast<const DenseMatrix<float>*>(arg))
+        if (auto mat = dynamic_cast<const DenseMatrix<float> *>(arg))
             return DaphneSerializer<DenseMatrix<float>>::serialize(mat, buf, chunkSize, serializeFromByte);
-        if (auto mat = dynamic_cast<const DenseMatrix<int8_t>*>(arg))
+        if (auto mat = dynamic_cast<const DenseMatrix<int8_t> *>(arg))
             return DaphneSerializer<DenseMatrix<int8_t>>::serialize(mat, buf, chunkSize, serializeFromByte);
-        if (auto mat = dynamic_cast<const DenseMatrix<int32_t>*>(arg))
+        if (auto mat = dynamic_cast<const DenseMatrix<int32_t> *>(arg))
             return DaphneSerializer<DenseMatrix<int32_t>>::serialize(mat, buf, chunkSize, serializeFromByte);
-        if (auto mat = dynamic_cast<const DenseMatrix<int64_t>*>(arg))
+        if (auto mat = dynamic_cast<const DenseMatrix<int64_t> *>(arg))
             return DaphneSerializer<DenseMatrix<int64_t>>::serialize(mat, buf, chunkSize, serializeFromByte);
-        if (auto mat = dynamic_cast<const DenseMatrix<uint8_t>*>(arg))
+        if (auto mat = dynamic_cast<const DenseMatrix<uint8_t> *>(arg))
             return DaphneSerializer<DenseMatrix<uint8_t>>::serialize(mat, buf, chunkSize, serializeFromByte);
-        if (auto mat = dynamic_cast<const DenseMatrix<uint32_t>*>(arg))
+        if (auto mat = dynamic_cast<const DenseMatrix<uint32_t> *>(arg))
             return DaphneSerializer<DenseMatrix<uint32_t>>::serialize(mat, buf, chunkSize, serializeFromByte);
-        if (auto mat = dynamic_cast<const DenseMatrix<uint64_t>*>(arg))
+        if (auto mat = dynamic_cast<const DenseMatrix<uint64_t> *>(arg))
             return DaphneSerializer<DenseMatrix<uint64_t>>::serialize(mat, buf, chunkSize, serializeFromByte);
-            
+
         /* CSRMatrix */
-        if (auto mat = dynamic_cast<const CSRMatrix<double>*>(arg))
+        if (auto mat = dynamic_cast<const CSRMatrix<double> *>(arg))
             return DaphneSerializer<CSRMatrix<double>>::serialize(mat, buf, chunkSize, serializeFromByte);
-        if (auto mat = dynamic_cast<const CSRMatrix<float>*>(arg))
+        if (auto mat = dynamic_cast<const CSRMatrix<float> *>(arg))
             return DaphneSerializer<CSRMatrix<float>>::serialize(mat, buf, chunkSize, serializeFromByte);
-        if (auto mat = dynamic_cast<const CSRMatrix<int8_t>*>(arg))
+        if (auto mat = dynamic_cast<const CSRMatrix<int8_t> *>(arg))
             return DaphneSerializer<CSRMatrix<int8_t>>::serialize(mat, buf, chunkSize, serializeFromByte);
-        if (auto mat = dynamic_cast<const CSRMatrix<int32_t>*>(arg))
+        if (auto mat = dynamic_cast<const CSRMatrix<int32_t> *>(arg))
             return DaphneSerializer<CSRMatrix<int32_t>>::serialize(mat, buf, chunkSize, serializeFromByte);
-        if (auto mat = dynamic_cast<const CSRMatrix<int64_t>*>(arg))
+        if (auto mat = dynamic_cast<const CSRMatrix<int64_t> *>(arg))
             return DaphneSerializer<CSRMatrix<int64_t>>::serialize(mat, buf, chunkSize, serializeFromByte);
-        if (auto mat = dynamic_cast<const CSRMatrix<uint8_t>*>(arg))
+        if (auto mat = dynamic_cast<const CSRMatrix<uint8_t> *>(arg))
             return DaphneSerializer<CSRMatrix<uint8_t>>::serialize(mat, buf, chunkSize, serializeFromByte);
-        if (auto mat = dynamic_cast<const CSRMatrix<uint32_t>*>(arg))
+        if (auto mat = dynamic_cast<const CSRMatrix<uint32_t> *>(arg))
             return DaphneSerializer<CSRMatrix<uint32_t>>::serialize(mat, buf, chunkSize, serializeFromByte);
-        if (auto mat = dynamic_cast<const CSRMatrix<uint64_t>*>(arg))
+        if (auto mat = dynamic_cast<const CSRMatrix<uint64_t> *>(arg))
             return DaphneSerializer<CSRMatrix<uint64_t>>::serialize(mat, buf, chunkSize, serializeFromByte);
-        // else   
+        // else
         throw std::runtime_error("Serialization serialize: uknown value type");
     };
     // Gets the address of a pointer buffer and if it is nullptr,
     // it allocates chunksize memory
-    static size_t serialize(const Structure *arg, std::vector<char> &buffer, size_t chunkSize = 0, size_t serializeFromByte = 0) {
+    static size_t serialize(const Structure *arg, std::vector<char> &buffer, size_t chunkSize = 0,
+                            size_t serializeFromByte = 0) {
         chunkSize = chunkSize == 0 ? DaphneSerializer<Structure>::length(arg) : chunkSize;
-        
+
         if (buffer.size() < chunkSize) // Maybe if is unecessary here..
             buffer.resize(chunkSize);
         return serialize(arg, buffer.data(), chunkSize, serializeFromByte);
     }
-    // Serializes into the vector<char> buffer. If capacity is less than chunksize, it reserves memory.
-    static size_t serialize(const Structure *arg, char **buffer,  size_t chunkSize = 0, size_t serializeFromByte = 0) {
+    // Serializes into the vector<char> buffer. If capacity is less than
+    // chunksize, it reserves memory.
+    static size_t serialize(const Structure *arg, char **buffer, size_t chunkSize = 0, size_t serializeFromByte = 0) {
         chunkSize = chunkSize == 0 ? DaphneSerializer<Structure>::length(arg) : chunkSize;
-        
+
         if (*buffer == nullptr)
             *buffer = new char[sizeof(chunkSize)];
         return serialize(arg, *buffer, chunkSize, serializeFromByte);
@@ -1093,84 +1176,117 @@ struct DaphneSerializer<Structure> {
 
     /**
      * @brief Deserializes a header.
-    */
-    static Structure *deserializeHeader(const char * buffer, Structure *arg) { 
+     */
+    static Structure *deserializeHeader(const char *buffer, Structure *arg) {
         if (DF_Dtype(buffer) == DF_data_t::DenseMatrix_t) {
-        switch(DF_Vtype(buffer)) {
-            case ValueTypeCode::SI8: return DaphneSerializer<DenseMatrix<int8_t>>::deserializeHeader(buffer); break;
-            case ValueTypeCode::SI32: return DaphneSerializer<DenseMatrix<int32_t>>::deserializeHeader(buffer); break;
-            case ValueTypeCode::SI64: return DaphneSerializer<DenseMatrix<int64_t>>::deserializeHeader(buffer); break;
-            case ValueTypeCode::UI8: return DaphneSerializer<DenseMatrix<uint8_t>>::deserializeHeader(buffer); break;
-            case ValueTypeCode::UI32: return DaphneSerializer<DenseMatrix<uint32_t>>::deserializeHeader(buffer); break;
-            case ValueTypeCode::UI64: return DaphneSerializer<DenseMatrix<uint64_t>>::deserializeHeader(buffer); break;
-            case ValueTypeCode::F32: return DaphneSerializer<DenseMatrix<float>>::deserializeHeader(buffer); break;
-            case ValueTypeCode::F64: return DaphneSerializer<DenseMatrix<double>>::deserializeHeader(buffer); break;
-            default: throw std::runtime_error("unknown value type code");
-        }
+            switch (DF_Vtype(buffer)) {
+            case ValueTypeCode::SI8:
+                return DaphneSerializer<DenseMatrix<int8_t>>::deserializeHeader(buffer);
+                break;
+            case ValueTypeCode::SI32:
+                return DaphneSerializer<DenseMatrix<int32_t>>::deserializeHeader(buffer);
+                break;
+            case ValueTypeCode::SI64:
+                return DaphneSerializer<DenseMatrix<int64_t>>::deserializeHeader(buffer);
+                break;
+            case ValueTypeCode::UI8:
+                return DaphneSerializer<DenseMatrix<uint8_t>>::deserializeHeader(buffer);
+                break;
+            case ValueTypeCode::UI32:
+                return DaphneSerializer<DenseMatrix<uint32_t>>::deserializeHeader(buffer);
+                break;
+            case ValueTypeCode::UI64:
+                return DaphneSerializer<DenseMatrix<uint64_t>>::deserializeHeader(buffer);
+                break;
+            case ValueTypeCode::F32:
+                return DaphneSerializer<DenseMatrix<float>>::deserializeHeader(buffer);
+                break;
+            case ValueTypeCode::F64:
+                return DaphneSerializer<DenseMatrix<double>>::deserializeHeader(buffer);
+                break;
+            default:
+                throw std::runtime_error("unknown value type code");
+            }
         } else if (DF_Dtype(buffer) == DF_data_t::CSRMatrix_t) {
-        switch(DF_Vtype(buffer)) {
-            case ValueTypeCode::SI8: return DaphneSerializer<CSRMatrix<int8_t>>::deserializeHeader(buffer); break;
-            case ValueTypeCode::SI32: return DaphneSerializer<CSRMatrix<int32_t>>::deserializeHeader(buffer); break;
-            case ValueTypeCode::SI64: return DaphneSerializer<CSRMatrix<int64_t>>::deserializeHeader(buffer); break;
-            case ValueTypeCode::UI8: return DaphneSerializer<CSRMatrix<uint8_t>>::deserializeHeader(buffer); break;
-            case ValueTypeCode::UI32: return DaphneSerializer<CSRMatrix<uint32_t>>::deserializeHeader(buffer); break;
-            case ValueTypeCode::UI64: return DaphneSerializer<CSRMatrix<uint64_t>>::deserializeHeader(buffer); break;
-            case ValueTypeCode::F32: return DaphneSerializer<CSRMatrix<float>>::deserializeHeader(buffer); break;
-            case ValueTypeCode::F64: return DaphneSerializer<CSRMatrix<double>>::deserializeHeader(buffer); break;
-            default: throw std::runtime_error("unknown value type code");
-        }
+            switch (DF_Vtype(buffer)) {
+            case ValueTypeCode::SI8:
+                return DaphneSerializer<CSRMatrix<int8_t>>::deserializeHeader(buffer);
+                break;
+            case ValueTypeCode::SI32:
+                return DaphneSerializer<CSRMatrix<int32_t>>::deserializeHeader(buffer);
+                break;
+            case ValueTypeCode::SI64:
+                return DaphneSerializer<CSRMatrix<int64_t>>::deserializeHeader(buffer);
+                break;
+            case ValueTypeCode::UI8:
+                return DaphneSerializer<CSRMatrix<uint8_t>>::deserializeHeader(buffer);
+                break;
+            case ValueTypeCode::UI32:
+                return DaphneSerializer<CSRMatrix<uint32_t>>::deserializeHeader(buffer);
+                break;
+            case ValueTypeCode::UI64:
+                return DaphneSerializer<CSRMatrix<uint64_t>>::deserializeHeader(buffer);
+                break;
+            case ValueTypeCode::F32:
+                return DaphneSerializer<CSRMatrix<float>>::deserializeHeader(buffer);
+                break;
+            case ValueTypeCode::F64:
+                return DaphneSerializer<CSRMatrix<double>>::deserializeHeader(buffer);
+                break;
+            default:
+                throw std::runtime_error("unknown value type code");
+            }
         } else {
             throw std::runtime_error("unknown value type code");
         }
     };
-    static Structure *deserialize(const char *buffer, size_t chunkSize, Structure * arg = nullptr, size_t deserializeFromByte = 0) {
+    static Structure *deserialize(const char *buffer, size_t chunkSize, Structure *arg = nullptr,
+                                  size_t deserializeFromByte = 0) {
         /* DenseMatrix */
-        if (auto mat = dynamic_cast<DenseMatrix<double>*>(arg))
+        if (auto mat = dynamic_cast<DenseMatrix<double> *>(arg))
             return DaphneSerializer<DenseMatrix<double>>::deserialize(buffer, chunkSize, mat, deserializeFromByte);
-        if (auto mat = dynamic_cast<DenseMatrix<float>*>(arg))
+        if (auto mat = dynamic_cast<DenseMatrix<float> *>(arg))
             return DaphneSerializer<DenseMatrix<float>>::deserialize(buffer, chunkSize, mat, deserializeFromByte);
-        if (auto mat = dynamic_cast<DenseMatrix<int8_t>*>(arg))
+        if (auto mat = dynamic_cast<DenseMatrix<int8_t> *>(arg))
             return DaphneSerializer<DenseMatrix<int8_t>>::deserialize(buffer, chunkSize, mat, deserializeFromByte);
-        if (auto mat = dynamic_cast<DenseMatrix<int32_t>*>(arg))
+        if (auto mat = dynamic_cast<DenseMatrix<int32_t> *>(arg))
             return DaphneSerializer<DenseMatrix<int32_t>>::deserialize(buffer, chunkSize, mat, deserializeFromByte);
-        if (auto mat = dynamic_cast<DenseMatrix<int64_t>*>(arg))
+        if (auto mat = dynamic_cast<DenseMatrix<int64_t> *>(arg))
             return DaphneSerializer<DenseMatrix<int64_t>>::deserialize(buffer, chunkSize, mat, deserializeFromByte);
-        if (auto mat = dynamic_cast<DenseMatrix<uint8_t>*>(arg))
+        if (auto mat = dynamic_cast<DenseMatrix<uint8_t> *>(arg))
             return DaphneSerializer<DenseMatrix<uint8_t>>::deserialize(buffer, chunkSize, mat, deserializeFromByte);
-        if (auto mat = dynamic_cast<DenseMatrix<uint32_t>*>(arg))
+        if (auto mat = dynamic_cast<DenseMatrix<uint32_t> *>(arg))
             return DaphneSerializer<DenseMatrix<uint32_t>>::deserialize(buffer, chunkSize, mat, deserializeFromByte);
-        if (auto mat = dynamic_cast<DenseMatrix<uint64_t>*>(arg))
+        if (auto mat = dynamic_cast<DenseMatrix<uint64_t> *>(arg))
             return DaphneSerializer<DenseMatrix<uint64_t>>::deserialize(buffer, chunkSize, mat, deserializeFromByte);
-            
+
         /* CSRMatrix */
-        if (auto mat = dynamic_cast<CSRMatrix<double>*>(arg))
+        if (auto mat = dynamic_cast<CSRMatrix<double> *>(arg))
             return DaphneSerializer<CSRMatrix<double>>::deserialize(buffer, chunkSize, mat, deserializeFromByte);
-        if (auto mat = dynamic_cast<CSRMatrix<float>*>(arg))
+        if (auto mat = dynamic_cast<CSRMatrix<float> *>(arg))
             return DaphneSerializer<CSRMatrix<float>>::deserialize(buffer, chunkSize, mat, deserializeFromByte);
-        if (auto mat = dynamic_cast<CSRMatrix<int8_t>*>(arg))
+        if (auto mat = dynamic_cast<CSRMatrix<int8_t> *>(arg))
             return DaphneSerializer<CSRMatrix<int8_t>>::deserialize(buffer, chunkSize, mat, deserializeFromByte);
-        if (auto mat = dynamic_cast<CSRMatrix<int32_t>*>(arg))
+        if (auto mat = dynamic_cast<CSRMatrix<int32_t> *>(arg))
             return DaphneSerializer<CSRMatrix<int32_t>>::deserialize(buffer, chunkSize, mat, deserializeFromByte);
-        if (auto mat = dynamic_cast<CSRMatrix<int64_t>*>(arg))
+        if (auto mat = dynamic_cast<CSRMatrix<int64_t> *>(arg))
             return DaphneSerializer<CSRMatrix<int64_t>>::deserialize(buffer, chunkSize, mat, deserializeFromByte);
-        if (auto mat = dynamic_cast<CSRMatrix<uint8_t>*>(arg))
+        if (auto mat = dynamic_cast<CSRMatrix<uint8_t> *>(arg))
             return DaphneSerializer<CSRMatrix<uint8_t>>::deserialize(buffer, chunkSize, mat, deserializeFromByte);
-        if (auto mat = dynamic_cast<CSRMatrix<uint32_t>*>(arg))
+        if (auto mat = dynamic_cast<CSRMatrix<uint32_t> *>(arg))
             return DaphneSerializer<CSRMatrix<uint32_t>>::deserialize(buffer, chunkSize, mat, deserializeFromByte);
-        if (auto mat = dynamic_cast<CSRMatrix<uint64_t>*>(arg))
+        if (auto mat = dynamic_cast<CSRMatrix<uint64_t> *>(arg))
             return DaphneSerializer<CSRMatrix<uint64_t>>::deserialize(buffer, chunkSize, mat, deserializeFromByte);
-        // else   
+        // else
         throw std::runtime_error("Serialization serialize: uknown value type");
     };
 };
 
-
 // ----------------------------------------------------------------------------
 // const DaphneSerializer<const Structure>
 // ----------------------------------------------------------------------------
 
-template<>
-struct DaphneSerializer<const Structure> : public DaphneSerializer<Structure> { };
+template <> struct DaphneSerializer<const Structure> : public DaphneSerializer<Structure> {};
 
 // ----------------------------------------------------------------------------
 // Partial specialization for fundumental types */
@@ -1178,12 +1294,11 @@ struct DaphneSerializer<const Structure> : public DaphneSerializer<Structure> {
 
 /**
  * @brief Serialize and deserialize value types.
- * 
+ *
  * Contains static methods for finding the length in bytes, serializing and
  * deserializing value types.
-*/
-template<typename VT>
-struct DaphneSerializer<VT, true> {
+ */
+template <typename VT> struct DaphneSerializer<VT, true> {
     static size_t length(const VT arg) {
         size_t len = 0;
         len += sizeof(DF_header);
@@ -1200,15 +1315,17 @@ struct DaphneSerializer<VT, true> {
         DF_header h;
         h.version = 1;
         h.dt = (uint8_t)DF_data_t::Value_t;
-        
-        std::copy(reinterpret_cast<char*>(&h), reinterpret_cast<char*>(&h) + sizeof(h), buf + bufferIdx);
+
+        std::copy(reinterpret_cast<char *>(&h), reinterpret_cast<char *>(&h) + sizeof(h), buf + bufferIdx);
         bufferIdx += sizeof(h);
 
         const ValueTypeCode vt = ValueTypeUtils::codeFor<VT>;
-        std::copy(reinterpret_cast<const char*>(&vt), reinterpret_cast<const char*>(&vt) + sizeof(vt), buf + bufferIdx);
+        std::copy(reinterpret_cast<const char *>(&vt), reinterpret_cast<const char *>(&vt) + sizeof(vt),
+                  buf + bufferIdx);
         bufferIdx += sizeof(vt);
 
-        std::copy(reinterpret_cast<const char*>(&arg), reinterpret_cast<const char*>(&arg) + sizeof(VT), buf + bufferIdx);
+        std::copy(reinterpret_cast<const char *>(&arg), reinterpret_cast<const char *>(&arg) + sizeof(VT),
+                  buf + bufferIdx);
         return length(arg);
     };
     // Gets the address of a pointer buffer and if it is nullptr,
@@ -1218,61 +1335,93 @@ struct DaphneSerializer<VT, true> {
             buf.resize(length(arg));
         return serialize(arg, buf.data());
     }
-    // Serializes into the vector<char> buffer. If capacity is less than chunksize, it reserves memory.
+    // Serializes into the vector<char> buffer. If capacity is less than
+    // chunksize, it reserves memory.
     static size_t serialize(const VT &arg, char **buffer) {
         if (*buffer == nullptr)
             *buffer = new char(length(arg));
         return serialize(arg, *buffer);
     }
-    
-    static VT deserialize(const char *buf) {        
-        
+
+    static VT deserialize(const char *buf) {
+
         size_t bufferIdx = 0;
         bufferIdx += sizeof(DF_header);
         bufferIdx += sizeof(ValueTypeCode);
         VT val;
-        std::copy(buf + bufferIdx, buf + bufferIdx + sizeof(val), reinterpret_cast<char*>(&val));
+        std::copy(buf + bufferIdx, buf + bufferIdx + sizeof(val), reinterpret_cast<char *>(&val));
 
         return val;
     };
-    static VT deserialize(const std::vector<char> &buf) {        
-        return deserialize(buf.data());
-    }
+    static VT deserialize(const std::vector<char> &buf) { return deserialize(buf.data()); }
 };
 
-
 /**
  * @brief Deserializes a buffer to a Daphne object.
- * 
+ *
  * @param buf pointer to buffer.
  * @param bufferSize size of buffer in bytes.
- * 
+ *
  * @return Pointer to structure of deserialized object.
-*/
+ */
 inline Structure *DF_deserialize(const char *buf, size_t bufferSize) {
     if (DF_Dtype(buf) == DF_data_t::DenseMatrix_t) {
-        switch(DF_Vtype(buf)) {
-            case ValueTypeCode::SI8: return DaphneSerializer<DenseMatrix<int8_t>>::deserialize(buf, bufferSize); break;
-            case ValueTypeCode::SI32: return DaphneSerializer<DenseMatrix<int32_t>>::deserialize(buf, bufferSize); break;
-            case ValueTypeCode::SI64: return DaphneSerializer<DenseMatrix<int64_t>>::deserialize(buf, bufferSize); break;
-            case ValueTypeCode::UI8: return DaphneSerializer<DenseMatrix<uint8_t>>::deserialize(buf, bufferSize); break;
-            case ValueTypeCode::UI32: return DaphneSerializer<DenseMatrix<uint32_t>>::deserialize(buf, bufferSize); break;
-            case ValueTypeCode::UI64: return DaphneSerializer<DenseMatrix<uint64_t>>::deserialize(buf, bufferSize); break;
-            case ValueTypeCode::F32: return DaphneSerializer<DenseMatrix<float>>::deserialize(buf, bufferSize); break;
-            case ValueTypeCode::F64: return DaphneSerializer<DenseMatrix<double>>::deserialize(buf, bufferSize); break;
-            default: throw std::runtime_error("unknown value type code");
+        switch (DF_Vtype(buf)) {
+        case ValueTypeCode::SI8:
+            return DaphneSerializer<DenseMatrix<int8_t>>::deserialize(buf, bufferSize);
+            break;
+        case ValueTypeCode::SI32:
+            return DaphneSerializer<DenseMatrix<int32_t>>::deserialize(buf, bufferSize);
+            break;
+        case ValueTypeCode::SI64:
+            return DaphneSerializer<DenseMatrix<int64_t>>::deserialize(buf, bufferSize);
+            break;
+        case ValueTypeCode::UI8:
+            return DaphneSerializer<DenseMatrix<uint8_t>>::deserialize(buf, bufferSize);
+            break;
+        case ValueTypeCode::UI32:
+            return DaphneSerializer<DenseMatrix<uint32_t>>::deserialize(buf, bufferSize);
+            break;
+        case ValueTypeCode::UI64:
+            return DaphneSerializer<DenseMatrix<uint64_t>>::deserialize(buf, bufferSize);
+            break;
+        case ValueTypeCode::F32:
+            return DaphneSerializer<DenseMatrix<float>>::deserialize(buf, bufferSize);
+            break;
+        case ValueTypeCode::F64:
+            return DaphneSerializer<DenseMatrix<double>>::deserialize(buf, bufferSize);
+            break;
+        default:
+            throw std::runtime_error("unknown value type code");
         }
     } else if (DF_Dtype(buf) == DF_data_t::CSRMatrix_t) {
-        switch(DF_Vtype(buf)) {
-            case ValueTypeCode::SI8: return DaphneSerializer<CSRMatrix<int8_t>>::deserialize(buf, bufferSize); break;
-            case ValueTypeCode::SI32: return DaphneSerializer<CSRMatrix<int32_t>>::deserialize(buf, bufferSize); break;
-            case ValueTypeCode::SI64: return DaphneSerializer<CSRMatrix<int64_t>>::deserialize(buf, bufferSize); break;
-            case ValueTypeCode::UI8: return DaphneSerializer<CSRMatrix<uint8_t>>::deserialize(buf, bufferSize); break;
-            case ValueTypeCode::UI32: return DaphneSerializer<CSRMatrix<uint32_t>>::deserialize(buf, bufferSize); break;
-            case ValueTypeCode::UI64: return DaphneSerializer<CSRMatrix<uint64_t>>::deserialize(buf, bufferSize); break;
-            case ValueTypeCode::F32: return DaphneSerializer<CSRMatrix<float>>::deserialize(buf, bufferSize); break;
-            case ValueTypeCode::F64: return DaphneSerializer<CSRMatrix<double>>::deserialize(buf, bufferSize); break;
-            default: throw std::runtime_error("unknown value type code");
+        switch (DF_Vtype(buf)) {
+        case ValueTypeCode::SI8:
+            return DaphneSerializer<CSRMatrix<int8_t>>::deserialize(buf, bufferSize);
+            break;
+        case ValueTypeCode::SI32:
+            return DaphneSerializer<CSRMatrix<int32_t>>::deserialize(buf, bufferSize);
+            break;
+        case ValueTypeCode::SI64:
+            return DaphneSerializer<CSRMatrix<int64_t>>::deserialize(buf, bufferSize);
+            break;
+        case ValueTypeCode::UI8:
+            return DaphneSerializer<CSRMatrix<uint8_t>>::deserialize(buf, bufferSize);
+            break;
+        case ValueTypeCode::UI32:
+            return DaphneSerializer<CSRMatrix<uint32_t>>::deserialize(buf, bufferSize);
+            break;
+        case ValueTypeCode::UI64:
+            return DaphneSerializer<CSRMatrix<uint64_t>>::deserialize(buf, bufferSize);
+            break;
+        case ValueTypeCode::F32:
+            return DaphneSerializer<CSRMatrix<float>>::deserialize(buf, bufferSize);
+            break;
+        case ValueTypeCode::F64:
+            return DaphneSerializer<CSRMatrix<double>>::deserialize(buf, bufferSize);
+            break;
+        default:
+            throw std::runtime_error("unknown value type code");
         }
     } else {
         throw std::runtime_error("unknown value type code");
@@ -1282,54 +1431,55 @@ inline Structure *DF_deserialize(const char *buf, size_t bufferSize) {
 /**
  * @brief Deserializes a vector<char> buffer to a Daphne object.
  * @param buf the vector<char> buffer.
- * 
+ *
  * @return Pointer to structure of deserialized object.
-*/
-inline Structure *DF_deserialize(const std::vector<char> &buf) {
-    return DF_deserialize(buf.data(), buf.size());
-}
+ */
+inline Structure *DF_deserialize(const std::vector<char> &buf) { return DF_deserialize(buf.data(), buf.size()); }
 
 /**
  * @brief Serialization out of order.
- * 
- * This class can be used to serialize an object in an out of order fashion. 
- * The serialized chunks produced, can be used to reconstruct the object in any order.
- * All serialized chunks contain a header that is used to determine characteristics of the object (rows, cols, types)
- * and in addition the index of the chunk.
- * 
- * The result chunks should be deserialized using class DaphneDeserializerOutOfOrderChunks
+ *
+ * This class can be used to serialize an object in an out of order fashion.
+ * The serialized chunks produced, can be used to reconstruct the object in any
+ * order. All serialized chunks contain a header that is used to determine
+ * characteristics of the object (rows, cols, types) and in addition the index
+ * of the chunk.
+ *
+ * The result chunks should be deserialized using class
+ * DaphneDeserializerOutOfOrderChunks
  */
-template<class DT>
-struct DaphneSerializerOutOfOrderChunks {
-private:
-    std::mutex lock;    
+template <class DT> struct DaphneSerializerOutOfOrderChunks {
+  private:
+    std::mutex lock;
 
-public:
+  public:
     /**
      * @brief The default serialization chunk size
      */
     static const size_t DEFAULT_SERIALIZATION_BUFFER_SIZE = 1048576;
     DT *obj;
     size_t chunkSize;
-    size_t startOffset = 0; 
+    size_t startOffset = 0;
 
     /**
      * @brief Construct a new Daphne Serializer Out Of Order Chunks object
-     * 
+     *
      * @param obj The object to serialize
-     * @param chunkSize_ (optional) The chunk size what will be used for each chunk (default DEFAULT_SERIALIZATION_BUFFER_SIZE)
+     * @param chunkSize_ (optional) The chunk size what will be used for each
+     * chunk (default DEFAULT_SERIALIZATION_BUFFER_SIZE)
      */
-    DaphneSerializerOutOfOrderChunks(DT *obj, size_t chunkSize_ = DEFAULT_SERIALIZATION_BUFFER_SIZE) : obj(obj), chunkSize(chunkSize_), startOffset(0) {
+    DaphneSerializerOutOfOrderChunks(DT *obj, size_t chunkSize_ = DEFAULT_SERIALIZATION_BUFFER_SIZE)
+        : obj(obj), chunkSize(chunkSize_), startOffset(0) {
         chunkSize = chunkSize_ == 0 ? DaphneSerializer<DT>::length(obj) : chunkSize_;
     };
 
-    public:
-        size_t numberOfBytesSerialized = 0;
-        size_t index;
+  public:
+    size_t numberOfBytesSerialized = 0;
+    size_t index;
 
     /**
      * @brief Serializes the next chunk of data for the given object
-     * 
+     *
      * @param buffer The buffer to write data
      * @return size_t The number of bytes written to buffer (max = chunkSize)
      */
@@ -1338,74 +1488,75 @@ struct DaphneSerializerOutOfOrderChunks {
 
         if (buffer.size() < chunkSize)
             buffer.resize(chunkSize);
-        
-        // Serialize header (needed for all chunks, we don't know which arrives first)
+
+        // Serialize header (needed for all chunks, we don't know which arrives
+        // first)
         auto bufferIdx = DaphneSerializer<DT>::serializeHeader(obj, buffer.data());
-        
+
         // Serialize startOffset index
-        std::memcpy(buffer.data() + bufferIdx, reinterpret_cast<char*>(&startOffset), sizeof(startOffset));
+        std::memcpy(buffer.data() + bufferIdx, reinterpret_cast<char *>(&startOffset), sizeof(startOffset));
         bufferIdx += sizeof(startOffset);
 
         // TODO use locks for parallel serialization
-        size_t len = DaphneSerializer<DT>::serialize(obj, buffer.data() + bufferIdx, chunkSize - bufferIdx, startOffset);
+        size_t len =
+            DaphneSerializer<DT>::serialize(obj, buffer.data() + bufferIdx, chunkSize - bufferIdx, startOffset);
         bufferIdx += len;
         startOffset += len;
 
         return bufferIdx;
     }
     /**
-     * @brief Returns true if there are more chunks to serialize, otherwise false.
-     * 
-     * @return true 
-     * @return false 
+     * @brief Returns true if there are more chunks to serialize, otherwise
+     * false.
+     *
+     * @return true
+     * @return false
      */
-    bool HasNextChunk() {
-        return startOffset != DaphneSerializer<DT>::length(obj);
-    }
+    bool HasNextChunk() { return startOffset != DaphneSerializer<DT>::length(obj); }
 };
 
 /**
  * @brief Deserialization out of order.
- * 
- * This class deserializes chunks created with the DaphneSerializationOutOfOrder class.
- * The chunks can be provided in any random order.
- * All chunks should contain a header containing information about the deserialized object along with an index,
- * specifying at which byte-index deserialization should continue for each chunk.
+ *
+ * This class deserializes chunks created with the DaphneSerializationOutOfOrder
+ * class. The chunks can be provided in any random order. All chunks should
+ * contain a header containing information about the deserialized object along
+ * with an index, specifying at which byte-index deserialization should continue
+ * for each chunk.
  */
-template<class DT>
-struct DaphneDeserializerOutOfOrderChunks {
-private:
+template <class DT> struct DaphneDeserializerOutOfOrderChunks {
+  private:
     std::mutex lock;
 
-public:
+  public:
     DT *obj;
-    size_t bytesDeserialized; 
+    size_t bytesDeserialized;
 
     /**
      * @brief Construct a new Daphne Deserializer Out Of Order Chunks object
-     * 
+     *
      */
-    DaphneDeserializerOutOfOrderChunks() : obj(nullptr), bytesDeserialized(0) {};
+    DaphneDeserializerOutOfOrderChunks() : obj(nullptr), bytesDeserialized(0){};
 
     /**
      * @brief Deserializes a chunk and writes the contents to the object.
-     * 
+     *
      * @param buffer A std::vector<char> containing the serialized data.
      * @return DT* The partially deserialized object.
      */
-    DT* DeserializeNextChunk(std::vector<char> &buffer) {
+    DT *DeserializeNextChunk(std::vector<char> &buffer) {
         auto chunkSize = buffer.size();
-        
-        size_t startOffset;        
+
+        size_t startOffset;
         size_t bufferIdx = 0;
 
-        if (obj == nullptr) 
+        if (obj == nullptr)
             obj = DaphneSerializer<DT>::deserializeHeader(buffer.data(), obj);
         bufferIdx += DaphneSerializer<DT>::headerSize(obj);
 
-        std::memcpy(reinterpret_cast<char*>(&startOffset), buffer.data() + bufferIdx, sizeof(size_t));
+        std::memcpy(reinterpret_cast<char *>(&startOffset), buffer.data() + bufferIdx, sizeof(size_t));
         bufferIdx += sizeof(size_t);
-        
+
         // TODO use locks for parallel deserialization
         size_t deserializeLength = chunkSize - bufferIdx;
         obj = DaphneSerializer<DT>::deserialize(buffer.data() + bufferIdx, deserializeLength, obj, startOffset);
@@ -1415,9 +1566,9 @@ struct DaphneDeserializerOutOfOrderChunks {
     }
     /**
      * @brief Returns true if object is not fully deserialized, else false.
-     * 
-     * @return true 
-     * @return false 
+     *
+     * @return true
+     * @return false
      */
     bool HasNextChunk() {
         if (obj == nullptr)
@@ -1430,39 +1581,46 @@ struct DaphneDeserializerOutOfOrderChunks {
 /**
  * @brief This class is used to serialize an object using iterators.
  * The result chunks should be deserialized in order.
- * 
+ *
  */
-template <class DT>
-struct DaphneSerializerChunks
-{
+template <class DT> struct DaphneSerializerChunks {
     /**
      * @brief The default serialization chunk size
      */
     static const size_t DEFAULT_SERIALIZATION_BUFFER_SIZE = 1048576;
 
     /**
-     * @brief The buffer size should be at least big enough to serialize the header.
-    */
+     * @brief The buffer size should be at least big enough to serialize the
+     * header.
+     */
     static const size_t HEADER_BUFFER_SIZE = DaphneSerializer<DT>::HEADER_BUFFER_SIZE;
 
     DT *obj;
     size_t chunkSize;
     /**
      * @brief Construct a new Daphne Serializer Chunks object
-     * 
+     *
      * @param obj The object to serialize
-     * @param chunkSize (Optional) The chunk size (default DEFAULT_SERIALIZATION_BUFFER_SIZE). Since at least one chunk will contain the header, the minimum chunk size should be HEADER_BUFFER_SIZE bytes (so the header won't be partially serialized).
+     * @param chunkSize (Optional) The chunk size (default
+     * DEFAULT_SERIALIZATION_BUFFER_SIZE). Since at least one chunk will contain
+     * the header, the minimum chunk size should be HEADER_BUFFER_SIZE bytes (so
+     * the header won't be partially serialized).
      */
-    DaphneSerializerChunks(DT *obj, size_t chunkSize_ = DEFAULT_SERIALIZATION_BUFFER_SIZE) : obj(obj), chunkSize(chunkSize_) {
-        // Since at least one chunk will contain the header, the minimum chunk size should be HEADER_BUFFER_SIZE bytes (so the header won't be partially serialized).
+    DaphneSerializerChunks(DT *obj, size_t chunkSize_ = DEFAULT_SERIALIZATION_BUFFER_SIZE)
+        : obj(obj), chunkSize(chunkSize_) {
+        // Since at least one chunk will contain the header, the minimum chunk
+        // size should be HEADER_BUFFER_SIZE bytes (so the header won't be
+        // partially serialized).
         if (chunkSize < HEADER_BUFFER_SIZE)
-            throw std::runtime_error("Minimum chunk size " + std::to_string(HEADER_BUFFER_SIZE) + " bytes"); // For now..?
+            throw std::runtime_error("Minimum chunk size " + std::to_string(HEADER_BUFFER_SIZE) +
+                                     " bytes"); // For now..?
         // Get minimum possible chunk size
-        this->chunkSize = chunkSize_ > DaphneSerializer<DT>::length(obj) ? DaphneSerializer<DT>::length(obj) : chunkSize_;
+        this->chunkSize =
+            chunkSize_ > DaphneSerializer<DT>::length(obj) ? DaphneSerializer<DT>::length(obj) : chunkSize_;
     };
     /**
      * @brief An iterator used to serialize the object
-     * 
+     *
      */
     struct Iterator {
         using iterator_category = std::input_iterator_tag;
@@ -1472,26 +1630,28 @@ struct DaphneSerializerChunks
         using value_type = std::pair<size_t, buffer>; // TODO verify this
         using pointer = std::pair<size_t, buffer> *;
         using reference = std::pair<size_t, buffer> &; // TODO verify this
-    private:
+      private:
         const DT *obj;
         size_t chunkSize;
         value_type serializedData;
 
-    public:
+      public:
         size_t numberOfBytesSerialized = 0;
         size_t index;
 
         // Constructors
         Iterator(){};
-        Iterator(const DT *obj, size_t chunkSize) : obj(obj), chunkSize(chunkSize), numberOfBytesSerialized(0), index(0)
-        {
+        Iterator(const DT *obj, size_t chunkSize)
+            : obj(obj), chunkSize(chunkSize), numberOfBytesSerialized(0), index(0) {
             if (chunkSize < HEADER_BUFFER_SIZE)
-                throw std::runtime_error("Minimum chunk size " + std::to_string(HEADER_BUFFER_SIZE) + " bytes"); // For now..?
+                throw std::runtime_error("Minimum chunk size " + std::to_string(HEADER_BUFFER_SIZE) +
+                                         " bytes"); // For now..?
             // assign buffer
             serializedData.second = std::make_shared<std::vector<char>>();
             serializedData.second->resize(chunkSize);
 
-            serializedData.first = DaphneSerializer<DT>::serialize(obj, serializedData.second->data(), chunkSize, numberOfBytesSerialized);
+            serializedData.first =
+                DaphneSerializer<DT>::serialize(obj, serializedData.second->data(), chunkSize, numberOfBytesSerialized);
             numberOfBytesSerialized += serializedData.first;
         };
 
@@ -1499,16 +1659,15 @@ struct DaphneSerializerChunks
         pointer operator->() { return &serializedData; }
 
         // Prefix increment
-        Iterator operator++()
-        {
+        Iterator operator++() {
             index++;
-            serializedData.first = DaphneSerializer<DT>::serialize(obj, serializedData.second->data(), chunkSize, numberOfBytesSerialized);
+            serializedData.first =
+                DaphneSerializer<DT>::serialize(obj, serializedData.second->data(), chunkSize, numberOfBytesSerialized);
             numberOfBytesSerialized += serializedData.first;
             return *this;
         };
         // Postfix increment
-        Iterator operator++(int)
-        {
+        Iterator operator++(int) {
             Iterator tmp = *this;
             ++(*this);
             return tmp;
@@ -1519,18 +1678,18 @@ struct DaphneSerializerChunks
         friend bool operator!=(const Iterator &a, const Iterator &b) { return a.index != b.index; };
     };
     /**
-     * @brief Returns an iterator containing the first chunk of the serialization process.
-     * 
-     * @return Iterator 
+     * @brief Returns an iterator containing the first chunk of the
+     * serialization process.
+     *
+     * @return Iterator
      */
-    Iterator begin() {
-        return Iterator(obj, chunkSize);
-    }
+    Iterator begin() { return Iterator(obj, chunkSize); }
 
     /**
-     * @brief Returns an iterator at the end of the serialization process (it does not contain any serialized data).
-     * 
-     * @return Iterator 
+     * @brief Returns an iterator at the end of the serialization process (it
+     * does not contain any serialized data).
+     *
+     * @return Iterator
      */
     Iterator end() {
         Iterator iter;
@@ -1541,31 +1700,30 @@ struct DaphneSerializerChunks
 
 /**
  * @brief A class to deserialize data in order using iterators
- * The iterator expects the chunks in order of serialization, for it to deserialize the object properly.
+ * The iterator expects the chunks in order of serialization, for it to
+ * deserialize the object properly.
  */
-template <class DT>
-struct DaphneDeserializerChunks
-{
+template <class DT> struct DaphneDeserializerChunks {
     /**
-     * @brief The buffer size should be at least big enough to serialize the header.
-    */
+     * @brief The buffer size should be at least big enough to serialize the
+     * header.
+     */
     static const size_t HEADER_BUFFER_SIZE = DaphneSerializer<DT>::HEADER_BUFFER_SIZE;
 
     DT **objPtr;
     size_t chunkSize;
     /**
      * @brief Construct a new Daphne Deserializer Chunks object
-     * 
+     *
      * @param obj The object to write data to.
      * @param chunkSize The chunk size used to deserialize
      */
-    DaphneDeserializerChunks(DT **obj, size_t chunkSize_) : objPtr(obj), chunkSize(chunkSize_)
-    {
+    DaphneDeserializerChunks(DT **obj, size_t chunkSize_) : objPtr(obj), chunkSize(chunkSize_) {
         if (chunkSize < HEADER_BUFFER_SIZE)
-            throw std::runtime_error("Minimum chunk size " + std::to_string(HEADER_BUFFER_SIZE) + " bytes"); // For now..?
+            throw std::runtime_error("Minimum chunk size " + std::to_string(HEADER_BUFFER_SIZE) +
+                                     " bytes"); // For now..?
     };
-    struct Iterator
-    {
+    struct Iterator {
         using iterator_category = std::output_iterator_tag;
         // using difference_type = // todo
 
@@ -1575,19 +1733,21 @@ struct DaphneDeserializerChunks
         using reference = std::pair<size_t, buffer> &; // TODO verify this
         DT **objPtr;
 
-    private:
+      private:
         size_t chunkSize;
         value_type serializedData;
 
-    public:
+      public:
         size_t numberOfBytesDeserialized = 0;
         size_t index;
 
         // Constructors
         Iterator(){};
-        Iterator(DT **obj, size_t chunkSize) : objPtr(obj), chunkSize(chunkSize), numberOfBytesDeserialized(0), index(0) {
+        Iterator(DT **obj, size_t chunkSize)
+            : objPtr(obj), chunkSize(chunkSize), numberOfBytesDeserialized(0), index(0) {
             if (chunkSize < HEADER_BUFFER_SIZE)
-                throw std::runtime_error("Minimum chunk size " + std::to_string(HEADER_BUFFER_SIZE) + " bytes"); // For now..?
+                throw std::runtime_error("Minimum chunk size " + std::to_string(HEADER_BUFFER_SIZE) +
+                                         " bytes"); // For now..?
             // assign buffer
             serializedData.second = std::make_shared<std::vector<char>>();
             serializedData.second->resize(chunkSize);
@@ -1598,34 +1758,34 @@ struct DaphneDeserializerChunks
 
         // Prefix increment
         Iterator operator++() {
-            if (index == 0 || objPtr == nullptr){
+            if (index == 0 || objPtr == nullptr) {
                 *objPtr = DaphneSerializer<DT>::deserializeHeader(serializedData.second->data(), *objPtr);
-            } 
+            }
             index++;
-            *objPtr = DaphneSerializer<DT>::deserialize(serializedData.second->data(), serializedData.first, *objPtr, numberOfBytesDeserialized);
+            *objPtr = DaphneSerializer<DT>::deserialize(serializedData.second->data(), serializedData.first, *objPtr,
+                                                        numberOfBytesDeserialized);
             numberOfBytesDeserialized += serializedData.first;
             return *this;
         }
         // Postfix increment
-        Iterator operator++(int)
-        {
+        Iterator operator++(int) {
             Iterator tmp = *this;
             ++(*this);
-            return tmp;            
+            return tmp;
         }
 
         friend bool operator==(const Iterator &a, const Iterator &b) { return a.index == b.index; };
         friend bool operator!=(const Iterator &a, const Iterator &b) { return a.index != b.index; };
     };
-    Iterator begin() {
-        return Iterator(objPtr, chunkSize);
-    }
+    Iterator begin() { return Iterator(objPtr, chunkSize); }
     // Iterator end
     Iterator end() {
         Iterator iter;
-        // If we just initialized the iterator is possible that we have no information about the deserialized object
-        // but the buffer is also uninitialized. If object is uninitialized we simply set Iterator::end() as 1, otherwise
-        // we calculate it based on object information.
+        // If we just initialized the iterator is possible that we have no
+        // information about the deserialized object but the buffer is also
+        // uninitialized. If object is uninitialized we simply set
+        // Iterator::end() as 1, otherwise we calculate it based on object
+        // information.
         if (*objPtr != nullptr)
             iter.index = std::ceil(DaphneSerializer<DT>::length(*objPtr) / (double)chunkSize);
         else
diff --git a/src/runtime/local/io/File.h b/src/runtime/local/io/File.h
index 7173a99dc..7b2272ebc 100644
--- a/src/runtime/local/io/File.h
+++ b/src/runtime/local/io/File.h
@@ -21,69 +21,69 @@
 #include <stdlib.h>
 
 struct File {
-  FILE *identifier;
-  unsigned long pos;
-  long read;
-  char *line;
-  size_t line_len;
+    FILE *identifier;
+    unsigned long pos;
+    long read;
+    char *line;
+    size_t line_len;
 };
 
-inline struct File *openMemFile(FILE *ident){
-  struct File *f = (struct File *)malloc(sizeof(struct File));
+inline struct File *openMemFile(FILE *ident) {
+    struct File *f = (struct File *)malloc(sizeof(struct File));
 
-  f->identifier = ident;
-  f->pos = 0;
+    f->identifier = ident;
+    f->pos = 0;
 
-  f->line = NULL;
-  f->line_len = 0;
+    f->line = NULL;
+    f->line_len = 0;
 
-  return f;
+    return f;
 }
 
 inline struct File *openFile(const char *filename) {
-  struct File *f = (struct File *)malloc(sizeof(struct File));
+    struct File *f = (struct File *)malloc(sizeof(struct File));
 
-  f->identifier = fopen(filename, "r");
-  f->pos = 0;
+    f->identifier = fopen(filename, "r");
+    f->pos = 0;
 
-  if (f->identifier == NULL)
-    return NULL;
+    if (f->identifier == NULL)
+        return NULL;
 
-  f->line = NULL;
-  f->line_len = 0;
+    f->line = NULL;
+    f->line_len = 0;
 
-  return f;
+    return f;
 }
 
 inline struct File *openFileForWrite(const char *filename) {
-  struct File *f = (struct File *)malloc(sizeof(struct File));
+    struct File *f = (struct File *)malloc(sizeof(struct File));
 
-  f->identifier = fopen(filename, "w+");
-  f->pos = 0;
-  
-  if (f->identifier == NULL)
-    return NULL;
+    f->identifier = fopen(filename, "w+");
+    f->pos = 0;
 
-  f->line = NULL;
-  f->line_len = 0;
+    if (f->identifier == NULL)
+        return NULL;
 
-  return f;
+    f->line = NULL;
+    f->line_len = 0;
+
+    return f;
 }
 
 inline void closeFile(File *f) {
-  fclose(f->identifier);
-  if (f->line) {
-    free(f->line);
-  }
-  free(f);
+    fclose(f->identifier);
+    if (f->line) {
+        free(f->line);
+    }
+    free(f);
 }
 
 inline ssize_t getFileLine(File *f) {
-  ssize_t ret = getline(&f->line, &f->line_len, f->identifier);
-  f->read = ret;
-  f->pos += ret;
+    ssize_t ret = getline(&f->line, &f->line_len, f->identifier);
+    f->read = ret;
+    f->pos += ret;
 
-  return ret;
+    return ret;
 }
 
 #endif
diff --git a/src/runtime/local/io/FileMetaData.h b/src/runtime/local/io/FileMetaData.h
index 603e80364..84cbddff4 100644
--- a/src/runtime/local/io/FileMetaData.h
+++ b/src/runtime/local/io/FileMetaData.h
@@ -18,18 +18,18 @@
 
 #include <runtime/local/datastructures/ValueTypeCode.h>
 
+#include <string>
 #include <utility>
 #include <vector>
-#include <string>
 
-struct HDFSMetaData{
+struct HDFSMetaData {
     bool isHDFS = false;
     std::string HDFSFilename;
 };
 
 /**
  * @brief Very simple representation of basic file meta data.
- * 
+ *
  * Currently tailored to frames.
  */
 struct FileMetaData {
@@ -40,42 +40,22 @@ struct FileMetaData {
     std::vector<std::string> labels;
     const ssize_t numNonZeros;
     HDFSMetaData hdfs;
-    
+
     /**
      * @brief Construct a new File Meta Data object for Frames
      */
-    FileMetaData(
-        size_t numRows,
-        size_t numCols,
-        bool isSingleValueType,
-        std::vector<ValueTypeCode> schema,
-        std::vector<std::string> labels,
-        ssize_t numNonZeros = -1,
-        HDFSMetaData hdfs = {}
-    ) :
-        numRows(numRows), numCols(numCols),
-        isSingleValueType(isSingleValueType),
-        schema(std::move(schema)),
-        labels(std::move(labels)),
-        numNonZeros(numNonZeros),
-        hdfs(hdfs) {}
+    FileMetaData(size_t numRows, size_t numCols, bool isSingleValueType, std::vector<ValueTypeCode> schema,
+                 std::vector<std::string> labels, ssize_t numNonZeros = -1, HDFSMetaData hdfs = {})
+        : numRows(numRows), numCols(numCols), isSingleValueType(isSingleValueType), schema(std::move(schema)),
+          labels(std::move(labels)), numNonZeros(numNonZeros), hdfs(hdfs) {}
 
     /**
      * @brief Construct a new File Meta Data object for Matrix
      */
-    FileMetaData(
-        size_t numRows,
-        size_t numCols,
-        bool isSingleValueType,
-        ValueTypeCode valueType,
-        ssize_t numNonZeros = -1,
-        HDFSMetaData hdfs = {}
-    ) :
-        numRows(numRows), numCols(numCols),
-        isSingleValueType(isSingleValueType),
-        numNonZeros(numNonZeros),
-        hdfs(hdfs)
-    {
+    FileMetaData(size_t numRows, size_t numCols, bool isSingleValueType, ValueTypeCode valueType,
+                 ssize_t numNonZeros = -1, HDFSMetaData hdfs = {})
+        : numRows(numRows), numCols(numCols), isSingleValueType(isSingleValueType), numNonZeros(numNonZeros),
+          hdfs(hdfs) {
         schema.emplace_back(valueType);
     }
 };
diff --git a/src/runtime/local/io/HDFS/HDFSUtils.h b/src/runtime/local/io/HDFS/HDFSUtils.h
index 9a38fc613..125ecfead 100644
--- a/src/runtime/local/io/HDFS/HDFSUtils.h
+++ b/src/runtime/local/io/HDFS/HDFSUtils.h
@@ -37,22 +37,24 @@ struct HDFSUtils {
         return MetaDataParser::readMetaDataFromString(fmdStr);
     }
 
-    static inline std::string getBaseFile(const char * fn){
+    static inline std::string getBaseFile(const char *fn) {
         std::filesystem::path filePath(fn);
         return filePath.filename().string();
     }
 
-    static std::tuple<int, size_t> findSegmendAndOffset(const hdfsFS &fs, size_t startOffset, size_t startRow, const char * filename, size_t offsetMuliplier) {
+    static std::tuple<int, size_t> findSegmendAndOffset(const hdfsFS &fs, size_t startOffset, size_t startRow,
+                                                        const char *filename, size_t offsetMuliplier) {
         size_t skippedRows = 0;
         int seg = 1;
-        
+
         while (skippedRows != startRow) {
-            std::string segf = std::string(filename) + "/" + HDFSUtils::getBaseFile(filename) + "_segment_" + std::to_string(seg++);
+            std::string segf =
+                std::string(filename) + "/" + HDFSUtils::getBaseFile(filename) + "_segment_" + std::to_string(seg++);
             auto fmd = HDFSUtils::parseHDFSMetaData(segf, fs);
             skippedRows += fmd.numRows;
             // We need offset within segment
             if (skippedRows > startRow) {
-                seg--;  // adjust segment and skipped rows
+                seg--; // adjust segment and skipped rows
                 skippedRows -= fmd.numRows;
                 startOffset += (startRow - skippedRows) * offsetMuliplier;
                 skippedRows = startRow;
@@ -61,9 +63,9 @@ struct HDFSUtils {
         return std::make_tuple(seg, startOffset);
     }
 
-    static std::tuple<std::string, uint16_t> parseIPAddress(const std::string& input) {
+    static std::tuple<std::string, uint16_t> parseIPAddress(const std::string &input) {
         std::string ip;
-        uint16_t port = 9000;  // Default port
+        uint16_t port = 9000; // Default port
         size_t colonPos = input.find(':');
 
         if (colonPos != std::string::npos) {
diff --git a/src/runtime/local/io/HDFS/ReadDaphneHDFS.h b/src/runtime/local/io/HDFS/ReadDaphneHDFS.h
index 1c91f52da..70947da1c 100644
--- a/src/runtime/local/io/HDFS/ReadDaphneHDFS.h
+++ b/src/runtime/local/io/HDFS/ReadDaphneHDFS.h
@@ -16,16 +16,16 @@
 
 #pragma once
 
+#include <runtime/local/context/DaphneContext.h>
+#include <runtime/local/context/HDFSContext.h>
+#include <runtime/local/datastructures/CSRMatrix.h>
 #include <runtime/local/datastructures/DataObjectFactory.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
-#include <runtime/local/datastructures/CSRMatrix.h>
 #include <runtime/local/datastructures/Frame.h>
-#include <runtime/local/context/DaphneContext.h>
-#include <runtime/local/context/HDFSContext.h>
 
+#include <runtime/local/io/DaphneSerializer.h>
 #include <runtime/local/io/File.h>
 #include <runtime/local/io/utils.h>
-#include <runtime/local/io/DaphneSerializer.h>
 
 #include <util/preprocessor_defs.h>
 
@@ -33,12 +33,12 @@
 
 #include <cstddef>
 #include <cstdint>
-#include <queue>
 #include <fstream>
+#include <iomanip> // For setfill and setw
+#include <iostream>
 #include <limits>
+#include <queue>
 #include <sstream>
-#include <iostream>
-#include <iomanip> // For setfill and setw
 #include <vector>
 
 #include <fstream>
@@ -47,9 +47,7 @@
 // Struct for partial template specialization
 // ****************************************************************************
 
-template <class DTRes>
-struct ReadDaphneHDFS
-{
+template <class DTRes> struct ReadDaphneHDFS {
     static void apply(DTRes *&res, const char *hdfsDir, DCTX(dctx), size_t startRow = 0) = delete;
 };
 
@@ -57,9 +55,7 @@ struct ReadDaphneHDFS
 // Convenience function
 // ****************************************************************************
 
-template <class DTRes>
-void readDaphneHDFS(DTRes *&res, const char *hdfsDir, DCTX(dctx), size_t startRow = 0)
-{
+template <class DTRes> void readDaphneHDFS(DTRes *&res, const char *hdfsDir, DCTX(dctx), size_t startRow = 0) {
     ReadDaphneHDFS<DTRes>::apply(res, hdfsDir, dctx, startRow);
 }
 
@@ -71,76 +67,72 @@ void readDaphneHDFS(DTRes *&res, const char *hdfsDir, DCTX(dctx), size_t startRo
 // DenseMatrix
 // ----------------------------------------------------------------------------
 
-template <typename VT>
-struct ReadDaphneHDFS<DenseMatrix<VT>>
-{
-    static void apply(DenseMatrix<VT> *&res, const char *hdfsDir, DCTX(dctx), size_t startRow = 0)
-    {
+template <typename VT> struct ReadDaphneHDFS<DenseMatrix<VT>> {
+    static void apply(DenseMatrix<VT> *&res, const char *hdfsDir, DCTX(dctx), size_t startRow = 0) {
         if (res == NULL) {
             throw std::runtime_error("Could not initialize result matrix");
         }
 
         size_t numRows = res->getNumRows();
-        size_t numCols = res->getNumCols();        
+        size_t numCols = res->getNumCols();
 
         auto hdfsCtx = HDFSContext::get(dctx);
         auto fs = hdfsCtx->getConnection();
-        if (fs == NULL)
-        {
+        if (fs == NULL) {
             std::cout << "Error connecting to HDFS" << std::endl;
         }
-        
+
         auto headerSize = DaphneSerializer<DenseMatrix<VT>>::headerSize(res);
-       
-        auto [startSegment, offset] = HDFSUtils::findSegmendAndOffset(*fs, headerSize, startRow, hdfsDir, numCols * sizeof(VT));
+
+        auto [startSegment, offset] =
+            HDFSUtils::findSegmendAndOffset(*fs, headerSize, startRow, hdfsDir, numCols * sizeof(VT));
 
         size_t parsedRows = 0;
         auto segment = startSegment;
         size_t startSerByte = headerSize;
-        while (parsedRows < numRows){
-            auto hdfsFn = std::string(hdfsDir) + "/" + HDFSUtils::getBaseFile(hdfsDir) + "_segment_" + std::to_string(segment++);
+        while (parsedRows < numRows) {
+            auto hdfsFn =
+                std::string(hdfsDir) + "/" + HDFSUtils::getBaseFile(hdfsDir) + "_segment_" + std::to_string(segment++);
             auto segFmd = HDFSUtils::parseHDFSMetaData(hdfsFn, *fs);
-            
+
             hdfsFile hFile = hdfsOpenFile(*fs, hdfsFn.c_str(), O_RDONLY, 0, 0, 0);
-            if (hFile == NULL)
-            {
+            if (hFile == NULL) {
                 throw std::runtime_error("Error opening HDFS file");
             }
 
             // Find out the size of the file to allocate a buffer
             hdfsFileInfo *fileInfo = hdfsGetPathInfo(*fs, hdfsFn.c_str());
-            if (fileInfo == NULL)
-            {
+            if (fileInfo == NULL) {
                 hdfsCloseFile(*fs, hFile);
                 throw std::runtime_error("Error getting file info");
                 return;
             }
             tSize fileSize = fileInfo->mSize;
             hdfsFreeFileInfo(fileInfo, 1);
-                    
+
             // Allocate buffer
             std::vector<char> buffer(fileSize);
 
-            // If started parsing rows, set offset to headerSize 
+            // If started parsing rows, set offset to headerSize
             // Simply skip buffer header, except first time.
             offset = parsedRows == 0 ? offset : headerSize;
             hdfsSeek(*fs, hFile, offset);
             // Read the file into the buffer
             tSize bytesRead = hdfsRead(*fs, hFile, buffer.data(), fileSize);
-            if (bytesRead == -1)
-            {
-                hdfsCloseFile(*fs, hFile);            
+            if (bytesRead == -1) {
+                hdfsCloseFile(*fs, hFile);
                 throw std::runtime_error("Error reading file");
                 return;
             }
             size_t bufferEnd = fileSize;
-            // If segment is bigger than end row, bufferend should not be equal to whole segment
-            if (numRows - parsedRows < segFmd.numRows){
+            // If segment is bigger than end row, bufferend should not be equal
+            // to whole segment
+            if (numRows - parsedRows < segFmd.numRows) {
                 bufferEnd = (numRows - parsedRows) * numCols * sizeof(VT);
             }
             res = DaphneSerializer<DenseMatrix<VT>>::deserialize(buffer.data(), bufferEnd, res, startSerByte);
             startSerByte += bufferEnd - offset;
-            
+
             hdfsCloseFile(*fs, hFile);
             parsedRows += segFmd.numRows;
         }
diff --git a/src/runtime/local/io/HDFS/ReadHDFS.h b/src/runtime/local/io/HDFS/ReadHDFS.h
index b53903989..eb0042956 100644
--- a/src/runtime/local/io/HDFS/ReadHDFS.h
+++ b/src/runtime/local/io/HDFS/ReadHDFS.h
@@ -16,19 +16,19 @@
 
 #pragma once
 
+#include <runtime/local/context/DaphneContext.h>
+#include <runtime/local/datastructures/CSRMatrix.h>
 #include <runtime/local/datastructures/DataObjectFactory.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
-#include <runtime/local/datastructures/CSRMatrix.h>
 #include <runtime/local/datastructures/Frame.h>
-#include <runtime/local/context/DaphneContext.h>
 
-#include <runtime/local/io/File.h>
-#include <runtime/local/io/utils.h>
+#include <parser/metadata/MetaDataParser.h>
+#include <runtime/distributed/coordinator/kernels/DistributedRead.h>
 #include <runtime/local/io/DaphneSerializer.h>
-#include <runtime/local/io/HDFS/ReadHDFSCsv.h>
+#include <runtime/local/io/File.h>
 #include <runtime/local/io/HDFS/ReadDaphneHDFS.h>
-#include <runtime/distributed/coordinator/kernels/DistributedRead.h>
-#include <parser/metadata/MetaDataParser.h>
+#include <runtime/local/io/HDFS/ReadHDFSCsv.h>
+#include <runtime/local/io/utils.h>
 
 #include <util/preprocessor_defs.h>
 
@@ -36,12 +36,12 @@
 
 #include <cstddef>
 #include <cstdint>
-#include <queue>
 #include <fstream>
+#include <iomanip> // For setfill and setw
+#include <iostream>
 #include <limits>
+#include <queue>
 #include <sstream>
-#include <iostream>
-#include <iomanip> // For setfill and setw
 #include <vector>
 
 #include <fstream>
@@ -50,11 +50,8 @@
 // Struct for partial template specialization
 // ****************************************************************************
 
-template <class DTRes>
-struct ReadHDFS
-{
-    static void apply(DTRes *&res, const char *filename, DCTX(dctx))
-    {        
+template <class DTRes> struct ReadHDFS {
+    static void apply(DTRes *&res, const char *filename, DCTX(dctx)) {
         FileMetaData fmd = MetaDataParser::readMetaData(filename);
         res = DataObjectFactory::create<DTRes>(fmd.numRows, fmd.numCols, false);
 
@@ -80,8 +77,6 @@ struct ReadHDFS
 // Convenience function
 // ****************************************************************************
 
-template <class DTRes>
-void readHDFS(DTRes *&res, const char *hdfsFilename, DCTX(dctx))
-{
+template <class DTRes> void readHDFS(DTRes *&res, const char *hdfsFilename, DCTX(dctx)) {
     ReadHDFS<DTRes>::apply(res, hdfsFilename, dctx);
 }
diff --git a/src/runtime/local/io/HDFS/ReadHDFSCsv.h b/src/runtime/local/io/HDFS/ReadHDFSCsv.h
index c941246cd..9787ed269 100644
--- a/src/runtime/local/io/HDFS/ReadHDFSCsv.h
+++ b/src/runtime/local/io/HDFS/ReadHDFSCsv.h
@@ -16,10 +16,10 @@
 
 #pragma once
 
-#include <runtime/local/datastructures/DataObjectFactory.h>
-#include <runtime/local/datastructures/DenseMatrix.h>
 #include <runtime/local/context/DaphneContext.h>
 #include <runtime/local/context/HDFSContext.h>
+#include <runtime/local/datastructures/DataObjectFactory.h>
+#include <runtime/local/datastructures/DenseMatrix.h>
 
 #include <runtime/local/io/File.h>
 #include <runtime/local/io/utils.h>
@@ -31,8 +31,8 @@
 // ****************************************************************************
 
 template <class DTRes> struct ReadHDFSCsv {
-  static void apply(DTRes *&res, const char *hdfsDir, size_t numRows, size_t numCols,
-                    char delim, DCTX(dctx), size_t startRow = 0) = delete;
+    static void apply(DTRes *&res, const char *hdfsDir, size_t numRows, size_t numCols, char delim, DCTX(dctx),
+                      size_t startRow = 0) = delete;
 };
 
 // ****************************************************************************
@@ -40,12 +40,11 @@ template <class DTRes> struct ReadHDFSCsv {
 // ****************************************************************************
 
 template <class DTRes>
-void readHDFSCsv(DTRes *&res, const char *hdfsDir, size_t numRows, size_t numCols,
-             char delim, DCTX(dctx), size_t startRow = 0) {
-  ReadHDFSCsv<DTRes>::apply(res, hdfsDir, numRows, numCols, delim, dctx, startRow);
+void readHDFSCsv(DTRes *&res, const char *hdfsDir, size_t numRows, size_t numCols, char delim, DCTX(dctx),
+                 size_t startRow = 0) {
+    ReadHDFSCsv<DTRes>::apply(res, hdfsDir, numRows, numCols, delim, dctx, startRow);
 }
 
-
 // ****************************************************************************
 // (Partial) template specializations for different data/value types
 // ****************************************************************************
@@ -55,100 +54,104 @@ void readHDFSCsv(DTRes *&res, const char *hdfsDir, size_t numRows, size_t numCol
 // ----------------------------------------------------------------------------
 
 template <typename VT> struct ReadHDFSCsv<DenseMatrix<VT>> {
-  static void apply(DenseMatrix<VT> *&res, const char *hdfsDir, size_t numRows,
-                    size_t numCols, char delim, DCTX(dctx), size_t startRow = 0) {
-    if(hdfsDir == nullptr) {
-        throw std::runtime_error("File required");
-    } 
-    if (numRows <= 0) {
-        throw std::runtime_error("numRows must be > 0");
-    }
-    if (numRows <= 0) {
-        throw std::runtime_error("numCols must be > 0");
-    }
+    static void apply(DenseMatrix<VT> *&res, const char *hdfsDir, size_t numRows, size_t numCols, char delim,
+                      DCTX(dctx), size_t startRow = 0) {
+        if (hdfsDir == nullptr) {
+            throw std::runtime_error("File required");
+        }
+        if (numRows <= 0) {
+            throw std::runtime_error("numRows must be > 0");
+        }
+        if (numRows <= 0) {
+            throw std::runtime_error("numCols must be > 0");
+        }
 
-    if (res == nullptr) {
-        res = DataObjectFactory::create<DenseMatrix<VT>>(numRows, numCols, false);
-    }
+        if (res == nullptr) {
+            res = DataObjectFactory::create<DenseMatrix<VT>>(numRows, numCols, false);
+        }
 
-    auto hdfsCtx = HDFSContext::get(dctx);
-    auto fs = hdfsCtx->getConnection();    
-    if (fs == NULL) {
-        std::cerr << "Error connecting to HDFS" << std::endl;
-    }
+        auto hdfsCtx = HDFSContext::get(dctx);
+        auto fs = hdfsCtx->getConnection();
+        if (fs == NULL) {
+            std::cerr << "Error connecting to HDFS" << std::endl;
+        }
 
-    [[maybe_unused]] auto [startSegment, dummy] = HDFSUtils::findSegmendAndOffset(*fs, 0, startRow, hdfsDir, numCols * sizeof(VT));
-    // TODO verify file exists
+        [[maybe_unused]] auto [startSegment, dummy] =
+            HDFSUtils::findSegmendAndOffset(*fs, 0, startRow, hdfsDir, numCols * sizeof(VT));
+        // TODO verify file exists
 
-    size_t parsedRows = 0;
-    auto segment = startSegment;
-    if (res == NULL){
-        throw std::runtime_error("Could not initialize result matrix");
-    }
+        size_t parsedRows = 0;
+        auto segment = startSegment;
+        if (res == NULL) {
+            throw std::runtime_error("Could not initialize result matrix");
+        }
 
-    VT *valuesRes = res->getValues();
+        VT *valuesRes = res->getValues();
 
-    while (parsedRows < numRows) {
-        auto hdfsFn = std::string(hdfsDir) + "/" + HDFSUtils::getBaseFile(hdfsDir) + "_segment_" + std::to_string(segment++);
-        auto segFmd = HDFSUtils::parseHDFSMetaData(hdfsFn, *fs);
+        while (parsedRows < numRows) {
+            auto hdfsFn =
+                std::string(hdfsDir) + "/" + HDFSUtils::getBaseFile(hdfsDir) + "_segment_" + std::to_string(segment++);
+            auto segFmd = HDFSUtils::parseHDFSMetaData(hdfsFn, *fs);
 
-        hdfsFile hFile = hdfsOpenFile(*fs, hdfsFn.c_str(), O_RDONLY, 0, 0, 0);
-        if (hFile == NULL) {
-            throw std::runtime_error("Error opening HDFS file");            
-        }
+            hdfsFile hFile = hdfsOpenFile(*fs, hdfsFn.c_str(), O_RDONLY, 0, 0, 0);
+            if (hFile == NULL) {
+                throw std::runtime_error("Error opening HDFS file");
+            }
 
-        char buffer[1UL << 20];
-        char *cur = nullptr;
-        size_t n = 0;
+            char buffer[1UL << 20];
+            char *cur = nullptr;
+            size_t n = 0;
 
-        for (size_t r = 0; r < segFmd.numRows; r++) {
-            std::string line;
+            for (size_t r = 0; r < segFmd.numRows; r++) {
+                std::string line;
 
-            do {
-                if (cur == nullptr) {
-                    n = hdfsRead(*fs, hFile, buffer, 1UL << 20);
-                    if (n <= 0) {
-                        throw std::runtime_error("Could not read hdfs file");
+                do {
+                    if (cur == nullptr) {
+                        n = hdfsRead(*fs, hFile, buffer, 1UL << 20);
+                        if (n <= 0) {
+                            throw std::runtime_error("Could not read hdfs file");
+                        }
+                        cur = buffer;
                     }
-                    cur = buffer;
-                }
 
-                char *eol = (char *)std::memchr(cur, '\n', n);
-                if (eol == nullptr || static_cast<size_t>(eol - cur) >= n) {
-                    line.append(cur, n);
-                    cur = nullptr;
-                } else {
-                    line.append(cur, eol - cur);
-                    cur = eol + 1;
-                }
-            } while (cur == nullptr);
-            // If first segment, skip rows
-            if (parsedRows == 0 && startRow > (segment - 2) * segFmd.numRows + r)
-                continue;
-
-            size_t pos = 0;
-            for(size_t c = 0; c < numCols; c++) {
-                VT val;
-                convertCstr(line.c_str() + pos, &val);
-
-                // TODO This assumes that rowSkip == numCols.
-                *valuesRes = val;
-                valuesRes++;
-                // TODO We could even exploit the fact that the strtoX functions can
-                // return a pointer to the first character after the parsed input, then
-                // we wouldn't have to search for that ourselves, just would need to
-                // check if it is really the delimiter.
-                if(c < numCols - 1) {
-                    while(line[pos] != delim) pos++;
-                    pos++; // skip delimiter
+                    char *eol = (char *)std::memchr(cur, '\n', n);
+                    if (eol == nullptr || static_cast<size_t>(eol - cur) >= n) {
+                        line.append(cur, n);
+                        cur = nullptr;
+                    } else {
+                        line.append(cur, eol - cur);
+                        cur = eol + 1;
+                    }
+                } while (cur == nullptr);
+                // If first segment, skip rows
+                if (parsedRows == 0 && startRow > (segment - 2) * segFmd.numRows + r)
+                    continue;
+
+                size_t pos = 0;
+                for (size_t c = 0; c < numCols; c++) {
+                    VT val;
+                    convertCstr(line.c_str() + pos, &val);
+
+                    // TODO This assumes that rowSkip == numCols.
+                    *valuesRes = val;
+                    valuesRes++;
+                    // TODO We could even exploit the fact that the strtoX
+                    // functions can return a pointer to the first character
+                    // after the parsed input, then we wouldn't have to search
+                    // for that ourselves, just would need to check if it is
+                    // really the delimiter.
+                    if (c < numCols - 1) {
+                        while (line[pos] != delim)
+                            pos++;
+                        pos++; // skip delimiter
+                    }
                 }
+                parsedRows++;
+                if (parsedRows == numRows)
+                    break;
             }
-            parsedRows++;
-            if (parsedRows == numRows)
-                break;
-        }
 
-        hdfsCloseFile(*fs, hFile);
+            hdfsCloseFile(*fs, hFile);
+        }
     }
-  }
 };
diff --git a/src/runtime/local/io/HDFS/WriteDaphneHDFS.h b/src/runtime/local/io/HDFS/WriteDaphneHDFS.h
index 62ab17f94..9ed1061c9 100644
--- a/src/runtime/local/io/HDFS/WriteDaphneHDFS.h
+++ b/src/runtime/local/io/HDFS/WriteDaphneHDFS.h
@@ -16,16 +16,16 @@
 
 #pragma once
 
+#include <runtime/local/context/DaphneContext.h>
+#include <runtime/local/context/HDFSContext.h>
+#include <runtime/local/datastructures/CSRMatrix.h>
 #include <runtime/local/datastructures/DataObjectFactory.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
-#include <runtime/local/datastructures/CSRMatrix.h>
 #include <runtime/local/datastructures/Frame.h>
-#include <runtime/local/context/DaphneContext.h>
-#include <runtime/local/context/HDFSContext.h>
 
+#include <runtime/local/io/DaphneSerializer.h>
 #include <runtime/local/io/File.h>
 #include <runtime/local/io/utils.h>
-#include <runtime/local/io/DaphneSerializer.h>
 
 #include <util/preprocessor_defs.h>
 
@@ -33,11 +33,11 @@
 
 #include <cstddef>
 #include <cstdint>
-#include <queue>
 #include <fstream>
+#include <iostream>
 #include <limits>
+#include <queue>
 #include <sstream>
-#include <iostream>
 
 #include <fstream>
 #include <vector>
@@ -46,9 +46,7 @@
 // Struct for partial template specialization
 // ****************************************************************************
 
-template <class DTArg>
-struct WriteDaphneHDFS
-{
+template <class DTArg> struct WriteDaphneHDFS {
     static void apply(const DTArg *arg, const char *hdfsFilename, DCTX(dctx)) = delete;
 };
 
@@ -56,9 +54,7 @@ struct WriteDaphneHDFS
 // Convenience function
 // ****************************************************************************
 
-template <class DTArg>
-void writeDaphneHDFS(const DTArg *arg, const char *hdfsFilename, DCTX(dctx))
-{
+template <class DTArg> void writeDaphneHDFS(const DTArg *arg, const char *hdfsFilename, DCTX(dctx)) {
     WriteDaphneHDFS<DTArg>::apply(arg, hdfsFilename, dctx);
 }
 
@@ -70,20 +66,17 @@ void writeDaphneHDFS(const DTArg *arg, const char *hdfsFilename, DCTX(dctx))
 // DenseMatrix
 // ----------------------------------------------------------------------------
 
-template <typename VT>
-struct WriteDaphneHDFS<DenseMatrix<VT>>{
-    static void apply(const DenseMatrix<VT> *arg, const char *hdfsFilename, DCTX(dctx))
-    {
+template <typename VT> struct WriteDaphneHDFS<DenseMatrix<VT>> {
+    static void apply(const DenseMatrix<VT> *arg, const char *hdfsFilename, DCTX(dctx)) {
         size_t length;
         length = DaphneSerializer<DenseMatrix<VT>>::length(arg);
-                
+
         std::vector<char> buffer(length);
         DaphneSerializer<DenseMatrix<VT>>::serialize(arg, buffer);
 
         auto hdfsCtx = HDFSContext::get(dctx);
         auto fs = hdfsCtx->getConnection();
-        if (fs == NULL)
-        {
+        if (fs == NULL) {
             std::cout << "Error connecting to HDFS" << std::endl;
         }
 
@@ -101,15 +94,13 @@ struct WriteDaphneHDFS<DenseMatrix<VT>>{
 
         // Write binary
         hdfsFile = hdfsOpenFile(*fs, hdfsFilename, O_WRONLY, 0, 0, 0);
-        if (hdfsFile == NULL)
-        {
+        if (hdfsFile == NULL) {
             throw std::runtime_error("Error opening HDFS file");
         }
 
         hdfsWrite(*fs, hdfsFile, buffer.data(), length);
-        if (hdfsCloseFile(*fs, hdfsFile) == -1)
-        {
+        if (hdfsCloseFile(*fs, hdfsFile) == -1) {
             throw std::runtime_error("Failed to close HDFS file");
-        }        
+        }
     }
 };
diff --git a/src/runtime/local/io/HDFS/WriteHDFS.h b/src/runtime/local/io/HDFS/WriteHDFS.h
index 8a8315bbc..5156eaff6 100644
--- a/src/runtime/local/io/HDFS/WriteHDFS.h
+++ b/src/runtime/local/io/HDFS/WriteHDFS.h
@@ -16,18 +16,18 @@
 
 #pragma once
 
+#include <runtime/local/datastructures/CSRMatrix.h>
 #include <runtime/local/datastructures/DataObjectFactory.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
-#include <runtime/local/datastructures/CSRMatrix.h>
 #include <runtime/local/datastructures/Frame.h>
 
-#include <runtime/local/io/File.h>
-#include <runtime/local/io/utils.h>
-#include <runtime/local/io/DaphneSerializer.h>
-#include <runtime/local/io/HDFS/WriteHDFSCsv.h>
-#include <runtime/local/io/HDFS/WriteDaphneHDFS.h>
 #include <runtime/distributed/coordinator/kernels/DistributedWrite.h>
 #include <runtime/local/datastructures/AllocationDescriptorGRPC.h>
+#include <runtime/local/io/DaphneSerializer.h>
+#include <runtime/local/io/File.h>
+#include <runtime/local/io/HDFS/WriteDaphneHDFS.h>
+#include <runtime/local/io/HDFS/WriteHDFSCsv.h>
+#include <runtime/local/io/utils.h>
 
 #include <util/preprocessor_defs.h>
 
@@ -35,11 +35,11 @@
 
 #include <cstddef>
 #include <cstdint>
-#include <queue>
 #include <fstream>
+#include <iostream>
 #include <limits>
+#include <queue>
 #include <sstream>
-#include <iostream>
 
 #include <fstream>
 #include <vector>
@@ -48,15 +48,11 @@
 // Struct for partial template specialization
 // ****************************************************************************
 
-template <class DTArg>
-struct WriteHDFS
-{
-    static void apply(const DTArg *arg, const char *filename, DCTX(dctx)) 
-    {
+template <class DTArg> struct WriteHDFS {
+    static void apply(const DTArg *arg, const char *filename, DCTX(dctx)) {
         auto hdfsCtx = HDFSContext::get(dctx);
         auto fs = hdfsCtx->getConnection();
-        if (fs == NULL)
-        {
+        if (fs == NULL) {
             std::cout << "Error connecting to HDFS" << std::endl;
         }
 
@@ -72,12 +68,12 @@ struct WriteHDFS
         if (hdfsExists(*fs, dirFileName.c_str()) == -1) {
             // The file does not exist, so create the directory structure
             // and the file
-            if(hdfsCreateDirectory(*fs, dirFileName.c_str()) == -1)
+            if (hdfsCreateDirectory(*fs, dirFileName.c_str()) == -1)
                 throw std::runtime_error("Failed to create file");
         } else {
             // clear directory
             int numEntries;
-            hdfsFileInfo * files = hdfsListDirectory(*fs, dirFileName.c_str(), &numEntries);
+            hdfsFileInfo *files = hdfsListDirectory(*fs, dirFileName.c_str(), &numEntries);
             for (int i = 0; i < numEntries; i++)
                 hdfsDelete(*fs, files[i].mName, 1);
             hdfsFreeFileInfo(files, numEntries);
@@ -85,16 +81,16 @@ struct WriteHDFS
 
         if (dctx->config.use_distributed) {
             distributedWrite<DTArg>(arg, filename, dctx);
-           
+
         } else {
             // Write one segment
-            auto hdfsfilename = baseFileName + "_segment_1";            
-            if (extension == ".csv") {                
+            auto hdfsfilename = baseFileName + "_segment_1";
+            if (extension == ".csv") {
                 writeHDFSCsv(arg, hdfsfilename.c_str(), dctx);
             } else if (extension == ".dbdf") {
                 writeDaphneHDFS(arg, hdfsfilename.c_str(), dctx);
             }
-        }  
+        }
     }
 };
 
@@ -102,8 +98,6 @@ struct WriteHDFS
 // Convenience function
 // ****************************************************************************
 
-template <class DTArg>
-void writeHDFS(const DTArg *arg, const char *filename, DCTX(dctx))
-{
+template <class DTArg> void writeHDFS(const DTArg *arg, const char *filename, DCTX(dctx)) {
     WriteHDFS<DTArg>::apply(arg, filename, dctx);
 }
diff --git a/src/runtime/local/io/HDFS/WriteHDFSCsv.h b/src/runtime/local/io/HDFS/WriteHDFSCsv.h
index d0587d0d4..4b1e042e8 100644
--- a/src/runtime/local/io/HDFS/WriteHDFSCsv.h
+++ b/src/runtime/local/io/HDFS/WriteHDFSCsv.h
@@ -17,11 +17,11 @@
 #pragma once
 
 #include <runtime/local/context/DaphneContext.h>
+#include <runtime/local/context/HDFSContext.h>
 #include <runtime/local/datastructures/DataObjectFactory.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
 #include <runtime/local/io/File.h>
 #include <runtime/local/io/utils.h>
-#include <runtime/local/context/HDFSContext.h>
 
 #include <cstddef>
 #include <cstdint>
@@ -34,8 +34,7 @@
 // Struct for partial template specialization
 // ****************************************************************************
 
-template <class DTArg>
-struct WriteHDFSCsv {
+template <class DTArg> struct WriteHDFSCsv {
     static void apply(const DTArg *arg, const char *hdfsFilename, DCTX(dctx)) = delete;
 };
 
@@ -43,8 +42,7 @@ struct WriteHDFSCsv {
 // Convenience function
 // ****************************************************************************
 
-template <class DTArg>
-void writeHDFSCsv(const DTArg *arg, const char *hdfsFilename, DCTX(dctx)) {
+template <class DTArg> void writeHDFSCsv(const DTArg *arg, const char *hdfsFilename, DCTX(dctx)) {
     WriteHDFSCsv<DTArg>::apply(arg, hdfsFilename, dctx);
 }
 
@@ -56,8 +54,7 @@ void writeHDFSCsv(const DTArg *arg, const char *hdfsFilename, DCTX(dctx)) {
 // DenseMatrix
 // ----------------------------------------------------------------------------
 
-template <typename VT>
-struct WriteHDFSCsv<DenseMatrix<VT>> {
+template <typename VT> struct WriteHDFSCsv<DenseMatrix<VT>> {
     static void apply(const DenseMatrix<VT> *arg, const char *hdfsFilename, DCTX(dctx)) {
         if (hdfsFilename == nullptr) {
             throw std::runtime_error("Could not read hdfs file");
@@ -74,7 +71,7 @@ struct WriteHDFSCsv<DenseMatrix<VT>> {
         // Check if path exists
         std::filesystem::path filePath(hdfsFilename);
         auto dirFileName = filePath.parent_path();
-        
+
         if (hdfsExists(*fs, dirFileName.c_str()) == -1) {
             // The file does not exist, so create the directory structure
             // and the file
@@ -111,23 +108,20 @@ struct WriteHDFSCsv<DenseMatrix<VT>> {
                 // Convert the numeric value to a string and add a comma
                 std::ostringstream oss;
                 oss << valuesArg[cell];
-                oss << (j < (arg->getNumCols() - 1)
-                            ? ","
-                            : "");  // Add a comma unless it's the last column
+                oss << (j < (arg->getNumCols() - 1) ? "," : ""); // Add a comma unless it's the last column
                 const std::string &valueStr = oss.str();
 
                 // Write the value string to the HDFS file
                 if (hdfsWrite(*fs, hdfsFile, static_cast<const void *>(valueStr.c_str()), valueStr.size()) == -1) {
-                    hdfsCloseFile(*fs, hdfsFile);                
+                    hdfsCloseFile(*fs, hdfsFile);
                     throw std::runtime_error("Failed to write to HDFS file");
                 }
                 cell++;
             }
             // Add a newline character at the end of each row
             const char newline = '\n';
-            if (hdfsWrite(*fs, hdfsFile, static_cast<const void *>(&newline),
-                          1) == -1) {
-                hdfsCloseFile(*fs, hdfsFile);                
+            if (hdfsWrite(*fs, hdfsFile, static_cast<const void *>(&newline), 1) == -1) {
+                hdfsCloseFile(*fs, hdfsFile);
                 throw std::runtime_error("Failed to write to HDFS file");
             }
         }
diff --git a/src/runtime/local/io/MMFile.h b/src/runtime/local/io/MMFile.h
index 9236cfdcc..693660288 100644
--- a/src/runtime/local/io/MMFile.h
+++ b/src/runtime/local/io/MMFile.h
@@ -16,18 +16,18 @@
 
 #pragma once
 
-#include <iterator>
 #include <cstddef>
 #include <cstring>
 #include <functional>
+#include <iterator>
 
 #define MM_MAX_LINE_LENGTH 1025
 #define MatrixMarketBanner "%%MatrixMarket"
 #define MM_MAX_TOKEN_LENGTH 64
 
+#include <runtime/local/datastructures/ValueTypeCode.h>
 #include <runtime/local/io/File.h>
 #include <runtime/local/io/utils.h>
-#include <runtime/local/datastructures/ValueTypeCode.h>
 
 typedef char MM_typecode[4];
 
@@ -35,244 +35,223 @@ char *mm_typecode_to_str(MM_typecode matcode);
 
 /********************* MM_typecode query fucntions ***************************/
 
-#define mm_is_matrix(typecode)	((typecode)[0]=='M')
-
-#define mm_is_sparse(typecode)	((typecode)[1]=='C')
-#define mm_is_coordinate(typecode)((typecode)[1]=='C')
-#define mm_is_dense(typecode)	((typecode)[1]=='A')
-#define mm_is_array(typecode)	((typecode)[1]=='A')
-
-#define mm_is_complex(typecode)	((typecode)[2]=='C')
-#define mm_is_real(typecode)		((typecode)[2]=='R')
-#define mm_is_pattern(typecode)	((typecode)[2]=='P')
-#define mm_is_integer(typecode) ((typecode)[2]=='I')
-
-#define mm_is_symmetric(typecode)((typecode)[3]=='S')
-#define mm_is_general(typecode)	((typecode)[3]=='G')
-#define mm_is_skew(typecode)	((typecode)[3]=='K')
-#define mm_is_hermitian(typecode)((typecode)[3]=='H')
-
-inline int mm_is_valid(MM_typecode matcode)
-{
-  if (!mm_is_matrix(matcode)) return 0;
-  if (mm_is_dense(matcode) && mm_is_pattern(matcode)) return 0;
-  if (mm_is_real(matcode) && mm_is_hermitian(matcode)) return 0;
-  if (mm_is_pattern(matcode) && (mm_is_hermitian(matcode) || 
-    mm_is_skew(matcode))) return 0;
-  return 1;
+#define mm_is_matrix(typecode) ((typecode)[0] == 'M')
+
+#define mm_is_sparse(typecode) ((typecode)[1] == 'C')
+#define mm_is_coordinate(typecode) ((typecode)[1] == 'C')
+#define mm_is_dense(typecode) ((typecode)[1] == 'A')
+#define mm_is_array(typecode) ((typecode)[1] == 'A')
+
+#define mm_is_complex(typecode) ((typecode)[2] == 'C')
+#define mm_is_real(typecode) ((typecode)[2] == 'R')
+#define mm_is_pattern(typecode) ((typecode)[2] == 'P')
+#define mm_is_integer(typecode) ((typecode)[2] == 'I')
+
+#define mm_is_symmetric(typecode) ((typecode)[3] == 'S')
+#define mm_is_general(typecode) ((typecode)[3] == 'G')
+#define mm_is_skew(typecode) ((typecode)[3] == 'K')
+#define mm_is_hermitian(typecode) ((typecode)[3] == 'H')
+
+inline int mm_is_valid(MM_typecode matcode) {
+    if (!mm_is_matrix(matcode))
+        return 0;
+    if (mm_is_dense(matcode) && mm_is_pattern(matcode))
+        return 0;
+    if (mm_is_real(matcode) && mm_is_hermitian(matcode))
+        return 0;
+    if (mm_is_pattern(matcode) && (mm_is_hermitian(matcode) || mm_is_skew(matcode)))
+        return 0;
+    return 1;
 }
 
-
 /********************* MM_typecode modify fucntions ***************************/
 
-#define mm_set_matrix(typecode)	((*typecode)[0]='M')
-#define mm_set_coordinate(typecode)	((*typecode)[1]='C')
-#define mm_set_array(typecode)	((*typecode)[1]='A')
-#define mm_set_dense(typecode)	mm_set_array(typecode)
-#define mm_set_sparse(typecode)	mm_set_coordinate(typecode)
+#define mm_set_matrix(typecode) ((*typecode)[0] = 'M')
+#define mm_set_coordinate(typecode) ((*typecode)[1] = 'C')
+#define mm_set_array(typecode) ((*typecode)[1] = 'A')
+#define mm_set_dense(typecode) mm_set_array(typecode)
+#define mm_set_sparse(typecode) mm_set_coordinate(typecode)
 
-#define mm_set_complex(typecode)((*typecode)[2]='C')
-#define mm_set_real(typecode)	((*typecode)[2]='R')
-#define mm_set_pattern(typecode)((*typecode)[2]='P')
-#define mm_set_integer(typecode)((*typecode)[2]='I')
+#define mm_set_complex(typecode) ((*typecode)[2] = 'C')
+#define mm_set_real(typecode) ((*typecode)[2] = 'R')
+#define mm_set_pattern(typecode) ((*typecode)[2] = 'P')
+#define mm_set_integer(typecode) ((*typecode)[2] = 'I')
 
+#define mm_set_symmetric(typecode) ((*typecode)[3] = 'S')
+#define mm_set_general(typecode) ((*typecode)[3] = 'G')
+#define mm_set_skew(typecode) ((*typecode)[3] = 'K')
+#define mm_set_hermitian(typecode) ((*typecode)[3] = 'H')
 
-#define mm_set_symmetric(typecode)((*typecode)[3]='S')
-#define mm_set_general(typecode)((*typecode)[3]='G')
-#define mm_set_skew(typecode)	((*typecode)[3]='K')
-#define mm_set_hermitian(typecode)((*typecode)[3]='H')
-
-#define mm_clear_typecode(typecode) ((*typecode)[0]=(*typecode)[1]= \
-									(*typecode)[2]=' ',(*typecode)[3]='G')
+#define mm_clear_typecode(typecode) ((*typecode)[0] = (*typecode)[1] = (*typecode)[2] = ' ', (*typecode)[3] = 'G')
 
 #define mm_initialize_typecode(typecode) mm_clear_typecode(typecode)
 
-
 /********************* Matrix Market error codes ***************************/
 
-
-#define MM_COULD_NOT_READ_FILE	11
-#define MM_PREMATURE_EOF		12
-#define MM_NOT_MTX				13
-#define MM_NO_HEADER			14
-#define MM_UNSUPPORTED_TYPE		15
-#define MM_LINE_TOO_LONG		16
-#define MM_COULD_NOT_WRITE_FILE	17
-
+#define MM_COULD_NOT_READ_FILE 11
+#define MM_PREMATURE_EOF 12
+#define MM_NOT_MTX 13
+#define MM_NO_HEADER 14
+#define MM_UNSUPPORTED_TYPE 15
+#define MM_LINE_TOO_LONG 16
+#define MM_COULD_NOT_WRITE_FILE 17
 
 /******************** Matrix Market internal definitions ********************
 
    MM_matrix_typecode: 4-character sequence
 
-                      ojbect 	  sparse/    data       storage 
+                      ojbect 	  sparse/    data       storage
                                 dense      type       scheme
 
    string position:	  [0]       [1]			  [2]         [3]
 
    Matrix typecode:   M(atrix)  C(oord)		R(eal)   	  G(eneral)
-						                    A(array)	C(omplex)   H(ermitian)
-											                    P(attern)   S(ymmetric)
-								    		                  I(nteger)   K(kew)
+                                                                    A(array)
+ C(omplex)   H(ermitian) P(attern)   S(ymmetric) I(nteger)   K(kew)
 
  ***********************************************************************/
 
+#define MM_MTX_STR "matrix"
+#define MM_ARRAY_STR "array"
+#define MM_DENSE_STR "array"
+#define MM_COORDINATE_STR "coordinate"
+#define MM_SPARSE_STR "coordinate"
+#define MM_COMPLEX_STR "complex"
+#define MM_REAL_STR "real"
+#define MM_INT_STR "integer"
+#define MM_GENERAL_STR "general"
+#define MM_SYMM_STR "symmetric"
+#define MM_HERM_STR "hermitian"
+#define MM_SKEW_STR "skew-symmetric"
+#define MM_PATTERN_STR "pattern"
+
+inline int mm_read_banner(File *f, MM_typecode *matcode) {
+    char banner[MM_MAX_TOKEN_LENGTH];
+    char mtx[MM_MAX_TOKEN_LENGTH];
+    char crd[MM_MAX_TOKEN_LENGTH];
+    char data_type[MM_MAX_TOKEN_LENGTH];
+    char storage_scheme[MM_MAX_TOKEN_LENGTH];
+    char *p;
+
+    mm_clear_typecode(matcode);
+    ssize_t ret = getFileLine(f);
+    if ((ssize_t)f->read == EOF)
+        return MM_PREMATURE_EOF;
+    if (ret == -1)
+        throw std::runtime_error("mm_read_banner: getFileLine failed");
 
-#define MM_MTX_STR		"matrix"
-#define MM_ARRAY_STR	"array"
-#define MM_DENSE_STR	"array"
-#define MM_COORDINATE_STR "coordinate" 
-#define MM_SPARSE_STR	"coordinate"
-#define MM_COMPLEX_STR	"complex"
-#define MM_REAL_STR		"real"
-#define MM_INT_STR		"integer"
-#define MM_GENERAL_STR  "general"
-#define MM_SYMM_STR		"symmetric"
-#define MM_HERM_STR		"hermitian"
-#define MM_SKEW_STR		"skew-symmetric"
-#define MM_PATTERN_STR  "pattern"
-
-inline int mm_read_banner(File *f, MM_typecode *matcode){
-  char banner[MM_MAX_TOKEN_LENGTH];
-  char mtx[MM_MAX_TOKEN_LENGTH]; 
-  char crd[MM_MAX_TOKEN_LENGTH];
-  char data_type[MM_MAX_TOKEN_LENGTH];
-  char storage_scheme[MM_MAX_TOKEN_LENGTH];
-  char *p;
-
-
-  mm_clear_typecode(matcode);  
-  ssize_t ret = getFileLine(f);
-  if ((ssize_t)f->read == EOF) 
-    return MM_PREMATURE_EOF;
-  if (ret == -1)
-    throw std::runtime_error("mm_read_banner: getFileLine failed");
-
-  if (sscanf(f->line, "%s %s %s %s %s", banner, mtx, crd, data_type,
-      storage_scheme) != 5)
-      return MM_PREMATURE_EOF;
-
-  for (p=mtx; *p!='\0'; *p=tolower(*p),p++);  /* convert to lower case */
-  for (p=crd; *p!='\0'; *p=tolower(*p),p++);  
-  for (p=data_type; *p!='\0'; *p=tolower(*p),p++);
-  for (p=storage_scheme; *p!='\0'; *p=tolower(*p),p++);
-
-  /* check for banner */
-  if (strncmp(banner, MatrixMarketBanner, strlen(MatrixMarketBanner)) != 0)
-    return MM_NO_HEADER;
-
-  /* first field should be "mtx" */
-  if (strcmp(mtx, MM_MTX_STR) != 0)
-    return  MM_UNSUPPORTED_TYPE;
-  mm_set_matrix(matcode);
-
-
-  /* second field describes whether this is a sparse matrix (in coordinate
-          storage) or a dense array */
-
-
-  if (strcmp(crd, MM_SPARSE_STR) == 0)
-    mm_set_sparse(matcode);
-  else
-  if (strcmp(crd, MM_DENSE_STR) == 0)
-    mm_set_dense(matcode);
-  else
-    return MM_UNSUPPORTED_TYPE;
-    
-
-  /* third field */
-  if(strcmp(data_type, MM_REAL_STR) == 0)
-    mm_set_real(matcode);
-  else if (strcmp(data_type, MM_COMPLEX_STR) == 0)
-    mm_set_complex(matcode);
-  else
-  if (strcmp(data_type, MM_PATTERN_STR) == 0)
-    mm_set_pattern(matcode);
-  else
-  if (strcmp(data_type, MM_INT_STR) == 0)
-    mm_set_integer(matcode);
-  else
-    return MM_UNSUPPORTED_TYPE;
-    
-
-  /* fourth field */
-
-  if (strcmp(storage_scheme, MM_GENERAL_STR) == 0)
-    mm_set_general(matcode);
-  else
-  if (strcmp(storage_scheme, MM_SYMM_STR) == 0)
-    mm_set_symmetric(matcode);
-  else
-  if (strcmp(storage_scheme, MM_HERM_STR) == 0)
-    mm_set_hermitian(matcode);
-  else
-  if (strcmp(storage_scheme, MM_SKEW_STR) == 0)
-    mm_set_skew(matcode);
-  else
-    return MM_UNSUPPORTED_TYPE;
-        
-
-  return 0;
+    if (sscanf(f->line, "%s %s %s %s %s", banner, mtx, crd, data_type, storage_scheme) != 5)
+        return MM_PREMATURE_EOF;
+
+    for (p = mtx; *p != '\0'; *p = tolower(*p), p++)
+        ; /* convert to lower case */
+    for (p = crd; *p != '\0'; *p = tolower(*p), p++)
+        ;
+    for (p = data_type; *p != '\0'; *p = tolower(*p), p++)
+        ;
+    for (p = storage_scheme; *p != '\0'; *p = tolower(*p), p++)
+        ;
+
+    /* check for banner */
+    if (strncmp(banner, MatrixMarketBanner, strlen(MatrixMarketBanner)) != 0)
+        return MM_NO_HEADER;
+
+    /* first field should be "mtx" */
+    if (strcmp(mtx, MM_MTX_STR) != 0)
+        return MM_UNSUPPORTED_TYPE;
+    mm_set_matrix(matcode);
+
+    /* second field describes whether this is a sparse matrix (in coordinate
+            storage) or a dense array */
+
+    if (strcmp(crd, MM_SPARSE_STR) == 0)
+        mm_set_sparse(matcode);
+    else if (strcmp(crd, MM_DENSE_STR) == 0)
+        mm_set_dense(matcode);
+    else
+        return MM_UNSUPPORTED_TYPE;
+
+    /* third field */
+    if (strcmp(data_type, MM_REAL_STR) == 0)
+        mm_set_real(matcode);
+    else if (strcmp(data_type, MM_COMPLEX_STR) == 0)
+        mm_set_complex(matcode);
+    else if (strcmp(data_type, MM_PATTERN_STR) == 0)
+        mm_set_pattern(matcode);
+    else if (strcmp(data_type, MM_INT_STR) == 0)
+        mm_set_integer(matcode);
+    else
+        return MM_UNSUPPORTED_TYPE;
+
+    /* fourth field */
+
+    if (strcmp(storage_scheme, MM_GENERAL_STR) == 0)
+        mm_set_general(matcode);
+    else if (strcmp(storage_scheme, MM_SYMM_STR) == 0)
+        mm_set_symmetric(matcode);
+    else if (strcmp(storage_scheme, MM_HERM_STR) == 0)
+        mm_set_hermitian(matcode);
+    else if (strcmp(storage_scheme, MM_SKEW_STR) == 0)
+        mm_set_skew(matcode);
+    else
+        return MM_UNSUPPORTED_TYPE;
+
+    return 0;
 }
 
-inline int mm_read_mtx_crd_size(File *f, size_t *M, size_t *N, size_t *nz )
-{
-  int num_items_read;
+inline int mm_read_mtx_crd_size(File *f, size_t *M, size_t *N, size_t *nz) {
+    int num_items_read;
 
-  /* set return null parameter values, in case we exit with errors */
-  *M = *N = *nz = 0;
-  do
-  {
-    ssize_t ret = getFileLine(f);
-    if ((ssize_t)f->read == EOF)
-      return MM_PREMATURE_EOF;
-    if (ret == -1)
-      throw std::runtime_error("mm_read_mtx_crd_size: getFileLine failed");
+    /* set return null parameter values, in case we exit with errors */
+    *M = *N = *nz = 0;
+    do {
+        ssize_t ret = getFileLine(f);
+        if ((ssize_t)f->read == EOF)
+            return MM_PREMATURE_EOF;
+        if (ret == -1)
+            throw std::runtime_error("mm_read_mtx_crd_size: getFileLine failed");
 
-    num_items_read = sscanf(f->line, "%lu %lu %lu", M, N, nz);
-  } while (num_items_read != 3);
+        num_items_read = sscanf(f->line, "%lu %lu %lu", M, N, nz);
+    } while (num_items_read != 3);
 
-  return 0;
+    return 0;
 }
 
-inline int mm_read_mtx_array_size(File *f, size_t *M, size_t *N)
-{
+inline int mm_read_mtx_array_size(File *f, size_t *M, size_t *N) {
     int num_items_read;
     /* set return null parameter values, in case we exit with errors */
     *M = *N = 0;
-    do
-    { 
-      ssize_t ret = getFileLine(f);
-      if ((ssize_t)f->read == EOF) 
-        return MM_PREMATURE_EOF;
-      if (ret == -1)
-        throw std::runtime_error("mm_read_mtx_array_size: getFileLine failed");
-      num_items_read = sscanf(f->line, "%lu %lu", M, N);
+    do {
+        ssize_t ret = getFileLine(f);
+        if ((ssize_t)f->read == EOF)
+            return MM_PREMATURE_EOF;
+        if (ret == -1)
+            throw std::runtime_error("mm_read_mtx_array_size: getFileLine failed");
+        num_items_read = sscanf(f->line, "%lu %lu", M, N);
     } while (num_items_read != 2);
 
     return 0;
 }
 
-template<typename VT>
-class MMFile
-{
-private:
-    File* f;
+template <typename VT> class MMFile {
+  private:
+    File *f;
     MM_typecode typecode{};
     size_t rows{}, cols{}, nnz;
-public:
-    explicit MMFile(const char *filename){
+
+  public:
+    explicit MMFile(const char *filename) {
         f = openFile(filename);
         mm_read_banner(f, &typecode);
-        if (mm_is_coordinate(typecode)){
+        if (mm_is_coordinate(typecode)) {
             mm_read_mtx_crd_size(f, &rows, &cols, &nnz);
-            if(mm_is_skew(typecode))
-              nnz *= 2;
-        }
-        else{
+            if (mm_is_skew(typecode))
+                nnz *= 2;
+        } else {
             mm_read_mtx_array_size(f, &rows, &cols);
-            nnz = rows*cols;
-            if(mm_is_skew(typecode))
-              nnz-= rows;
+            nnz = rows * cols;
+            if (mm_is_skew(typecode))
+                nnz -= rows;
         }
     }
     ~MMFile() = default;
@@ -285,113 +264,124 @@ class MMFile
     */
     size_t entryCount() { return nnz; }
     ValueTypeCode elementType() {
-      if (mm_is_integer(typecode)) return ValueTypeCode::SI64;
-      else return ValueTypeCode::F64;
+        if (mm_is_integer(typecode))
+            return ValueTypeCode::SI64;
+        else
+            return ValueTypeCode::F64;
     }
 
     struct Entry {
         size_t row, col;
         VT val;
-        friend bool operator== (const Entry& a, const Entry& b) {
-          return
-          a.row == b.row &&
-          a.col == b.col;
-        }
-        friend bool operator!=(const Entry& a, const Entry& b){
-          return !(a == b);
-        }
-        friend bool operator>(const Entry& a, const Entry& b){
-          return (a.row > b.row || (a.row == b.row && a.col > b.col));
+        friend bool operator==(const Entry &a, const Entry &b) { return a.row == b.row && a.col == b.col; }
+        friend bool operator!=(const Entry &a, const Entry &b) { return !(a == b); }
+        friend bool operator>(const Entry &a, const Entry &b) {
+            return (a.row > b.row || (a.row == b.row && a.col > b.col));
         }
     };
 
     class MMIterator {
         using iterator_category = std::input_iterator_tag;
-        using value_type  = Entry;
-        using pointer     = Entry*;
-        using reference   = Entry&;
+        using value_type = Entry;
+        using pointer = Entry *;
+        using reference = Entry &;
         using difference_type = std::ptrdiff_t;
-    private:
-        pointer m_ptr = (pointer)malloc(sizeof(Entry)*2), next = m_ptr+1;
+
+      private:
+        pointer m_ptr = (pointer)malloc(sizeof(Entry) * 2), next = m_ptr + 1;
         bool do_next = false;
         MMFile<VT> &file;
         size_t r = 0, c = 0;
-        std::function<void()> progress = [&]() mutable { r = 0; c++; };
+        std::function<void()> progress = [&]() mutable {
+            r = 0;
+            c++;
+        };
         VT cur;
         MMIterator() = default;
-        void readEntry(){
-          //TODO: Handle arbitrary blank lines
-          ssize_t ret = getFileLine(file.f);
-          if((ssize_t)file.f->read == -1){
-            terminate();
-            return;
-          }
-          if (ret == -1)
-            throw std::runtime_error("MMIterator::readEntry: getFileLine failed");
-
-          size_t pos = 0;
-          if(mm_is_coordinate(file.typecode)){
-              //Assumes only single space between values
-              r = std::stoi(file.f->line) - 1;
-              while(file.f->line[pos++] != ' ');
-              c = std::stoi(file.f->line + pos) - 1;
-              while(file.f->line[pos] != ' ' && file.f->line[pos] != '\n') pos++;
-          }
-
-          if(!mm_is_pattern(file.typecode))
-            convertCstr(file.f->line + pos, &cur);
-
-          *m_ptr = {r, c, cur};
-          if(mm_is_symmetric(file.typecode) && r != c){
-            // M[i][j] = M[j][i]
-            *next = {c, r, cur};
-            do_next = true;
-          }
-          else if (mm_is_skew(file.typecode)) {
-            // M[i][j] = -M[j][i] (cast to comply when VT is unsigned)
-            *next = {c, r, (VT)-cur};
-            do_next = true;
-          }
-          if(++r >= file.rows) progress();
+        void readEntry() {
+            // TODO: Handle arbitrary blank lines
+            ssize_t ret = getFileLine(file.f);
+            if ((ssize_t)file.f->read == -1) {
+                terminate();
+                return;
+            }
+            if (ret == -1)
+                throw std::runtime_error("MMIterator::readEntry: getFileLine failed");
+
+            size_t pos = 0;
+            if (mm_is_coordinate(file.typecode)) {
+                // Assumes only single space between values
+                r = std::stoi(file.f->line) - 1;
+                while (file.f->line[pos++] != ' ')
+                    ;
+                c = std::stoi(file.f->line + pos) - 1;
+                while (file.f->line[pos] != ' ' && file.f->line[pos] != '\n')
+                    pos++;
+            }
+
+            if (!mm_is_pattern(file.typecode))
+                convertCstr(file.f->line + pos, &cur);
+
+            *m_ptr = {r, c, cur};
+            if (mm_is_symmetric(file.typecode) && r != c) {
+                // M[i][j] = M[j][i]
+                *next = {c, r, cur};
+                do_next = true;
+            } else if (mm_is_skew(file.typecode)) {
+                // M[i][j] = -M[j][i] (cast to comply when VT is unsigned)
+                *next = {c, r, (VT)-cur};
+                do_next = true;
+            }
+            if (++r >= file.rows)
+                progress();
         }
-    public:
-        explicit MMIterator(MMFile<VT>& f, bool read = true) : file(f) {
-          if(mm_is_pattern(file.typecode)) cur = true;
-          if(mm_is_symmetric(file.typecode)) progress = [&]() mutable { r = ++c; };
-          else if (mm_is_skew(file.typecode)) {
-            r = 1;
-            progress = [&]() mutable { r = ++c+1; };
-          }
-          if(read) readEntry();
+
+      public:
+        explicit MMIterator(MMFile<VT> &f, bool read = true) : file(f) {
+            if (mm_is_pattern(file.typecode))
+                cur = true;
+            if (mm_is_symmetric(file.typecode))
+                progress = [&]() mutable { r = ++c; };
+            else if (mm_is_skew(file.typecode)) {
+                r = 1;
+                progress = [&]() mutable { r = ++c + 1; };
+            }
+            if (read)
+                readEntry();
         }
-        ~MMIterator() { free((void*)std::min(m_ptr, next)); }
+        ~MMIterator() { free((void *)std::min(m_ptr, next)); }
         // TODO: In the case of the input matrix being exactly (2^64) x (2^64),
         // the lower right element will incorrectly not be read
-        void terminate() { *m_ptr = { -1ul, -1ul, cur}; }
+        void terminate() { *m_ptr = {-1ul, -1ul, cur}; }
         reference operator*() const { return *m_ptr; }
         pointer operator->() { return m_ptr; }
 
         // Prefix increment
-        MMIterator& operator++() {
-          if(do_next) { std::swap(m_ptr, next); do_next = false; }
-          else readEntry();
-          return *this;
-        }   
+        MMIterator &operator++() {
+            if (do_next) {
+                std::swap(m_ptr, next);
+                do_next = false;
+            } else
+                readEntry();
+            return *this;
+        }
 
         // Postfix increment
-        MMIterator operator++(int) { MMIterator tmp = *this; ++(*this); return tmp; }
+        MMIterator operator++(int) {
+            MMIterator tmp = *this;
+            ++(*this);
+            return tmp;
+        }
 
-        friend bool operator== (const MMIterator& a, const MMIterator& b) { return *(a.m_ptr) == *(b.m_ptr); };
-        friend bool operator!= (const MMIterator& a, const MMIterator& b) { return *(a.m_ptr) != *(b.m_ptr); };
+        friend bool operator==(const MMIterator &a, const MMIterator &b) { return *(a.m_ptr) == *(b.m_ptr); };
+        friend bool operator!=(const MMIterator &a, const MMIterator &b) { return *(a.m_ptr) != *(b.m_ptr); };
     };
 
-    MMIterator begin() {
-      return MMIterator(*this);
-    }
+    MMIterator begin() { return MMIterator(*this); }
 
     MMIterator end() {
-      MMIterator dummy = MMIterator(*this, false);
-      dummy.terminate();
-      return dummy;
+        MMIterator dummy = MMIterator(*this, false);
+        dummy.terminate();
+        return dummy;
     }
 };
diff --git a/src/runtime/local/io/ReadCsv.h b/src/runtime/local/io/ReadCsv.h
index 2f944cf0f..81e6938e4 100644
--- a/src/runtime/local/io/ReadCsv.h
+++ b/src/runtime/local/io/ReadCsv.h
@@ -16,14 +16,14 @@
 
 #pragma once
 
+#include <runtime/local/datastructures/CSRMatrix.h>
 #include <runtime/local/datastructures/DataObjectFactory.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
-#include <runtime/local/datastructures/CSRMatrix.h>
 #include <runtime/local/datastructures/Frame.h>
 
 #include <runtime/local/io/File.h>
-#include <runtime/local/io/utils.h>
 #include <runtime/local/io/ReadCsvFile.h>
+#include <runtime/local/io/utils.h>
 
 #include <util/preprocessor_defs.h>
 
@@ -31,9 +31,9 @@
 
 #include <cstddef>
 #include <cstdint>
-#include <queue>
 #include <fstream>
 #include <limits>
+#include <queue>
 #include <sstream>
 
 // ****************************************************************************
@@ -41,35 +41,31 @@
 // ****************************************************************************
 
 template <class DTRes> struct ReadCsv {
-  static void apply(DTRes *&res, const char *filename, size_t numRows, size_t numCols,
-                    char delim) = delete;
+    static void apply(DTRes *&res, const char *filename, size_t numRows, size_t numCols, char delim) = delete;
 
-  static void apply(DTRes *&res, const char *filename, size_t numRows, size_t numCols,
-                    ssize_t numNonZeros, bool sorted = true) = delete;
+    static void apply(DTRes *&res, const char *filename, size_t numRows, size_t numCols, ssize_t numNonZeros,
+                      bool sorted = true) = delete;
 
-  static void apply(DTRes *&res, const char *filename, size_t numRows, size_t numCols,
-                    char delim, ValueTypeCode *schema) = delete;
+    static void apply(DTRes *&res, const char *filename, size_t numRows, size_t numCols, char delim,
+                      ValueTypeCode *schema) = delete;
 };
 
 // ****************************************************************************
 // Convenience function
 // ****************************************************************************
 
-template <class DTRes>
-void readCsv(DTRes *&res, const char *filename, size_t numRows, size_t numCols,
-             char delim) {
-  ReadCsv<DTRes>::apply(res, filename, numRows, numCols, delim);
+template <class DTRes> void readCsv(DTRes *&res, const char *filename, size_t numRows, size_t numCols, char delim) {
+    ReadCsv<DTRes>::apply(res, filename, numRows, numCols, delim);
 }
 
 template <class DTRes>
-void readCsv(DTRes *&res, const char *filename, size_t numRows, size_t numCols,
-             char delim, ValueTypeCode *schema) {
-  ReadCsv<DTRes>::apply(res, filename, numRows, numCols, delim, schema);
+void readCsv(DTRes *&res, const char *filename, size_t numRows, size_t numCols, char delim, ValueTypeCode *schema) {
+    ReadCsv<DTRes>::apply(res, filename, numRows, numCols, delim, schema);
 }
 
 template <class DTRes>
-void readCsv(DTRes *&res, const char *filename, size_t numRows, size_t numCols,
-             char delim, ssize_t numNonZeros, bool sorted = true) {
+void readCsv(DTRes *&res, const char *filename, size_t numRows, size_t numCols, char delim, ssize_t numNonZeros,
+             bool sorted = true) {
     ReadCsv<DTRes>::apply(res, filename, numRows, numCols, delim, numNonZeros, sorted);
 }
 
@@ -82,12 +78,11 @@ void readCsv(DTRes *&res, const char *filename, size_t numRows, size_t numCols,
 // ----------------------------------------------------------------------------
 
 template <typename VT> struct ReadCsv<DenseMatrix<VT>> {
-  static void apply(DenseMatrix<VT> *&res, const char *filename, size_t numRows,
-                    size_t numCols, char delim) {
-    struct File *file = openFile(filename);
-    readCsvFile(res, file, numRows, numCols, delim);
-    closeFile(file);
-  }
+    static void apply(DenseMatrix<VT> *&res, const char *filename, size_t numRows, size_t numCols, char delim) {
+        struct File *file = openFile(filename);
+        readCsvFile(res, file, numRows, numCols, delim);
+        closeFile(file);
+    }
 };
 
 // ----------------------------------------------------------------------------
@@ -95,24 +90,23 @@ template <typename VT> struct ReadCsv<DenseMatrix<VT>> {
 // ----------------------------------------------------------------------------
 
 template <typename VT> struct ReadCsv<CSRMatrix<VT>> {
-    static void apply(CSRMatrix<VT> *&res, const char *filename, size_t numRows,
-                      size_t numCols, char delim, ssize_t numNonZeros, bool sorted = true) {
+    static void apply(CSRMatrix<VT> *&res, const char *filename, size_t numRows, size_t numCols, char delim,
+                      ssize_t numNonZeros, bool sorted = true) {
         struct File *file = openFile(filename);
         readCsvFile(res, file, numRows, numCols, delim, numNonZeros, sorted);
         closeFile(file);
     }
 };
 
-
 // ----------------------------------------------------------------------------
 // Frame
 // ----------------------------------------------------------------------------
 
 template <> struct ReadCsv<Frame> {
-  static void apply(Frame *&res, const char *filename, size_t numRows,
-                    size_t numCols, char delim, ValueTypeCode *schema) {
-    struct File *file = openFile(filename);
-    readCsvFile(res, file, numRows, numCols, delim, schema);
-    closeFile(file);
-  }
+    static void apply(Frame *&res, const char *filename, size_t numRows, size_t numCols, char delim,
+                      ValueTypeCode *schema) {
+        struct File *file = openFile(filename);
+        readCsvFile(res, file, numRows, numCols, delim, schema);
+        closeFile(file);
+    }
 };
diff --git a/src/runtime/local/io/ReadCsvFile.h b/src/runtime/local/io/ReadCsvFile.h
index 34d0d5dd3..db2fd0f92 100644
--- a/src/runtime/local/io/ReadCsvFile.h
+++ b/src/runtime/local/io/ReadCsvFile.h
@@ -16,12 +16,11 @@
 
 #pragma once
 
+#include <runtime/local/datastructures/CSRMatrix.h>
 #include <runtime/local/datastructures/DataObjectFactory.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
-#include <runtime/local/datastructures/CSRMatrix.h>
 #include <runtime/local/datastructures/Frame.h>
 
-#include <runtime/local/io/File.h>
 #include <runtime/local/io/utils.h>
 
 #include <util/preprocessor_defs.h>
@@ -30,9 +29,9 @@
 
 #include <cstddef>
 #include <cstdint>
-#include <queue>
 #include <fstream>
 #include <limits>
+#include <queue>
 #include <sstream>
 #include <stdexcept>
 
@@ -41,35 +40,31 @@
 // ****************************************************************************
 
 template <class DTRes> struct ReadCsvFile {
-  static void apply(DTRes *&res, File *file, size_t numRows, size_t numCols,
-                    char delim) = delete;
+    static void apply(DTRes *&res, File *file, size_t numRows, size_t numCols, char delim) = delete;
 
-  static void apply(DTRes *&res, File *file, size_t numRows, size_t numCols,
-                    ssize_t numNonZeros, bool sorted = true) = delete;
+    static void apply(DTRes *&res, File *file, size_t numRows, size_t numCols, ssize_t numNonZeros,
+                      bool sorted = true) = delete;
 
-  static void apply(DTRes *&res, File *file, size_t numRows, size_t numCols,
-                    char delim, ValueTypeCode *schema) = delete;
+    static void apply(DTRes *&res, File *file, size_t numRows, size_t numCols, char delim,
+                      ValueTypeCode *schema) = delete;
 };
 
 // ****************************************************************************
 // Convenience function
 // ****************************************************************************
 
-template <class DTRes>
-void readCsvFile(DTRes *&res, File *file, size_t numRows, size_t numCols,
-             char delim) {
-  ReadCsvFile<DTRes>::apply(res, file, numRows, numCols, delim);
+template <class DTRes> void readCsvFile(DTRes *&res, File *file, size_t numRows, size_t numCols, char delim) {
+    ReadCsvFile<DTRes>::apply(res, file, numRows, numCols, delim);
 }
 
 template <class DTRes>
-void readCsvFile(DTRes *&res, File *file, size_t numRows, size_t numCols,
-             char delim, ValueTypeCode *schema) {
-  ReadCsvFile<DTRes>::apply(res, file, numRows, numCols, delim, schema);
+void readCsvFile(DTRes *&res, File *file, size_t numRows, size_t numCols, char delim, ValueTypeCode *schema) {
+    ReadCsvFile<DTRes>::apply(res, file, numRows, numCols, delim, schema);
 }
 
 template <class DTRes>
-void readCsvFile(DTRes *&res, File *file, size_t numRows, size_t numCols,
-             char delim, ssize_t numNonZeros, bool sorted = true) {
+void readCsvFile(DTRes *&res, File *file, size_t numRows, size_t numCols, char delim, ssize_t numNonZeros,
+                 bool sorted = true) {
     ReadCsvFile<DTRes>::apply(res, file, numRows, numCols, delim, numNonZeros, sorted);
 }
 
@@ -82,49 +77,114 @@ void readCsvFile(DTRes *&res, File *file, size_t numRows, size_t numCols,
 // ----------------------------------------------------------------------------
 
 template <typename VT> struct ReadCsvFile<DenseMatrix<VT>> {
-  static void apply(DenseMatrix<VT> *&res, struct File *file, size_t numRows,
-                    size_t numCols, char delim) {
-    if (file == nullptr)
-      throw std::runtime_error("ReadCsvFile: requires a file to be specified (must not be nullptr)");
-    if (numRows <= 0)
-      throw std::runtime_error("ReadCsvFile: numRows must be > 0");
-    if (numCols <= 0)
-      throw std::runtime_error("ReadCsvFile: numCols must be > 0");
-
-    if (res == nullptr) {
-      res = DataObjectFactory::create<DenseMatrix<VT>>(numRows, numCols, false);
+    static void apply(DenseMatrix<VT> *&res, struct File *file, size_t numRows, size_t numCols, char delim) {
+        if (file == nullptr)
+            throw std::runtime_error("ReadCsvFile: requires a file to be "
+                                     "specified (must not be nullptr)");
+        if (numRows <= 0)
+            throw std::runtime_error("ReadCsvFile: numRows must be > 0");
+        if (numCols <= 0)
+            throw std::runtime_error("ReadCsvFile: numCols must be > 0");
+
+        if (res == nullptr) {
+            res = DataObjectFactory::create<DenseMatrix<VT>>(numRows, numCols, false);
+        }
+
+        size_t cell = 0;
+        VT *valuesRes = res->getValues();
+
+        for (size_t r = 0; r < numRows; r++) {
+            if (getFileLine(file) == -1)
+                throw std::runtime_error("ReadCsvFile::apply: getFileLine failed");
+            // TODO Assuming that the given numRows is available, this should
+            // never happen.
+            //      if (line == NULL)
+            //        break;
+
+            size_t pos = 0;
+            for (size_t c = 0; c < numCols; c++) {
+                VT val;
+                convertCstr(file->line + pos, &val);
+
+                // TODO This assumes that rowSkip == numCols.
+                valuesRes[cell++] = val;
+
+                // TODO We could even exploit the fact that the strtoX functions
+                // can return a pointer to the first character after the parsed
+                // input, then we wouldn't have to search for that ourselves,
+                // just would need to check if it is really the delimiter.
+                if (c < numCols - 1) {
+                    while (file->line[pos] != delim)
+                        pos++;
+                    pos++; // skip delimiter
+                }
+            }
+        }
     }
+};
 
-    size_t cell = 0;
-    VT * valuesRes = res->getValues();
-
-    for(size_t r = 0; r < numRows; r++) {
-      if (getFileLine(file) == -1)
-        throw std::runtime_error("ReadCsvFile::apply: getFileLine failed");
-      // TODO Assuming that the given numRows is available, this should never
-      // happen.
-//      if (line == NULL)
-//        break;
-
-      size_t pos = 0;
-      for(size_t c = 0; c < numCols; c++) {
-        VT val;
-        convertCstr(file->line + pos, &val);
-        
-        // TODO This assumes that rowSkip == numCols.
-        valuesRes[cell++] = val;
-        
-        // TODO We could even exploit the fact that the strtoX functions can
-        // return a pointer to the first character after the parsed input, then
-        // we wouldn't have to search for that ourselves, just would need to
-        // check if it is really the delimiter.
-        if(c < numCols - 1) {
-            while(file->line[pos] != delim) pos++;
-            pos++; // skip delimiter
+template <> struct ReadCsvFile<DenseMatrix<std::string>> {
+    static void apply(DenseMatrix<std::string> *&res, struct File *file, size_t numRows, size_t numCols, char delim) {
+        if (file == nullptr)
+            throw std::runtime_error("ReadCsvFile: requires a file to be specified (must not be nullptr)");
+        if (numRows <= 0)
+            throw std::runtime_error("ReadCsvFile: numRows must be > 0");
+        if (numCols <= 0)
+            throw std::runtime_error("ReadCsvFile: numCols must be > 0");
+
+        if (res == nullptr) {
+            res = DataObjectFactory::create<DenseMatrix<std::string>>(numRows, numCols, false);
+        }
+
+        size_t cell = 0;
+        std::string *valuesRes = res->getValues();
+
+        for (size_t r = 0; r < numRows; r++) {
+            if (getFileLine(file) == -1)
+                throw std::runtime_error("ReadCsvFile::apply: getFileLine failed");
+
+            size_t pos = 0;
+            for (size_t c = 0; c < numCols; c++) {
+                std::string val("");
+                int next_column_pos = setCString(file, pos, &val, delim);
+                // TODO This assumes that rowSkip == numCols.
+                valuesRes[cell++] = val;
+                pos += next_column_pos + 1;
+            }
+        }
+    }
+};
+
+template <> struct ReadCsvFile<DenseMatrix<FixedStr16>> {
+    static void apply(DenseMatrix<FixedStr16> *&res, struct File *file, size_t numRows, size_t numCols, char delim) {
+        if (file == nullptr)
+            throw std::runtime_error("ReadCsvFile: requires a file to be specified (must not be nullptr)");
+        if (numRows <= 0)
+            throw std::runtime_error("ReadCsvFile: numRows must be > 0");
+        if (numCols <= 0)
+            throw std::runtime_error("ReadCsvFile: numCols must be > 0");
+
+        if (res == nullptr) {
+            res = DataObjectFactory::create<DenseMatrix<FixedStr16>>(numRows, numCols, false);
+        }
+
+        size_t cell = 0;
+        FixedStr16 *valuesRes = res->getValues();
+
+        for (size_t r = 0; r < numRows; r++) {
+            if (getFileLine(file) == -1)
+                throw std::runtime_error("ReadCsvFile::apply: getFileLine failed");
+
+            size_t pos = 0;
+            for (size_t c = 0; c < numCols; c++) {
+                std::string val("");
+                int next_column_pos = setCString(file, pos, &val, delim);
+                // TODO This assumes that rowSkip == numCols.
+                valuesRes[cell++].set(val.c_str());
+                pos += next_column_pos + 1;
+            }
         }
-      }
     }
-  }
 };
 
 // ----------------------------------------------------------------------------
@@ -132,23 +192,22 @@ template <typename VT> struct ReadCsvFile<DenseMatrix<VT>> {
 // ----------------------------------------------------------------------------
 
 template <typename VT> struct ReadCsvFile<CSRMatrix<VT>> {
-    static void apply(CSRMatrix<VT> *&res, struct File *file, size_t numRows,
-                      size_t numCols, char delim, ssize_t numNonZeros, bool sorted = true) {
+    static void apply(CSRMatrix<VT> *&res, struct File *file, size_t numRows, size_t numCols, char delim,
+                      ssize_t numNonZeros, bool sorted = true) {
         if (numNonZeros == -1)
-          throw std::runtime_error("ReadCsvFile: Currently, reading of sparse matrices requires a number of non zeros to be defined");
+            throw std::runtime_error("ReadCsvFile: Currently, reading of sparse matrices requires a "
+                                     "number of non zeros to be defined");
 
-        if(res == nullptr)
-            res = DataObjectFactory::create<CSRMatrix<VT>>(
-                numRows, numCols, numNonZeros, false
-            );
+        if (res == nullptr)
+            res = DataObjectFactory::create<CSRMatrix<VT>>(numRows, numCols, numNonZeros, false);
 
-        // TODO/FIXME: file format should be inferred from file extension or specified by user
-        if(sorted) {
+        // TODO/FIXME: file format should be inferred from file extension or
+        // specified by user
+        if (sorted) {
             readCOOSorted(res, file, numRows, numCols, static_cast<size_t>(numNonZeros), delim);
-        }
-        else {
-            // this internally sorts, so it might be worth considering just directly sorting the dense matrix
-            // Read file of COO format
+        } else {
+            // this internally sorts, so it might be worth considering just
+            // directly sorting the dense matrix Read file of COO format
             DenseMatrix<uint64_t> *rowColumnPairs = nullptr;
             readCsvFile(rowColumnPairs, file, static_cast<size_t>(numNonZeros), 2, delim);
             readCOOUnsorted(res, rowColumnPairs, numRows, numCols, static_cast<size_t>(numNonZeros));
@@ -156,15 +215,12 @@ template <typename VT> struct ReadCsvFile<CSRMatrix<VT>> {
         }
     }
 
-private:
-    static void readCOOSorted(CSRMatrix<VT> *&res,
-                              File *file,
-                              size_t numRows,
-            [[maybe_unused]] size_t numCols,
-                              size_t numNonZeros,
-                              char delim) {
+  private:
+    static void readCOOSorted(CSRMatrix<VT> *&res, File *file, size_t numRows, [[maybe_unused]] size_t numCols,
+                              size_t numNonZeros, char delim) {
         auto *rowOffsets = res->getRowOffsets();
-        // we first write number of non zeros for each row and then compute the cumulative sum
+        // we first write number of non zeros for each row and then compute the
+        // cumulative sum
         std::memset(rowOffsets, 0, (numRows + 1) * sizeof(size_t));
         auto *colIdxs = res->getColIdxs();
         auto *values = res->getValues();
@@ -174,10 +230,11 @@ template <typename VT> struct ReadCsvFile<CSRMatrix<VT>> {
         uint64_t col;
         for (size_t i = 0; i < numNonZeros; ++i) {
             if (getFileLine(file) == -1)
-              throw std::runtime_error("ReadCOOSorted::apply: getFileLine failed");
+                throw std::runtime_error("ReadCOOSorted::apply: getFileLine failed");
             convertCstr(file->line, &row);
             pos = 0;
-            while(file->line[pos] != delim) pos++;
+            while (file->line[pos] != delim)
+                pos++;
             pos++; // skip delimiter
             convertCstr(file->line + pos, &col);
 
@@ -185,22 +242,19 @@ template <typename VT> struct ReadCsvFile<CSRMatrix<VT>> {
             values[i] = 1;
             colIdxs[i] = col;
         }
-//        #pragma clang loop vectorize(enable)
+        //        #pragma clang loop vectorize(enable)
         PRAGMA_LOOP_VECTORIZE
         for (size_t r = 1; r <= numRows; ++r) {
             rowOffsets[r] += rowOffsets[r - 1];
         }
     }
 
-    static void readCOOUnsorted(CSRMatrix<VT> *&res,
-                                DenseMatrix<uint64_t> *rowColumnPairs,
-                                size_t numRows,
-                                size_t numCols,
-                                size_t numNonZeros) {
+    static void readCOOUnsorted(CSRMatrix<VT> *&res, DenseMatrix<uint64_t> *rowColumnPairs, size_t numRows,
+                                size_t numCols, size_t numNonZeros) {
         // pairs are ordered by first then by second argument (row, then col)
         using RowColPos = std::pair<size_t, size_t>;
         std::priority_queue<RowColPos, std::vector<RowColPos>, std::greater<>> positions;
-        for(auto r = 0u ; r < rowColumnPairs->getNumRows() ; ++r) {
+        for (auto r = 0u; r < rowColumnPairs->getNumRows(); ++r) {
             positions.emplace(rowColumnPairs->get(r, 0), rowColumnPairs->get(r, 1));
         }
 
@@ -210,14 +264,14 @@ template <typename VT> struct ReadCsvFile<CSRMatrix<VT>> {
         auto *values = res->getValues();
         size_t currValIdx = 0;
         size_t rowIdx = 0;
-        while(!positions.empty()) {
+        while (!positions.empty()) {
             auto pos = positions.top();
-            if(pos.first >= res->getNumRows() || pos.second >= res->getNumCols()) {
-                throw std::runtime_error("Position [" + std::to_string(pos.first) + ", " + std::to_string(pos.second)
-                    + "] is not part of matrix<" + std::to_string(res->getNumRows()) + ", "
-                    + std::to_string(res->getNumCols()) + ">");
+            if (pos.first >= res->getNumRows() || pos.second >= res->getNumCols()) {
+                throw std::runtime_error("Position [" + std::to_string(pos.first) + ", " + std::to_string(pos.second) +
+                                         "] is not part of matrix<" + std::to_string(res->getNumRows()) + ", " +
+                                         std::to_string(res->getNumCols()) + ">");
             }
-            while(rowIdx < pos.first) {
+            while (rowIdx < pos.first) {
                 rowOffsets[rowIdx + 1] = currValIdx;
                 rowIdx++;
             }
@@ -227,114 +281,114 @@ template <typename VT> struct ReadCsvFile<CSRMatrix<VT>> {
             currValIdx++;
             positions.pop();
         }
-        while(rowIdx < numRows) {
+        while (rowIdx < numRows) {
             rowOffsets[rowIdx + 1] = currValIdx;
             rowIdx++;
         }
     }
 };
 
-
 // ----------------------------------------------------------------------------
 // Frame
 // ----------------------------------------------------------------------------
 
 template <> struct ReadCsvFile<Frame> {
-  static void apply(Frame *&res, struct File *file, size_t numRows,
-                    size_t numCols, char delim, ValueTypeCode *schema) {
-    if (numRows <= 0)
-      throw std::runtime_error("ReadCsvFile: numRows must be > 0");
-    if (numCols <= 0)
-      throw std::runtime_error("ReadCsvFile: numCols must be > 0");
-
-    if (res == nullptr) {
-      res = DataObjectFactory::create<Frame>(numRows, numCols, schema, nullptr, false);
-    }
-
-    size_t row = 0, col = 0;
+    static void apply(Frame *&res, struct File *file, size_t numRows, size_t numCols, char delim,
+                      ValueTypeCode *schema) {
+        if (numRows <= 0)
+            throw std::runtime_error("ReadCsvFile: numRows must be > 0");
+        if (numCols <= 0)
+            throw std::runtime_error("ReadCsvFile: numCols must be > 0");
+
+        if (res == nullptr) {
+            res = DataObjectFactory::create<Frame>(numRows, numCols, schema, nullptr, false);
+        }
 
-    uint8_t ** rawCols = new uint8_t * [numCols];
-    ValueTypeCode * colTypes = new ValueTypeCode[numCols];
-    for(size_t i = 0; i < numCols; i++) {
-        rawCols[i] = reinterpret_cast<uint8_t *>(res->getColumnRaw(i));
-        colTypes[i] = res->getColumnType(i);
-    }
+        size_t row = 0, col = 0;
 
-    while (1) {
-      ssize_t ret = getFileLine(file);
-      if (file->read == EOF)
-        break;
-      if (file->line == NULL)
-        break;
-      if (ret == -1)
-        throw std::runtime_error("ReadCsvFile::apply: getFileLine failed");
-
-      size_t pos = 0;
-      while (1) {
-        switch (colTypes[col]) {
-        case ValueTypeCode::SI8:
-          int8_t val_si8;
-          convertCstr(file->line + pos, &val_si8);
-          reinterpret_cast<int8_t *>(rawCols[col])[row] = val_si8;
-          break;
-        case ValueTypeCode::SI32:
-          int32_t val_si32;
-          convertCstr(file->line + pos, &val_si32);
-          reinterpret_cast<int32_t *>(rawCols[col])[row] = val_si32;
-          break;
-        case ValueTypeCode::SI64:
-          int64_t val_si64;
-          convertCstr(file->line + pos, &val_si64);
-          reinterpret_cast<int64_t *>(rawCols[col])[row] = val_si64;
-          break;
-        case ValueTypeCode::UI8:
-          uint8_t val_ui8;
-          convertCstr(file->line + pos, &val_ui8);
-          reinterpret_cast<uint8_t *>(rawCols[col])[row] = val_ui8;
-          break;
-        case ValueTypeCode::UI32:
-          uint32_t val_ui32;
-          convertCstr(file->line + pos, &val_ui32);
-          reinterpret_cast<uint32_t *>(rawCols[col])[row] = val_ui32;
-          break;
-        case ValueTypeCode::UI64:
-          uint64_t val_ui64;
-          convertCstr(file->line + pos, &val_ui64);
-          reinterpret_cast<uint64_t *>(rawCols[col])[row] = val_ui64;
-          break;
-        case ValueTypeCode::F32:
-          float val_f32;
-          convertCstr(file->line + pos, &val_f32);
-          reinterpret_cast<float *>(rawCols[col])[row] = val_f32;
-          break;
-        case ValueTypeCode::F64:
-          double val_f64;
-          convertCstr(file->line + pos, &val_f64);
-          reinterpret_cast<double *>(rawCols[col])[row] = val_f64;
-          break;
-        default:
-          throw std::runtime_error("ReadCsvFile::apply: unknown value type code");
+        uint8_t **rawCols = new uint8_t *[numCols];
+        ValueTypeCode *colTypes = new ValueTypeCode[numCols];
+        for (size_t i = 0; i < numCols; i++) {
+            rawCols[i] = reinterpret_cast<uint8_t *>(res->getColumnRaw(i));
+            colTypes[i] = res->getColumnType(i);
         }
 
-        if (++col >= numCols) {
-          break;
+        while (1) {
+            ssize_t ret = getFileLine(file);
+            if (file->read == EOF)
+                break;
+            if (file->line == NULL)
+                break;
+            if (ret == -1)
+                throw std::runtime_error("ReadCsvFile::apply: getFileLine failed");
+
+            size_t pos = 0;
+            while (1) {
+                switch (colTypes[col]) {
+                case ValueTypeCode::SI8:
+                    int8_t val_si8;
+                    convertCstr(file->line + pos, &val_si8);
+                    reinterpret_cast<int8_t *>(rawCols[col])[row] = val_si8;
+                    break;
+                case ValueTypeCode::SI32:
+                    int32_t val_si32;
+                    convertCstr(file->line + pos, &val_si32);
+                    reinterpret_cast<int32_t *>(rawCols[col])[row] = val_si32;
+                    break;
+                case ValueTypeCode::SI64:
+                    int64_t val_si64;
+                    convertCstr(file->line + pos, &val_si64);
+                    reinterpret_cast<int64_t *>(rawCols[col])[row] = val_si64;
+                    break;
+                case ValueTypeCode::UI8:
+                    uint8_t val_ui8;
+                    convertCstr(file->line + pos, &val_ui8);
+                    reinterpret_cast<uint8_t *>(rawCols[col])[row] = val_ui8;
+                    break;
+                case ValueTypeCode::UI32:
+                    uint32_t val_ui32;
+                    convertCstr(file->line + pos, &val_ui32);
+                    reinterpret_cast<uint32_t *>(rawCols[col])[row] = val_ui32;
+                    break;
+                case ValueTypeCode::UI64:
+                    uint64_t val_ui64;
+                    convertCstr(file->line + pos, &val_ui64);
+                    reinterpret_cast<uint64_t *>(rawCols[col])[row] = val_ui64;
+                    break;
+                case ValueTypeCode::F32:
+                    float val_f32;
+                    convertCstr(file->line + pos, &val_f32);
+                    reinterpret_cast<float *>(rawCols[col])[row] = val_f32;
+                    break;
+                case ValueTypeCode::F64:
+                    double val_f64;
+                    convertCstr(file->line + pos, &val_f64);
+                    reinterpret_cast<double *>(rawCols[col])[row] = val_f64;
+                    break;
+                default:
+                    throw std::runtime_error("ReadCsvFile::apply: unknown value type code");
+                }
+
+                if (++col >= numCols) {
+                    break;
+                }
+
+                // TODO We could even exploit the fact that the strtoX functions
+                // can return a pointer to the first character after the parsed
+                // input, then we wouldn't have to search for that ourselves,
+                // just would need to check if it is really the delimiter.
+                while (file->line[pos] != delim)
+                    pos++;
+                pos++; // skip delimiter
+            }
+
+            if (++row >= numRows) {
+                break;
+            }
+            col = 0;
         }
-        
-        // TODO We could even exploit the fact that the strtoX functions can
-        // return a pointer to the first character after the parsed input, then
-        // we wouldn't have to search for that ourselves, just would need to
-        // check if it is really the delimiter.
-        while(file->line[pos] != delim) pos++;
-        pos++; // skip delimiter
-      }
-
-      if (++row >= numRows) {
-        break;
-      }
-      col = 0;
+
+        delete[] rawCols;
+        delete[] colTypes;
     }
-    
-    delete[] rawCols;
-    delete[] colTypes;
-  }
 };
diff --git a/src/runtime/local/io/ReadDaphne.h b/src/runtime/local/io/ReadDaphne.h
index ad9d6dd75..5024bba49 100644
--- a/src/runtime/local/io/ReadDaphne.h
+++ b/src/runtime/local/io/ReadDaphne.h
@@ -16,11 +16,11 @@
 
 #pragma once
 
-#include <runtime/local/datastructures/ValueTypeCode.h>
+#include <runtime/local/datastructures/CSRMatrix.h>
 #include <runtime/local/datastructures/DataObjectFactory.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
-#include <runtime/local/datastructures/CSRMatrix.h>
 #include <runtime/local/datastructures/Frame.h>
+#include <runtime/local/datastructures/ValueTypeCode.h>
 
 #include <runtime/local/io/DaphneFile.h>
 #include <runtime/local/io/DaphneSerializer.h>
@@ -28,21 +28,20 @@
 
 #include <util/preprocessor_defs.h>
 
-#include <type_traits>
 #include <cstddef>
 #include <cstdint>
 #include <fstream>
 #include <iostream>
 #include <limits>
-#include <vector>
 #include <stdlib.h>
+#include <type_traits>
+#include <vector>
 
 // ****************************************************************************
 // Struct for partial template specialization
 // ****************************************************************************
 
-template <class DTRes>
-struct ReadDaphne {
+template <class DTRes> struct ReadDaphne {
     static void apply(DTRes *&res, const char *filename) = delete;
 };
 
@@ -50,23 +49,20 @@ struct ReadDaphne {
 // Convenience function
 // ****************************************************************************
 
-template <class DTRes>
-void readDaphne(DTRes *&res, const char *filename) {
-    ReadDaphne<DTRes>::apply(res, filename);
-}
+template <class DTRes> void readDaphne(DTRes *&res, const char *filename) { ReadDaphne<DTRes>::apply(res, filename); }
 
 // ****************************************************************************
 // (Partial) template specializations for different data/value types
 // ****************************************************************************
 
-template <typename VT>
-struct ReadDaphne<DenseMatrix<VT>> {
+template <typename VT> struct ReadDaphne<DenseMatrix<VT>> {
     static void apply(DenseMatrix<VT> *&res, const char *filename) {
         std::ifstream f;
         f.open(filename, std::ios::in | std::ios::binary);
         // TODO: check f.good()
 
-        auto deser = DaphneDeserializerChunks<DenseMatrix<VT>>(&res, DaphneSerializer<DenseMatrix<VT>>::DEFAULT_SERIALIZATION_BUFFER_SIZE);
+        auto deser = DaphneDeserializerChunks<DenseMatrix<VT>>(
+            &res, DaphneSerializer<DenseMatrix<VT>>::DEFAULT_SERIALIZATION_BUFFER_SIZE);
         for (auto it = deser.begin(); it != deser.end(); ++it) {
             it->first = DaphneSerializer<DenseMatrix<VT>>::DEFAULT_SERIALIZATION_BUFFER_SIZE;
             f.read(it->second->data(), it->first);
@@ -79,14 +75,14 @@ struct ReadDaphne<DenseMatrix<VT>> {
     }
 };
 
-template <typename VT>
-struct ReadDaphne<CSRMatrix<VT>> {
+template <typename VT> struct ReadDaphne<CSRMatrix<VT>> {
     static void apply(CSRMatrix<VT> *&res, const char *filename) {
         std::ifstream f;
         f.open(filename, std::ios::in | std::ios::binary);
         // TODO: check f.good()
 
-        auto deser = DaphneDeserializerChunks<CSRMatrix<VT>>(&res, DaphneSerializer<CSRMatrix<VT>>::DEFAULT_SERIALIZATION_BUFFER_SIZE);
+        auto deser = DaphneDeserializerChunks<CSRMatrix<VT>>(
+            &res, DaphneSerializer<CSRMatrix<VT>>::DEFAULT_SERIALIZATION_BUFFER_SIZE);
         for (auto it = deser.begin(); it != deser.end(); ++it) {
             it->first = DaphneSerializer<CSRMatrix<VT>>::DEFAULT_SERIALIZATION_BUFFER_SIZE;
             f.read(it->second->data(), it->first);
@@ -99,8 +95,7 @@ struct ReadDaphne<CSRMatrix<VT>> {
     }
 };
 
-template <>
-struct ReadDaphne<Frame> {
+template <> struct ReadDaphne<Frame> {
     static void apply(Frame *&res, const char *filename) {
         std::ifstream f;
         f.open(filename, std::ios::in | std::ios::binary);
@@ -143,48 +138,48 @@ struct ReadDaphne<Frame> {
             for (size_t r = 0; r < h.nbrows; r++) {
                 for (size_t c = 0; c < h.nbcols; c++) {
                     switch (schema[c]) {
-                        case ValueTypeCode::SI8:
-                            int8_t val_si8;
-                            f.read((char *)&val_si8, sizeof(val_si8));
-                            reinterpret_cast<int8_t *>(rawCols[c])[r] = val_si8;
-                            break;
-                        case ValueTypeCode::SI32:
-                            int32_t val_si32;
-                            f.read((char *)&val_si32, sizeof(val_si32));
-                            reinterpret_cast<int32_t *>(rawCols[c])[r] = val_si32;
-                            break;
-                        case ValueTypeCode::SI64:
-                            int64_t val_si64;
-                            f.read((char *)&val_si64, sizeof(val_si64));
-                            reinterpret_cast<int64_t *>(rawCols[c])[r] = val_si64;
-                            break;
-                        case ValueTypeCode::UI8:
-                            uint8_t val_ui8;
-                            f.read((char *)&val_ui8, sizeof(val_ui8));
-                            reinterpret_cast<uint8_t *>(rawCols[c])[r] = val_ui8;
-                            break;
-                        case ValueTypeCode::UI32:
-                            uint32_t val_ui32;
-                            f.read((char *)&val_ui32, sizeof(val_ui32));
-                            reinterpret_cast<uint32_t *>(rawCols[c])[r] = val_ui32;
-                            break;
-                        case ValueTypeCode::UI64:
-                            uint64_t val_ui64;
-                            f.read((char *)&val_ui64, sizeof(val_ui64));
-                            reinterpret_cast<uint64_t *>(rawCols[c])[r] = val_ui64;
-                            break;
-                        case ValueTypeCode::F32:
-                            float val_f32;
-                            f.read((char *)&val_f32, sizeof(val_f32));
-                            reinterpret_cast<float *>(rawCols[c])[r] = val_f32;
-                            break;
-                        case ValueTypeCode::F64:
-                            double val_f64;
-                            f.read((char *)&val_f64, sizeof(val_f64));
-                            reinterpret_cast<double *>(rawCols[c])[r] = val_f64;
-                            break;
-                        default:
-                            throw std::runtime_error("ReadDaphne::apply: unknown value type code");
+                    case ValueTypeCode::SI8:
+                        int8_t val_si8;
+                        f.read((char *)&val_si8, sizeof(val_si8));
+                        reinterpret_cast<int8_t *>(rawCols[c])[r] = val_si8;
+                        break;
+                    case ValueTypeCode::SI32:
+                        int32_t val_si32;
+                        f.read((char *)&val_si32, sizeof(val_si32));
+                        reinterpret_cast<int32_t *>(rawCols[c])[r] = val_si32;
+                        break;
+                    case ValueTypeCode::SI64:
+                        int64_t val_si64;
+                        f.read((char *)&val_si64, sizeof(val_si64));
+                        reinterpret_cast<int64_t *>(rawCols[c])[r] = val_si64;
+                        break;
+                    case ValueTypeCode::UI8:
+                        uint8_t val_ui8;
+                        f.read((char *)&val_ui8, sizeof(val_ui8));
+                        reinterpret_cast<uint8_t *>(rawCols[c])[r] = val_ui8;
+                        break;
+                    case ValueTypeCode::UI32:
+                        uint32_t val_ui32;
+                        f.read((char *)&val_ui32, sizeof(val_ui32));
+                        reinterpret_cast<uint32_t *>(rawCols[c])[r] = val_ui32;
+                        break;
+                    case ValueTypeCode::UI64:
+                        uint64_t val_ui64;
+                        f.read((char *)&val_ui64, sizeof(val_ui64));
+                        reinterpret_cast<uint64_t *>(rawCols[c])[r] = val_ui64;
+                        break;
+                    case ValueTypeCode::F32:
+                        float val_f32;
+                        f.read((char *)&val_f32, sizeof(val_f32));
+                        reinterpret_cast<float *>(rawCols[c])[r] = val_f32;
+                        break;
+                    case ValueTypeCode::F64:
+                        double val_f64;
+                        f.read((char *)&val_f64, sizeof(val_f64));
+                        reinterpret_cast<double *>(rawCols[c])[r] = val_f64;
+                        break;
+                    default:
+                        throw std::runtime_error("ReadDaphne::apply: unknown value type code");
                     }
                 }
             }
diff --git a/src/runtime/local/io/ReadMM.h b/src/runtime/local/io/ReadMM.h
index 007877d7c..6a136c815 100644
--- a/src/runtime/local/io/ReadMM.h
+++ b/src/runtime/local/io/ReadMM.h
@@ -17,15 +17,15 @@
 #ifndef MM_IO_H
 #define MM_IO_H
 
+#include <algorithm>
+#include <queue>
+#include <runtime/local/datastructures/CSRMatrix.h>
 #include <runtime/local/datastructures/DataObjectFactory.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
-#include <runtime/local/datastructures/CSRMatrix.h>
 #include <runtime/local/datastructures/Frame.h>
 #include <runtime/local/datastructures/ValueTypeCode.h>
 #include <runtime/local/io/MMFile.h>
 #include <vector>
-#include <algorithm>
-#include <queue>
 
 typedef char MM_typecode[4];
 
@@ -36,100 +36,87 @@ char *mm_typecode_to_str(MM_typecode matcode);
 // ****************************************************************************
 
 template <class DTRes> struct ReadMM {
-  static void apply(DTRes *&res, const char *filename) = delete;
+    static void apply(DTRes *&res, const char *filename) = delete;
 };
 
 // ****************************************************************************
 // Convenience function
 // ****************************************************************************
 
-template <class DTRes>
-void readMM(DTRes *&res, const char *filename) {
-  ReadMM<DTRes>::apply(res, filename);
-}
+template <class DTRes> void readMM(DTRes *&res, const char *filename) { ReadMM<DTRes>::apply(res, filename); }
 
 template <typename VT> struct ReadMM<DenseMatrix<VT>> {
-  static void apply(DenseMatrix<VT> *&res, const char *filename){
-    MMFile<VT> mmfile(filename);
-    if(res == nullptr)
-      res = DataObjectFactory::create<DenseMatrix<VT>>(
-        mmfile.numberRows(),
-        mmfile.numberCols(),
-        mmfile.entryCount() != mmfile.numberCols() * mmfile.numberRows()
-      );
-    VT *valuesRes = res->getValues();
-    for (auto &entry : mmfile)
-      valuesRes[entry.row * mmfile.numberCols() + entry.col] = entry.val;
-  }
+    static void apply(DenseMatrix<VT> *&res, const char *filename) {
+        MMFile<VT> mmfile(filename);
+        if (res == nullptr)
+            res = DataObjectFactory::create<DenseMatrix<VT>>(mmfile.numberRows(), mmfile.numberCols(),
+                                                             mmfile.entryCount() !=
+                                                                 mmfile.numberCols() * mmfile.numberRows());
+        VT *valuesRes = res->getValues();
+        for (auto &entry : mmfile)
+            valuesRes[entry.row * mmfile.numberCols() + entry.col] = entry.val;
+    }
 };
 
 template <typename VT> struct ReadMM<CSRMatrix<VT>> {
-  static void apply(CSRMatrix<VT> *&res, const char *filename){
-    MMFile<VT> mmfile(filename);
-
-    using entry_t = typename MMFile<VT>::Entry;
-    std::priority_queue<entry_t, std::vector<entry_t>, std::greater<>>
-      entry_queue;
-
-    for(auto &entry : mmfile) entry_queue.emplace(entry);
-
-    if(res == nullptr)
-      res = DataObjectFactory::create<CSRMatrix<VT>>(
-        mmfile.numberRows(),
-        mmfile.numberCols(),
-        entry_queue.size(),
-        false
-      );
-
-    auto *rowOffsets = res->getRowOffsets();
-    rowOffsets[0] = 0;
-    auto *colIdxs = res->getColIdxs();
-    auto *values = res->getValues();
-    size_t currValIdx = 0;
-    size_t rowIdx = 0;
-    while(!entry_queue.empty()){
-      auto& entry = entry_queue.top();
-      while(rowIdx < entry.row) {
-          rowOffsets[rowIdx + 1] = currValIdx;
-          rowIdx++;
-      }
-      values[currValIdx] = entry.val;
-      colIdxs[currValIdx] = entry.col;
-      currValIdx++;
-      entry_queue.pop();
-    }
-    while(rowIdx < mmfile.numberRows()) {
-        rowOffsets[rowIdx + 1] = currValIdx;
-        rowIdx++;
+    static void apply(CSRMatrix<VT> *&res, const char *filename) {
+        MMFile<VT> mmfile(filename);
+
+        using entry_t = typename MMFile<VT>::Entry;
+        std::priority_queue<entry_t, std::vector<entry_t>, std::greater<>> entry_queue;
+
+        for (auto &entry : mmfile)
+            entry_queue.emplace(entry);
+
+        if (res == nullptr)
+            res = DataObjectFactory::create<CSRMatrix<VT>>(mmfile.numberRows(), mmfile.numberCols(), entry_queue.size(),
+                                                           false);
+
+        auto *rowOffsets = res->getRowOffsets();
+        rowOffsets[0] = 0;
+        auto *colIdxs = res->getColIdxs();
+        auto *values = res->getValues();
+        size_t currValIdx = 0;
+        size_t rowIdx = 0;
+        while (!entry_queue.empty()) {
+            auto &entry = entry_queue.top();
+            while (rowIdx < entry.row) {
+                rowOffsets[rowIdx + 1] = currValIdx;
+                rowIdx++;
+            }
+            values[currValIdx] = entry.val;
+            colIdxs[currValIdx] = entry.col;
+            currValIdx++;
+            entry_queue.pop();
+        }
+        while (rowIdx < mmfile.numberRows()) {
+            rowOffsets[rowIdx + 1] = currValIdx;
+            rowIdx++;
+        }
     }
-  }
 };
 
 template <> struct ReadMM<Frame> {
-  static void apply(Frame *&res, const char *filename){
-    MMFile<double> mmfile(filename);
-
-    if(res == nullptr){
-      ValueTypeCode *types = new ValueTypeCode[mmfile.numberCols()];
-      for(size_t i = 0; i<mmfile.numberCols(); i++)
-        types[i] = mmfile.elementType();
-
-      res = DataObjectFactory::create<Frame>(
-        mmfile.numberRows(),
-        mmfile.numberCols(),
-        types, nullptr,
-        mmfile.entryCount() != mmfile.numberCols() * mmfile.numberRows()
-      );
+    static void apply(Frame *&res, const char *filename) {
+        MMFile<double> mmfile(filename);
+
+        if (res == nullptr) {
+            ValueTypeCode *types = new ValueTypeCode[mmfile.numberCols()];
+            for (size_t i = 0; i < mmfile.numberCols(); i++)
+                types[i] = mmfile.elementType();
+
+            res = DataObjectFactory::create<Frame>(mmfile.numberRows(), mmfile.numberCols(), types, nullptr,
+                                                   mmfile.entryCount() != mmfile.numberCols() * mmfile.numberRows());
+        }
+        uint8_t **rawFrame = new uint8_t *[mmfile.numberCols()];
+        for (size_t i = 0; i < mmfile.numberCols(); i++)
+            rawFrame[i] = reinterpret_cast<uint8_t *>(res->getColumnRaw(i));
+        for (auto &entry : mmfile)
+            if (mmfile.elementType() == ValueTypeCode::SI64)
+                reinterpret_cast<int64_t *>(rawFrame[entry.col])[entry.row] = (int64_t)entry.val;
+            else
+                reinterpret_cast<double *>(rawFrame[entry.col])[entry.row] = entry.val;
     }
-    uint8_t ** rawFrame = new uint8_t*[mmfile.numberCols()];
-    for(size_t i = 0; i<mmfile.numberCols(); i++)
-      rawFrame[i] = reinterpret_cast<uint8_t *>(res->getColumnRaw(i));
-    for(auto& entry : mmfile)
-      if (mmfile.elementType() == ValueTypeCode::SI64)
-        reinterpret_cast<int64_t *>(rawFrame[entry.col])[entry.row] = (int64_t)entry.val;
-      else
-        reinterpret_cast<double *>(rawFrame[entry.col])[entry.row] = entry.val;
-  }
 };
 
 #endif // MM_IO_H
\ No newline at end of file
diff --git a/src/runtime/local/io/ReadParquet.h b/src/runtime/local/io/ReadParquet.h
index 764dc6fe5..29a66023a 100644
--- a/src/runtime/local/io/ReadParquet.h
+++ b/src/runtime/local/io/ReadParquet.h
@@ -16,9 +16,9 @@
 
 #pragma once
 
+#include <runtime/local/datastructures/CSRMatrix.h>
 #include <runtime/local/datastructures/DataObjectFactory.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
-#include <runtime/local/datastructures/CSRMatrix.h>
 #include <runtime/local/datastructures/Frame.h>
 
 #include <runtime/local/io/File.h>
@@ -29,48 +29,46 @@
 
 #include <cstddef>
 #include <cstdint>
-#include <queue>
 #include <fstream>
 #include <limits>
+#include <queue>
 #include <sstream>
 
 #include <arrow/api.h>
+#include <arrow/csv/api.h>
+#include <arrow/filesystem/localfs.h>
 #include <arrow/io/memory.h>
 #include <arrow/ipc/api.h>
 #include <parquet/arrow/reader.h>
-#include <arrow/filesystem/localfs.h>
-#include <arrow/csv/api.h>
 
 // ****************************************************************************
 // Struct for partial template specialization
 // ****************************************************************************
 
 template <class DTRes> struct ReadParquet {
-  static void apply(DTRes *&res, const char *filename, size_t numRows, size_t numCols) = delete;
-  static void apply(DTRes *&res, const char *filename, size_t numRows, size_t numCols,
-                    ValueTypeCode *schema) = delete;
-  static void apply(DTRes *&res, const char *filename, size_t numRows, size_t numCols,
-                    ssize_t numNonZeros, bool sorted = true) = delete;
+    static void apply(DTRes *&res, const char *filename, size_t numRows, size_t numCols) = delete;
+    static void apply(DTRes *&res, const char *filename, size_t numRows, size_t numCols,
+                      ValueTypeCode *schema) = delete;
+    static void apply(DTRes *&res, const char *filename, size_t numRows, size_t numCols, ssize_t numNonZeros,
+                      bool sorted = true) = delete;
 };
 
 // ****************************************************************************
 // Convenience function
 // ****************************************************************************
 
-template <class DTRes>
-void readParquet(DTRes *&res, const char *filename, size_t numRows, size_t numCols) {
-  ReadParquet<DTRes>::apply(res, filename, numRows, numCols);
+template <class DTRes> void readParquet(DTRes *&res, const char *filename, size_t numRows, size_t numCols) {
+    ReadParquet<DTRes>::apply(res, filename, numRows, numCols);
 }
 
 template <class DTRes>
-void readParquet(DTRes *&res, const char *filename, size_t numRows, size_t numCols,
-             ValueTypeCode *schema) {
-  ReadParquet<DTRes>::apply(res, filename, numRows, numCols, schema);
+void readParquet(DTRes *&res, const char *filename, size_t numRows, size_t numCols, ValueTypeCode *schema) {
+    ReadParquet<DTRes>::apply(res, filename, numRows, numCols, schema);
 }
 
 template <class DTRes>
-void readParquet(DTRes *&res, const char *filename, size_t numRows, size_t numCols,
-             ssize_t numNonZeros, bool sorted = true) {
+void readParquet(DTRes *&res, const char *filename, size_t numRows, size_t numCols, ssize_t numNonZeros,
+                 bool sorted = true) {
     ReadParquet<DTRes>::apply(res, filename, numRows, numCols, numNonZeros, sorted);
 }
 
@@ -78,21 +76,21 @@ void readParquet(DTRes *&res, const char *filename, size_t numRows, size_t numCo
 // (Partial) template specializations for different data/value types
 // ****************************************************************************
 
-inline struct File *arrowToCsv(const char *filename){
-    arrow::MemoryPool* pool = arrow::default_memory_pool();
+inline struct File *arrowToCsv(const char *filename) {
+    arrow::MemoryPool *pool = arrow::default_memory_pool();
     arrow::fs::LocalFileSystem file_system;
     std::shared_ptr<arrow::io::RandomAccessFile> input = file_system.OpenInputFile(filename).ValueOrDie();
 
     std::unique_ptr<parquet::arrow::FileReader> arrow_reader;
-    if(!(parquet::arrow::OpenFile(input, pool, &arrow_reader).ok()))
+    if (!(parquet::arrow::OpenFile(input, pool, &arrow_reader).ok()))
         throw std::runtime_error("Could not open Parquet file");
 
     std::shared_ptr<arrow::Table> table;
-    if(!(arrow_reader->ReadTable(&table)).ok())
+    if (!(arrow_reader->ReadTable(&table)).ok())
         throw std::runtime_error("Could not read Parquet table");
 
     auto output = arrow::io::BufferOutputStream::Create().ValueOrDie();
-    if(!(arrow::csv::WriteCSV(*table, arrow::csv::WriteOptions::Defaults(), output.get())).ok())
+    if (!(arrow::csv::WriteCSV(*table, arrow::csv::WriteOptions::Defaults(), output.get())).ok())
         throw std::runtime_error("Could not write from Parquet to CSV format");
 
     auto finishResult = output->Finish();
@@ -113,12 +111,11 @@ inline struct File *arrowToCsv(const char *filename){
 // ----------------------------------------------------------------------------
 
 template <> struct ReadParquet<Frame> {
-  static void apply(Frame *&res, const char *filename, size_t numRows,
-                    size_t numCols, ValueTypeCode *schema) {
-    struct File *file = arrowToCsv(filename);
-    readCsvFile<Frame>(res, file, numRows, numCols, ',', schema);
-    closeFile(file);
-  }
+    static void apply(Frame *&res, const char *filename, size_t numRows, size_t numCols, ValueTypeCode *schema) {
+        struct File *file = arrowToCsv(filename);
+        readCsvFile<Frame>(res, file, numRows, numCols, ',', schema);
+        closeFile(file);
+    }
 };
 
 // ----------------------------------------------------------------------------
@@ -126,8 +123,8 @@ template <> struct ReadParquet<Frame> {
 // ----------------------------------------------------------------------------
 
 template <typename VT> struct ReadParquet<CSRMatrix<VT>> {
-    static void apply(CSRMatrix<VT> *&res, const char *filename, size_t numRows,
-                      size_t numCols, ssize_t numNonZeros, bool sorted = true) {
+    static void apply(CSRMatrix<VT> *&res, const char *filename, size_t numRows, size_t numCols, ssize_t numNonZeros,
+                      bool sorted = true) {
         struct File *file = arrowToCsv(filename);
         readCsvFile<CSRMatrix<VT>>(res, file, numRows, numCols, ',', numNonZeros, sorted);
         closeFile(file);
@@ -139,8 +136,7 @@ template <typename VT> struct ReadParquet<CSRMatrix<VT>> {
 // ----------------------------------------------------------------------------
 
 template <typename VT> struct ReadParquet<DenseMatrix<VT>> {
-  static void apply(DenseMatrix<VT> *&res, const char *filename, size_t numRows,
-                    size_t numCols) {
+    static void apply(DenseMatrix<VT> *&res, const char *filename, size_t numRows, size_t numCols) {
         struct File *file = arrowToCsv(filename);
         readCsvFile<DenseMatrix<VT>>(res, file, numRows, numCols, ',');
         closeFile(file);
diff --git a/src/runtime/local/io/WriteCsv.h b/src/runtime/local/io/WriteCsv.h
index 2ee9d880c..34f4efcc0 100644
--- a/src/runtime/local/io/WriteCsv.h
+++ b/src/runtime/local/io/WriteCsv.h
@@ -38,8 +38,7 @@
 // Struct for partial template specialization
 // ****************************************************************************
 
-template <class DTArg>
-struct WriteCsv {
+template <class DTArg> struct WriteCsv {
     static void apply(const DTArg *arg, File *file) = delete;
 };
 
@@ -47,10 +46,7 @@ struct WriteCsv {
 // Convenience function
 // ****************************************************************************
 
-template <class DTArg>
-void writeCsv(const DTArg *arg, File *file) {
-    WriteCsv<DTArg>::apply(arg, file);
-}
+template <class DTArg> void writeCsv(const DTArg *arg, File *file) { WriteCsv<DTArg>::apply(arg, file); }
 
 // ****************************************************************************
 // (Partial) template specializations for different data/value types
@@ -60,31 +56,27 @@ void writeCsv(const DTArg *arg, File *file) {
 // DenseMatrix
 // ----------------------------------------------------------------------------
 
-template <typename VT>
-struct WriteCsv<DenseMatrix<VT>> {
-    static void apply(const DenseMatrix<VT> *arg, File* file) {
+template <typename VT> struct WriteCsv<DenseMatrix<VT>> {
+    static void apply(const DenseMatrix<VT> *arg, File *file) {
         if (file == nullptr)
-            throw std::runtime_error("WriteCsv: requires a file to be specified (must not be nullptr)");
-        const VT * valuesArg = arg->getValues();
+            throw std::runtime_error("WriteCsv: requires a file to be "
+                                     "specified (must not be nullptr)");
+        const VT *valuesArg = arg->getValues();
         const size_t rowSkip = arg->getRowSkip();
         const size_t argNumCols = arg->getNumCols();
 
-        for (size_t i = 0; i < arg->getNumRows(); ++i)
-        {
-            for(size_t j = 0; j < argNumCols; ++j)
-            {
-                fprintf(
-                        file->identifier,
+        for (size_t i = 0; i < arg->getNumRows(); ++i) {
+            for (size_t j = 0; j < argNumCols; ++j) {
+                fprintf(file->identifier,
                         std::is_floating_point<VT>::value ? "%f" : (std::is_same<VT, long int>::value ? "%ld" : "%d"),
-                        valuesArg[i*rowSkip + j]
-                );
-                if(j < (arg->getNumCols() - 1))
+                        valuesArg[i * rowSkip + j]);
+                if (j < (arg->getNumCols() - 1))
                     fprintf(file->identifier, ",");
                 else
                     fprintf(file->identifier, "\n");
             }
         }
-   }
+    }
 };
 
 // ----------------------------------------------------------------------------
@@ -92,46 +84,66 @@ struct WriteCsv<DenseMatrix<VT>> {
 // ----------------------------------------------------------------------------
 
 template <> struct WriteCsv<Frame> {
-    static void apply(const Frame * arg, File * file) {
-
-    if (file == nullptr)
-        throw std::runtime_error("WriteCsv: requires a file to be specified (must not be nullptr)");
-
-    for(size_t i = 0; i < arg->getNumRows(); ++i) {
-        for(size_t j = 0; j < arg->getNumCols(); ++j) {
-            const void* array = arg->getColumnRaw(j);
-            ValueTypeCode vtc = arg->getColumnType(j);
-            switch(vtc) {
-                // Conversion int8->int32 for formating as number as opposed to character.
-                case ValueTypeCode::SI8:  fprintf(file->identifier, "%" PRId8, static_cast<int32_t>(reinterpret_cast<const int8_t *>(array)[i])); break;
-                case ValueTypeCode::SI32: fprintf(file->identifier, "%" PRId32, reinterpret_cast<const int32_t *>(array)[i]); break;
-                case ValueTypeCode::SI64: fprintf(file->identifier, "%" PRId64, reinterpret_cast<const int64_t *>(array)[i]); break;
-                // Conversion uint8->uint32 for formating as number as opposed to character.
-                case ValueTypeCode::UI8:  fprintf(file->identifier, "%" PRIu8, static_cast<uint32_t>(reinterpret_cast<const uint8_t *>(array)[i])); break;
-                case ValueTypeCode::UI32: fprintf(file->identifier, "%" PRIu32, reinterpret_cast<const uint32_t *>(array)[i]); break;
-                case ValueTypeCode::UI64: fprintf(file->identifier, "%" PRIu64, reinterpret_cast<const uint64_t *>(array)[i]); break;
-                case ValueTypeCode::F32: fprintf(file->identifier, "%f", reinterpret_cast<const float  *>(array)[i]); break;
-                case ValueTypeCode::F64: fprintf(file->identifier, "%f", reinterpret_cast<const double *>(array)[i]); break;
-                default: throw std::runtime_error("unknown value type code");
-            }
+    static void apply(const Frame *arg, File *file) {
 
-            if(j < (arg->getNumCols() - 1))
-                fprintf(file->identifier, ",");
-            else
-                fprintf(file->identifier, "\n");
+        if (file == nullptr)
+            throw std::runtime_error("WriteCsv: requires a file to be "
+                                     "specified (must not be nullptr)");
+
+        for (size_t i = 0; i < arg->getNumRows(); ++i) {
+            for (size_t j = 0; j < arg->getNumCols(); ++j) {
+                const void *array = arg->getColumnRaw(j);
+                ValueTypeCode vtc = arg->getColumnType(j);
+                switch (vtc) {
+                // Conversion int8->int32 for formating as number as opposed to
+                // character.
+                case ValueTypeCode::SI8:
+                    fprintf(file->identifier, "%" PRId8,
+                            static_cast<int32_t>(reinterpret_cast<const int8_t *>(array)[i]));
+                    break;
+                case ValueTypeCode::SI32:
+                    fprintf(file->identifier, "%" PRId32, reinterpret_cast<const int32_t *>(array)[i]);
+                    break;
+                case ValueTypeCode::SI64:
+                    fprintf(file->identifier, "%" PRId64, reinterpret_cast<const int64_t *>(array)[i]);
+                    break;
+                // Conversion uint8->uint32 for formating as number as opposed
+                // to character.
+                case ValueTypeCode::UI8:
+                    fprintf(file->identifier, "%" PRIu8,
+                            static_cast<uint32_t>(reinterpret_cast<const uint8_t *>(array)[i]));
+                    break;
+                case ValueTypeCode::UI32:
+                    fprintf(file->identifier, "%" PRIu32, reinterpret_cast<const uint32_t *>(array)[i]);
+                    break;
+                case ValueTypeCode::UI64:
+                    fprintf(file->identifier, "%" PRIu64, reinterpret_cast<const uint64_t *>(array)[i]);
+                    break;
+                case ValueTypeCode::F32:
+                    fprintf(file->identifier, "%f", reinterpret_cast<const float *>(array)[i]);
+                    break;
+                case ValueTypeCode::F64:
+                    fprintf(file->identifier, "%f", reinterpret_cast<const double *>(array)[i]);
+                    break;
+                default:
+                    throw std::runtime_error("unknown value type code");
+                }
+
+                if (j < (arg->getNumCols() - 1))
+                    fprintf(file->identifier, ",");
+                else
+                    fprintf(file->identifier, "\n");
+            }
         }
     }
-}
-
 };
 
 // ----------------------------------------------------------------------------
 // Matrix
 // ----------------------------------------------------------------------------
 
-template <typename VT>
-struct WriteCsv<Matrix<VT>> {
-    static void apply(const Matrix<VT> *arg, File* file) {
+template <typename VT> struct WriteCsv<Matrix<VT>> {
+    static void apply(const Matrix<VT> *arg, File *file) {
         if (file == nullptr)
             throw std::runtime_error("WriteCsv: File required");
 
@@ -140,18 +152,16 @@ struct WriteCsv<Matrix<VT>> {
 
         for (size_t r = 0; r < numRows; ++r) {
             for (size_t c = 0; c < numCols; ++c) {
-                fprintf(
-                        file->identifier,
+                fprintf(file->identifier,
                         std::is_floating_point<VT>::value ? "%f" : (std::is_same<VT, long int>::value ? "%ld" : "%d"),
-                        arg->get(r, c)
-                );
+                        arg->get(r, c));
                 if (c < (numCols - 1))
                     fprintf(file->identifier, ",");
                 else
                     fprintf(file->identifier, "\n");
             }
         }
-   }
+    }
 };
-  
+
 #endif // SRC_RUNTIME_LOCAL_IO_WRITECSV_H
\ No newline at end of file
diff --git a/src/runtime/local/io/WriteDaphne.h b/src/runtime/local/io/WriteDaphne.h
index d8adcb98d..bb86b16b8 100644
--- a/src/runtime/local/io/WriteDaphne.h
+++ b/src/runtime/local/io/WriteDaphne.h
@@ -16,16 +16,16 @@
 
 #pragma once
 
-#include <runtime/local/datastructures/ValueTypeCode.h>
+#include <runtime/local/datastructures/CSRMatrix.h>
 #include <runtime/local/datastructures/DataObjectFactory.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
-#include <runtime/local/datastructures/CSRMatrix.h>
 #include <runtime/local/datastructures/Frame.h>
+#include <runtime/local/datastructures/ValueTypeCode.h>
 #include <runtime/local/datastructures/ValueTypeUtils.h>
 
-#include <runtime/local/io/utils.h>
 #include <runtime/local/io/DaphneFile.h>
 #include <runtime/local/io/DaphneSerializer.h>
+#include <runtime/local/io/utils.h>
 
 #include <type_traits>
 
@@ -40,8 +40,7 @@
 // Struct for partial template specialization
 // ****************************************************************************
 
-template <class DTArg>
-struct WriteDaphne {
+template <class DTArg> struct WriteDaphne {
     static void apply(const DTArg *arg, const char *filename) = delete;
 };
 
@@ -49,8 +48,7 @@ struct WriteDaphne {
 // Convenience function
 // ****************************************************************************
 
-template <class DTArg>
-void writeDaphne(const DTArg *arg, const char *filename) {
+template <class DTArg> void writeDaphne(const DTArg *arg, const char *filename) {
     WriteDaphne<DTArg>::apply(arg, filename);
 }
 
@@ -62,14 +60,14 @@ void writeDaphne(const DTArg *arg, const char *filename) {
 // DenseMatrix
 // ----------------------------------------------------------------------------
 
-template <typename VT>
-struct WriteDaphne<DenseMatrix<VT>> {
+template <typename VT> struct WriteDaphne<DenseMatrix<VT>> {
     static void apply(const DenseMatrix<VT> *arg, const char *filename) {
         std::ofstream f;
         f.open(filename, std::ios::out | std::ios::binary);
         // TODO: check f.good()
 
-        auto ser = DaphneSerializerChunks<const DenseMatrix<VT>>(arg, DaphneSerializer<DenseMatrix<VT>>::DEFAULT_SERIALIZATION_BUFFER_SIZE);
+        auto ser = DaphneSerializerChunks<const DenseMatrix<VT>>(
+            arg, DaphneSerializer<DenseMatrix<VT>>::DEFAULT_SERIALIZATION_BUFFER_SIZE);
         for (auto it = ser.begin(); it != ser.end(); ++it) {
             f.write(it->second->data(), it->first);
         }
@@ -83,14 +81,14 @@ struct WriteDaphne<DenseMatrix<VT>> {
 // CSRMatrix
 // ----------------------------------------------------------------------------
 
-template <typename VT>
-struct WriteDaphne<CSRMatrix<VT>> {
+template <typename VT> struct WriteDaphne<CSRMatrix<VT>> {
     static void apply(const CSRMatrix<VT> *arg, const char *filename) {
         std::ofstream f;
         f.open(filename, std::ios::out | std::ios::binary);
         // TODO: check f.good()
 
-        auto ser = DaphneSerializerChunks<const CSRMatrix<VT>>(arg, DaphneSerializer<CSRMatrix<VT>>::DEFAULT_SERIALIZATION_BUFFER_SIZE);
+        auto ser = DaphneSerializerChunks<const CSRMatrix<VT>>(
+            arg, DaphneSerializer<CSRMatrix<VT>>::DEFAULT_SERIALIZATION_BUFFER_SIZE);
         for (auto it = ser.begin(); it != ser.end(); ++it) {
             f.write(it->second->data(), it->first);
         }
@@ -104,8 +102,7 @@ struct WriteDaphne<CSRMatrix<VT>> {
 // Frame
 // ----------------------------------------------------------------------------
 
-template <>
-struct WriteDaphne<Frame> {
+template <> struct WriteDaphne<Frame> {
     static void apply(const Frame *arg, const char *filename) {
 
         std::ofstream f;
@@ -149,32 +146,32 @@ struct WriteDaphne<Frame> {
         for (size_t r = 0; r < h.nbrows; r++) {
             for (size_t c = 0; c < h.nbcols; c++) {
                 switch (schema[c]) {
-                    case ValueTypeCode::SI8:
-                        f.write((char *)&(reinterpret_cast<int8_t *>(vals[c])[r]), sizeof(int8_t));
-                        break;
-                    case ValueTypeCode::SI32:
-                        f.write((char *)&(reinterpret_cast<int32_t *>(vals[c])[r]), sizeof(int32_t));
-                        break;
-                    case ValueTypeCode::SI64:
-                        f.write((char *)&(reinterpret_cast<int64_t *>(vals[c])[r]), sizeof(int64_t));
-                        break;
-                    case ValueTypeCode::UI8:
-                        f.write((char *)&(reinterpret_cast<uint8_t *>(vals[c])[r]), sizeof(uint8_t));
-                        break;
-                    case ValueTypeCode::UI32:
-                        f.write((char *)&(reinterpret_cast<uint32_t *>(vals[c])[r]), sizeof(uint32_t));
-                        break;
-                    case ValueTypeCode::UI64:
-                        f.write((char *)&(reinterpret_cast<uint64_t *>(vals[c])[r]), sizeof(uint64_t));
-                        break;
-                    case ValueTypeCode::F32:
-                        f.write((char *)&(reinterpret_cast<float *>(vals[c])[r]), sizeof(float));
-                        break;
-                    case ValueTypeCode::F64:
-                        f.write((char *)&(reinterpret_cast<double *>(vals[c])[r]), sizeof(double));
-                        break;
-                    default:
-                        throw std::runtime_error("WriteDaphne::apply: unknown value type code");
+                case ValueTypeCode::SI8:
+                    f.write((char *)&(reinterpret_cast<int8_t *>(vals[c])[r]), sizeof(int8_t));
+                    break;
+                case ValueTypeCode::SI32:
+                    f.write((char *)&(reinterpret_cast<int32_t *>(vals[c])[r]), sizeof(int32_t));
+                    break;
+                case ValueTypeCode::SI64:
+                    f.write((char *)&(reinterpret_cast<int64_t *>(vals[c])[r]), sizeof(int64_t));
+                    break;
+                case ValueTypeCode::UI8:
+                    f.write((char *)&(reinterpret_cast<uint8_t *>(vals[c])[r]), sizeof(uint8_t));
+                    break;
+                case ValueTypeCode::UI32:
+                    f.write((char *)&(reinterpret_cast<uint32_t *>(vals[c])[r]), sizeof(uint32_t));
+                    break;
+                case ValueTypeCode::UI64:
+                    f.write((char *)&(reinterpret_cast<uint64_t *>(vals[c])[r]), sizeof(uint64_t));
+                    break;
+                case ValueTypeCode::F32:
+                    f.write((char *)&(reinterpret_cast<float *>(vals[c])[r]), sizeof(float));
+                    break;
+                case ValueTypeCode::F64:
+                    f.write((char *)&(reinterpret_cast<double *>(vals[c])[r]), sizeof(double));
+                    break;
+                default:
+                    throw std::runtime_error("WriteDaphne::apply: unknown value type code");
                 }
             }
         }
diff --git a/src/runtime/local/io/utils.h b/src/runtime/local/io/utils.h
index 8595bf2de..eb8224405 100644
--- a/src/runtime/local/io/utils.h
+++ b/src/runtime/local/io/utils.h
@@ -20,29 +20,30 @@
 #include <stdexcept>
 #include <string>
 
+#include <runtime/local/io/File.h>
 #include <spdlog/spdlog.h>
 
 // Conversion of std::string.
 
 inline void convertStr(std::string const &x, double *v) {
-  try {
-    *v = stod(x);
-  } catch (const std::invalid_argument &) {
-    *v = std::numeric_limits<double>::quiet_NaN();
-  }
+    try {
+        *v = stod(x);
+    } catch (const std::invalid_argument &) {
+        *v = std::numeric_limits<double>::quiet_NaN();
+    }
 }
 inline void convertStr(std::string const &x, float *v) {
-  try {
-    *v = stof(x);
-  }
-  catch (const std::invalid_argument &) {
-    *v = std::numeric_limits<float>::quiet_NaN();
-  }
-  catch (const std::out_of_range& e) {
-      // handling subnormal values (too small)
-      *v = std::numeric_limits<float>::min();
-      spdlog::warn("setting subnormal float value {} to std::numeric_limits<float>::min() -> {}", x, std::numeric_limits<float>::min());
-  }
+    try {
+        *v = stof(x);
+    } catch (const std::invalid_argument &) {
+        *v = std::numeric_limits<float>::quiet_NaN();
+    } catch (const std::out_of_range &e) {
+        // handling subnormal values (too small)
+        *v = std::numeric_limits<float>::min();
+        spdlog::warn("setting subnormal float value {} to "
+                     "std::numeric_limits<float>::min() -> {}",
+                     x, std::numeric_limits<float>::min());
+    }
 }
 inline void convertStr(std::string const &x, int8_t *v) { *v = stoi(x); }
 inline void convertStr(std::string const &x, int32_t *v) { *v = stoi(x); }
@@ -53,22 +54,77 @@ inline void convertStr(std::string const &x, uint64_t *v) { *v = stoi(x); }
 
 // Conversion of char *.
 
-inline void convertCstr(const char * x, double *v) {
-  char * end;
-  *v = strtod(x, &end);
-  if(x == end)
-    *v = std::numeric_limits<double>::quiet_NaN();
+inline void convertCstr(const char *x, double *v) {
+    char *end;
+    *v = strtod(x, &end);
+    if (x == end)
+        *v = std::numeric_limits<double>::quiet_NaN();
 }
-inline void convertCstr(const char * x, float *v) {
-  char * end;
-  *v = strtof(x, &end);
-  if(x == end)
-    *v = std::numeric_limits<float>::quiet_NaN();
+inline void convertCstr(const char *x, float *v) {
+    char *end;
+    *v = strtof(x, &end);
+    if (x == end)
+        *v = std::numeric_limits<float>::quiet_NaN();
 }
-inline void convertCstr(const char * x, int8_t *v) { *v = atoi(x); }
-inline void convertCstr(const char * x, int32_t *v) { *v = atoi(x); }
-inline void convertCstr(const char * x, int64_t *v) { *v = atoi(x); }
-inline void convertCstr(const char * x, uint8_t *v) { *v = atoi(x); }
-inline void convertCstr(const char * x, uint32_t *v) { *v = atoi(x); }
-inline void convertCstr(const char * x, uint64_t *v) { *v = atoi(x); }
+inline void convertCstr(const char *x, int8_t *v) { *v = atoi(x); }
+inline void convertCstr(const char *x, int32_t *v) { *v = atoi(x); }
+inline void convertCstr(const char *x, int64_t *v) { *v = atoi(x); }
+inline void convertCstr(const char *x, uint8_t *v) { *v = atoi(x); }
+inline void convertCstr(const char *x, uint32_t *v) { *v = atoi(x); }
+inline void convertCstr(const char *x, uint64_t *v) { *v = atoi(x); }
 
+/**
+ * @brief This function reads a CSV column that contains strings.
+ *
+ * This function processes a column from a CSV file starting at the given position in the current line.
+ * It reads and appends characters to the result string (`res`) until it encounters the column delimiter
+ * or the end of the line. If the column contains multiline strings (enclosed in double quotes), it
+ * continues reading until the closing quote is found, handling embedded quotes and newline characters
+ * as necessary.
+ *
+ * @param file Pointer to the file object from which the CSV data is being read. The file's `line`
+ *             attribute is expected to contain the current line being processed.
+ * @param start_pos The starting position within the current line to begin reading the column. This
+ *                  function may move beyond the current line if the field contains a multiline string.
+ * @param res A pointer to the result string that will store the contents of the current column.
+ * @param delim The delimiter character separating columns (e.g., a comma `,`).
+ * @return The position pointing to the character immediately before the next column in the line.
+ */
+inline size_t setCString(struct File *file, size_t start_pos, std::string *res, const char delim) {
+    size_t pos = 0;
+    const char *str = file->line + start_pos;
+    bool is_multiLine = (str[0] == '"');
+    if (is_multiLine)
+        pos++;
+
+    int is_not_end = 1;
+    while (is_not_end && str[pos]) {
+        is_not_end -= (!is_multiLine && str[pos] == delim);
+        is_not_end -= (!is_multiLine && str[pos] == '\n');
+        is_not_end -= (!is_multiLine && str[pos] == '\r');
+
+        is_not_end -= (is_multiLine && str[pos] == '"' && str[pos + 1] != '"');
+        if (!is_not_end)
+            break;
+        if (is_multiLine && str[pos] == '"' && str[pos + 1] == '"') {
+            res->append("\"\"");
+            pos += 2;
+        } else if (is_multiLine && str[pos] == '\\' && str[pos + 1] == '"') {
+            res->append("\\\"");
+            pos += 2;
+        } else if (is_multiLine && (str[pos] == '\n' || str[pos] == '\r')) {
+            res->push_back('\n');
+            getFileLine(file);
+            str = file->line;
+            pos = 0;
+        } else {
+            res->push_back(str[pos]);
+            pos++;
+        }
+    }
+
+    if (is_multiLine)
+        pos++;
+
+    return pos;
+}
diff --git a/src/runtime/local/kernels/Activation.h b/src/runtime/local/kernels/Activation.h
index 38a1cb668..11e06795a 100644
--- a/src/runtime/local/kernels/Activation.h
+++ b/src/runtime/local/kernels/Activation.h
@@ -21,13 +21,13 @@
 #include "runtime/local/datastructures/DenseMatrix.h"
 
 namespace NN::Activation {
-    struct ReLU {
-        static inline int getActivationType() { /* ToDo: ReLU activation */ return 0; }
-    };
+struct ReLU {
+    static inline int getActivationType() { /* ToDo: ReLU activation */ return 0; }
+};
 
-    template<typename OP, typename DTRes, typename DTArg>
-    struct Forward {
-        static void apply(DTRes *&res, [[maybe_unused]] const DTArg *data, DCTX(dctx)) { throw
-                std::runtime_error("C++ ReLU activation not implemented"); }
-    };
-}
+template <typename OP, typename DTRes, typename DTArg> struct Forward {
+    static void apply(DTRes *&res, [[maybe_unused]] const DTArg *data, DCTX(dctx)) {
+        throw std::runtime_error("C++ ReLU activation not implemented");
+    }
+};
+} // namespace NN::Activation
diff --git a/src/runtime/local/kernels/AggAll.h b/src/runtime/local/kernels/AggAll.h
index 95d1ef55e..14b375057 100644
--- a/src/runtime/local/kernels/AggAll.h
+++ b/src/runtime/local/kernels/AggAll.h
@@ -31,17 +31,15 @@
 // Struct for partial template specialization
 // ****************************************************************************
 
-template<typename VTRes, class DTArg>
-struct AggAll {
-    static VTRes apply(AggOpCode opCode, const DTArg * arg, DCTX(ctx)) = delete;
+template <typename VTRes, class DTArg> struct AggAll {
+    static VTRes apply(AggOpCode opCode, const DTArg *arg, DCTX(ctx)) = delete;
 };
 
 // ****************************************************************************
 // Convenience function
 // ****************************************************************************
 
-template<typename VTRes, class DTArg>
-VTRes aggAll(AggOpCode opCode, const DTArg * arg, DCTX(ctx)) {
+template <typename VTRes, class DTArg> VTRes aggAll(AggOpCode opCode, const DTArg *arg, DCTX(ctx)) {
     return AggAll<VTRes, DTArg>::apply(opCode, arg, ctx);
 }
 
@@ -53,32 +51,29 @@ VTRes aggAll(AggOpCode opCode, const DTArg * arg, DCTX(ctx)) {
 // scalar <- DenseMatrix
 // ----------------------------------------------------------------------------
 
-template<typename VTRes, typename VTArg>
-struct AggAll<VTRes, DenseMatrix<VTArg>> {
-    static VTRes apply(AggOpCode opCode, const DenseMatrix<VTArg> * arg, DCTX(ctx)) {
+template <typename VTRes, typename VTArg> struct AggAll<VTRes, DenseMatrix<VTArg>> {
+    static VTRes apply(AggOpCode opCode, const DenseMatrix<VTArg> *arg, DCTX(ctx)) {
         const size_t numRows = arg->getNumRows();
         const size_t numCols = arg->getNumCols();
-        
-        const VTArg * valuesArg = arg->getValues();
+
+        const VTArg *valuesArg = arg->getValues();
 
         EwBinaryScaFuncPtr<VTRes, VTRes, VTRes> func;
         VTRes agg, stddev;
         if (AggOpCodeUtils::isPureBinaryReduction(opCode)) {
             func = getEwBinaryScaFuncPtr<VTRes, VTRes, VTRes>(AggOpCodeUtils::getBinaryOpCode(opCode));
             agg = AggOpCodeUtils::template getNeutral<VTRes>(opCode);
-        }
-        else {
+        } else {
             // TODO Setting the function pointer yields the correct result.
-            // However, since MEAN, VAR, and STDDEV are not sparse-safe, the program
-            // does not take the same path for doing the summation, and is less
-            // efficient.
-            // for MEAN, VAR, and STDDEV, we need to sum
+            // However, since MEAN, VAR, and STDDEV are not sparse-safe, the
+            // program does not take the same path for doing the summation, and
+            // is less efficient. for MEAN, VAR, and STDDEV, we need to sum
             func = getEwBinaryScaFuncPtr<VTRes, VTRes, VTRes>(AggOpCodeUtils::getBinaryOpCode(AggOpCode::SUM));
             agg = VTRes(0);
         }
 
-        for(size_t r = 0; r < numRows; r++) {
-            for(size_t c = 0; c < numCols; c++)
+        for (size_t r = 0; r < numRows; r++) {
+            for (size_t c = 0; c < numCols; c++)
                 agg = func(agg, static_cast<VTRes>(valuesArg[c]), ctx);
             valuesArg += arg->getRowSkip();
         }
@@ -91,24 +86,24 @@ struct AggAll<VTRes, DenseMatrix<VTArg>> {
             return agg;
         }
         // else op-code is STDDEV or VAR
-        stddev=0;
+        stddev = 0;
         valuesArg = arg->getValues();
-        for(size_t r = 0; r < numRows; r++) {
-            for(size_t c = 0; c < numCols; c++) {
+        for (size_t r = 0; r < numRows; r++) {
+            for (size_t c = 0; c < numCols; c++) {
                 VTRes val = static_cast<VTRes>(valuesArg[c]) - agg;
                 stddev = stddev + val * val;
             }
-            valuesArg += arg->getRowSkip();               
+            valuesArg += arg->getRowSkip();
         }
 
         stddev /= arg->getNumCols() * arg->getNumRows();
 
-        //Variance --> stddev before sqrt() is variance
-        if (opCode == AggOpCode::VAR){
+        // Variance --> stddev before sqrt() is variance
+        if (opCode == AggOpCode::VAR) {
             VTRes var = stddev;
             return var;
         }
-        
+
         stddev = sqrt(stddev);
         return stddev;
     }
@@ -118,73 +113,59 @@ struct AggAll<VTRes, DenseMatrix<VTArg>> {
 // scalar <- CSRMatrix
 // ----------------------------------------------------------------------------
 
-template<typename VTRes, typename VTArg>
-struct AggAll<VTRes, CSRMatrix<VTArg>> {
-    static VTRes aggArray(const VTArg * values, size_t numNonZeros, size_t numCells, EwBinaryScaFuncPtr<VTRes, VTRes, VTRes> func, bool isSparseSafe, VTRes neutral, DCTX(ctx)) {
-        if(numNonZeros) {
+template <typename VTRes, typename VTArg> struct AggAll<VTRes, CSRMatrix<VTArg>> {
+    static VTRes aggArray(const VTArg *values, size_t numNonZeros, size_t numCells,
+                          EwBinaryScaFuncPtr<VTRes, VTRes, VTRes> func, bool isSparseSafe, VTRes neutral, DCTX(ctx)) {
+        if (numNonZeros) {
             VTRes agg = static_cast<VTRes>(values[0]);
-            for(size_t i = 1; i < numNonZeros; i++)
+            for (size_t i = 1; i < numNonZeros; i++)
                 agg = func(agg, static_cast<VTRes>(values[i]), ctx);
 
-            if(!isSparseSafe && numNonZeros < numCells)
+            if (!isSparseSafe && numNonZeros < numCells)
                 agg = func(agg, 0, ctx);
 
             return agg;
-        }
-        else
+        } else
             return func(neutral, 0, ctx);
     }
-    
-    static VTRes apply(AggOpCode opCode, const CSRMatrix<VTArg> * arg, DCTX(ctx)) {
-        if(AggOpCodeUtils::isPureBinaryReduction(opCode)) {
-
-            EwBinaryScaFuncPtr<VTRes, VTRes, VTRes> func = getEwBinaryScaFuncPtr<VTRes, VTRes, VTRes>(AggOpCodeUtils::getBinaryOpCode(opCode));
-            
-            return aggArray(
-                    arg->getValues(0),
-                    arg->getNumNonZeros(),
-                    arg->getNumRows() * arg->getNumCols(),
-                    func,
-                    AggOpCodeUtils::isSparseSafe(opCode),
-                    AggOpCodeUtils::template getNeutral<VTRes>(opCode),
-                    ctx
-            );
-        }
-        else { // The op-code is either MEAN or STDDEV or VAR.
-            EwBinaryScaFuncPtr<VTRes, VTRes, VTRes> func = getEwBinaryScaFuncPtr<VTRes, VTRes, VTRes>(AggOpCodeUtils::getBinaryOpCode(AggOpCode::SUM));            
-            auto agg = aggArray(
-                arg->getValues(0),
-                arg->getNumNonZeros(),
-                arg->getNumRows() * arg->getNumCols(),
-                func,
-                true,
-                VTRes(0),
-                ctx
-            );
+
+    static VTRes apply(AggOpCode opCode, const CSRMatrix<VTArg> *arg, DCTX(ctx)) {
+        if (AggOpCodeUtils::isPureBinaryReduction(opCode)) {
+
+            EwBinaryScaFuncPtr<VTRes, VTRes, VTRes> func =
+                getEwBinaryScaFuncPtr<VTRes, VTRes, VTRes>(AggOpCodeUtils::getBinaryOpCode(opCode));
+
+            return aggArray(arg->getValues(0), arg->getNumNonZeros(), arg->getNumRows() * arg->getNumCols(), func,
+                            AggOpCodeUtils::isSparseSafe(opCode), AggOpCodeUtils::template getNeutral<VTRes>(opCode),
+                            ctx);
+        } else { // The op-code is either MEAN or STDDEV or VAR.
+            EwBinaryScaFuncPtr<VTRes, VTRes, VTRes> func =
+                getEwBinaryScaFuncPtr<VTRes, VTRes, VTRes>(AggOpCodeUtils::getBinaryOpCode(AggOpCode::SUM));
+            auto agg = aggArray(arg->getValues(0), arg->getNumNonZeros(), arg->getNumRows() * arg->getNumCols(), func,
+                                true, VTRes(0), ctx);
             agg = agg / (arg->getNumRows() * arg->getNumCols());
             if (opCode == AggOpCode::MEAN)
                 return agg;
-            else{
-                //STDDEV-VAR
-                VTRes stddev=0;
+            else {
+                // STDDEV-VAR
+                VTRes stddev = 0;
 
-                const VTArg * valuesArg = arg->getValues(0);
-                for(size_t i = 0; i < arg->getNumNonZeros(); i++) {
+                const VTArg *valuesArg = arg->getValues(0);
+                for (size_t i = 0; i < arg->getNumNonZeros(); i++) {
                     VTRes val = static_cast<VTRes>((valuesArg[i])) - agg;
                     stddev = stddev + val * val;
                 }
-                stddev += ((arg->getNumRows() * arg->getNumCols()) - arg->getNumNonZeros())*agg*agg;
+                stddev += ((arg->getNumRows() * arg->getNumCols()) - arg->getNumNonZeros()) * agg * agg;
                 stddev /= (arg->getNumRows() * arg->getNumCols());
-                 
-                //Variance --> stddev before sqrt() is variance
-                if (opCode == AggOpCode::VAR){
+
+                // Variance --> stddev before sqrt() is variance
+                if (opCode == AggOpCode::VAR) {
                     VTRes var = stddev;
                     return var;
                 }
 
                 stddev = sqrt(stddev);
                 return stddev;
-
             }
         }
     }
@@ -194,9 +175,8 @@ struct AggAll<VTRes, CSRMatrix<VTArg>> {
 // scalar <- Matrix
 // ----------------------------------------------------------------------------
 
-template<typename VTRes, typename VTArg>
-struct AggAll<VTRes, Matrix<VTArg>> {
-    static VTRes apply(AggOpCode opCode, const Matrix<VTArg> * arg, DCTX(ctx)) {
+template <typename VTRes, typename VTArg> struct AggAll<VTRes, Matrix<VTArg>> {
+    static VTRes apply(AggOpCode opCode, const Matrix<VTArg> *arg, DCTX(ctx)) {
         const size_t numRows = arg->getNumRows();
         const size_t numCols = arg->getNumCols();
 
@@ -205,13 +185,11 @@ struct AggAll<VTRes, Matrix<VTArg>> {
         if (AggOpCodeUtils::isPureBinaryReduction(opCode)) {
             func = getEwBinaryScaFuncPtr<VTRes, VTRes, VTRes>(AggOpCodeUtils::getBinaryOpCode(opCode));
             agg = AggOpCodeUtils::template getNeutral<VTRes>(opCode);
-        }
-        else {
+        } else {
             // TODO Setting the function pointer yields the correct result.
-            // However, since MEAN, VAR, and STDDEV are not sparse-safe, the program
-            // does not take the same path for doing the summation, and is less
-            // efficient.
-            // for MEAN, VAR, and STDDEV, we need to sum
+            // However, since MEAN, VAR, and STDDEV are not sparse-safe, the
+            // program does not take the same path for doing the summation, and
+            // is less efficient. for MEAN, VAR, and STDDEV, we need to sum
             func = getEwBinaryScaFuncPtr<VTRes, VTRes, VTRes>(AggOpCodeUtils::getBinaryOpCode(AggOpCode::SUM));
             agg = VTRes(0);
         }
@@ -242,11 +220,11 @@ struct AggAll<VTRes, Matrix<VTArg>> {
         // VAR --> stddev before sqrt() is variance
         if (opCode == AggOpCode::VAR)
             return stddev;
-        
+
         // STDDEV
         stddev = sqrt(stddev);
         return stddev;
     }
 };
 
-#endif //SRC_RUNTIME_LOCAL_KERNELS_AGGALL_H
+#endif // SRC_RUNTIME_LOCAL_KERNELS_AGGALL_H
diff --git a/src/runtime/local/kernels/AggCol.h b/src/runtime/local/kernels/AggCol.h
index ccdff51fc..f096c0c94 100644
--- a/src/runtime/local/kernels/AggCol.h
+++ b/src/runtime/local/kernels/AggCol.h
@@ -35,17 +35,15 @@
 // Struct for partial template specialization
 // ****************************************************************************
 
-template<class DTRes, class DTArg>
-struct AggCol {
-    static void apply(AggOpCode opCode, DTRes *& res, const DTArg * arg, DCTX(ctx)) = delete;
+template <class DTRes, class DTArg> struct AggCol {
+    static void apply(AggOpCode opCode, DTRes *&res, const DTArg *arg, DCTX(ctx)) = delete;
 };
 
 // ****************************************************************************
 // Convenience function
 // ****************************************************************************
 
-template<class DTRes, class DTArg>
-void aggCol(AggOpCode opCode, DTRes *& res, const DTArg * arg, DCTX(ctx)) {
+template <class DTRes, class DTArg> void aggCol(AggOpCode opCode, DTRes *&res, const DTArg *arg, DCTX(ctx)) {
     AggCol<DTRes, DTArg>::apply(opCode, res, arg, ctx);
 }
 
@@ -57,35 +55,36 @@ void aggCol(AggOpCode opCode, DTRes *& res, const DTArg * arg, DCTX(ctx)) {
 // DenseMatrix <- DenseMatrix
 // ----------------------------------------------------------------------------
 
-template<typename VTRes, typename VTArg>
-struct AggCol<DenseMatrix<VTRes>, DenseMatrix<VTArg>> {
-    static void apply(AggOpCode opCode, DenseMatrix<VTRes> *& res, const DenseMatrix<VTArg> * arg, DCTX(ctx)) {
+template <typename VTRes, typename VTArg> struct AggCol<DenseMatrix<VTRes>, DenseMatrix<VTArg>> {
+    static void apply(AggOpCode opCode, DenseMatrix<VTRes> *&res, const DenseMatrix<VTArg> *arg, DCTX(ctx)) {
         const size_t numRows = arg->getNumRows();
         const size_t numCols = arg->getNumCols();
-        
-        if(res == nullptr)
+
+        if (res == nullptr)
             res = DataObjectFactory::create<DenseMatrix<VTRes>>(1, numCols, false);
-        
-        const VTArg * valuesArg = arg->getValues();
-        VTRes * valuesRes = res->getValues();
-        
+
+        const VTArg *valuesArg = arg->getValues();
+        VTRes *valuesRes = res->getValues();
+
         // TODO Merge the cases for IDXMIN and IDXMAX to avoid code duplication.
-        if(opCode == AggOpCode::IDXMIN) {
-            // Minimum values seen so far per column (initialize with first row of argument).
+        if (opCode == AggOpCode::IDXMIN) {
+            // Minimum values seen so far per column (initialize with first row
+            // of argument).
             auto tmp = DataObjectFactory::create<DenseMatrix<VTArg>>(1, numCols, false);
-            VTArg * valuesTmp = tmp->getValues();
+            VTArg *valuesTmp = tmp->getValues();
             memcpy(valuesTmp, valuesArg, numCols * sizeof(VTArg));
 
-            // Positions at which the minimum values were found (initialize with zeros),
-            // stored directly in the result.
+            // Positions at which the minimum values were found (initialize with
+            // zeros), stored directly in the result.
             for (size_t c = 0; c < numCols; c++)
                 valuesRes[c] = 0;
 
-            // Scan over the remaining rows and update the minimum values and their positions accordingly.
+            // Scan over the remaining rows and update the minimum values and
+            // their positions accordingly.
             valuesArg += arg->getRowSkip();
-            for(size_t r = 1; r < numRows; r++) {
-                for(size_t c = 0; c < numCols; c++)
-                    if(valuesArg[c] < valuesTmp[c]) {
+            for (size_t r = 1; r < numRows; r++) {
+                for (size_t c = 0; c < numCols; c++)
+                    if (valuesArg[c] < valuesTmp[c]) {
                         valuesTmp[c] = valuesArg[c];
                         valuesRes[c] = r;
                     }
@@ -94,23 +93,24 @@ struct AggCol<DenseMatrix<VTRes>, DenseMatrix<VTArg>> {
 
             // Free the temporary minimum values.
             DataObjectFactory::destroy(tmp);
-        }
-        else if(opCode == AggOpCode::IDXMAX) {
-            // Maximum values seen so far per column (initialize with first row of argument).
+        } else if (opCode == AggOpCode::IDXMAX) {
+            // Maximum values seen so far per column (initialize with first row
+            // of argument).
             auto tmp = DataObjectFactory::create<DenseMatrix<VTArg>>(1, numCols, false);
-            VTArg * valuesTmp = tmp->getValues();
+            VTArg *valuesTmp = tmp->getValues();
             memcpy(valuesTmp, valuesArg, numCols * sizeof(VTArg));
 
-            // Positions at which the maximum values were found (initialize with zeros),
-            // stored directly in the result.
+            // Positions at which the maximum values were found (initialize with
+            // zeros), stored directly in the result.
             for (size_t c = 0; c < numCols; c++)
                 valuesRes[c] = 0;
 
-            // Scan over the remaining rows and update the maximum values and their positions accordingly.
+            // Scan over the remaining rows and update the maximum values and
+            // their positions accordingly.
             valuesArg += arg->getRowSkip();
-            for(size_t r = 1; r < numRows; r++) {
-                for(size_t c = 0; c < numCols; c++)
-                    if(valuesArg[c] > valuesTmp[c]) {
+            for (size_t r = 1; r < numRows; r++) {
+                for (size_t c = 0; c < numCols; c++)
+                    if (valuesArg[c] > valuesTmp[c]) {
                         valuesTmp[c] = valuesArg[c];
                         valuesRes[c] = r;
                     }
@@ -119,62 +119,58 @@ struct AggCol<DenseMatrix<VTRes>, DenseMatrix<VTArg>> {
 
             // Free the temporary maximum values.
             DataObjectFactory::destroy(tmp);
-        }
-        else {
+        } else {
             EwBinaryScaFuncPtr<VTRes, VTRes, VTRes> func;
-            if(AggOpCodeUtils::isPureBinaryReduction(opCode))
+            if (AggOpCodeUtils::isPureBinaryReduction(opCode))
                 func = getEwBinaryScaFuncPtr<VTRes, VTRes, VTRes>(AggOpCodeUtils::getBinaryOpCode(opCode));
             else
                 // TODO Setting the function pointer yields the correct result.
-                // However, since MEAN and STDDEV are not sparse-safe, the program
-                // does not take the same path for doing the summation, and is less
-                // efficient.
-                // for MEAN and STDDDEV, we need to sum
+                // However, since MEAN and STDDEV are not sparse-safe, the
+                // program does not take the same path for doing the summation,
+                // and is less efficient. for MEAN and STDDDEV, we need to sum
                 func = getEwBinaryScaFuncPtr<VTRes, VTRes, VTRes>(AggOpCodeUtils::getBinaryOpCode(AggOpCode::SUM));
 
             // memcpy(valuesRes, valuesArg, numCols * sizeof(VTRes));
             // Can't memcpy because we might have different result type
             for (size_t c = 0; c < numCols; c++)
                 valuesRes[c] = static_cast<VTRes>(valuesArg[c]);
-            for(size_t r = 1; r < numRows; r++) {
+            for (size_t r = 1; r < numRows; r++) {
                 valuesArg += arg->getRowSkip();
-                for(size_t c = 0; c < numCols; c++)
+                for (size_t c = 0; c < numCols; c++)
                     valuesRes[c] = func(valuesRes[c], static_cast<VTRes>(valuesArg[c]), ctx);
             }
-            
-            if(AggOpCodeUtils::isPureBinaryReduction(opCode))
+
+            if (AggOpCodeUtils::isPureBinaryReduction(opCode))
                 return;
-            
+
             // The op-code is either MEAN or STDDEV or VAR.
 
-            for(size_t c = 0; c < numCols; c++)
+            for (size_t c = 0; c < numCols; c++)
                 valuesRes[c] /= numRows;
 
-            if(opCode == AggOpCode::MEAN)
+            if (opCode == AggOpCode::MEAN)
                 return;
 
             auto tmp = DataObjectFactory::create<DenseMatrix<VTRes>>(1, numCols, true);
-            VTRes * valuesT = tmp->getValues();
+            VTRes *valuesT = tmp->getValues();
             valuesArg = arg->getValues();
 
-            for(size_t r = 0; r < numRows; r++) {
-                for(size_t c = 0; c < numCols; c++) {
+            for (size_t r = 0; r < numRows; r++) {
+                for (size_t c = 0; c < numCols; c++) {
                     VTRes val = static_cast<VTRes>(valuesArg[c]) - valuesRes[c];
                     valuesT[c] = valuesT[c] + val * val;
                 }
                 valuesArg += arg->getRowSkip();
             }
 
-            for(size_t c = 0; c < numCols; c++) {
+            for (size_t c = 0; c < numCols; c++) {
                 valuesT[c] /= numRows;
                 if (opCode == AggOpCode::STDDEV)
                     valuesT[c] = sqrt(valuesT[c]);
             }
-            
 
-            
-            // TODO We could avoid copying by returning tmp and destroying res. But
-            // that might be wrong if res was not nullptr initially.
+            // TODO We could avoid copying by returning tmp and destroying res.
+            // But that might be wrong if res was not nullptr initially.
             memcpy(valuesRes, valuesT, numCols * sizeof(VTRes));
             DataObjectFactory::destroy<DenseMatrix<VTRes>>(tmp);
         }
@@ -185,19 +181,18 @@ struct AggCol<DenseMatrix<VTRes>, DenseMatrix<VTArg>> {
 // DenseMatrix <- CSRMatrix
 // ----------------------------------------------------------------------------
 
-template<typename VTRes, typename VTArg>
-struct AggCol<DenseMatrix<VTRes>, CSRMatrix<VTArg>> {
-    static void apply(AggOpCode opCode, DenseMatrix<VTRes> *& res, const CSRMatrix<VTArg> * arg, DCTX(ctx)) {
+template <typename VTRes, typename VTArg> struct AggCol<DenseMatrix<VTRes>, CSRMatrix<VTArg>> {
+    static void apply(AggOpCode opCode, DenseMatrix<VTRes> *&res, const CSRMatrix<VTArg> *arg, DCTX(ctx)) {
         const size_t numRows = arg->getNumRows();
         const size_t numCols = arg->getNumCols();
-        
-        if(res == nullptr)
+
+        if (res == nullptr)
             res = DataObjectFactory::create<DenseMatrix<VTRes>>(1, numCols, true);
-        
-        VTRes * valuesRes = res->getValues();
-        
+
+        VTRes *valuesRes = res->getValues();
+
         EwBinaryScaFuncPtr<VTRes, VTRes, VTRes> func;
-        if(AggOpCodeUtils::isPureBinaryReduction(opCode))
+        if (AggOpCodeUtils::isPureBinaryReduction(opCode))
             func = getEwBinaryScaFuncPtr<VTRes, VTRes, VTRes>(AggOpCodeUtils::getBinaryOpCode(opCode));
         else
             // TODO Setting the function pointer yields the correct result.
@@ -207,64 +202,63 @@ struct AggCol<DenseMatrix<VTRes>, CSRMatrix<VTArg>> {
             // for MEAN and STDDDEV, we need to sum
             func = getEwBinaryScaFuncPtr<VTRes, VTRes, VTRes>(AggOpCodeUtils::getBinaryOpCode(AggOpCode::SUM));
 
-        const VTArg * valuesArg = arg->getValues(0);
-        const size_t * colIdxsArg = arg->getColIdxs(0);
-        
+        const VTArg *valuesArg = arg->getValues(0);
+        const size_t *colIdxsArg = arg->getColIdxs(0);
+
         const size_t numNonZeros = arg->getNumNonZeros();
-        
-        if(AggOpCodeUtils::isSparseSafe(opCode)) {
-            for(size_t i = 0; i < numNonZeros; i++) {
+
+        if (AggOpCodeUtils::isSparseSafe(opCode)) {
+            for (size_t i = 0; i < numNonZeros; i++) {
                 const size_t colIdx = colIdxsArg[i];
                 valuesRes[colIdx] = func(valuesRes[colIdx], static_cast<VTRes>(valuesArg[i]), ctx);
             }
-        }
-        else {
-            size_t * hist = new size_t[numCols](); // initialized to zeros
+        } else {
+            size_t *hist = new size_t[numCols](); // initialized to zeros
 
             const size_t numNonZerosFirstRowArg = arg->getNumNonZeros(0);
-            for(size_t i = 0; i < numNonZerosFirstRowArg; i++) {
+            for (size_t i = 0; i < numNonZerosFirstRowArg; i++) {
                 size_t colIdx = colIdxsArg[i];
                 valuesRes[colIdx] = static_cast<VTRes>(valuesArg[i]);
                 hist[colIdx]++;
             }
 
-            if(arg->getNumRows() > 1) {
-                for(size_t i = numNonZerosFirstRowArg; i < numNonZeros; i++) {
+            if (arg->getNumRows() > 1) {
+                for (size_t i = numNonZerosFirstRowArg; i < numNonZeros; i++) {
                     const size_t colIdx = colIdxsArg[i];
                     valuesRes[colIdx] = func(valuesRes[colIdx], static_cast<VTRes>(valuesArg[i]), ctx);
                     hist[colIdx]++;
                 }
-                for(size_t c = 0; c < numCols; c++)
-                    if(hist[c] < numRows)
+                for (size_t c = 0; c < numCols; c++)
+                    if (hist[c] < numRows)
                         valuesRes[c] = func(valuesRes[c], VTRes(0), ctx);
             }
-            
+
             delete[] hist;
         }
 
-        if(AggOpCodeUtils::isPureBinaryReduction(opCode))
+        if (AggOpCodeUtils::isPureBinaryReduction(opCode))
             return;
-        
+
         // The op-code is either MEAN or STDDEV or VAR.
 
-        for(size_t c = 0; c < numCols; c++)
+        for (size_t c = 0; c < numCols; c++)
             valuesRes[c] /= arg->getNumRows();
 
-        if(opCode == AggOpCode::MEAN)
+        if (opCode == AggOpCode::MEAN)
             return;
 
         auto tmp = DataObjectFactory::create<DenseMatrix<VTRes>>(1, numCols, true);
-        VTRes * valuesT = tmp->getValues();
+        VTRes *valuesT = tmp->getValues();
 
-        size_t * nnzCol = new size_t[numCols](); // initialized to zeros
-        for(size_t i = 0; i < numNonZeros; i++) {
+        size_t *nnzCol = new size_t[numCols](); // initialized to zeros
+        for (size_t i = 0; i < numNonZeros; i++) {
             const size_t colIdx = colIdxsArg[i];
             VTRes val = static_cast<VTRes>(valuesArg[i]) - valuesRes[colIdx];
             valuesT[colIdx] = valuesT[colIdx] + val * val;
             nnzCol[colIdx]++;
         }
 
-        for(size_t c = 0; c < numCols; c++) {
+        for (size_t c = 0; c < numCols; c++) {
             // Take all zeros in the column into account.
             valuesT[c] += (valuesRes[c] * valuesRes[c]) * (numRows - nnzCol[c]);
             // Finish computation of stddev.
@@ -272,14 +266,13 @@ struct AggCol<DenseMatrix<VTRes>, CSRMatrix<VTArg>> {
             if (opCode == AggOpCode::STDDEV)
                 valuesT[c] = sqrt(valuesT[c]);
         }
-        
+
         delete[] nnzCol;
 
         // TODO We could avoid copying by returning tmp and destroying res. But
         // that might be wrong if res was not nullptr initially.
         memcpy(valuesRes, valuesT, numCols * sizeof(VTRes));
         DataObjectFactory::destroy<DenseMatrix<VTRes>>(tmp);
-
     }
 };
 
@@ -287,30 +280,32 @@ struct AggCol<DenseMatrix<VTRes>, CSRMatrix<VTArg>> {
 // Matrix <- Matrix
 // ----------------------------------------------------------------------------
 
-template<typename VTRes, typename VTArg>
-struct AggCol<Matrix<VTRes>, Matrix<VTArg>> {
-    static void apply(AggOpCode opCode, Matrix<VTRes> *& res, const Matrix<VTArg> * arg, DCTX(ctx)) {
+template <typename VTRes, typename VTArg> struct AggCol<Matrix<VTRes>, Matrix<VTArg>> {
+    static void apply(AggOpCode opCode, Matrix<VTRes> *&res, const Matrix<VTArg> *arg, DCTX(ctx)) {
         const size_t numRows = arg->getNumRows();
         const size_t numCols = arg->getNumCols();
-        
+
         if (res == nullptr)
             res = DataObjectFactory::create<DenseMatrix<VTRes>>(1, numCols, false);
 
         if (opCode == AggOpCode::IDXMIN || opCode == AggOpCode::IDXMAX) {
-            // Minimum/Maximum values seen so far per column (initialize with first row of argument).
+            // Minimum/Maximum values seen so far per column (initialize with
+            // first row of argument).
             std::vector<VTArg> tmp;
             tmp.reserve(numCols);
             for (size_t c = 0; c < numCols; ++c)
                 tmp.emplace_back(arg->get(0, c));
 
-            // Positions at which the minimum/maximum values were found (initialize with zeros),
-            // stored directly in the result.
+            // Positions at which the minimum/maximum values were found
+            // (initialize with zeros), stored directly in the result.
             res->prepareAppend();
             res->finishAppend();
 
-            // Scan over the remaining rows and update the minimum values and their positions accordingly.
+            // Scan over the remaining rows and update the minimum values and
+            // their positions accordingly.
             // TODO: reduce code duplication with lambda
-            //       initial test seemed slower than separate loops but should be tested again
+            //       initial test seemed slower than separate loops but should
+            //       be tested again
             if (opCode == AggOpCode::IDXMIN) {
                 for (size_t r = 1; r < numRows; ++r) {
                     for (size_t c = 0; c < numCols; ++c) {
@@ -321,8 +316,7 @@ struct AggCol<Matrix<VTRes>, Matrix<VTArg>> {
                         }
                     }
                 }
-            }
-            else {
+            } else {
                 for (size_t r = 1; r < numRows; ++r) {
                     for (size_t c = 0; c < numCols; ++c) {
                         VTArg argVal = arg->get(r, c);
@@ -333,17 +327,15 @@ struct AggCol<Matrix<VTRes>, Matrix<VTArg>> {
                     }
                 }
             }
-        }
-        else {
+        } else {
             EwBinaryScaFuncPtr<VTRes, VTRes, VTRes> func;
             if (AggOpCodeUtils::isPureBinaryReduction(opCode))
                 func = getEwBinaryScaFuncPtr<VTRes, VTRes, VTRes>(AggOpCodeUtils::getBinaryOpCode(opCode));
             else
                 // TODO Setting the function pointer yields the correct result.
-                // However, since MEAN and STDDEV are not sparse-safe, the program
-                // does not take the same path for doing the summation, and is less
-                // efficient.
-                // for MEAN and STDDDEV, we need to sum
+                // However, since MEAN and STDDEV are not sparse-safe, the
+                // program does not take the same path for doing the summation,
+                // and is less efficient. for MEAN and STDDDEV, we need to sum
                 func = getEwBinaryScaFuncPtr<VTRes, VTRes, VTRes>(AggOpCodeUtils::getBinaryOpCode(AggOpCode::SUM));
 
             res->prepareAppend();
@@ -354,10 +346,10 @@ struct AggCol<Matrix<VTRes>, Matrix<VTArg>> {
                 for (size_t c = 0; c < numCols; ++c)
                     res->set(0, c, func(res->get(0, c), static_cast<VTRes>(arg->get(r, c)), ctx));
             }
-            
+
             if (AggOpCodeUtils::isPureBinaryReduction(opCode))
                 return;
-            
+
             // The op-code is either MEAN or STDDEV or VAR.
 
             for (size_t c = 0; c < numCols; ++c)
@@ -367,7 +359,7 @@ struct AggCol<Matrix<VTRes>, Matrix<VTArg>> {
                 return;
 
             std::vector<VTRes> tmp(numCols);
-            
+
             for (size_t r = 0; r < numRows; ++r) {
                 for (size_t c = 0; c < numCols; ++c) {
                     VTRes val = static_cast<VTRes>(arg->get(r, c)) - res->get(0, c);
@@ -387,4 +379,4 @@ struct AggCol<Matrix<VTRes>, Matrix<VTArg>> {
     }
 };
 
-#endif //SRC_RUNTIME_LOCAL_KERNELS_AGGCOL_H
+#endif // SRC_RUNTIME_LOCAL_KERNELS_AGGCOL_H
diff --git a/src/runtime/local/kernels/AggCum.h b/src/runtime/local/kernels/AggCum.h
index 314f5ddb1..3481511e8 100644
--- a/src/runtime/local/kernels/AggCum.h
+++ b/src/runtime/local/kernels/AggCum.h
@@ -27,17 +27,15 @@
 // Struct for partial template specialization
 // ****************************************************************************
 
-template<class DTRes, class DTArg>
-struct AggCum {
-    static void apply(AggOpCode opCode, DTRes *& res, const DTArg * arg, DCTX(ctx)) = delete;
+template <class DTRes, class DTArg> struct AggCum {
+    static void apply(AggOpCode opCode, DTRes *&res, const DTArg *arg, DCTX(ctx)) = delete;
 };
 
 // ****************************************************************************
 // Convenience function
 // ****************************************************************************
 
-template<class DTRes, class DTArg>
-void aggCum(AggOpCode opCode, DTRes *& res, const DTArg * arg, DCTX(ctx)) {
+template <class DTRes, class DTArg> void aggCum(AggOpCode opCode, DTRes *&res, const DTArg *arg, DCTX(ctx)) {
     AggCum<DTRes, DTArg>::apply(opCode, res, arg, ctx);
 }
 
@@ -49,34 +47,33 @@ void aggCum(AggOpCode opCode, DTRes *& res, const DTArg * arg, DCTX(ctx)) {
 // DenseMatrix <- DenseMatrix
 // ----------------------------------------------------------------------------
 
-template<typename VTRes, typename VTArg>
-struct AggCum<DenseMatrix<VTRes>, DenseMatrix<VTArg>> {
-    static void apply(AggOpCode opCode, DenseMatrix<VTRes> *& res, const DenseMatrix<VTArg> * arg, DCTX(ctx)) {
-        if(!AggOpCodeUtils::isPureBinaryReduction(opCode))
-            throw std::runtime_error("the aggregation function used in aggCum must be a pure binary reduction");
+template <typename VTRes, typename VTArg> struct AggCum<DenseMatrix<VTRes>, DenseMatrix<VTArg>> {
+    static void apply(AggOpCode opCode, DenseMatrix<VTRes> *&res, const DenseMatrix<VTArg> *arg, DCTX(ctx)) {
+        if (!AggOpCodeUtils::isPureBinaryReduction(opCode))
+            throw std::runtime_error("the aggregation function used in aggCum "
+                                     "must be a pure binary reduction");
 
         const size_t numRows = arg->getNumRows();
         const size_t numCols = arg->getNumCols();
 
-        if(res == nullptr)
+        if (res == nullptr)
             res = DataObjectFactory::create<DenseMatrix<VTRes>>(numRows, numCols, false);
-        
-        const VTArg * valuesArg = arg->getValues();
-        VTRes * valuesResPrv = res->getValues();
-        VTRes * valuesResCur = valuesResPrv;
-        
-        EwBinaryScaFuncPtr<VTRes, VTRes, VTArg> func = getEwBinaryScaFuncPtr<VTRes, VTRes, VTArg>(
-                AggOpCodeUtils::getBinaryOpCode(opCode)
-        );
+
+        const VTArg *valuesArg = arg->getValues();
+        VTRes *valuesResPrv = res->getValues();
+        VTRes *valuesResCur = valuesResPrv;
+
+        EwBinaryScaFuncPtr<VTRes, VTRes, VTArg> func =
+            getEwBinaryScaFuncPtr<VTRes, VTRes, VTArg>(AggOpCodeUtils::getBinaryOpCode(opCode));
 
         // First row: copy from arg to res.
-        for(size_t c = 0; c < numCols; c++)
+        for (size_t c = 0; c < numCols; c++)
             valuesResCur[c] = valuesArg[c];
         valuesArg += arg->getRowSkip();
         valuesResCur += res->getRowSkip();
         // Remaining rows: calculate from previous res row and current arg row.
-        for(size_t r = 1; r < numRows; r++) {
-            for(size_t c = 0; c < numCols; c++)
+        for (size_t r = 1; r < numRows; r++) {
+            for (size_t c = 0; c < numCols; c++)
                 valuesResCur[c] = func(valuesResPrv[c], valuesArg[c], ctx);
             valuesArg += arg->getRowSkip();
             valuesResPrv += res->getRowSkip();
@@ -89,21 +86,20 @@ struct AggCum<DenseMatrix<VTRes>, DenseMatrix<VTArg>> {
 // Matrix <- Matrix
 // ----------------------------------------------------------------------------
 
-template<typename VTRes, typename VTArg>
-struct AggCum<Matrix<VTRes>, Matrix<VTArg>> {
-    static void apply(AggOpCode opCode, Matrix<VTRes> *& res, const Matrix<VTArg> * arg, DCTX(ctx)) {
+template <typename VTRes, typename VTArg> struct AggCum<Matrix<VTRes>, Matrix<VTArg>> {
+    static void apply(AggOpCode opCode, Matrix<VTRes> *&res, const Matrix<VTArg> *arg, DCTX(ctx)) {
         if (!AggOpCodeUtils::isPureBinaryReduction(opCode))
-            throw std::runtime_error("AggCum: the aggregation function must be a pure binary reduction");
+            throw std::runtime_error("AggCum: the aggregation function must be "
+                                     "a pure binary reduction");
 
         const size_t numRows = arg->getNumRows();
         const size_t numCols = arg->getNumCols();
 
         if (res == nullptr)
             res = DataObjectFactory::create<DenseMatrix<VTRes>>(numRows, numCols, false);
-        
-        EwBinaryScaFuncPtr<VTRes, VTRes, VTArg> func = getEwBinaryScaFuncPtr<VTRes, VTRes, VTArg>(
-                AggOpCodeUtils::getBinaryOpCode(opCode)
-        );
+
+        EwBinaryScaFuncPtr<VTRes, VTRes, VTArg> func =
+            getEwBinaryScaFuncPtr<VTRes, VTRes, VTArg>(AggOpCodeUtils::getBinaryOpCode(opCode));
 
         // First row: copy from arg to res.
         res->prepareAppend();
@@ -114,6 +110,6 @@ struct AggCum<Matrix<VTRes>, Matrix<VTArg>> {
         // Remaining rows: calculate from previous res row and current arg row.
         for (size_t r = 1; r < numRows; ++r)
             for (size_t c = 0; c < numCols; ++c)
-                res->set(r, c, func(res->get(r-1, c), arg->get(r, c), ctx));
+                res->set(r, c, func(res->get(r - 1, c), arg->get(r, c), ctx));
     }
 };
\ No newline at end of file
diff --git a/src/runtime/local/kernels/AggOpCode.h b/src/runtime/local/kernels/AggOpCode.h
index 74f3ab752..2cb37f1ef 100644
--- a/src/runtime/local/kernels/AggOpCode.h
+++ b/src/runtime/local/kernels/AggOpCode.h
@@ -36,67 +36,74 @@ enum class AggOpCode {
 
 struct AggOpCodeUtils {
     static bool isPureBinaryReduction(AggOpCode opCode) {
-        switch(opCode) {
-            case AggOpCode::SUM:
-            case AggOpCode::PROD:
-            case AggOpCode::MIN:
-            case AggOpCode::MAX:
-                return true;
-            case AggOpCode::MEAN:
-            case AggOpCode::STDDEV:
-            case AggOpCode::VAR:
-                return false;
-            default:
-                throw std::runtime_error("unsupported AggOpCode");
+        switch (opCode) {
+        case AggOpCode::SUM:
+        case AggOpCode::PROD:
+        case AggOpCode::MIN:
+        case AggOpCode::MAX:
+            return true;
+        case AggOpCode::MEAN:
+        case AggOpCode::STDDEV:
+        case AggOpCode::VAR:
+            return false;
+        default:
+            throw std::runtime_error("unsupported AggOpCode");
         }
     }
-    
+
     static BinaryOpCode getBinaryOpCode(AggOpCode opCode) {
         if (!isPureBinaryReduction(opCode)) {
-            throw std::runtime_error(
-                "Aggregation kernel expects pure binary reduction.");
+            throw std::runtime_error("Aggregation kernel expects pure binary reduction.");
         }
-        switch(opCode) {
-            case AggOpCode::SUM: return BinaryOpCode::ADD;
-            case AggOpCode::PROD: return BinaryOpCode::MUL;
-            case AggOpCode::MIN: return BinaryOpCode::MIN;
-            case AggOpCode::MAX: return BinaryOpCode::MAX;
-            default:
-                throw std::runtime_error("unsupported AggOpCode");
+        switch (opCode) {
+        case AggOpCode::SUM:
+            return BinaryOpCode::ADD;
+        case AggOpCode::PROD:
+            return BinaryOpCode::MUL;
+        case AggOpCode::MIN:
+            return BinaryOpCode::MIN;
+        case AggOpCode::MAX:
+            return BinaryOpCode::MAX;
+        default:
+            throw std::runtime_error("unsupported AggOpCode");
         }
     }
-    
-    template<typename VT>
-    static VT getNeutral(AggOpCode opCode) {
+
+    template <typename VT> static VT getNeutral(AggOpCode opCode) {
         if (!isPureBinaryReduction(opCode)) {
-            throw std::runtime_error(
-                "Aggregation kernel expects pure binary reduction.");
+            throw std::runtime_error("Aggregation kernel expects pure binary reduction.");
         }
-        switch(opCode) {
-            case AggOpCode::SUM: return VT(0);
-            case AggOpCode::PROD: return VT(1);
-            case AggOpCode::MIN: return std::numeric_limits<VT>::has_infinity ?  std::numeric_limits<VT>::infinity() : std::numeric_limits<VT>::max();
-            case AggOpCode::MAX: return std::numeric_limits<VT>::has_infinity ? -std::numeric_limits<VT>::infinity() : std::numeric_limits<VT>::min();
-            default:
-                throw std::runtime_error("unsupported AggOpCode");
+        switch (opCode) {
+        case AggOpCode::SUM:
+            return VT(0);
+        case AggOpCode::PROD:
+            return VT(1);
+        case AggOpCode::MIN:
+            return std::numeric_limits<VT>::has_infinity ? std::numeric_limits<VT>::infinity()
+                                                         : std::numeric_limits<VT>::max();
+        case AggOpCode::MAX:
+            return std::numeric_limits<VT>::has_infinity ? -std::numeric_limits<VT>::infinity()
+                                                         : std::numeric_limits<VT>::min();
+        default:
+            throw std::runtime_error("unsupported AggOpCode");
         }
     }
-    
+
     static bool isSparseSafe(AggOpCode opCode) {
-        switch(opCode) {
-            case AggOpCode::SUM:
-                return true;
-            case AggOpCode::PROD:
-            case AggOpCode::MIN:
-            case AggOpCode::MAX:
-            case AggOpCode::MEAN:
-            case AggOpCode::STDDEV:
-            case AggOpCode::VAR:
-                return false;
-            default:
-                throw std::runtime_error("unsupported AggOpCode");
+        switch (opCode) {
+        case AggOpCode::SUM:
+            return true;
+        case AggOpCode::PROD:
+        case AggOpCode::MIN:
+        case AggOpCode::MAX:
+        case AggOpCode::MEAN:
+        case AggOpCode::STDDEV:
+        case AggOpCode::VAR:
+            return false;
+        default:
+            throw std::runtime_error("unsupported AggOpCode");
         }
     }
 };
 
-#endif //SRC_RUNTIME_LOCAL_KERNELS_AGGOPCODE_H
+#endif // SRC_RUNTIME_LOCAL_KERNELS_AGGOPCODE_H
diff --git a/src/runtime/local/kernels/AggRow.h b/src/runtime/local/kernels/AggRow.h
index 82a34ae9b..8b3da82ce 100644
--- a/src/runtime/local/kernels/AggRow.h
+++ b/src/runtime/local/kernels/AggRow.h
@@ -28,26 +28,24 @@
 
 #include <vector>
 
+#include <cmath>
 #include <cstddef>
 #include <cstring>
-#include <cmath>
 #include <typeinfo>
 
 // ****************************************************************************
 // Struct for partial template specialization
 // ****************************************************************************
 
-template<class DTRes, class DTArg>
-struct AggRow {
-    static void apply(AggOpCode opCode, DTRes *& res, const DTArg * arg, DCTX(ctx)) = delete;
+template <class DTRes, class DTArg> struct AggRow {
+    static void apply(AggOpCode opCode, DTRes *&res, const DTArg *arg, DCTX(ctx)) = delete;
 };
 
 // ****************************************************************************
 // Convenience function
 // ****************************************************************************
 
-template<class DTRes, class DTArg>
-void aggRow(AggOpCode opCode, DTRes *& res, const DTArg * arg, DCTX(ctx)) {
+template <class DTRes, class DTArg> void aggRow(AggOpCode opCode, DTRes *&res, const DTArg *arg, DCTX(ctx)) {
     AggRow<DTRes, DTArg>::apply(opCode, res, arg, ctx);
 }
 
@@ -59,24 +57,23 @@ void aggRow(AggOpCode opCode, DTRes *& res, const DTArg * arg, DCTX(ctx)) {
 // DenseMatrix <- DenseMatrix
 // ----------------------------------------------------------------------------
 
-template<typename VTRes, typename VTArg>
-struct AggRow<DenseMatrix<VTRes>, DenseMatrix<VTArg>> {
-    static void apply(AggOpCode opCode, DenseMatrix<VTRes> *& res, const DenseMatrix<VTArg> * arg, DCTX(ctx)) {
+template <typename VTRes, typename VTArg> struct AggRow<DenseMatrix<VTRes>, DenseMatrix<VTArg>> {
+    static void apply(AggOpCode opCode, DenseMatrix<VTRes> *&res, const DenseMatrix<VTArg> *arg, DCTX(ctx)) {
         const size_t numRows = arg->getNumRows();
         const size_t numCols = arg->getNumCols();
-        
-        if(res == nullptr)
+
+        if (res == nullptr)
             res = DataObjectFactory::create<DenseMatrix<VTRes>>(numRows, 1, false);
-        
-        const VTArg * valuesArg = arg->getValues();
-        VTRes * valuesRes = res->getValues();
-        
-        if(opCode == AggOpCode::IDXMIN) {
-            for(size_t r = 0; r < numRows; r++) {
+
+        const VTArg *valuesArg = arg->getValues();
+        VTRes *valuesRes = res->getValues();
+
+        if (opCode == AggOpCode::IDXMIN) {
+            for (size_t r = 0; r < numRows; r++) {
                 VTArg minVal = valuesArg[0];
                 size_t minValIdx = 0;
-                for(size_t c = 1; c < numCols; c++)
-                    if(valuesArg[c] < minVal) {
+                for (size_t c = 1; c < numCols; c++)
+                    if (valuesArg[c] < minVal) {
                         minVal = valuesArg[c];
                         minValIdx = c;
                     }
@@ -84,13 +81,12 @@ struct AggRow<DenseMatrix<VTRes>, DenseMatrix<VTArg>> {
                 valuesArg += arg->getRowSkip();
                 valuesRes += res->getRowSkip();
             }
-        }
-        else if(opCode == AggOpCode::IDXMAX) {
-            for(size_t r = 0; r < numRows; r++) {
+        } else if (opCode == AggOpCode::IDXMAX) {
+            for (size_t r = 0; r < numRows; r++) {
                 VTArg maxVal = valuesArg[0];
                 size_t maxValIdx = 0;
-                for(size_t c = 1; c < numCols; c++)
-                    if(valuesArg[c] > maxVal) {
+                for (size_t c = 1; c < numCols; c++)
+                    if (valuesArg[c] > maxVal) {
                         maxVal = valuesArg[c];
                         maxValIdx = c;
                     }
@@ -98,22 +94,20 @@ struct AggRow<DenseMatrix<VTRes>, DenseMatrix<VTArg>> {
                 valuesArg += arg->getRowSkip();
                 valuesRes += res->getRowSkip();
             }
-        }
-        else {
-            EwBinaryScaFuncPtr<VTRes, VTRes, VTRes> func;    
-            if(AggOpCodeUtils::isPureBinaryReduction(opCode))
+        } else {
+            EwBinaryScaFuncPtr<VTRes, VTRes, VTRes> func;
+            if (AggOpCodeUtils::isPureBinaryReduction(opCode))
                 func = getEwBinaryScaFuncPtr<VTRes, VTRes, VTRes>(AggOpCodeUtils::getBinaryOpCode(opCode));
             else
                 // TODO Setting the function pointer yields the correct result.
-                // However, since MEAN and STDDEV are not sparse-safe, the program
-                // does not take the same path for doing the summation, and is less
-                // efficient.
-                // for MEAN and STDDDEV, we need to sum
+                // However, since MEAN and STDDEV are not sparse-safe, the
+                // program does not take the same path for doing the summation,
+                // and is less efficient. for MEAN and STDDDEV, we need to sum
                 func = getEwBinaryScaFuncPtr<VTRes, VTRes, VTRes>(AggOpCodeUtils::getBinaryOpCode(AggOpCode::SUM));
 
-            for(size_t r = 0; r < numRows; r++) {
+            for (size_t r = 0; r < numRows; r++) {
                 VTRes agg = static_cast<VTRes>(*valuesArg);
-                for(size_t c = 1; c < numCols; c++){
+                for (size_t c = 1; c < numCols; c++) {
                     agg = func(agg, static_cast<VTRes>(valuesArg[c]), ctx);
                 }
                 *valuesRes = static_cast<VTRes>(agg);
@@ -121,40 +115,40 @@ struct AggRow<DenseMatrix<VTRes>, DenseMatrix<VTArg>> {
                 valuesRes += res->getRowSkip();
             }
 
-            if(AggOpCodeUtils::isPureBinaryReduction(opCode))
+            if (AggOpCodeUtils::isPureBinaryReduction(opCode))
                 return;
 
             // The op-code is either MEAN or STDDEV or VAR
             valuesRes = res->getValues();
             // valuesArg = arg->getValues();
-            for(size_t r = 0; r < numRows; r++) {
+            for (size_t r = 0; r < numRows; r++) {
                 *valuesRes = (*valuesRes) / numCols;
                 valuesRes += res->getRowSkip();
             }
 
-            if(opCode == AggOpCode::MEAN)
+            if (opCode == AggOpCode::MEAN)
                 return;
-            
+
             // else op-code is STDDEV or VAR
 
-            // Create a temporary matrix to store the resulting standard deviations for each row
+            // Create a temporary matrix to store the resulting standard
+            // deviations for each row
             auto tmp = DataObjectFactory::create<DenseMatrix<VTRes>>(numRows, 1, true);
-            VTRes * valuesT = tmp->getValues();
+            VTRes *valuesT = tmp->getValues();
             valuesArg = arg->getValues();
             valuesRes = res->getValues();
-            for(size_t r = 0; r < numRows; r++) {
-                for(size_t c = 0; c < numCols; c++) {
+            for (size_t r = 0; r < numRows; r++) {
+                for (size_t c = 0; c < numCols; c++) {
                     VTRes val = static_cast<VTRes>(valuesArg[c]) - (*valuesRes);
                     valuesT[r] = valuesT[r] + val * val;
                 }
                 valuesArg += arg->getRowSkip();
                 valuesRes += res->getRowSkip();
-               
             }
             valuesRes = res->getValues();
-            for(size_t c = 0; c < numRows; c++) {
+            for (size_t c = 0; c < numRows; c++) {
                 valuesT[c] /= numCols;
-                if(opCode == AggOpCode::STDDEV)
+                if (opCode == AggOpCode::STDDEV)
                     *valuesRes = sqrt(valuesT[c]);
                 else
                     *valuesRes = valuesT[c];
@@ -162,7 +156,6 @@ struct AggRow<DenseMatrix<VTRes>, DenseMatrix<VTArg>> {
             }
 
             DataObjectFactory::destroy<DenseMatrix<VTRes>>(tmp);
-            
         }
     }
 };
@@ -171,68 +164,54 @@ struct AggRow<DenseMatrix<VTRes>, DenseMatrix<VTArg>> {
 // DenseMatrix <- CSRMatrix
 // ----------------------------------------------------------------------------
 
-template<typename VTRes, typename VTArg>
-struct AggRow<DenseMatrix<VTRes>, CSRMatrix<VTArg>> {
-    static void apply(AggOpCode opCode, DenseMatrix<VTRes> *& res, const CSRMatrix<VTArg> * arg, DCTX(ctx)) {
+template <typename VTRes, typename VTArg> struct AggRow<DenseMatrix<VTRes>, CSRMatrix<VTArg>> {
+    static void apply(AggOpCode opCode, DenseMatrix<VTRes> *&res, const CSRMatrix<VTArg> *arg, DCTX(ctx)) {
         const size_t numCols = arg->getNumCols();
         const size_t numRows = arg->getNumRows();
-        
-        if(res == nullptr)
+
+        if (res == nullptr)
             res = DataObjectFactory::create<DenseMatrix<VTRes>>(numRows, 1, false);
-        
-        VTRes * valuesRes = res->getValues();
-        
+
+        VTRes *valuesRes = res->getValues();
+
         if (AggOpCodeUtils::isPureBinaryReduction(opCode)) {
-        
-            EwBinaryScaFuncPtr<VTRes, VTRes, VTRes> func = getEwBinaryScaFuncPtr<VTRes, VTRes, VTRes>(AggOpCodeUtils::getBinaryOpCode(opCode));
+
+            EwBinaryScaFuncPtr<VTRes, VTRes, VTRes> func =
+                getEwBinaryScaFuncPtr<VTRes, VTRes, VTRes>(AggOpCodeUtils::getBinaryOpCode(opCode));
 
             const bool isSparseSafe = AggOpCodeUtils::isSparseSafe(opCode);
             const VTRes neutral = AggOpCodeUtils::template getNeutral<VTRes>(opCode);
-        
-            for(size_t r = 0; r < numRows; r++) {
-                *valuesRes = AggAll<VTRes, CSRMatrix<VTArg>>::aggArray(
-                        arg->getValues(r),
-                        arg->getNumNonZeros(r),
-                        numCols,
-                        func,
-                        isSparseSafe,
-                        neutral,
-                        ctx
-                );
+
+            for (size_t r = 0; r < numRows; r++) {
+                *valuesRes = AggAll<VTRes, CSRMatrix<VTArg>>::aggArray(arg->getValues(r), arg->getNumNonZeros(r),
+                                                                       numCols, func, isSparseSafe, neutral, ctx);
                 valuesRes += res->getRowSkip();
             }
-        }
-        else { // The op-code is either MEAN or STDDEV or VAR
+        } else { // The op-code is either MEAN or STDDEV or VAR
             // get sum for each row
-            size_t ctr = 0 ;
+            size_t ctr = 0;
             const VTRes neutral = VTRes(0);
             const bool isSparseSafe = true;
             auto tmp = DataObjectFactory::create<DenseMatrix<VTRes>>(numRows, 1, true);
-            VTRes * valuesT = tmp->getValues();
-            EwBinaryScaFuncPtr<VTRes, VTRes, VTRes> func = getEwBinaryScaFuncPtr<VTRes, VTRes, VTRes>(AggOpCodeUtils::getBinaryOpCode(AggOpCode::SUM));
-            for (size_t r = 0; r < numRows; r++){
-                *valuesRes = AggAll<VTRes, CSRMatrix<VTArg>>::aggArray(
-                    arg->getValues(r),
-                    arg->getNumNonZeros(r),
-                    numCols,
-                    func,
-                    isSparseSafe,
-                    neutral,
-                    ctx
-                );
-                const VTArg * valuesArg = arg->getValues(0);
+            VTRes *valuesT = tmp->getValues();
+            EwBinaryScaFuncPtr<VTRes, VTRes, VTRes> func =
+                getEwBinaryScaFuncPtr<VTRes, VTRes, VTRes>(AggOpCodeUtils::getBinaryOpCode(AggOpCode::SUM));
+            for (size_t r = 0; r < numRows; r++) {
+                *valuesRes = AggAll<VTRes, CSRMatrix<VTArg>>::aggArray(arg->getValues(r), arg->getNumNonZeros(r),
+                                                                       numCols, func, isSparseSafe, neutral, ctx);
+                const VTArg *valuesArg = arg->getValues(0);
                 const size_t numNonZeros = arg->getNumNonZeros(r);
                 *valuesRes = *valuesRes / numCols;
-                if (opCode != AggOpCode::MEAN){
-                    for(size_t i = ctr; i < ctr+numNonZeros; i++) {
+                if (opCode != AggOpCode::MEAN) {
+                    for (size_t i = ctr; i < ctr + numNonZeros; i++) {
                         VTRes val = static_cast<VTRes>((valuesArg[i])) - (*valuesRes);
                         valuesT[r] = valuesT[r] + val * val;
                     }
 
-                    ctr+=numNonZeros; 
-                    valuesT[r] += (numCols - numNonZeros) * (*valuesRes)*(*valuesRes);
+                    ctr += numNonZeros;
+                    valuesT[r] += (numCols - numNonZeros) * (*valuesRes) * (*valuesRes);
                     valuesT[r] /= numCols;
-                    if(opCode == AggOpCode::STDDEV)
+                    if (opCode == AggOpCode::STDDEV)
                         *valuesRes = sqrt(valuesT[r]);
                     else
                         *valuesRes = valuesT[r];
@@ -241,7 +220,6 @@ struct AggRow<DenseMatrix<VTRes>, CSRMatrix<VTArg>> {
             }
             valuesRes = res->getValues();
             DataObjectFactory::destroy<DenseMatrix<VTRes>>(tmp);
-
         }
     }
 };
@@ -250,15 +228,14 @@ struct AggRow<DenseMatrix<VTRes>, CSRMatrix<VTArg>> {
 // Matrix <- Matrix
 // ----------------------------------------------------------------------------
 
-template<typename VTRes, typename VTArg>
-struct AggRow<Matrix<VTRes>, Matrix<VTArg>> {
-    static void apply(AggOpCode opCode, Matrix<VTRes> *& res, const Matrix<VTArg> * arg, DCTX(ctx)) {
+template <typename VTRes, typename VTArg> struct AggRow<Matrix<VTRes>, Matrix<VTArg>> {
+    static void apply(AggOpCode opCode, Matrix<VTRes> *&res, const Matrix<VTArg> *arg, DCTX(ctx)) {
         const size_t numRows = arg->getNumRows();
         const size_t numCols = arg->getNumCols();
-        
+
         if (res == nullptr)
             res = DataObjectFactory::create<DenseMatrix<VTRes>>(numRows, 1, false);
-        
+
         if (opCode == AggOpCode::IDXMIN) {
             res->prepareAppend();
             for (size_t r = 0; r < numRows; ++r) {
@@ -274,8 +251,7 @@ struct AggRow<Matrix<VTRes>, Matrix<VTArg>> {
                 res->append(r, 0, static_cast<VTRes>(minValIdx));
             }
             res->finishAppend();
-        }
-        else if (opCode == AggOpCode::IDXMAX) {
+        } else if (opCode == AggOpCode::IDXMAX) {
             res->prepareAppend();
             for (size_t r = 0; r < numRows; ++r) {
                 VTArg maxVal = arg->get(r, 0);
@@ -290,17 +266,15 @@ struct AggRow<Matrix<VTRes>, Matrix<VTArg>> {
                 res->append(r, 0, static_cast<VTRes>(maxValIdx));
             }
             res->finishAppend();
-        }
-        else {
-            EwBinaryScaFuncPtr<VTRes, VTRes, VTRes> func;    
+        } else {
+            EwBinaryScaFuncPtr<VTRes, VTRes, VTRes> func;
             if (AggOpCodeUtils::isPureBinaryReduction(opCode))
                 func = getEwBinaryScaFuncPtr<VTRes, VTRes, VTRes>(AggOpCodeUtils::getBinaryOpCode(opCode));
             else
                 // TODO Setting the function pointer yields the correct result.
-                // However, since MEAN and STDDEV are not sparse-safe, the program
-                // does not take the same path for doing the summation, and is less
-                // efficient.
-                // for MEAN and STDDDEV, we need to sum
+                // However, since MEAN and STDDEV are not sparse-safe, the
+                // program does not take the same path for doing the summation,
+                // and is less efficient. for MEAN and STDDDEV, we need to sum
                 func = getEwBinaryScaFuncPtr<VTRes, VTRes, VTRes>(AggOpCodeUtils::getBinaryOpCode(AggOpCode::SUM));
 
             res->prepareAppend();
@@ -322,17 +296,18 @@ struct AggRow<Matrix<VTRes>, Matrix<VTArg>> {
 
             if (opCode == AggOpCode::MEAN)
                 return;
-            
+
             // else op-code is STDDEV or VAR
 
-            // Create a temporary matrix to store the resulting standard deviations for each row
+            // Create a temporary matrix to store the resulting standard
+            // deviations for each row
             std::vector<VTRes> tmp(numRows);
 
             for (size_t r = 0; r < numRows; ++r) {
                 for (size_t c = 0; c < numCols; ++c) {
                     VTRes val = static_cast<VTRes>(arg->get(r, c)) - res->get(r, 0);
                     tmp[r] += val * val;
-                }               
+                }
             }
 
             res->prepareAppend();
@@ -348,4 +323,4 @@ struct AggRow<Matrix<VTRes>, Matrix<VTArg>> {
     }
 };
 
-#endif //SRC_RUNTIME_LOCAL_KERNELS_AGGROW_H
+#endif // SRC_RUNTIME_LOCAL_KERNELS_AGGROW_H
diff --git a/src/runtime/local/kernels/Append.h b/src/runtime/local/kernels/Append.h
index 6822ede3b..1aedec969 100644
--- a/src/runtime/local/kernels/Append.h
+++ b/src/runtime/local/kernels/Append.h
@@ -17,15 +17,16 @@
 #pragma once
 
 #include <runtime/local/context/DaphneContext.h>
+#include <runtime/local/datastructures/CSRMatrix.h>
 #include <runtime/local/datastructures/DataObjectFactory.h>
+#include <runtime/local/datastructures/DenseMatrix.h>
 #include <runtime/local/datastructures/List.h>
 
 // ****************************************************************************
 // Convenience function
 // ****************************************************************************
 
-template<class DT>
-void append(List<DT> *& resList, const List<DT> * argList, const DT * elem, DCTX(ctx)) {
+template <class DT> void append(List<DT> *&resList, const List<DT> *argList, const DT *elem, DCTX(ctx)) {
     resList = DataObjectFactory::create<List<DT>>(argList);
     resList->append(elem);
-}
\ No newline at end of file
+}
diff --git a/src/runtime/local/kernels/AvgPoolBackward.h b/src/runtime/local/kernels/AvgPoolBackward.h
index 92572299d..31a170b1e 100644
--- a/src/runtime/local/kernels/AvgPoolBackward.h
+++ b/src/runtime/local/kernels/AvgPoolBackward.h
@@ -1,37 +1,28 @@
-#include "Pooling.h"
 #include "Padding.h"
-
+#include "Pooling.h"
 
 // ****************************************************************************
 // Struct for partial template specialization
 // ****************************************************************************
 
-template<class DTRes, class DTArg>
-struct AvgPoolBackward {
-    static void apply(  DTRes *&res, const DTArg *input, const DTArg *dOut,
-                        const size_t batch_size, const size_t num_channels, 
-                        const size_t img_h, const size_t img_w,
-                        const size_t pool_h, const size_t pool_w,
-                        const size_t stride_h, const size_t stride_w, 
-                        const size_t pad_h, const size_t pad_w,  DCTX(dctx)) = delete;
+template <class DTRes, class DTArg> struct AvgPoolBackward {
+    static void apply(DTRes *&res, const DTArg *input, const DTArg *dOut, const size_t batch_size,
+                      const size_t num_channels, const size_t img_h, const size_t img_w, const size_t pool_h,
+                      const size_t pool_w, const size_t stride_h, const size_t stride_w, const size_t pad_h,
+                      const size_t pad_w, DCTX(dctx)) = delete;
 };
 
 // ****************************************************************************
 // Convenience function
 // ****************************************************************************
 
-template<class DTRes, class DTArg>
-void avgPoolBackward(   DTRes *&res, const DTArg *input, const DTArg *dOut,
-                        const size_t batch_size, const size_t num_channels, 
-                        const size_t img_h, const size_t img_w,
-                        const size_t pool_h, const size_t pool_w,
-                        const size_t stride_h, const size_t stride_w, 
-                        const size_t pad_h, const size_t pad_w,  DCTX(dctx)) {
-    AvgPoolBackward<DTRes, DTArg>::apply(res, input, dOut,
-                        batch_size, num_channels, img_h, img_w,
-                        pool_h, pool_w, 
-                        stride_h, stride_w, 
-                        pad_h, pad_w, dctx);
+template <class DTRes, class DTArg>
+void avgPoolBackward(DTRes *&res, const DTArg *input, const DTArg *dOut, const size_t batch_size,
+                     const size_t num_channels, const size_t img_h, const size_t img_w, const size_t pool_h,
+                     const size_t pool_w, const size_t stride_h, const size_t stride_w, const size_t pad_h,
+                     const size_t pad_w, DCTX(dctx)) {
+    AvgPoolBackward<DTRes, DTArg>::apply(res, input, dOut, batch_size, num_channels, img_h, img_w, pool_h, pool_w,
+                                         stride_h, stride_w, pad_h, pad_w, dctx);
 }
 
 // ****************************************************************************
@@ -42,20 +33,11 @@ void avgPoolBackward(   DTRes *&res, const DTArg *input, const DTArg *dOut,
 // DenseMatrix <- DenseMatrix
 // ----------------------------------------------------------------------------
 
-template <typename VTRes, typename VTArg>
-struct AvgPoolBackward<DenseMatrix<VTRes>, DenseMatrix<VTArg>>
-{
-    static void 
-    apply(DenseMatrix<VTRes> *&res, 
-          const DenseMatrix<VTArg> *input,
-          const DenseMatrix<VTArg> *dOut,
-          const size_t batch_size, const size_t num_channels, 
-          const size_t img_h, const size_t img_w,       
-          const size_t pool_h, const size_t pool_w,
-          const size_t stride_h, const size_t stride_w,
-          const size_t pad_h, const size_t pad_w,  
-          DCTX(dctx))
-    {    
+template <typename VTRes, typename VTArg> struct AvgPoolBackward<DenseMatrix<VTRes>, DenseMatrix<VTArg>> {
+    static void apply(DenseMatrix<VTRes> *&res, const DenseMatrix<VTArg> *input, const DenseMatrix<VTArg> *dOut,
+                      const size_t batch_size, const size_t num_channels, const size_t img_h, const size_t img_w,
+                      const size_t pool_h, const size_t pool_w, const size_t stride_h, const size_t stride_w,
+                      const size_t pad_h, const size_t pad_w, DCTX(dctx)) {
         auto HW = img_h * img_w;
         auto C = num_channels;
         auto CHW = C * HW;
@@ -64,38 +46,36 @@ struct AvgPoolBackward<DenseMatrix<VTRes>, DenseMatrix<VTArg>>
         auto Q = getPQ(img_w, pool_w, pad_w, stride_h);
         auto CPQ = C * P * Q;
         auto start = 0;
-        auto stop = batch_size;    
+        auto stop = batch_size;
         auto plen = static_cast<VTArg>(1) / static_cast<VTArg>(pool_w * pool_h);
 
         auto padded_img_h = img_h + 2 * pad_h;
         auto padded_img_w = img_w + 2 * pad_w;
-        DenseMatrix<VTArg> *dPooling_padded = DataObjectFactory::create<DenseMatrix<VTArg>>(1, padded_img_h * padded_img_w, true);
-        
-        if (res == nullptr)
-        {
+        DenseMatrix<VTArg> *dPooling_padded =
+            DataObjectFactory::create<DenseMatrix<VTArg>>(1, padded_img_h * padded_img_w, true);
+
+        if (res == nullptr) {
             res = DataObjectFactory::create<DenseMatrix<VTArg>>(batch_size, CHW, true);
         }
-        
+
         for (uint32_t i = start; i < stop; i++)
-            for (uint32_t c = 0; c < C; c++)
-            {
+            for (uint32_t c = 0; c < C; c++) {
                 auto off_input = i * CHW + c * HW;
                 for (uint32_t p = 0; p < P; p++)
                     for (uint32_t h = 0; h < pool_h; h++)
                         for (uint32_t q = 0; q < Q; q++)
-                            for (uint32_t w = 0; w < pool_w; w++)
-                            {
+                            for (uint32_t w = 0; w < pool_w; w++) {
                                 auto off_padded = (p * stride_h + h) * padded_img_w + q * stride_w + w;
                                 auto off_output = i * CPQ + c * P * Q + p * Q + q;
-                                dPooling_padded->getValues()[off_padded] = dPooling_padded->getValues()[off_padded]
-                                                                         + plen * dOut->getValues()[off_output];    
+                                dPooling_padded->getValues()[off_padded] =
+                                    dPooling_padded->getValues()[off_padded] + plen * dOut->getValues()[off_output];
                             }
-                CleanPaddingAndSave(res->getValues(), dPooling_padded->getValues(), pad_h, pad_w, img_w, img_h, off_input);
+                CleanPaddingAndSave(res->getValues(), dPooling_padded->getValues(), pad_h, pad_w, img_w, img_h,
+                                    off_input);
                 for (uint32_t i = 0; i < padded_img_h * padded_img_w; i++)
                     dPooling_padded->getValues()[i] = 0;
             }
-                
+
         DataObjectFactory::destroy(dPooling_padded);
     }
-    
 };
\ No newline at end of file
diff --git a/src/runtime/local/kernels/AvgPoolForward.h b/src/runtime/local/kernels/AvgPoolForward.h
index 1348b293f..80333a5b5 100644
--- a/src/runtime/local/kernels/AvgPoolForward.h
+++ b/src/runtime/local/kernels/AvgPoolForward.h
@@ -1,38 +1,27 @@
 #include "Pooling.h"
 
-
 // ****************************************************************************
 // Struct for partial template specialization
 // ****************************************************************************
 
-template<class DTRes, class DTArg>
-struct AvgPoolForward {
-    static void apply(DTRes *&res, size_t& res_h, size_t& res_w,
-                          const DTArg *data,
-                          const size_t batch_size, const size_t num_channels, 
-                          const size_t img_h, const size_t img_w,
-                          const size_t pool_h, const size_t pool_w,
-                          const size_t stride_h, const size_t stride_w, 
-                          const size_t pad_h, const size_t pad_w,  DCTX(dctx)) = delete;
+template <class DTRes, class DTArg> struct AvgPoolForward {
+    static void apply(DTRes *&res, size_t &res_h, size_t &res_w, const DTArg *data, const size_t batch_size,
+                      const size_t num_channels, const size_t img_h, const size_t img_w, const size_t pool_h,
+                      const size_t pool_w, const size_t stride_h, const size_t stride_w, const size_t pad_h,
+                      const size_t pad_w, DCTX(dctx)) = delete;
 };
 
 // ****************************************************************************
 // Convenience function
 // ****************************************************************************
 
-template<class DTRes, class DTArg>
-void avgPoolForward(DTRes *&res, size_t& res_h, size_t& res_w,
-                          const DTArg *data,
-                          const size_t batch_size, const size_t num_channels, 
-                          const size_t img_h, const size_t img_w,
-                          const size_t pool_h, const size_t pool_w,
-                          const size_t stride_h, const size_t stride_w, 
-                          const size_t pad_h, const size_t pad_w,  DCTX(dctx)) {
-    AvgPoolForward<DTRes, DTArg>::apply(res, res_h, res_w,
-                        data, batch_size, num_channels, img_h, img_w,
-                        pool_h, pool_w, 
-                        stride_h, stride_w, 
-                        pad_h, pad_w, dctx);
+template <class DTRes, class DTArg>
+void avgPoolForward(DTRes *&res, size_t &res_h, size_t &res_w, const DTArg *data, const size_t batch_size,
+                    const size_t num_channels, const size_t img_h, const size_t img_w, const size_t pool_h,
+                    const size_t pool_w, const size_t stride_h, const size_t stride_w, const size_t pad_h,
+                    const size_t pad_w, DCTX(dctx)) {
+    AvgPoolForward<DTRes, DTArg>::apply(res, res_h, res_w, data, batch_size, num_channels, img_h, img_w, pool_h, pool_w,
+                                        stride_h, stride_w, pad_h, pad_w, dctx);
 }
 
 // ****************************************************************************
@@ -43,24 +32,13 @@ void avgPoolForward(DTRes *&res, size_t& res_h, size_t& res_w,
 // DenseMatrix <- DenseMatrix
 // ----------------------------------------------------------------------------
 
-template <typename VTRes, typename VTArg>
-struct AvgPoolForward<DenseMatrix<VTRes>, DenseMatrix<VTArg>>
-{
-    static void 
-    apply(DenseMatrix<VTRes> *&res, 
-          size_t &res_h, size_t &res_w,
-          const DenseMatrix<VTArg> *data,
-          const size_t batch_size, const size_t num_channels, 
-          const size_t img_h, const size_t img_w,       
-          const size_t pool_h, const size_t pool_w,
-          const size_t stride_h, const size_t stride_w,
-          const size_t pad_h, const size_t pad_w,  
-          DCTX(dctx))
-    {
-        NN::Pooling::Forward<NN::Pooling::AVG, DenseMatrix<VTRes>, DenseMatrix<VTArg>>::apply(res, res_h, res_w,
-                        data, batch_size, num_channels, img_h, img_w,
-                        pool_h, pool_w, 
-                        stride_h, stride_w, 
-                        pad_h, pad_w, dctx);
+template <typename VTRes, typename VTArg> struct AvgPoolForward<DenseMatrix<VTRes>, DenseMatrix<VTArg>> {
+    static void apply(DenseMatrix<VTRes> *&res, size_t &res_h, size_t &res_w, const DenseMatrix<VTArg> *data,
+                      const size_t batch_size, const size_t num_channels, const size_t img_h, const size_t img_w,
+                      const size_t pool_h, const size_t pool_w, const size_t stride_h, const size_t stride_w,
+                      const size_t pad_h, const size_t pad_w, DCTX(dctx)) {
+        NN::Pooling::Forward<NN::Pooling::AVG, DenseMatrix<VTRes>, DenseMatrix<VTArg>>::apply(
+            res, res_h, res_w, data, batch_size, num_channels, img_h, img_w, pool_h, pool_w, stride_h, stride_w, pad_h,
+            pad_w, dctx);
     }
 };
\ No newline at end of file
diff --git a/src/runtime/local/kernels/BatchNorm2DBackward.h b/src/runtime/local/kernels/BatchNorm2DBackward.h
index 5af0b356b..5709e1e22 100644
--- a/src/runtime/local/kernels/BatchNorm2DBackward.h
+++ b/src/runtime/local/kernels/BatchNorm2DBackward.h
@@ -35,23 +35,20 @@
 // Struct for partial template specialization
 // ****************************************************************************
 
-template<class DTRes, class DTArg>
-struct BatchNorm2DBackward {
-    static void apply(  DTRes *&dX, DTRes *&dGamma, DTRes *&dBeta,
-                        const DTArg *mean, const DTArg *invVar, 
-                        const DTArg *in, const DTArg *dout,
-                        const DTArg *gamma, const typename DTArg::VT eps, DCTX(dctx)) = delete;
+template <class DTRes, class DTArg> struct BatchNorm2DBackward {
+    static void apply(DTRes *&dX, DTRes *&dGamma, DTRes *&dBeta, const DTArg *mean, const DTArg *invVar,
+                      const DTArg *in, const DTArg *dout, const DTArg *gamma, const typename DTArg::VT eps,
+                      DCTX(dctx)) = delete;
 };
 
 // ****************************************************************************
 // Convenience function
 // ****************************************************************************
 
-template<class DTRes, class DTArg>
-void batchNorm2DBackward(   DTRes *&dX, DTRes *&dGamma, DTRes *&dBeta,
-                            const DTArg *mean, const DTArg *invVar, 
-                            const DTArg *in, const DTArg *dout, 
-                            const DTArg *gamma, const typename DTArg::VT eps, DCTX(dctx)) {
+template <class DTRes, class DTArg>
+void batchNorm2DBackward(DTRes *&dX, DTRes *&dGamma, DTRes *&dBeta, const DTArg *mean, const DTArg *invVar,
+                         const DTArg *in, const DTArg *dout, const DTArg *gamma, const typename DTArg::VT eps,
+                         DCTX(dctx)) {
     BatchNorm2DBackward<DTRes, DTArg>::apply(dX, dGamma, dBeta, mean, invVar, in, dout, gamma, eps, dctx);
 }
 
@@ -63,21 +60,11 @@ void batchNorm2DBackward(   DTRes *&dX, DTRes *&dGamma, DTRes *&dBeta,
 // DenseMatrix <- DenseMatrix
 // ----------------------------------------------------------------------------
 
-template <typename VTRes, typename VTArg>
-struct BatchNorm2DBackward<DenseMatrix<VTRes>, DenseMatrix<VTArg>>
-{
-    static void 
-    apply(  DenseMatrix<VTRes> *&dX,
-            DenseMatrix<VTRes> *&dGamma,
-            DenseMatrix<VTRes> *&dBeta,
-            const DenseMatrix<VTArg> *mean,
-            const DenseMatrix<VTArg> *invVar,
-            const DenseMatrix<VTArg> *in,
-            const DenseMatrix<VTArg> *dout, 
-            const DenseMatrix<VTArg> *gamma,
-            const VTArg eps, DCTX(dctx))
-    {
-        
+template <typename VTRes, typename VTArg> struct BatchNorm2DBackward<DenseMatrix<VTRes>, DenseMatrix<VTArg>> {
+    static void apply(DenseMatrix<VTRes> *&dX, DenseMatrix<VTRes> *&dGamma, DenseMatrix<VTRes> *&dBeta,
+                      const DenseMatrix<VTArg> *mean, const DenseMatrix<VTArg> *invVar, const DenseMatrix<VTArg> *in,
+                      const DenseMatrix<VTArg> *dout, const DenseMatrix<VTArg> *gamma, const VTArg eps, DCTX(dctx)) {
+
         auto start = 0;
         auto stop = in->getNumRows();
         auto CHW = in->getNumCols();
@@ -92,8 +79,8 @@ struct BatchNorm2DBackward<DenseMatrix<VTRes>, DenseMatrix<VTArg>>
         auto off = 0;
         VTArg sum_dBeta = 0, sum_dGamma = 0, dVar = 0, dMean = 0, dX_hat = 0;
         /* double sum_dBeta = 0., sum_dGamma = 0.;
-        double dVar = 0.; 
-        double dMean = 0.; 
+        double dVar = 0.;
+        double dMean = 0.;
         double dX_hat = 0.; */
 
         if (dX == nullptr)
@@ -102,41 +89,36 @@ struct BatchNorm2DBackward<DenseMatrix<VTRes>, DenseMatrix<VTArg>>
             dGamma = DataObjectFactory::create<DenseMatrix<VTArg>>(C, 1, true);
         if (dBeta == nullptr)
             dBeta = DataObjectFactory::create<DenseMatrix<VTArg>>(C, 1, true);
-        
-        for(uint32_t c = 0; c < C; c++)
-        {
+
+        for (uint32_t c = 0; c < C; c++) {
             sum_dBeta = 0, sum_dGamma = 0, dVar = 0, dMean = 0, dX_hat = 0;
-            for(uint32_t i = start; i < stop; i++)
-                for(uint32_t j = 0; j < HW; j++)
-                {
+            for (uint32_t i = start; i < stop; i++)
+                for (uint32_t j = 0; j < HW; j++) {
                     off = i * CHW + c * HW + j;
                     sum_dBeta += dout->getValues()[off];
-                    sum_dGamma += dout->getValues()[off] * (in->getValues()[off] - mean->getValues()[c]) * invVar->getValues()[c];
+                    sum_dGamma +=
+                        dout->getValues()[off] * (in->getValues()[off] - mean->getValues()[c]) * invVar->getValues()[c];
                     dX_hat = dout->getValues()[off] * gamma->getValues()[c];
-                    dVar -= dX_hat
-                            * (in->getValues()[off] - mean->getValues()[c])
-                            * half * std::pow(invVar->getValues()[c], 3);
+                    dVar -= dX_hat * (in->getValues()[off] - mean->getValues()[c]) * half *
+                            std::pow(invVar->getValues()[c], 3);
                 }
             dBeta->getValues()[c] = sum_dBeta;
             dGamma->getValues()[c] = sum_dGamma;
-          
-            for(uint32_t i = start; i < stop; i++)
-                for(uint32_t j = 0; j < HW; j++)
-                {
+
+            for (uint32_t i = start; i < stop; i++)
+                for (uint32_t j = 0; j < HW; j++) {
                     off = i * CHW + c * HW + j;
                     dX_hat = dout->getValues()[off] * gamma->getValues()[c];
-                    dMean += dX_hat * (-invVar->getValues()[c])
-                            + dVar * (-const_2_m) * (in->getValues()[off] - mean->getValues()[c]);            
+                    dMean += dX_hat * (-invVar->getValues()[c]) +
+                             dVar * (-const_2_m) * (in->getValues()[off] - mean->getValues()[c]);
                 }
-            for(uint32_t i = start; i < stop; i++)
-                for(uint32_t j = 0; j < HW; j++)
-                {
+            for (uint32_t i = start; i < stop; i++)
+                for (uint32_t j = 0; j < HW; j++) {
                     off = i * CHW + c * HW + j;
                     dX_hat = dout->getValues()[off] * gamma->getValues()[c];
-                    dX->getValues()[off] = dX_hat * invVar->getValues()[c]
-                                    + dVar * const_2_m * (in->getValues()[off] - mean->getValues()[c])
-                                    + dMean / m;
-                }  
+                    dX->getValues()[off] = dX_hat * invVar->getValues()[c] +
+                                           dVar * const_2_m * (in->getValues()[off] - mean->getValues()[c]) + dMean / m;
+                }
         }
     }
 };
\ No newline at end of file
diff --git a/src/runtime/local/kernels/BatchNorm2DTestForward.h b/src/runtime/local/kernels/BatchNorm2DTestForward.h
index 601c5b666..58b92fca0 100644
--- a/src/runtime/local/kernels/BatchNorm2DTestForward.h
+++ b/src/runtime/local/kernels/BatchNorm2DTestForward.h
@@ -35,20 +35,18 @@
 // Struct for partial template specialization
 // ****************************************************************************
 
-template<class DTRes, class DTArg>
-struct BatchNorm2DTestForward {
-    static void apply(DTRes *&res, const DTArg *in, const DTArg *gamma, const DTArg *beta,
-                      const DTArg *emaMean, const DTArg *emaVar, const typename DTArg::VT eps,
-                      DCTX(dctx)) = delete;
+template <class DTRes, class DTArg> struct BatchNorm2DTestForward {
+    static void apply(DTRes *&res, const DTArg *in, const DTArg *gamma, const DTArg *beta, const DTArg *emaMean,
+                      const DTArg *emaVar, const typename DTArg::VT eps, DCTX(dctx)) = delete;
 };
 
 // ****************************************************************************
 // Convenience function
 // ****************************************************************************
 
-template<class DTRes, class DTArg>
-void batchNorm2DTestForward(DTRes *&res, const DTArg *in, const DTArg *gamma, const DTArg *beta,
-                    const DTArg *emaMean, const DTArg *emaVar, const typename DTArg::VT eps, DCTX(dctx)) {
+template <class DTRes, class DTArg>
+void batchNorm2DTestForward(DTRes *&res, const DTArg *in, const DTArg *gamma, const DTArg *beta, const DTArg *emaMean,
+                            const DTArg *emaVar, const typename DTArg::VT eps, DCTX(dctx)) {
     BatchNorm2DTestForward<DTRes, DTArg>::apply(res, in, gamma, beta, emaMean, emaVar, eps, dctx);
 }
 
@@ -60,9 +58,7 @@ void batchNorm2DTestForward(DTRes *&res, const DTArg *in, const DTArg *gamma, co
 // DenseMatrix <- DenseMatrix
 // ----------------------------------------------------------------------------
 
-template <typename VT>
-static inline VT getMean(const VT *in, uint32_t start, uint32_t length, VT plen) 
-{
+template <typename VT> static inline VT getMean(const VT *in, uint32_t start, uint32_t length, VT plen) {
     VT ret = 0;
     auto end = start + length;
     for (auto i = start; i < end; ++i)
@@ -70,9 +66,7 @@ static inline VT getMean(const VT *in, uint32_t start, uint32_t length, VT plen)
     return ret * plen;
 }
 
-template <typename VT>
-static inline VT getVar(const VT *in, uint32_t start, uint32_t length, VT plen, VT mean) 
-{
+template <typename VT> static inline VT getVar(const VT *in, uint32_t start, uint32_t length, VT plen, VT mean) {
     VT ret = 0;
     auto end = start + length;
     for (auto i = start; i < end; ++i)
@@ -80,20 +74,11 @@ static inline VT getVar(const VT *in, uint32_t start, uint32_t length, VT plen,
     return ret * plen;
 }
 
+template <typename VTRes, typename VTArg> struct BatchNorm2DTestForward<DenseMatrix<VTRes>, DenseMatrix<VTArg>> {
+    static void apply(DenseMatrix<VTRes> *&res, const DenseMatrix<VTArg> *in, const DenseMatrix<VTArg> *gamma,
+                      const DenseMatrix<VTArg> *beta, const DenseMatrix<VTArg> *emaMean,
+                      const DenseMatrix<VTArg> *emaVar, const VTArg eps, DCTX(dctx)) {
 
-template <typename VTRes, typename VTArg>
-struct BatchNorm2DTestForward<DenseMatrix<VTRes>, DenseMatrix<VTArg>>
-{
-    static void 
-    apply(DenseMatrix<VTRes> *&res, 
-          const DenseMatrix<VTArg> *in,
-          const DenseMatrix<VTArg> *gamma, 
-          const DenseMatrix<VTArg> *beta,
-          const DenseMatrix<VTArg> *emaMean,
-          const DenseMatrix<VTArg> *emaVar,
-          const VTArg eps, DCTX(dctx))
-    {
-        
         auto start = 0;
         auto stop = in->getNumRows();
         auto size = in->getNumCols();
@@ -101,15 +86,12 @@ struct BatchNorm2DTestForward<DenseMatrix<VTRes>, DenseMatrix<VTArg>>
         VTArg x_hat = 0;
         auto off = 0;
 
-        if (res == nullptr)
-        {
+        if (res == nullptr) {
             res = DataObjectFactory::create<DenseMatrix<VTArg>>(stop, size, true);
         }
-        
-        for(uint32_t i = start; i < stop; i++)
-        {
-            for(uint32_t j = 0; j < size; j++)
-            {    
+
+        for (uint32_t i = start; i < stop; i++) {
+            for (uint32_t j = 0; j < size; j++) {
                 off = i * size + j;
                 x_hat = (in->getValues()[off] - emaMean->getValues()[i]) / std::sqrt(emaVar->getValues()[i] + eps);
                 res->getValues()[off] = gamma->getValues()[i] * x_hat + beta->getValues()[i];
diff --git a/src/runtime/local/kernels/BatchNorm2DTrainForward.h b/src/runtime/local/kernels/BatchNorm2DTrainForward.h
index 8578fb2bf..b2ce30035 100644
--- a/src/runtime/local/kernels/BatchNorm2DTrainForward.h
+++ b/src/runtime/local/kernels/BatchNorm2DTrainForward.h
@@ -35,32 +35,23 @@
 // Struct for partial template specialization
 // ****************************************************************************
 
-template<class DTRes, class DTArg>
-struct BatchNorm2DTrainForward {
-    static void apply(  DTRes *&res, 
-                        DTRes *&new_emaMean, DTRes *&new_emaVar,
-                        DTRes *&mean, DTRes *&invVar, 
-                        const DTArg *in, 
-                        const DTArg *gamma, const DTArg *beta,
-                        const DTArg *emaMean, const DTArg *emaVar, 
-                        const typename DTArg::VT eps, const typename DTArg::VT mu,
-                        DCTX(dctx)) = delete;
+template <class DTRes, class DTArg> struct BatchNorm2DTrainForward {
+    static void apply(DTRes *&res, DTRes *&new_emaMean, DTRes *&new_emaVar, DTRes *&mean, DTRes *&invVar,
+                      const DTArg *in, const DTArg *gamma, const DTArg *beta, const DTArg *emaMean, const DTArg *emaVar,
+                      const typename DTArg::VT eps, const typename DTArg::VT mu, DCTX(dctx)) = delete;
 };
 
 // ****************************************************************************
 // Convenience function
 // ****************************************************************************
 
-template<class DTRes, class DTArg>
-void batchNorm2DTrainForward(   DTRes *&res, 
-                            DTRes *&new_emaMean, DTRes *&new_emaVar,
-                            DTRes *&mean, DTRes *&invVar, 
-                            const DTArg *in, 
-                            const DTArg *gamma, const DTArg *beta,
-                            const DTArg *emaMean, const DTArg *emaVar, 
-                            const typename DTArg::VT eps, const typename DTArg::VT mu,
-                            DCTX(dctx)) {
-    BatchNorm2DTrainForward<DTRes, DTArg>::apply(res, new_emaMean, new_emaVar, mean, invVar, in, gamma, beta, emaMean, emaVar, eps, mu, dctx);
+template <class DTRes, class DTArg>
+void batchNorm2DTrainForward(DTRes *&res, DTRes *&new_emaMean, DTRes *&new_emaVar, DTRes *&mean, DTRes *&invVar,
+                             const DTArg *in, const DTArg *gamma, const DTArg *beta, const DTArg *emaMean,
+                             const DTArg *emaVar, const typename DTArg::VT eps, const typename DTArg::VT mu,
+                             DCTX(dctx)) {
+    BatchNorm2DTrainForward<DTRes, DTArg>::apply(res, new_emaMean, new_emaVar, mean, invVar, in, gamma, beta, emaMean,
+                                                 emaVar, eps, mu, dctx);
 }
 
 // ****************************************************************************
@@ -71,23 +62,13 @@ void batchNorm2DTrainForward(   DTRes *&res,
 // DenseMatrix <- DenseMatrix
 // ----------------------------------------------------------------------------
 
-template <typename VTRes, typename VTArg>
-struct BatchNorm2DTrainForward<DenseMatrix<VTRes>, DenseMatrix<VTArg>>
-{
-    static void 
-    apply(  DenseMatrix<VTRes> *&res,
-            DenseMatrix<VTRes> *&new_emaMean,
-            DenseMatrix<VTRes> *&new_emaVar,
-            DenseMatrix<VTRes> *&Mean,
-            DenseMatrix<VTRes> *&invVar,
-            const DenseMatrix<VTArg> *in,
-            const DenseMatrix<VTArg> *gamma, 
-            const DenseMatrix<VTArg> *beta,
-            const DenseMatrix<VTArg> *emaMean,
-            const DenseMatrix<VTArg> *emaVar,
-            const VTArg eps, const VTArg mu, DCTX(dctx))
-    {
-        
+template <typename VTRes, typename VTArg> struct BatchNorm2DTrainForward<DenseMatrix<VTRes>, DenseMatrix<VTArg>> {
+    static void apply(DenseMatrix<VTRes> *&res, DenseMatrix<VTRes> *&new_emaMean, DenseMatrix<VTRes> *&new_emaVar,
+                      DenseMatrix<VTRes> *&Mean, DenseMatrix<VTRes> *&invVar, const DenseMatrix<VTArg> *in,
+                      const DenseMatrix<VTArg> *gamma, const DenseMatrix<VTArg> *beta,
+                      const DenseMatrix<VTArg> *emaMean, const DenseMatrix<VTArg> *emaVar, const VTArg eps,
+                      const VTArg mu, DCTX(dctx)) {
+
         auto start = 0;
         auto stop = in->getNumRows();
         auto CHW = in->getNumCols();
@@ -109,39 +90,33 @@ struct BatchNorm2DTrainForward<DenseMatrix<VTRes>, DenseMatrix<VTArg>>
             Mean = DataObjectFactory::create<DenseMatrix<VTArg>>(C, C, true);
         if (invVar == nullptr)
             invVar = DataObjectFactory::create<DenseMatrix<VTArg>>(C, C, true);
-        
-        for(uint32_t c = 0; c < C; c++)
-        {
+
+        for (uint32_t c = 0; c < C; c++) {
             mean = 0;
-            for(uint32_t i = start; i < stop; i++)
-                for(uint32_t j = 0; j < HW; j++)
-                {
+            for (uint32_t i = start; i < stop; i++)
+                for (uint32_t j = 0; j < HW; j++) {
                     off = i * CHW + c * HW + j;
-                    mean =  mean + in->getValues()[off] / (stop * HW);
+                    mean = mean + in->getValues()[off] / (stop * HW);
                 }
 
             var = 0;
-            for(uint32_t i = start; i < stop; i++)
-                for(uint32_t j = 0; j < HW; j++)
-                {
+            for (uint32_t i = start; i < stop; i++)
+                for (uint32_t j = 0; j < HW; j++) {
                     off = i * CHW + c * HW + j;
-                    var =  var + std::pow((in->getValues()[off] - mean), 2) / (stop * HW);
+                    var = var + std::pow((in->getValues()[off] - mean), 2) / (stop * HW);
                 }
 
             Mean->getValues()[c] = mean;
             invVar->getValues()[c] = 1 / std::sqrt(var + eps);
             new_emaMean->getValues()[c] = (1 - mu) * emaMean->getValues()[c] + mu * mean;
             new_emaVar->getValues()[c] = (1 - mu) * emaVar->getValues()[c] + mu * var;
-            for(uint32_t i = start; i < stop; i++)
-            {
-                for(uint32_t j = 0; j < HW; j++)
-                {
+            for (uint32_t i = start; i < stop; i++) {
+                for (uint32_t j = 0; j < HW; j++) {
                     off = i * CHW + c * HW + j;
                     x_hat = (in->getValues()[off] - mean) / std::sqrt(var + eps);
                     res->getValues()[off] = gamma->getValues()[c] * x_hat + beta->getValues()[c];
                 }
             }
-
         }
     }
 };
\ No newline at end of file
diff --git a/src/runtime/local/kernels/BiasAddForward.h b/src/runtime/local/kernels/BiasAddForward.h
index f1725bab5..97b2fd765 100644
--- a/src/runtime/local/kernels/BiasAddForward.h
+++ b/src/runtime/local/kernels/BiasAddForward.h
@@ -19,17 +19,15 @@
 // Struct for partial template specialization
 // ****************************************************************************
 
-template<class DTRes, class DTArg>
-struct BiasAddForward {
-    static void apply(  DTRes *&res, const DTArg *input, const DTArg *bias,
-                        DCTX(dctx)) = delete;
+template <class DTRes, class DTArg> struct BiasAddForward {
+    static void apply(DTRes *&res, const DTArg *input, const DTArg *bias, DCTX(dctx)) = delete;
 };
 
 // ****************************************************************************
 // Convenience function
 // ****************************************************************************
 
-template<class DTRes, class DTArg>
+template <class DTRes, class DTArg>
 void biasAddForward(DTRes *&res, const DTArg *input, const DTArg *bias, DCTX(dctx)) {
     BiasAddForward<DTRes, DTArg>::apply(res, input, bias, dctx);
 }
@@ -42,29 +40,22 @@ void biasAddForward(DTRes *&res, const DTArg *input, const DTArg *bias, DCTX(dct
 // DenseMatrix <- DenseMatrix
 // ----------------------------------------------------------------------------
 
-template <typename VTRes, typename VTArg>
-struct BiasAddForward<DenseMatrix<VTRes>, DenseMatrix<VTArg>>
-{
-    static void 
-    apply(DenseMatrix<VTRes> *&res, 
-          const DenseMatrix<VTArg> *input,
-          const DenseMatrix<VTArg> *bias,
-          DCTX(dctx))
-    {
+template <typename VTRes, typename VTArg> struct BiasAddForward<DenseMatrix<VTRes>, DenseMatrix<VTArg>> {
+    static void apply(DenseMatrix<VTRes> *&res, const DenseMatrix<VTArg> *input, const DenseMatrix<VTArg> *bias,
+                      DCTX(dctx)) {
         auto start = 0;
         auto stop = input->getNumRows();
         auto C = bias->getNumRows();
         auto CHW = input->getNumCols();
         auto HW = CHW / C;
-        
+
         if (res == nullptr)
             res = DataObjectFactory::create<DenseMatrix<VTArg>>(input->getNumRows(), CHW, true);
-        
+
         for (uint32_t i = start; i < stop; i++)
             for (uint32_t c = 0; c < C; c++)
                 for (uint32_t j = 0; j < HW; j++)
-                   res->getValues()[i * CHW + c * HW + j] = input->getValues()[i * CHW + c * HW + j]
-                                                          + bias->getValues()[c];
-            
+                    res->getValues()[i * CHW + c * HW + j] =
+                        input->getValues()[i * CHW + c * HW + j] + bias->getValues()[c];
     }
 };
\ No newline at end of file
diff --git a/src/runtime/local/kernels/Bin.h b/src/runtime/local/kernels/Bin.h
index 7f0b7c2d7..932988670 100644
--- a/src/runtime/local/kernels/Bin.h
+++ b/src/runtime/local/kernels/Bin.h
@@ -30,17 +30,17 @@
 // Struct for partial template specialization
 // ****************************************************************************
 
-template<class DTRes, class DTArg>
-struct Bin {
-    static void apply(DTRes *& res, const DTArg * arg, int64_t numBins, typename DTArg::VT min, typename DTArg::VT max, DCTX(ctx)) = delete;
+template <class DTRes, class DTArg> struct Bin {
+    static void apply(DTRes *&res, const DTArg *arg, int64_t numBins, typename DTArg::VT min, typename DTArg::VT max,
+                      DCTX(ctx)) = delete;
 };
 
 // ****************************************************************************
 // Convenience function
 // ****************************************************************************
 
-template<class DTRes, class DTArg>
-void bin(DTRes *& res, const DTArg * arg, int64_t numBins, typename DTArg::VT min, typename DTArg::VT max, DCTX(ctx)) {
+template <class DTRes, class DTArg>
+void bin(DTRes *&res, const DTArg *arg, int64_t numBins, typename DTArg::VT min, typename DTArg::VT max, DCTX(ctx)) {
     Bin<DTRes, DTArg>::apply(res, arg, numBins, min, max, ctx);
 }
 
@@ -48,14 +48,14 @@ void bin(DTRes *& res, const DTArg * arg, int64_t numBins, typename DTArg::VT mi
 // Argument validation
 // ****************************************************************************
 
-template<typename VTArg>
-void validateArgsBin(int64_t numBins, VTArg min, VTArg max) {
+template <typename VTArg> void validateArgsBin(int64_t numBins, VTArg min, VTArg max) {
     if (numBins <= 0)
         throw std::runtime_error("bin-kernel: numBins must be greater than zero");
     if (min > max)
         throw std::runtime_error("bin-kernel: min must not be greater than max");
     if (min == max && numBins > 1)
-        throw std::runtime_error("bin-kernel: min equals max, so numBins must not be greater than 1");
+        throw std::runtime_error("bin-kernel: min equals max, so numBins must "
+                                 "not be greater than 1");
     if (std::is_floating_point<VTArg>::value) {
         const VTArg inf = std::numeric_limits<VTArg>::infinity();
         if (std::isnan(min))
@@ -81,42 +81,42 @@ void validateArgsBin(int64_t numBins, VTArg min, VTArg max) {
 // DenseMatrix
 // ----------------------------------------------------------------------------
 
-template<typename VTRes, typename VTArg>
-struct Bin<DenseMatrix<VTRes>, DenseMatrix<VTArg>> {
-    static void apply(DenseMatrix<VTRes> *& res, const DenseMatrix<VTArg> * arg, int64_t numBins, VTArg min, VTArg max, DCTX(ctx)) {
+template <typename VTRes, typename VTArg> struct Bin<DenseMatrix<VTRes>, DenseMatrix<VTArg>> {
+    static void apply(DenseMatrix<VTRes> *&res, const DenseMatrix<VTArg> *arg, int64_t numBins, VTArg min, VTArg max,
+                      DCTX(ctx)) {
         validateArgsBin(numBins, min, max);
-        
+
         double binSize = static_cast<double>(max - min) / numBins;
 
         const size_t numRows = arg->getNumRows();
         const size_t numCols = arg->getNumCols();
 
-        if(res == nullptr)
+        if (res == nullptr)
             res = DataObjectFactory::create<DenseMatrix<VTRes>>(numRows, numCols, false);
-        
-        const VTArg * valuesArg = arg->getValues();
-        VTRes * valuesRes = res->getValues();
+
+        const VTArg *valuesArg = arg->getValues();
+        VTRes *valuesRes = res->getValues();
         const size_t rowSkipArg = arg->getRowSkip();
         const size_t rowSkipRes = res->getRowSkip();
 
-        if(min == max && numBins == 1)
-            for(size_t r = 0; r < numRows; r++) {
-                for(size_t c = 0; c < numCols; c++)
+        if (min == max && numBins == 1)
+            for (size_t r = 0; r < numRows; r++) {
+                for (size_t c = 0; c < numCols; c++)
                     valuesRes[c] = 0;
                 valuesRes += rowSkipRes;
             }
         else
-            for(size_t r = 0; r < numRows; r++) {
-                for(size_t c = 0; c < numCols; c++) {
+            for (size_t r = 0; r < numRows; r++) {
+                for (size_t c = 0; c < numCols; c++) {
                     VTArg v = valuesArg[c];
                     VTRes b;
-                    if(v <= min) // important if VTArg is an unsigned integer type
+                    if (v <= min) // important if VTArg is an unsigned integer type
                         b = 0;
                     else {
                         b = std::ceil(static_cast<double>(v - min) / binSize) - 1;
-                        if(b < 0)
+                        if (b < 0)
                             b = 0;
-                        else if(b >= numBins)
+                        else if (b >= numBins)
                             b = numBins - 1;
                     }
                     valuesRes[c] = b;
@@ -131,11 +131,10 @@ struct Bin<DenseMatrix<VTRes>, DenseMatrix<VTArg>> {
 // Matrix
 // ----------------------------------------------------------------------------
 
-template<typename VTRes, typename VTArg>
-struct Bin<Matrix<VTRes>, Matrix<VTArg>> {
-    static void apply(Matrix<VTRes> *& res, const Matrix<VTArg> * arg, int64_t numBins, VTArg min, VTArg max, DCTX(ctx)) {
+template <typename VTRes, typename VTArg> struct Bin<Matrix<VTRes>, Matrix<VTArg>> {
+    static void apply(Matrix<VTRes> *&res, const Matrix<VTArg> *arg, int64_t numBins, VTArg min, VTArg max, DCTX(ctx)) {
         validateArgsBin(numBins, min, max);
-        
+
         double binSize = static_cast<double>(max - min) / numBins;
 
         const size_t numRows = arg->getNumRows();
@@ -148,8 +147,7 @@ struct Bin<Matrix<VTRes>, Matrix<VTArg>> {
             // sets all values to zero
             res->prepareAppend();
             res->finishAppend();
-        }
-        else {
+        } else {
             res->prepareAppend();
             for (size_t r = 0; r < numRows; ++r) {
                 for (size_t c = 0; c < numCols; ++c) {
diff --git a/src/runtime/local/kernels/BinaryOpCode.h b/src/runtime/local/kernels/BinaryOpCode.h
index cbc5832e2..626e08cbb 100644
--- a/src/runtime/local/kernels/BinaryOpCode.h
+++ b/src/runtime/local/kernels/BinaryOpCode.h
@@ -22,20 +22,20 @@
 
 enum class BinaryOpCode {
     // Arithmetic.
-    ADD,  // addition
-    SUB,  // subtraction
-    MUL,  // multiplication
-    DIV,  // division
-    POW,  // to the power of
-    MOD,  // modulus
-    LOG,  // logarithm
+    ADD, // addition
+    SUB, // subtraction
+    MUL, // multiplication
+    DIV, // division
+    POW, // to the power of
+    MOD, // modulus
+    LOG, // logarithm
     // Comparisons.
-    EQ,   // equal
-    NEQ,  // not equal
-    LT,   // less than
-    LE,   // less equal
-    GT,   // greater than
-    GE,   // greater equal
+    EQ,  // equal
+    NEQ, // not equal
+    LT,  // less than
+    LE,  // less equal
+    GT,  // greater than
+    GE,  // greater equal
     // Min/max.
     MIN,
     MAX,
@@ -67,8 +67,7 @@ static std::string_view binary_op_codes[] = {
     // Bitwise.
     "BITWISE_AND",
     // Strings.
-    "CONCAT"
-};
+    "CONCAT"};
 
 // ****************************************************************************
 // Specification which binary ops should be supported on which value types
@@ -83,62 +82,85 @@ static std::string_view binary_op_codes[] = {
  * @tparam VTRhs The right-hand-side argument value type.
  * @tparam op The binary operation.
  */
-template<BinaryOpCode op, typename VTRes, typename VTLhs, typename VTRhs>
+template <BinaryOpCode op, typename VTRes, typename VTLhs, typename VTRhs>
 static constexpr bool supportsBinaryOp = false;
 
 // Macros for concisely specifying which binary operations should be
 // supported on which value types.
 
+// Generates code specifying that the binary operation `Op` should be supported
+// on the value type `VT` (for the result and the two arguments, for
+// simplicity).
+#define SUPPORT(Op, VT) template <> constexpr bool supportsBinaryOp<BinaryOpCode::Op, VT, VT, VT> = true;
+
 // Generates code specifying that the binary operation `Op` should be supported on
-// the value type `VT` (for the result and the two arguments, for simplicity).
-#define SUPPORT(Op, VT) \
-    template<> constexpr bool supportsBinaryOp<BinaryOpCode::Op, VT, VT, VT> = true;
+// the value types `VTLhs` and `VTRhs` with result `VTRes`.
+#define SUPPORT_RLR(Op, VTRes, VTLhs, VTRhs)                                                                           \
+    template <> constexpr bool supportsBinaryOp<BinaryOpCode::Op, VTRes, VTLhs, VTRhs> = true;
 
-// Generates code specifying that all binary operations of a certain category should be
-// supported on the given value type `VT` (for the result and the two arguments, for simplicity).
-#define SUPPORT_ARITHMETIC(VT) \
-    /* Arithmetic. */ \
-    SUPPORT(ADD, VT) \
-    SUPPORT(SUB, VT) \
-    SUPPORT(MUL, VT) \
-    SUPPORT(DIV, VT) \
-    SUPPORT(POW, VT) \
-    SUPPORT(MOD, VT) \
+// Generates code specifying that all binary operations of a certain category
+// should be supported on the given value type `VT` (for the result and the two
+// arguments, for simplicity).
+#define SUPPORT_ARITHMETIC(VT)                                                                                         \
+    /* Arithmetic. */                                                                                                  \
+    SUPPORT(ADD, VT)                                                                                                   \
+    SUPPORT(SUB, VT)                                                                                                   \
+    SUPPORT(MUL, VT)                                                                                                   \
+    SUPPORT(DIV, VT)                                                                                                   \
+    SUPPORT(POW, VT)                                                                                                   \
+    SUPPORT(MOD, VT)                                                                                                   \
     SUPPORT(LOG, VT)
-#define SUPPORT_EQUALITY(VT) \
-    /* Comparisons. */ \
-    SUPPORT(EQ , VT) \
+#define SUPPORT_EQUALITY(VT)                                                                                           \
+    /* Comparisons. */                                                                                                 \
+    SUPPORT(EQ, VT)                                                                                                    \
     SUPPORT(NEQ, VT)
-#define SUPPORT_COMPARISONS(VT) \
-    /* Comparisons. */ \
-    SUPPORT(LT, VT) \
-    SUPPORT(LE, VT) \
-    SUPPORT(GT, VT) \
-    SUPPORT(GE, VT) \
-    /* Min/max. */ \
-    SUPPORT(MIN, VT) \
+#define SUPPORT_COMPARISONS(VT)                                                                                        \
+    /* Comparisons. */                                                                                                 \
+    SUPPORT(LT, VT)                                                                                                    \
+    SUPPORT(LE, VT)                                                                                                    \
+    SUPPORT(GT, VT)                                                                                                    \
+    SUPPORT(GE, VT)                                                                                                    \
+    /* Min/max. */                                                                                                     \
+    SUPPORT(MIN, VT)                                                                                                   \
     SUPPORT(MAX, VT)
-#define SUPPORT_LOGICAL(VT) \
-    /* Logical. */ \
-    SUPPORT(AND, VT) \
-    SUPPORT(OR , VT)
-#define SUPPORT_BITWISE(VT) \
-    /* Bitwise. */ \
+#define SUPPORT_LOGICAL(VT)                                                                                            \
+    /* Logical. */                                                                                                     \
+    SUPPORT(AND, VT)                                                                                                   \
+    SUPPORT(OR, VT)
+#define SUPPORT_BITWISE(VT)                                                                                            \
+    /* Bitwise. */                                                                                                     \
     SUPPORT(BITWISE_AND, VT)
 
-// Generates code specifying that all binary operations typically supported on a certain
-// category of value types should be supported on the given value type `VT`
-// (for the result and the two arguments, for simplicity).
-#define SUPPORT_NUMERIC_FP(VT) \
-    SUPPORT_ARITHMETIC(VT) \
-    SUPPORT_EQUALITY(VT) \
-    SUPPORT_COMPARISONS(VT) \
+// Generates code specifying that all binary operations of a certain category should be
+// supported on the given argument value type `VTArg` (for the left and right-hand-side
+// arguments, for simplicity) and the given result value type `VTRes`.
+#define SUPPORT_COMPARISONS_RA(VTRes, VTArg)                                                                           \
+    /* string Comparisons operations. */                                                                               \
+    SUPPORT_RLR(LT, VTRes, VTArg, VTArg)                                                                               \
+    SUPPORT_RLR(GT, VTRes, VTArg, VTArg)
+#define SUPPORT_EQUALITY_RA(VTRes, VTArg)                                                                              \
+    /* string Comparisons operations. */                                                                               \
+    SUPPORT_RLR(EQ, VTRes, VTArg, VTArg)                                                                               \
+    SUPPORT_RLR(NEQ, VTRes, VTArg, VTArg)
+#define SUPPORT_STRING_RA(VTRes, VTArg)                                                                                \
+    /* string concatenation operations. */                                                                             \
+    /*  Since the result may not fit in FixedStr16,*/                                                                  \
+    /*  it always return std::string*/                                                                                 \
+    SUPPORT_RLR(CONCAT, VTRes, VTArg, VTArg)
+
+// Generates code specifying that all binary operations typically supported on a
+// certain category of value types should be supported on the given value type
+// `VT` (for the result and the two arguments, for simplicity).
+#define SUPPORT_NUMERIC_FP(VT)                                                                                         \
+    SUPPORT_ARITHMETIC(VT)                                                                                             \
+    SUPPORT_EQUALITY(VT)                                                                                               \
+    SUPPORT_COMPARISONS(VT)                                                                                            \
     SUPPORT_LOGICAL(VT)
-#define SUPPORT_NUMERIC_INT(VT) \
-    SUPPORT_ARITHMETIC(VT) \
-    SUPPORT_EQUALITY(VT) \
-    SUPPORT_COMPARISONS(VT) \
-    SUPPORT_LOGICAL(VT) \
+#define SUPPORT_NUMERIC_INT(VT)                                                                                        \
+    SUPPORT_ARITHMETIC(VT)                                                                                             \
+    SUPPORT_EQUALITY(VT)                                                                                               \
+    SUPPORT_COMPARISONS(VT)                                                                                            \
+    SUPPORT_LOGICAL(VT)                                                                                                \
     SUPPORT_BITWISE(VT)
 
 // Concise specification of which binary operations should be supported on
@@ -151,11 +173,19 @@ SUPPORT_NUMERIC_INT(int8_t)
 SUPPORT_NUMERIC_INT(uint64_t)
 SUPPORT_NUMERIC_INT(uint32_t)
 SUPPORT_NUMERIC_INT(uint8_t)
-template<> constexpr bool supportsBinaryOp<BinaryOpCode::CONCAT, const char *, const char *, const char *> = true;
-template<> constexpr bool supportsBinaryOp<BinaryOpCode::EQ, int64_t, const char *, const char *> = true;
+// Strings binary operations.
+SUPPORT_EQUALITY_RA(int64_t, std::string)
+SUPPORT_EQUALITY_RA(int64_t, FixedStr16)
+SUPPORT_EQUALITY_RA(int64_t, const char *)
+SUPPORT_COMPARISONS_RA(int64_t, std::string)
+SUPPORT_COMPARISONS_RA(int64_t, FixedStr16)
+SUPPORT_STRING_RA(std::string, std::string)
+SUPPORT_STRING_RA(std::string, FixedStr16)
+SUPPORT_STRING_RA(const char *, const char *)
 
 // Undefine helper macros.
 #undef SUPPORT
+#undef SUPPORT_RLR
 #undef SUPPORT_ARITHMETIC
 #undef SUPPORT_EQUALITY
 #undef SUPPORT_COMPARISONS
@@ -163,3 +193,6 @@ template<> constexpr bool supportsBinaryOp<BinaryOpCode::EQ, int64_t, const char
 #undef SUPPORT_BITWISE
 #undef SUPPORT_NUMERIC_FP
 #undef SUPPORT_NUMERIC_INT
+#undef SUPPORT_EQUALITY_RA
+#undef SUPPORT_COMPARISONS_RA
+#undef SUPPORT_STRING_RA
diff --git a/src/runtime/local/kernels/CMakeLists.txt b/src/runtime/local/kernels/CMakeLists.txt
index 7043d8d0b..3ea52cc49 100644
--- a/src/runtime/local/kernels/CMakeLists.txt
+++ b/src/runtime/local/kernels/CMakeLists.txt
@@ -18,17 +18,21 @@
 set(CMAKE_CXX_STANDARD 20 CACHE STRING "C++ standard to conform to")
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+find_package(Python3 REQUIRED COMPONENTS Interpreter)
 
 # The library of pre-compiled CUDA kernels
 if(USE_CUDA AND CMAKE_CUDA_COMPILER)
-    add_custom_command(
-            OUTPUT ${PROJECT_BINARY_DIR}/src/runtime/local/kernels/CUDAkernels.cpp ${PROJECT_SOURCE_DIR}/lib/CUDAcatalog.json
-            COMMAND python3 ARGS genKernelInst.py kernels.json
-                    ${PROJECT_BINARY_DIR}/src/runtime/local/kernels/CUDAkernels.cpp ${PROJECT_SOURCE_DIR}/lib/CUDAcatalog.json CUDA
-            MAIN_DEPENDENCY ${PROJECT_SOURCE_DIR}/src/runtime/local/kernels/kernels.json
-            DEPENDS ${PROJECT_SOURCE_DIR}/src/runtime/local/kernels/genKernelInst.py
-            WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/src/runtime/local/kernels/
-    )
+    set(CMAKE_CUDA_STANDARD 20)
+    set(CMAKE_CUDA_STANDARD_REQUIRED ON)
+    execute_process(
+      COMMAND
+        ${Python3_EXECUTABLE} genKernelInst.py kernels.json
+        ${PROJECT_BINARY_DIR}/src/runtime/local/kernels/CUDAkernels
+        ${PROJECT_SOURCE_DIR}/lib/CUDAcatalog.json CUDA
+      WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/src/runtime/local/kernels/)
+
+  file(GLOB CUDA_CODEGEN_CPP_FILES CONFIGURE_DEPENDS
+      "${PROJECT_BINARY_DIR}/src/runtime/local/kernels/CUDA*.cpp")
 
     set(PREFIX ${PROJECT_SOURCE_DIR}/src/runtime/local/kernels/CUDA)
     set(CUDAKernels_SRC
@@ -56,7 +60,7 @@ if(USE_CUDA AND CMAKE_CUDA_COMPILER)
             ${PREFIX}/Solve.cpp
             ${PREFIX}/Syrk.cu
             ${PREFIX}/Transpose.cpp
-            ${PROJECT_BINARY_DIR}/src/runtime/local/kernels/CUDAkernels.cpp
+            ${CUDA_CODEGEN_CPP_FILES}
             ${PROJECT_SOURCE_DIR}/src/runtime/local/vectorized/TasksCUDA.cpp
             ${PROJECT_SOURCE_DIR}/src/runtime/local/kernels/VectorizedPipeline.h
             ${PROJECT_SOURCE_DIR}/src/runtime/local/vectorized/WorkerGPU.h
@@ -70,17 +74,21 @@ if(USE_CUDA AND CMAKE_CUDA_COMPILER)
             PATH_SUFFIXES nvidia/current lib64 lib/x64 lib)
 
     target_link_libraries(CUDAKernels PUBLIC DataStructures LLVMSupport MLIRDaphne MLIRDaphneTransforms CUDA::cudart CUDA::cublasLt CUDA::cublas
-            CUDA::cusparse ${CUDA_cudnn_LIBRARY} CUDA::cusolver Util MLIRDaphneInference)
+            CUDA::cusparse ${CUDA_cudnn_LIBRARY} CUDA::cusolver Util MLIRDaphneInference fmt::fmt)
     set_target_properties(CUDAKernels PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${PROJECT_SOURCE_DIR}/lib)
+    set_property(TARGET CUDAKernels PROPERTY CUDA_ARCHITECTURES all)
 endif()
 
-add_custom_command(
-        OUTPUT ${PROJECT_BINARY_DIR}/src/runtime/local/kernels/kernels.cpp ${PROJECT_SOURCE_DIR}/lib/catalog.json
-        COMMAND python3 ARGS genKernelInst.py kernels.json ${PROJECT_BINARY_DIR}/src/runtime/local/kernels/kernels.cpp ${PROJECT_SOURCE_DIR}/lib/catalog.json CPP
-        MAIN_DEPENDENCY ${PROJECT_SOURCE_DIR}/src/runtime/local/kernels/kernels.json
-        DEPENDS ${PROJECT_SOURCE_DIR}/src/runtime/local/kernels/genKernelInst.py
-        WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/src/runtime/local/kernels/
-)
+execute_process(
+  COMMAND
+    ${Python3_EXECUTABLE} genKernelInst.py kernels.json
+    ${PROJECT_BINARY_DIR}/src/runtime/local/kernels/kernels
+    ${PROJECT_SOURCE_DIR}/lib/catalog.json CPP
+  WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/src/runtime/local/kernels/)
+
+file(GLOB CODEGEN_CPP_FILES CONFIGURE_DEPENDS
+     "${PROJECT_BINARY_DIR}/src/runtime/local/kernels/kernels_*.cpp")
+# message("CODEGEN_CPP_FILES: ${CODEGEN_CPP_FILES}")
 
 list(APPEND LIBS DataStructures IO IO_URING_LIB BLAS::BLAS MLIRParser)
 
@@ -88,10 +96,11 @@ set(PREFIX ${PROJECT_SOURCE_DIR}/src/runtime/local/kernels/)
 set(HEADERS_cpp_kernels
     ${PREFIX}/MatMul.h
 )
+
 set(SOURCES_cpp_kernels
         ${PREFIX}/MatMul.cpp
         ${PROJECT_SOURCE_DIR}/src/runtime/local/instrumentation/KernelInstrumentation.cpp
-        ${PROJECT_BINARY_DIR}/src/runtime/local/kernels/kernels.cpp
+        ${CODEGEN_CPP_FILES}
         ${PROJECT_SOURCE_DIR}/src/runtime/local/kernels/CreateDaphneContext.cpp
         ${PROJECT_SOURCE_DIR}/src/runtime/local/kernels/Pooling.cpp
         ${PROJECT_SOURCE_DIR}/src/runtime/local/kernels/VectorizedPipeline.h
@@ -101,7 +110,9 @@ set(SOURCES_cpp_kernels
         ${PROJECT_SOURCE_DIR}/src/runtime/local/vectorized/WorkerCPU.h
         )
 # The library of pre-compiled kernels. Will be linked into the JIT-compiled user program.
-add_library(AllKernels SHARED ${SOURCES_cpp_kernels} ${HEADERS_cpp_kernels})
+add_library(KernelObjLib OBJECT ${SOURCES_cpp_kernels} ${HEADERS_cpp_kernels})
+set_target_properties(KernelObjLib PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${PROJECT_SOURCE_DIR}/lib)
+add_library(AllKernels SHARED $<TARGET_OBJECTS:KernelObjLib>)
 set_target_properties(AllKernels PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${PROJECT_SOURCE_DIR}/lib)
 
 set(ALLKERNELS_INCLUDE_DIRS "${CMAKE_SOURCE_DIR}/thirdparty/installed/include" /usr/local/include)
@@ -126,8 +137,5 @@ if(USE_HDFS)
         find_library(LIBHDFS3 NAMES libhdfs3.so HINTS ${PROJECT_BINARY_DIR}/installed/lib REQUIRED)
 endif()
 
-find_library(URING NAMES uring HINTS /usr/local/lib "${CMAKE_SOURCE_DIR}/thirdparty/installed/lib")
-target_link_directories(AllKernels PUBLIC "${CMAKE_SOURCE_DIR}/thirdparty/installed/lib" /usr/local/lib)
-message(STATUS "ALLKERNELS_INCLUDE_DIRS: ${ALLKERNELS_INCLUDE_DIRS}")
-target_include_directories(AllKernels  PUBLIC ${ALLKERNELS_INCLUDE_DIRS})
-target_link_libraries(AllKernels PUBLIC ${LIBS} ${MPI_LIBRARIES} ${PAPI_LIB} ${HWLOC_LIB} ${LIBHDFS3} ${URING})
+target_link_libraries(KernelObjLib PUBLIC ${LIBS} ${MPI_LIBRARIES} ${PAPI_LIB} ${HWLOC_LIB} ${LIBHDFS3})
+target_link_libraries(AllKernels PUBLIC ${LIBS} ${MPI_LIBRARIES} ${PAPI_LIB} ${HWLOC_LIB} ${LIBHDFS3})
diff --git a/src/runtime/local/kernels/CTable.h b/src/runtime/local/kernels/CTable.h
index e1424e4f9..8b1344c0c 100644
--- a/src/runtime/local/kernels/CTable.h
+++ b/src/runtime/local/kernels/CTable.h
@@ -18,9 +18,9 @@
 #define SRC_RUNTIME_LOCAL_KERNELS_CTABLE_H
 
 #include <runtime/local/context/DaphneContext.h>
+#include <runtime/local/datastructures/CSRMatrix.h>
 #include <runtime/local/datastructures/DataObjectFactory.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
-#include <runtime/local/datastructures/CSRMatrix.h>
 #include <runtime/local/datastructures/Matrix.h>
 
 #include <cstdint>
@@ -29,32 +29,19 @@
 // Struct for partial template specialization
 // ****************************************************************************
 
-template<class DTRes, class DTLhs, class DTRhs, class VTWeight>
-struct CTable {
-    static void apply(
-        DTRes *& res,
-        const DTLhs * lhs, const DTRhs * rhs,
-        VTWeight weight,
-        int64_t resNumRows, int64_t resNumCols,
-        DCTX(ctx)
-    ) = delete;
+template <class DTRes, class DTLhs, class DTRhs, class VTWeight> struct CTable {
+    static void apply(DTRes *&res, const DTLhs *lhs, const DTRhs *rhs, VTWeight weight, int64_t resNumRows,
+                      int64_t resNumCols, DCTX(ctx)) = delete;
 };
 
 // ****************************************************************************
 // Convenience function
 // ****************************************************************************
 
-template<class DTRes, class DTLhs, class DTRhs, class VTWeight>
-void ctable(
-    DTRes *& res,
-    const DTLhs * lhs, const DTRhs * rhs,
-    VTWeight weight,
-    int64_t resNumRows, int64_t resNumCols,
-    DCTX(ctx)
-) {
-    CTable<DTRes, DTLhs, DTRhs, VTWeight>::apply(
-            res, lhs, rhs, weight, resNumRows, resNumCols, ctx
-    );
+template <class DTRes, class DTLhs, class DTRhs, class VTWeight>
+void ctable(DTRes *&res, const DTLhs *lhs, const DTRhs *rhs, VTWeight weight, int64_t resNumRows, int64_t resNumCols,
+            DCTX(ctx)) {
+    CTable<DTRes, DTLhs, DTRhs, VTWeight>::apply(res, lhs, rhs, weight, resNumRows, resNumCols, ctx);
 }
 
 // ****************************************************************************
@@ -65,15 +52,10 @@ void ctable(
 // DenseMatrix <- DenseMatrix, DenseMatrix
 // ----------------------------------------------------------------------------
 
-template<typename VTCoord, class VTWeight>
+template <typename VTCoord, class VTWeight>
 struct CTable<DenseMatrix<VTWeight>, DenseMatrix<VTCoord>, DenseMatrix<VTCoord>, VTWeight> {
-    static void apply(
-        DenseMatrix<VTWeight> *& res,
-        const DenseMatrix<VTCoord> * lhs, const DenseMatrix<VTCoord> * rhs,
-        VTWeight weight,
-        int64_t resNumRows, int64_t resNumCols,
-        DCTX(ctx)
-    ) {
+    static void apply(DenseMatrix<VTWeight> *&res, const DenseMatrix<VTCoord> *lhs, const DenseMatrix<VTCoord> *rhs,
+                      VTWeight weight, int64_t resNumRows, int64_t resNumCols, DCTX(ctx)) {
         const size_t lhsNumRows = lhs->getNumRows();
         const size_t lhsNumCols = lhs->getNumCols();
         const size_t rhsNumRows = rhs->getNumRows();
@@ -82,17 +64,17 @@ struct CTable<DenseMatrix<VTWeight>, DenseMatrix<VTCoord>, DenseMatrix<VTCoord>,
         auto lhsVals = lhs->getValues();
         auto rhsVals = rhs->getValues();
 
-        if((lhsNumCols != 1) || (rhsNumCols != 1))
+        if ((lhsNumCols != 1) || (rhsNumCols != 1))
             throw std::runtime_error("ctable: lhs and rhs must have only one column");
-        if(lhsNumRows != rhsNumRows)
+        if (lhsNumRows != rhsNumRows)
             throw std::runtime_error("ctable: lhs and rhs must have the same number of rows");
 
         const bool isResNumRowsFromLhs = resNumRows < 0;
-        const bool isResNumColsFromRhs= resNumCols < 0;
-        if(res == nullptr) {
-            if(isResNumRowsFromLhs)
+        const bool isResNumColsFromRhs = resNumCols < 0;
+        if (res == nullptr) {
+            if (isResNumRowsFromLhs)
                 resNumRows = *std::max_element(lhsVals, &lhsVals[lhsNumRows]) + 1;
-            if(isResNumColsFromRhs)
+            if (isResNumColsFromRhs)
                 resNumCols = *std::max_element(rhsVals, &rhsVals[rhsNumRows]) + 1;
             res = DataObjectFactory::create<DenseMatrix<VTWeight>>(resNumRows, resNumCols, true);
         }
@@ -100,24 +82,23 @@ struct CTable<DenseMatrix<VTWeight>, DenseMatrix<VTCoord>, DenseMatrix<VTCoord>,
         // res[i, j] = |{ k | lhs[k] = i and rhs[k] = j, 0 ≤ k ≤ n-1 }|.
         auto resVals = res->getValues();
         const size_t resRowSkip = res->getRowSkip();
-        if(isResNumRowsFromLhs && isResNumColsFromRhs) {
-            // The number of rows and columns of the result were derived from the
-            // left-hand-side and right-hand-side arguments. Thus, all positions
-            // are in-bounds.
-            for(size_t i = 0; i < lhsNumRows; i++) {
+        if (isResNumRowsFromLhs && isResNumColsFromRhs) {
+            // The number of rows and columns of the result were derived from
+            // the left-hand-side and right-hand-side arguments. Thus, all
+            // positions are in-bounds.
+            for (size_t i = 0; i < lhsNumRows; i++) {
                 const ssize_t r = lhsVals[i];
                 const ssize_t c = rhsVals[i];
                 resVals[static_cast<size_t>(r * resRowSkip + c)] += weight;
             }
-        }
-        else {
+        } else {
             // The number of rows and/or columns of the result were given by the
             // caller. Thus, positions might be out-of-bounds. If that is the
             // case, they shall be silently ignored.
-            for(size_t i = 0; i < lhsNumRows; i++) {
+            for (size_t i = 0; i < lhsNumRows; i++) {
                 const ssize_t r = lhsVals[i];
                 const ssize_t c = rhsVals[i];
-                if(r < resNumRows && c < resNumCols)
+                if (r < resNumRows && c < resNumCols)
                     resVals[static_cast<size_t>(r * resRowSkip + c)] += weight;
             }
         }
@@ -127,15 +108,10 @@ struct CTable<DenseMatrix<VTWeight>, DenseMatrix<VTCoord>, DenseMatrix<VTCoord>,
 // ----------------------------------------------------------------------------
 // CSRMatrix <- DenseMatrix, DenseMatrix
 // ----------------------------------------------------------------------------
-template<typename VTCoord, class VTWeight>
+template <typename VTCoord, class VTWeight>
 struct CTable<CSRMatrix<VTWeight>, DenseMatrix<VTCoord>, DenseMatrix<VTCoord>, VTWeight> {
-    static void apply(
-        CSRMatrix<VTWeight> *& res,
-        const DenseMatrix<VTCoord> * lhs, const DenseMatrix<VTCoord> * rhs,
-        VTWeight weight,
-        int64_t resNumRows, int64_t resNumCols,
-        DCTX(ctx)
-    ) {
+    static void apply(CSRMatrix<VTWeight> *&res, const DenseMatrix<VTCoord> *lhs, const DenseMatrix<VTCoord> *rhs,
+                      VTWeight weight, int64_t resNumRows, int64_t resNumCols, DCTX(ctx)) {
         const size_t lhsNumRows = lhs->getNumRows();
         const size_t lhsNumCols = lhs->getNumCols();
         const size_t rhsNumRows = rhs->getNumRows();
@@ -144,43 +120,39 @@ struct CTable<CSRMatrix<VTWeight>, DenseMatrix<VTCoord>, DenseMatrix<VTCoord>, V
         auto lhsVals = lhs->getValues();
         auto rhsVals = rhs->getValues();
 
-        if((lhsNumCols != 1) || (rhsNumCols != 1))
+        if ((lhsNumCols != 1) || (rhsNumCols != 1))
             throw std::runtime_error("ctable: lhs and rhs must have only one column");
-        if(lhsNumRows != rhsNumRows)
+        if (lhsNumRows != rhsNumRows)
             throw std::runtime_error("ctable: lhs and rhs must have the same number of rows");
 
         const bool isResNumRowsFromLhs = resNumRows < 0;
-        const bool isResNumColsFromRhs= resNumCols < 0;
-        if(res == nullptr) {
-            if(isResNumRowsFromLhs)
+        const bool isResNumColsFromRhs = resNumCols < 0;
+        if (res == nullptr) {
+            if (isResNumRowsFromLhs)
                 resNumRows = *std::max_element(lhsVals, &lhsVals[lhsNumRows]) + 1;
-            if(isResNumColsFromRhs)
+            if (isResNumColsFromRhs)
                 resNumCols = *std::max_element(rhsVals, &rhsVals[rhsNumRows]) + 1;
             res = DataObjectFactory::create<CSRMatrix<VTWeight>>(
-                    resNumRows, resNumCols,
-                    std::min(static_cast<ssize_t>(lhsNumRows), resNumRows * resNumCols),
-                    true
-            );
+                resNumRows, resNumCols, std::min(static_cast<ssize_t>(lhsNumRows), resNumRows * resNumCols), true);
         }
 
-        if(isResNumRowsFromLhs && isResNumColsFromRhs) {
-            // The number of rows and columns of the result were derived from the
-            // left-hand-side and right-hand-side arguments. Thus, all positions
-            // are in-bounds.
-            for(size_t i = 0; i < lhsNumRows; i++){
+        if (isResNumRowsFromLhs && isResNumColsFromRhs) {
+            // The number of rows and columns of the result were derived from
+            // the left-hand-side and right-hand-side arguments. Thus, all
+            // positions are in-bounds.
+            for (size_t i = 0; i < lhsNumRows; i++) {
                 const ssize_t r = lhsVals[i];
                 const ssize_t c = rhsVals[i];
                 res->set(r, c, res->get(r, c) + weight);
             }
-        }
-        else {
+        } else {
             // The number of rows and/or columns of the result were given by the
             // caller. Thus, positions might be out-of-bounds. If that is the
             // case, they shall be silently ignored.
-            for(size_t i = 0; i < lhsNumRows; i++){
+            for (size_t i = 0; i < lhsNumRows; i++) {
                 const ssize_t r = lhsVals[i];
                 const ssize_t c = rhsVals[i];
-                if(r < resNumRows && c < resNumCols)
+                if (r < resNumRows && c < resNumCols)
                     res->set(r, c, res->get(r, c) + weight);
             }
         }
@@ -191,15 +163,10 @@ struct CTable<CSRMatrix<VTWeight>, DenseMatrix<VTCoord>, DenseMatrix<VTCoord>, V
 // Matrix <- Matrix, Matrix
 // ----------------------------------------------------------------------------
 
-template<typename VTCoord, class VTWeight>
+template <typename VTCoord, class VTWeight>
 struct CTable<Matrix<VTWeight>, Matrix<VTCoord>, Matrix<VTCoord>, VTWeight> {
-    static void apply(
-        Matrix<VTWeight> *& res,
-        const Matrix<VTCoord> * lhs, const Matrix<VTCoord> * rhs,
-        VTWeight weight,
-        int64_t resNumRows, int64_t resNumCols,
-        DCTX(ctx)
-    ) {
+    static void apply(Matrix<VTWeight> *&res, const Matrix<VTCoord> *lhs, const Matrix<VTCoord> *rhs, VTWeight weight,
+                      int64_t resNumRows, int64_t resNumCols, DCTX(ctx)) {
         const size_t lhsNumRows = lhs->getNumRows();
 
         if ((lhs->getNumCols() != 1) || (rhs->getNumCols() != 1))
@@ -211,7 +178,7 @@ struct CTable<Matrix<VTWeight>, Matrix<VTCoord>, Matrix<VTCoord>, VTWeight> {
         const bool isResNumColsFromRhs = resNumCols < 0;
 
         if (res == nullptr) {
-            auto getMaxVal = [] (const Matrix<VTCoord> * mat) {
+            auto getMaxVal = [](const Matrix<VTCoord> *mat) {
                 const size_t numRows = mat->getNumRows();
                 VTCoord maxVal = mat->get(0, 0);
                 for (size_t r = 1; r < numRows; ++r) {
@@ -231,16 +198,15 @@ struct CTable<Matrix<VTWeight>, Matrix<VTCoord>, Matrix<VTCoord>, VTWeight> {
 
         // res[i, j] = |{ k | lhs[k] = i and rhs[k] = j, 0 ≤ k ≤ n-1 }|.
         if (isResNumRowsFromLhs && isResNumColsFromRhs) {
-            // The number of rows and columns of the result were derived from the
-            // left-hand-side and right-hand-side arguments. Thus, all positions
-            // are in-bounds.
+            // The number of rows and columns of the result were derived from
+            // the left-hand-side and right-hand-side arguments. Thus, all
+            // positions are in-bounds.
             for (size_t i = 0; i < lhsNumRows; ++i) {
                 const ssize_t r = lhs->get(i, 0);
                 const ssize_t c = rhs->get(i, 0);
                 res->set(r, c, res->get(r, c) + weight);
             }
-        }
-        else {
+        } else {
             // The number of rows and/or columns of the result were given by the
             // caller. Thus, positions might be out-of-bounds. If that is the
             // case, they shall be silently ignored.
@@ -254,4 +220,4 @@ struct CTable<Matrix<VTWeight>, Matrix<VTCoord>, Matrix<VTCoord>, VTWeight> {
     }
 };
 
-#endif //SRC_RUNTIME_LOCAL_KERNELS_CTABLE_H
+#endif // SRC_RUNTIME_LOCAL_KERNELS_CTABLE_H
diff --git a/src/runtime/local/kernels/CUDA/Activation.cpp b/src/runtime/local/kernels/CUDA/Activation.cpp
index 8eb911be5..63a55bb8f 100644
--- a/src/runtime/local/kernels/CUDA/Activation.cpp
+++ b/src/runtime/local/kernels/CUDA/Activation.cpp
@@ -18,33 +18,34 @@
 #include <runtime/local/datastructures/AllocationDescriptorCUDA.h>
 
 namespace CUDA::NN::Activation {
-    template<typename OP, typename DTRes, typename DTArg>
-    void Forward<OP, DTRes, DTArg>::apply(DTRes *&res, const DTArg *data, DCTX(dctx)) {
-        const size_t deviceID = 0; //ToDo: multi device support
-        auto ctx = CUDAContext::get(dctx, deviceID);
-        AllocationDescriptorCUDA alloc_desc(dctx, deviceID);
-        using VT = typename DTRes::VT;
-        const size_t nr1 = data->getNumRows();
-        const size_t nc1 = data->getNumCols();
-        const VT blend_alpha = 1;
-        const VT blend_beta = 0;
-        const VT* d_input = data->getValues(&alloc_desc);
-    
-        if (res == nullptr) {
-            res = DataObjectFactory::create<DTRes>(nr1, nc1, false, &alloc_desc);
-        }
-        VT* d_res = res->getValues(&alloc_desc);
+template <typename OP, typename DTRes, typename DTArg>
+void Forward<OP, DTRes, DTArg>::apply(DTRes *&res, const DTArg *data, DCTX(dctx)) {
+    const size_t deviceID = 0; // ToDo: multi device support
+    auto ctx = CUDAContext::get(dctx, deviceID);
+    AllocationDescriptorCUDA alloc_desc(dctx, deviceID);
+    using VT = typename DTRes::VT;
+    const size_t nr1 = data->getNumRows();
+    const size_t nc1 = data->getNumCols();
+    const VT blend_alpha = 1;
+    const VT blend_beta = 0;
+    const VT *d_input = data->getValues(&alloc_desc);
 
-        CHECK_CUDNN(cudnnSetTensor4dDescriptor(ctx->src_tensor_desc, ctx->tensor_format, ctx->getCUDNNDataType<VT>(), 1, 1, nr1, nc1));
-        CHECK_CUDNN(cudnnSetTensor4dDescriptor(ctx->dst_tensor_desc, ctx->tensor_format, ctx->getCUDNNDataType<VT>(), 1, 1, nr1, nc1));
+    if (res == nullptr) {
+        res = DataObjectFactory::create<DTRes>(nr1, nc1, false, &alloc_desc);
+    }
+    VT *d_res = res->getValues(&alloc_desc);
 
-        CHECK_CUDNN(cudnnSetActivationDescriptor(ctx->activation_desc, OP::getActivationType(), CUDNN_PROPAGATE_NAN, 0.0));
+    CHECK_CUDNN(cudnnSetTensor4dDescriptor(ctx->src_tensor_desc, ctx->tensor_format, ctx->getCUDNNDataType<VT>(), 1, 1,
+                                           nr1, nc1));
+    CHECK_CUDNN(cudnnSetTensor4dDescriptor(ctx->dst_tensor_desc, ctx->tensor_format, ctx->getCUDNNDataType<VT>(), 1, 1,
+                                           nr1, nc1));
 
-        CHECK_CUDNN(cudnnActivationForward(ctx->getCUDNNHandle(), ctx->activation_desc, &blend_alpha, ctx->src_tensor_desc,
-                d_input, &blend_beta, ctx->dst_tensor_desc, d_res));
-    }
+    CHECK_CUDNN(cudnnSetActivationDescriptor(ctx->activation_desc, OP::getActivationType(), CUDNN_PROPAGATE_NAN, 0.0));
 
-    template struct Forward<ReLU, DenseMatrix<float>, DenseMatrix<float>>;
-    template struct Forward<ReLU, DenseMatrix<double>, DenseMatrix<double>>;
+    CHECK_CUDNN(cudnnActivationForward(ctx->getCUDNNHandle(), ctx->activation_desc, &blend_alpha, ctx->src_tensor_desc,
+                                       d_input, &blend_beta, ctx->dst_tensor_desc, d_res));
 }
 
+template struct Forward<ReLU, DenseMatrix<float>, DenseMatrix<float>>;
+template struct Forward<ReLU, DenseMatrix<double>, DenseMatrix<double>>;
+} // namespace CUDA::NN::Activation
diff --git a/src/runtime/local/kernels/CUDA/Activation.h b/src/runtime/local/kernels/CUDA/Activation.h
index a193d324f..f71b18dc3 100644
--- a/src/runtime/local/kernels/CUDA/Activation.h
+++ b/src/runtime/local/kernels/CUDA/Activation.h
@@ -16,18 +16,17 @@
 
 #pragma once
 
+#include "HostUtils.h"
 #include "runtime/local/context/DaphneContext.h"
 #include "runtime/local/datastructures/DataObjectFactory.h"
 #include "runtime/local/datastructures/DenseMatrix.h"
-#include "HostUtils.h"
 
 namespace CUDA::NN::Activation {
-    struct ReLU {
-        static inline cudnnActivationMode_t getActivationType() { return CUDNN_ACTIVATION_RELU; }
-    };
+struct ReLU {
+    static inline cudnnActivationMode_t getActivationType() { return CUDNN_ACTIVATION_RELU; }
+};
 
-    template<typename OP, typename DTRes, typename DTArg>
-    struct Forward {
-        static void apply(DTRes *&res, const DTArg *data, DCTX(dctx));
-    };
-}
+template <typename OP, typename DTRes, typename DTArg> struct Forward {
+    static void apply(DTRes *&res, const DTArg *data, DCTX(dctx));
+};
+} // namespace CUDA::NN::Activation
diff --git a/src/runtime/local/kernels/CUDA/Affine.cpp b/src/runtime/local/kernels/CUDA/Affine.cpp
index a720d86ef..a2dcf2e89 100644
--- a/src/runtime/local/kernels/CUDA/Affine.cpp
+++ b/src/runtime/local/kernels/CUDA/Affine.cpp
@@ -18,67 +18,70 @@
 #include "HostUtils.h"
 #include <runtime/local/datastructures/AllocationDescriptorCUDA.h>
 
-template<typename T>
-static void launch_cublas_gemm(const CUDAContext& ctx, size_t nr1, size_t nc1, size_t nc2, const T* alpha, const T* beta,
-                               const T* d_lhs, const T* d_rhs, T* d_res);
+template <typename T>
+static void launch_cublas_gemm(const CUDAContext &ctx, size_t nr1, size_t nc1, size_t nc2, const T *alpha,
+                               const T *beta, const T *d_lhs, const T *d_rhs, T *d_res);
 
-template<>
-[[maybe_unused]] void launch_cublas_gemm<float>(const CUDAContext& ctx, size_t nr1, size_t nc1, size_t nc2,
-        const float* alpha,    const float* beta, const float* d_lhs, const float* d_rhs, float* d_res) {
+template <>
+[[maybe_unused]] void launch_cublas_gemm<float>(const CUDAContext &ctx, size_t nr1, size_t nc1, size_t nc2,
+                                                const float *alpha, const float *beta, const float *d_lhs,
+                                                const float *d_rhs, float *d_res) {
     CHECK_CUBLAS(cublasSgemm(ctx.getCublasHandle(), CUBLAS_OP_N, CUBLAS_OP_N, nc2, nr1, nc1, alpha, d_rhs, nc2, d_lhs,
-            nc1, beta, d_res, nc2));
+                             nc1, beta, d_res, nc2));
 }
 
-template<>
-[[maybe_unused]] void launch_cublas_gemm<double>(const CUDAContext& ctx, size_t nr1, size_t nc1, size_t nc2,
-        const double* alpha, const double* beta, const double* d_lhs, const double* d_rhs, double* d_res) {
+template <>
+[[maybe_unused]] void launch_cublas_gemm<double>(const CUDAContext &ctx, size_t nr1, size_t nc1, size_t nc2,
+                                                 const double *alpha, const double *beta, const double *d_lhs,
+                                                 const double *d_rhs, double *d_res) {
     CHECK_CUBLAS(cublasDgemm(ctx.getCublasHandle(), CUBLAS_OP_N, CUBLAS_OP_N, nc2, nr1, nc1, alpha, d_rhs, nc2, d_lhs,
                              nc1, beta, d_res, nc2));
 }
 
 namespace CUDA::NN::Affine {
-    template<typename DTRes, typename DTArg>
-    void Forward<DTRes, DTArg>::apply(DTRes *&res, const DTArg *data, const DTArg *weights, const DTArg *bias, DCTX(dctx)) {
-        const size_t deviceID = 0; //ToDo: multi device support
-        auto ctx = CUDAContext::get(dctx, deviceID);
-        AllocationDescriptorCUDA alloc_desc(dctx, deviceID);
-        using VT = typename DTRes::VT;
-        const size_t nr1 = data->getNumRows();
-        const size_t nc1 = data->getNumCols();
-        const size_t nc2 = weights->getNumCols();
-        const VT blend_alpha = 1;
-        VT blend_beta = 0;
-        const VT* d_input = data->getValues(&alloc_desc);
-        const VT* d_weights = weights->getValues(&alloc_desc);
+template <typename DTRes, typename DTArg>
+void Forward<DTRes, DTArg>::apply(DTRes *&res, const DTArg *data, const DTArg *weights, const DTArg *bias, DCTX(dctx)) {
+    const size_t deviceID = 0; // ToDo: multi device support
+    auto ctx = CUDAContext::get(dctx, deviceID);
+    AllocationDescriptorCUDA alloc_desc(dctx, deviceID);
+    using VT = typename DTRes::VT;
+    const size_t nr1 = data->getNumRows();
+    const size_t nc1 = data->getNumCols();
+    const size_t nc2 = weights->getNumCols();
+    const VT blend_alpha = 1;
+    VT blend_beta = 0;
+    const VT *d_input = data->getValues(&alloc_desc);
+    const VT *d_weights = weights->getValues(&alloc_desc);
 
-        if (nc1 != weights->getNumRows()) {
-            throw std::runtime_error(
-                fmt::format("CUDA::NN::Affine: #cols of lhs and #rows of rhs must be "
-                            "the same ({} != {})",
-                            nc1, weights->getNumRows()));
-        }
+    if (nc1 != weights->getNumRows()) {
+        throw std::runtime_error(fmt::format("CUDA::NN::Affine: #cols of lhs and #rows of rhs must be "
+                                             "the same ({} != {})",
+                                             nc1, weights->getNumRows()));
+    }
 
-        if(res == nullptr)
-            res = DataObjectFactory::create<DenseMatrix<VT>>(nr1, nc2, false, &alloc_desc);
-        VT* d_res = res->getValues(&alloc_desc);
+    if (res == nullptr)
+        res = DataObjectFactory::create<DenseMatrix<VT>>(nr1, nc2, false, &alloc_desc);
+    VT *d_res = res->getValues(&alloc_desc);
 
-        // reverse order to accommodate cublas' col major format (-> res = rhs * lhs)
-        launch_cublas_gemm<VT>(*ctx, nr1, nc1, nc2, &blend_alpha, &blend_beta, d_input, d_weights, d_res);
+    // reverse order to accommodate cublas' col major format (-> res = rhs *
+    // lhs)
+    launch_cublas_gemm<VT>(*ctx, nr1, nc1, nc2, &blend_alpha, &blend_beta, d_input, d_weights, d_res);
 
-        if(bias) {
-            if (bias->getNumRows() != 1)
-                throw std::runtime_error("Affine (CUDA): bias dimensions not matching up with weights matrix (W[MxN] -> b[1xN]");
-            const VT* d_bias = bias->getValues(&alloc_desc);
-            CHECK_CUDNN(cudnnSetTensor4dDescriptor(ctx->src_tensor_desc, ctx->tensor_format, ctx->getCUDNNDataType<VT>(),
-                    1, bias->getNumCols(), 1, 1));
-            CHECK_CUDNN(cudnnSetTensor4dDescriptor(ctx->dst_tensor_desc, ctx->tensor_format, ctx->template getCUDNNDataType<VT>(),
-                    nr1, nc2, 1, 1));
-            blend_beta = 1;
-            CHECK_CUDNN(cudnnAddTensor(ctx->getCUDNNHandle(), &blend_alpha, ctx->src_tensor_desc, d_bias, &blend_beta,
-                    ctx->dst_tensor_desc, d_res));
-        }
+    if (bias) {
+        if (bias->getNumRows() != 1)
+            throw std::runtime_error("Affine (CUDA): bias dimensions not matching up with weights "
+                                     "matrix (W[MxN] -> b[1xN]");
+        const VT *d_bias = bias->getValues(&alloc_desc);
+        CHECK_CUDNN(cudnnSetTensor4dDescriptor(ctx->src_tensor_desc, ctx->tensor_format, ctx->getCUDNNDataType<VT>(), 1,
+                                               bias->getNumCols(), 1, 1));
+        CHECK_CUDNN(cudnnSetTensor4dDescriptor(ctx->dst_tensor_desc, ctx->tensor_format,
+                                               ctx->template getCUDNNDataType<VT>(), nr1, nc2, 1, 1));
+        blend_beta = 1;
+        CHECK_CUDNN(cudnnAddTensor(ctx->getCUDNNHandle(), &blend_alpha, ctx->src_tensor_desc, d_bias, &blend_beta,
+                                   ctx->dst_tensor_desc, d_res));
     }
-
-    template struct Forward<DenseMatrix<float>, DenseMatrix<float>>;
-    template struct Forward<DenseMatrix<double>, DenseMatrix<double>>;
 }
+
+template struct Forward<DenseMatrix<float>, DenseMatrix<float>>;
+template struct Forward<DenseMatrix<double>, DenseMatrix<double>>;
+} // namespace CUDA::NN::Affine
diff --git a/src/runtime/local/kernels/CUDA/Affine.h b/src/runtime/local/kernels/CUDA/Affine.h
index 9f5302b84..cb543d4d4 100644
--- a/src/runtime/local/kernels/CUDA/Affine.h
+++ b/src/runtime/local/kernels/CUDA/Affine.h
@@ -21,8 +21,7 @@
 #include "runtime/local/datastructures/DenseMatrix.h"
 
 namespace CUDA::NN::Affine {
-    template<typename DTRes, typename DTArg>
-    struct Forward {
-        static void apply(DTRes *&res, const DTArg *data, const DTArg *weights, const DTArg *bias, DCTX(dctx));
-    };
-}
+template <typename DTRes, typename DTArg> struct Forward {
+    static void apply(DTRes *&res, const DTArg *data, const DTArg *weights, const DTArg *bias, DCTX(dctx));
+};
+} // namespace CUDA::NN::Affine
diff --git a/src/runtime/local/kernels/CUDA/AggAll.h b/src/runtime/local/kernels/CUDA/AggAll.h
index c409620e4..60104a321 100644
--- a/src/runtime/local/kernels/CUDA/AggAll.h
+++ b/src/runtime/local/kernels/CUDA/AggAll.h
@@ -25,19 +25,17 @@ namespace CUDA {
 // Struct for partial template specialization
 // ****************************************************************************
 
-    template<typename VTRes, class DTArg>
-    struct AggAll {
-        static VTRes apply(AggOpCode opCode, const DTArg *arg, DCTX(dctx)) = delete;
-    };
+template <typename VTRes, class DTArg> struct AggAll {
+    static VTRes apply(AggOpCode opCode, const DTArg *arg, DCTX(dctx)) = delete;
+};
 
 // ****************************************************************************
 // Convenience function
 // ****************************************************************************
 
-    template<typename VTRes, class DTArg>
-    VTRes aggAll(AggOpCode opCode, const DTArg *arg, DCTX(dctx)) {
-        return AggAll<VTRes, DTArg>::apply(opCode, arg, dctx);
-    }
+template <typename VTRes, class DTArg> VTRes aggAll(AggOpCode opCode, const DTArg *arg, DCTX(dctx)) {
+    return AggAll<VTRes, DTArg>::apply(opCode, arg, dctx);
+}
 
 // ****************************************************************************
 // (Partial) template specializations for different data/value types
@@ -47,8 +45,7 @@ namespace CUDA {
 // scalar <- DenseMatrix
 // ----------------------------------------------------------------------------
 
-    template<typename VTRes, typename VTArg>
-    struct AggAll<VTRes, DenseMatrix<VTArg>> {
-        static VTRes apply(AggOpCode opCode, const DenseMatrix<VTArg> *arg, DCTX(dctx));
-    };
-}
\ No newline at end of file
+template <typename VTRes, typename VTArg> struct AggAll<VTRes, DenseMatrix<VTArg>> {
+    static VTRes apply(AggOpCode opCode, const DenseMatrix<VTArg> *arg, DCTX(dctx));
+};
+} // namespace CUDA
\ No newline at end of file
diff --git a/src/runtime/local/kernels/CUDA/AggCol.h b/src/runtime/local/kernels/CUDA/AggCol.h
index 5fa6223db..692518c1e 100644
--- a/src/runtime/local/kernels/CUDA/AggCol.h
+++ b/src/runtime/local/kernels/CUDA/AggCol.h
@@ -27,19 +27,17 @@ namespace CUDA {
 // Struct for partial template specialization
 // ****************************************************************************
 
-    template<class DTRes, class DTArg>
-    struct AggCol {
-        static void apply(AggOpCode opCode, DenseMatrix<DTRes> *&res, const DenseMatrix<DTArg> *arg, DCTX(ctx));
-    };
+template <class DTRes, class DTArg> struct AggCol {
+    static void apply(AggOpCode opCode, DenseMatrix<DTRes> *&res, const DenseMatrix<DTArg> *arg, DCTX(ctx));
+};
 
 // ****************************************************************************
 // Convenience function
 // ****************************************************************************
 
-    template<class DTRes, class DTArg>
-    void aggCol(AggOpCode opCode, DTRes *&res, const DTArg *arg, DCTX(ctx)) {
-        AggCol<DTRes, DTArg>::apply(opCode, res, arg, ctx);
-    }
+template <class DTRes, class DTArg> void aggCol(AggOpCode opCode, DTRes *&res, const DTArg *arg, DCTX(ctx)) {
+    AggCol<DTRes, DTArg>::apply(opCode, res, arg, ctx);
+}
 
 // ****************************************************************************
 // (Partial) template specializations for different data/value types
@@ -49,8 +47,7 @@ namespace CUDA {
 // DenseMatrix <- DenseMatrix
 // ----------------------------------------------------------------------------
 
-    template<typename VT>
-    struct AggCol<DenseMatrix<VT>, DenseMatrix<VT>> {
-        static void apply(AggOpCode opCode, DenseMatrix<VT> *&res, const DenseMatrix<VT> *arg, DCTX(ctx));
-    };
-}
\ No newline at end of file
+template <typename VT> struct AggCol<DenseMatrix<VT>, DenseMatrix<VT>> {
+    static void apply(AggOpCode opCode, DenseMatrix<VT> *&res, const DenseMatrix<VT> *arg, DCTX(ctx));
+};
+} // namespace CUDA
\ No newline at end of file
diff --git a/src/runtime/local/kernels/CUDA/AggRow.h b/src/runtime/local/kernels/CUDA/AggRow.h
index db734ec57..36b6df695 100644
--- a/src/runtime/local/kernels/CUDA/AggRow.h
+++ b/src/runtime/local/kernels/CUDA/AggRow.h
@@ -27,19 +27,17 @@ namespace CUDA {
 // Struct for partial template specialization
 // ****************************************************************************
 
-    template<class DTRes, class DTArg>
-    struct AggRow {
-        static void apply(AggOpCode opCode, DenseMatrix<DTRes> *&res, const DenseMatrix<DTArg> *arg, DCTX(ctx));
-    };
+template <class DTRes, class DTArg> struct AggRow {
+    static void apply(AggOpCode opCode, DenseMatrix<DTRes> *&res, const DenseMatrix<DTArg> *arg, DCTX(ctx));
+};
 
 // ****************************************************************************
 // Convenience function
 // ****************************************************************************
 
-    template<class DTRes, class DTArg>
-    void aggRow(AggOpCode opCode, DTRes *&res, const DTArg *arg, DCTX(ctx)) {
-        AggRow<DTRes, DTArg>::apply(opCode, res, arg, ctx);
-    }
+template <class DTRes, class DTArg> void aggRow(AggOpCode opCode, DTRes *&res, const DTArg *arg, DCTX(ctx)) {
+    AggRow<DTRes, DTArg>::apply(opCode, res, arg, ctx);
+}
 
 // ****************************************************************************
 // (Partial) template specializations for different data/value types
@@ -49,8 +47,7 @@ namespace CUDA {
 // DenseMatrix <- DenseMatrix
 // ----------------------------------------------------------------------------
 
-    template<typename VT>
-    struct AggRow<DenseMatrix<VT>, DenseMatrix<VT>> {
-        static void apply(AggOpCode opCode, DenseMatrix<VT> *&res, const DenseMatrix<VT> *arg, DCTX(ctx));
-    };
-}
\ No newline at end of file
+template <typename VT> struct AggRow<DenseMatrix<VT>, DenseMatrix<VT>> {
+    static void apply(AggOpCode opCode, DenseMatrix<VT> *&res, const DenseMatrix<VT> *arg, DCTX(ctx));
+};
+} // namespace CUDA
\ No newline at end of file
diff --git a/src/runtime/local/kernels/CUDA/BatchNorm.cpp b/src/runtime/local/kernels/CUDA/BatchNorm.cpp
index f24bfc848..0d258697a 100644
--- a/src/runtime/local/kernels/CUDA/BatchNorm.cpp
+++ b/src/runtime/local/kernels/CUDA/BatchNorm.cpp
@@ -18,41 +18,42 @@
 #include <runtime/local/datastructures/AllocationDescriptorCUDA.h>
 
 namespace CUDA::BatchNorm {
-    template<typename DTRes, typename DTArg>
-    void Forward<DTRes, DTArg>::apply(DTRes *&res, const DTArg *data, const DTArg *gamma, const DTArg *beta,
-                                      const DTArg *ema_mean, const DTArg *ema_var, const typename DTArg::VT eps, DCTX(dctx))
-    {
-        const size_t deviceID = 0; //ToDo: multi device support
-        auto ctx = CUDAContext::get(dctx, deviceID);
-        AllocationDescriptorCUDA alloc_desc(dctx, deviceID);
-        using VT = typename DTRes::VT;
-        const size_t nr1 = data->getNumRows();
-        const size_t nc1 = data->getNumCols();
-        VT blend_alpha = 1.0;
-        VT blend_beta = 0.0;
-        const VT* d_input = data->getValues(&alloc_desc);
-        const VT* d_gamma = gamma->getValues(&alloc_desc);
-        const VT* d_beta = beta->getValues(&alloc_desc);
-        const VT* d_ema_mean = ema_mean->getValues(&alloc_desc);
-        const VT* d_ema_var = ema_var->getValues(&alloc_desc);
-        size_t num_channels = gamma->getNumRows();
+template <typename DTRes, typename DTArg>
+void Forward<DTRes, DTArg>::apply(DTRes *&res, const DTArg *data, const DTArg *gamma, const DTArg *beta,
+                                  const DTArg *ema_mean, const DTArg *ema_var, const typename DTArg::VT eps,
+                                  DCTX(dctx)) {
+    const size_t deviceID = 0; // ToDo: multi device support
+    auto ctx = CUDAContext::get(dctx, deviceID);
+    AllocationDescriptorCUDA alloc_desc(dctx, deviceID);
+    using VT = typename DTRes::VT;
+    const size_t nr1 = data->getNumRows();
+    const size_t nc1 = data->getNumCols();
+    VT blend_alpha = 1.0;
+    VT blend_beta = 0.0;
+    const VT *d_input = data->getValues(&alloc_desc);
+    const VT *d_gamma = gamma->getValues(&alloc_desc);
+    const VT *d_beta = beta->getValues(&alloc_desc);
+    const VT *d_ema_mean = ema_mean->getValues(&alloc_desc);
+    const VT *d_ema_var = ema_var->getValues(&alloc_desc);
+    size_t num_channels = gamma->getNumRows();
 
-        size_t HW = nc1 / num_channels;
-        auto H = static_cast<size_t>(std::sqrt(HW));
-        CHECK_CUDNN(cudnnSetTensor4dDescriptor(ctx->src_tensor_desc, ctx->tensor_format, ctx->getCUDNNDataType<VT>(), nr1, num_channels, H, H));
-        CHECK_CUDNN(cudnnSetTensor4dDescriptor(ctx->dst_tensor_desc, ctx->tensor_format, ctx->getCUDNNDataType<VT>(), nr1, num_channels, H, H));
+    size_t HW = nc1 / num_channels;
+    auto H = static_cast<size_t>(std::sqrt(HW));
+    CHECK_CUDNN(cudnnSetTensor4dDescriptor(ctx->src_tensor_desc, ctx->tensor_format, ctx->getCUDNNDataType<VT>(), nr1,
+                                           num_channels, H, H));
+    CHECK_CUDNN(cudnnSetTensor4dDescriptor(ctx->dst_tensor_desc, ctx->tensor_format, ctx->getCUDNNDataType<VT>(), nr1,
+                                           num_channels, H, H));
 
-        if (res == nullptr) {
-            res = DataObjectFactory::create<DTRes>(nr1, nc1, false, &alloc_desc);
-        }
-        VT* d_res = res->getValues(&alloc_desc);
-        CHECK_CUDNN(cudnnDeriveBNTensorDescriptor(ctx->bn_tensor_desc, ctx->src_tensor_desc, ctx->bn_mode));
-        CHECK_CUDNN(cudnnBatchNormalizationForwardInference(ctx->getCUDNNHandle(), ctx->bn_mode, &blend_alpha,
-                &blend_beta, ctx->src_tensor_desc, d_input, ctx->dst_tensor_desc, d_res, ctx->bn_tensor_desc,
-                d_gamma, d_beta, d_ema_mean, d_ema_var, eps));
+    if (res == nullptr) {
+        res = DataObjectFactory::create<DTRes>(nr1, nc1, false, &alloc_desc);
     }
-
-    template struct Forward<DenseMatrix<float>, DenseMatrix<float>>;
-    template struct Forward<DenseMatrix<double>, DenseMatrix<double>>;
+    VT *d_res = res->getValues(&alloc_desc);
+    CHECK_CUDNN(cudnnDeriveBNTensorDescriptor(ctx->bn_tensor_desc, ctx->src_tensor_desc, ctx->bn_mode));
+    CHECK_CUDNN(cudnnBatchNormalizationForwardInference(
+        ctx->getCUDNNHandle(), ctx->bn_mode, &blend_alpha, &blend_beta, ctx->src_tensor_desc, d_input,
+        ctx->dst_tensor_desc, d_res, ctx->bn_tensor_desc, d_gamma, d_beta, d_ema_mean, d_ema_var, eps));
 }
 
+template struct Forward<DenseMatrix<float>, DenseMatrix<float>>;
+template struct Forward<DenseMatrix<double>, DenseMatrix<double>>;
+} // namespace CUDA::BatchNorm
diff --git a/src/runtime/local/kernels/CUDA/BatchNorm.h b/src/runtime/local/kernels/CUDA/BatchNorm.h
index 23d970f58..1770f4c08 100644
--- a/src/runtime/local/kernels/CUDA/BatchNorm.h
+++ b/src/runtime/local/kernels/CUDA/BatchNorm.h
@@ -16,16 +16,15 @@
 
 #pragma once
 
+#include "HostUtils.h"
 #include "runtime/local/context/CUDAContext.h"
 #include "runtime/local/context/DaphneContext.h"
 #include "runtime/local/datastructures/DataObjectFactory.h"
 #include "runtime/local/datastructures/DenseMatrix.h"
-#include "HostUtils.h"
 
 namespace CUDA::BatchNorm {
-    template<typename DTRes, typename DTArg>
-    struct Forward {
-        static void apply(DTRes *&res, const DTArg *data, const DTArg *gamma, const DTArg *beta, const DTArg *ema_mean,
-                const DTArg *ema_var, typename DTArg::VT eps, DCTX(dctx));
-    };
-}
+template <typename DTRes, typename DTArg> struct Forward {
+    static void apply(DTRes *&res, const DTArg *data, const DTArg *gamma, const DTArg *beta, const DTArg *ema_mean,
+                      const DTArg *ema_var, typename DTArg::VT eps, DCTX(dctx));
+};
+} // namespace CUDA::BatchNorm
diff --git a/src/runtime/local/kernels/CUDA/BiasAdd.cpp b/src/runtime/local/kernels/CUDA/BiasAdd.cpp
index 67a6a3672..fd7bb7ab2 100644
--- a/src/runtime/local/kernels/CUDA/BiasAdd.cpp
+++ b/src/runtime/local/kernels/CUDA/BiasAdd.cpp
@@ -18,31 +18,32 @@
 #include <runtime/local/datastructures/AllocationDescriptorCUDA.h>
 
 namespace CUDA::BiasAdd {
-    template<typename DTRes, typename DTArg>
-    void Forward<DTRes, DTArg>::apply(DTRes *&res, const DTArg *data, const DTArg *bias, DCTX(dctx)) {
-        const size_t deviceID = 0; //ToDo: multi device support
-        auto ctx = CUDAContext::get(dctx, deviceID);
-        AllocationDescriptorCUDA alloc_desc(dctx, deviceID);
-        
-        using VT = typename DTRes::VT;
-        const size_t nr1 = data->getNumRows();
-        const size_t nc1 = data->getNumCols();
-        const VT blend_alpha = 1;
-        const VT blend_beta = 1;
-        const VT* d_input = data->getValues(&alloc_desc);
-        const VT* d_bias = bias->getValues(&alloc_desc);
-        res = const_cast<DTArg*>(data);
-        VT* d_res = const_cast<VT*>(d_input);
+template <typename DTRes, typename DTArg>
+void Forward<DTRes, DTArg>::apply(DTRes *&res, const DTArg *data, const DTArg *bias, DCTX(dctx)) {
+    const size_t deviceID = 0; // ToDo: multi device support
+    auto ctx = CUDAContext::get(dctx, deviceID);
+    AllocationDescriptorCUDA alloc_desc(dctx, deviceID);
 
-        CHECK_CUDNN(cudnnSetTensor4dDescriptor(ctx->src_tensor_desc, ctx->tensor_format, ctx->getCUDNNDataType<VT>(), nr1, nc1, 1, 1));
+    using VT = typename DTRes::VT;
+    const size_t nr1 = data->getNumRows();
+    const size_t nc1 = data->getNumCols();
+    const VT blend_alpha = 1;
+    const VT blend_beta = 1;
+    const VT *d_input = data->getValues(&alloc_desc);
+    const VT *d_bias = bias->getValues(&alloc_desc);
+    res = const_cast<DTArg *>(data);
+    VT *d_res = const_cast<VT *>(d_input);
 
-        CHECK_CUDNN(cudnnSetTensor4dDescriptor(ctx->dst_tensor_desc, ctx->tensor_format, ctx->template getCUDNNDataType<VT>(),
-                nr1, nc1, 1, 1));
+    CHECK_CUDNN(cudnnSetTensor4dDescriptor(ctx->src_tensor_desc, ctx->tensor_format, ctx->getCUDNNDataType<VT>(), nr1,
+                                           nc1, 1, 1));
 
-        CHECK_CUDNN(cudnnAddTensor(ctx->getCUDNNHandle(), &blend_alpha, ctx->src_tensor_desc, d_bias, &blend_beta,
-                ctx->dst_tensor_desc, d_res));
-    }
+    CHECK_CUDNN(cudnnSetTensor4dDescriptor(ctx->dst_tensor_desc, ctx->tensor_format,
+                                           ctx->template getCUDNNDataType<VT>(), nr1, nc1, 1, 1));
 
-    template struct Forward<DenseMatrix<float>, DenseMatrix<float>>;
-    template struct Forward<DenseMatrix<double>, DenseMatrix<double>>;
+    CHECK_CUDNN(cudnnAddTensor(ctx->getCUDNNHandle(), &blend_alpha, ctx->src_tensor_desc, d_bias, &blend_beta,
+                               ctx->dst_tensor_desc, d_res));
 }
+
+template struct Forward<DenseMatrix<float>, DenseMatrix<float>>;
+template struct Forward<DenseMatrix<double>, DenseMatrix<double>>;
+} // namespace CUDA::BiasAdd
diff --git a/src/runtime/local/kernels/CUDA/BiasAdd.h b/src/runtime/local/kernels/CUDA/BiasAdd.h
index 74ccd23eb..7d396ecb4 100644
--- a/src/runtime/local/kernels/CUDA/BiasAdd.h
+++ b/src/runtime/local/kernels/CUDA/BiasAdd.h
@@ -16,14 +16,13 @@
 
 #pragma once
 
+#include "HostUtils.h"
 #include "runtime/local/context/DaphneContext.h"
 #include "runtime/local/datastructures/DataObjectFactory.h"
 #include "runtime/local/datastructures/DenseMatrix.h"
-#include "HostUtils.h"
 
 namespace CUDA::BiasAdd {
-    template<typename DTRes, typename DTArg>
-    struct Forward {
-        static void apply(DTRes *&res, const DTArg *input, const DTArg *bias, DCTX(dctx));
-    };
-}
+template <typename DTRes, typename DTArg> struct Forward {
+    static void apply(DTRes *&res, const DTArg *input, const DTArg *bias, DCTX(dctx));
+};
+} // namespace CUDA::BiasAdd
diff --git a/src/runtime/local/kernels/CUDA/ColBind.h b/src/runtime/local/kernels/CUDA/ColBind.h
index 6aca0dfef..b774aaed3 100644
--- a/src/runtime/local/kernels/CUDA/ColBind.h
+++ b/src/runtime/local/kernels/CUDA/ColBind.h
@@ -24,22 +24,21 @@
 #include <string>
 
 namespace CUDA {
-    template<class DTRes, class DTLhs, class DTRhs>
-    struct ColBind {
-        static void apply(DTRes *&res, const DTLhs *lhs, const DTRhs *rhs, DCTX(ctx));
-    };
+template <class DTRes, class DTLhs, class DTRhs> struct ColBind {
+    static void apply(DTRes *&res, const DTLhs *lhs, const DTRhs *rhs, DCTX(ctx));
+};
 
-    template<typename VTres, typename VTlhs, typename VTrhs>
-    struct ColBind<DenseMatrix<VTres>, DenseMatrix<VTlhs>, DenseMatrix<VTrhs>> {
-        static void apply(DenseMatrix<VTres> *&res, const DenseMatrix<VTlhs> *lhs,
-                          const DenseMatrix<VTrhs> *rhs, DCTX(ctx));
-    };
+template <typename VTres, typename VTlhs, typename VTrhs>
+struct ColBind<DenseMatrix<VTres>, DenseMatrix<VTlhs>, DenseMatrix<VTrhs>> {
+    static void apply(DenseMatrix<VTres> *&res, const DenseMatrix<VTlhs> *lhs, const DenseMatrix<VTrhs> *rhs,
+                      DCTX(ctx));
+};
 
 // ****************************************************************************
 // Convenience function
 // ****************************************************************************
-    template<class DTRes, class DTLhs, class DTRhs>
-    void colBind(DTRes *&res, const DTLhs *lhs, const DTRhs *rhs, DCTX(ctx)) {
-        ColBind<DTRes, DTLhs, DTRhs>::apply(res, lhs, rhs, ctx);
-    }
-}
\ No newline at end of file
+template <class DTRes, class DTLhs, class DTRhs>
+void colBind(DTRes *&res, const DTLhs *lhs, const DTRhs *rhs, DCTX(ctx)) {
+    ColBind<DTRes, DTLhs, DTRhs>::apply(res, lhs, rhs, ctx);
+}
+} // namespace CUDA
\ No newline at end of file
diff --git a/src/runtime/local/kernels/CUDA/Convolution.cpp b/src/runtime/local/kernels/CUDA/Convolution.cpp
index fa96591df..3a0453c06 100644
--- a/src/runtime/local/kernels/CUDA/Convolution.cpp
+++ b/src/runtime/local/kernels/CUDA/Convolution.cpp
@@ -18,107 +18,114 @@
 #include <runtime/local/datastructures/AllocationDescriptorCUDA.h>
 
 namespace CUDA::Convolution {
-    template<typename DTRes, typename DTArg>
-    void Forward<DTRes, DTArg>::apply(DTRes *&res, size_t& res_h, size_t& res_w, const DTArg *data, const DTArg *filter, const DTArg *bias,
-            const size_t batch_size, const size_t num_channels, const size_t img_h, const size_t img_w, const size_t filter_h,
-            const size_t filter_w, const size_t stride_h, const size_t stride_w, const size_t pad_h, const size_t pad_w, DCTX(dctx))
-    {
-        const size_t deviceID = 0; //ToDo: multi device support
-        auto ctx = CUDAContext::get(dctx, deviceID);
-        AllocationDescriptorCUDA alloc_desc(dctx, deviceID);
-        
-        using VT = typename DTRes::VT;
-        auto F = filter->getNumRows(); // num filters
-        const VT blend_alpha = 1;
-        VT blend_beta = 0;
-        const VT* d_input = data->getValues(&alloc_desc);
-        const VT* d_filter = filter->getValues(&alloc_desc);
-
-        cudnnConvolutionFwdAlgo_t algo;
-
-        CHECK_CUDNN(cudnnSetTensor4dDescriptor(ctx->src_tensor_desc, ctx->tensor_format, ctx->template getCUDNNDataType<VT>(), batch_size,
-                num_channels, img_h, img_w));
-        const int tensorDims = 4;
-        int tensorOuputDimA[tensorDims];
-        const int filterDimA[tensorDims] = {static_cast<int>(F), static_cast<int>(num_channels), static_cast<int>(filter_h),
-                static_cast<int>(filter_w)};
-
-        CHECK_CUDNN(cudnnSetFilterNdDescriptor(ctx->filter_desc, ctx->template getCUDNNDataType<VT>(), CUDNN_TENSOR_NCHW, tensorDims, filterDimA));
-
-        const int convDims = 2;
-        int padA[convDims] = {static_cast<int>(pad_h), static_cast<int>(pad_w)};
-        int filterStrideA[convDims] = { static_cast<int>(stride_h), static_cast<int>(stride_w)};
-        int upscaleA[convDims] = {1,1};
-        cudnnDataType_t convDataType = ctx->template getCUDNNDataType<VT>();
-
-        // ToDo: Math are done in FP32 when tensor are in FP16.
-//        if (ctx->data_type == CUDNN_DATA_HALF) {
-//            convDataType = CUDNN_DATA_FLOAT;
-//        }
-
-        CHECK_CUDNN(cudnnSetConvolutionNdDescriptor(ctx->conv_desc, convDims, padA, filterStrideA, upscaleA,
-                CUDNN_CROSS_CORRELATION, convDataType));
-
-        CHECK_CUDNN(cudnnGetConvolutionNdForwardOutputDim(ctx->conv_desc, ctx->src_tensor_desc, ctx->filter_desc,
-                tensorDims, tensorOuputDimA));
-
-        int n = tensorOuputDimA[0]; int c = tensorOuputDimA[1];
-        int h = tensorOuputDimA[2]; int w = tensorOuputDimA[3];
-        res_h = h;
-        res_w = w;
-//        size_t out_buf_size = n * c * h * w * sizeOfDataType;
-        CHECK_CUDNN(cudnnSetTensor4dDescriptor(ctx->dst_tensor_desc, ctx->tensor_format, ctx->template getCUDNNDataType<VT>(), n, c, h, w));
-
-        if (res == nullptr) {
-            res = DataObjectFactory::create<DTRes>(batch_size, c*h*w, false, &alloc_desc);
-        }
-        
-        VT* d_res = res->getValues(&alloc_desc);
-        if (ctx->conv_algorithm < 0) {
-            int requestedAlgoCount = CUDNN_CONVOLUTION_FWD_ALGO_COUNT;
-            int returnedAlgoCount = -1;
-            cudnnConvolutionFwdAlgoPerf_t results[2 * CUDNN_CONVOLUTION_FWD_ALGO_COUNT];
-
-            CHECK_CUDNN(cudnnFindConvolutionForwardAlgorithm(ctx->getCUDNNHandle(), ctx->src_tensor_desc, ctx->filter_desc,
-                                                             ctx->conv_desc, ctx->dst_tensor_desc, requestedAlgoCount, &returnedAlgoCount, results));
-            algo = results[0].algo;
-            ctx->conv_algorithm = algo;
-        }
-        else {
-            algo = static_cast<cudnnConvolutionFwdAlgo_t>(ctx->conv_algorithm);
-        }
+template <typename DTRes, typename DTArg>
+void Forward<DTRes, DTArg>::apply(DTRes *&res, size_t &res_h, size_t &res_w, const DTArg *data, const DTArg *filter,
+                                  const DTArg *bias, const size_t batch_size, const size_t num_channels,
+                                  const size_t img_h, const size_t img_w, const size_t filter_h, const size_t filter_w,
+                                  const size_t stride_h, const size_t stride_w, const size_t pad_h, const size_t pad_w,
+                                  DCTX(dctx)) {
+    const size_t deviceID = 0; // ToDo: multi device support
+    auto ctx = CUDAContext::get(dctx, deviceID);
+    AllocationDescriptorCUDA alloc_desc(dctx, deviceID);
+
+    using VT = typename DTRes::VT;
+    auto F = filter->getNumRows(); // num filters
+    const VT blend_alpha = 1;
+    VT blend_beta = 0;
+    const VT *d_input = data->getValues(&alloc_desc);
+    const VT *d_filter = filter->getValues(&alloc_desc);
+
+    cudnnConvolutionFwdAlgo_t algo;
+
+    CHECK_CUDNN(cudnnSetTensor4dDescriptor(ctx->src_tensor_desc, ctx->tensor_format,
+                                           ctx->template getCUDNNDataType<VT>(), batch_size, num_channels, img_h,
+                                           img_w));
+    const int tensorDims = 4;
+    int tensorOuputDimA[tensorDims];
+    const int filterDimA[tensorDims] = {static_cast<int>(F), static_cast<int>(num_channels), static_cast<int>(filter_h),
+                                        static_cast<int>(filter_w)};
+
+    CHECK_CUDNN(cudnnSetFilterNdDescriptor(ctx->filter_desc, ctx->template getCUDNNDataType<VT>(), CUDNN_TENSOR_NCHW,
+                                           tensorDims, filterDimA));
+
+    const int convDims = 2;
+    int padA[convDims] = {static_cast<int>(pad_h), static_cast<int>(pad_w)};
+    int filterStrideA[convDims] = {static_cast<int>(stride_h), static_cast<int>(stride_w)};
+    int upscaleA[convDims] = {1, 1};
+    cudnnDataType_t convDataType = ctx->template getCUDNNDataType<VT>();
+
+    // ToDo: Math are done in FP32 when tensor are in FP16.
+    //        if (ctx->data_type == CUDNN_DATA_HALF) {
+    //            convDataType = CUDNN_DATA_FLOAT;
+    //        }
+
+    CHECK_CUDNN(cudnnSetConvolutionNdDescriptor(ctx->conv_desc, convDims, padA, filterStrideA, upscaleA,
+                                                CUDNN_CROSS_CORRELATION, convDataType));
+
+    CHECK_CUDNN(cudnnGetConvolutionNdForwardOutputDim(ctx->conv_desc, ctx->src_tensor_desc, ctx->filter_desc,
+                                                      tensorDims, tensorOuputDimA));
+
+    int n = tensorOuputDimA[0];
+    int c = tensorOuputDimA[1];
+    int h = tensorOuputDimA[2];
+    int w = tensorOuputDimA[3];
+    res_h = h;
+    res_w = w;
+    //        size_t out_buf_size = n * c * h * w * sizeOfDataType;
+    CHECK_CUDNN(cudnnSetTensor4dDescriptor(ctx->dst_tensor_desc, ctx->tensor_format,
+                                           ctx->template getCUDNNDataType<VT>(), n, c, h, w));
+
+    if (res == nullptr) {
+        res = DataObjectFactory::create<DTRes>(batch_size, c * h * w, false, &alloc_desc);
+    }
 
-        size_t workspace_sizeInBytes=0;
-        void* work_space=nullptr;
-        CHECK_CUDNN(cudnnGetConvolutionForwardWorkspaceSize(ctx->getCUDNNHandle(), ctx->src_tensor_desc, ctx->filter_desc,
-                                                            ctx->conv_desc, ctx->dst_tensor_desc, algo, &workspace_sizeInBytes));
+    VT *d_res = res->getValues(&alloc_desc);
+    if (ctx->conv_algorithm < 0) {
+        int requestedAlgoCount = CUDNN_CONVOLUTION_FWD_ALGO_COUNT;
+        int returnedAlgoCount = -1;
+        cudnnConvolutionFwdAlgoPerf_t results[2 * CUDNN_CONVOLUTION_FWD_ALGO_COUNT];
+
+        CHECK_CUDNN(cudnnFindConvolutionForwardAlgorithm(ctx->getCUDNNHandle(), ctx->src_tensor_desc, ctx->filter_desc,
+                                                         ctx->conv_desc, ctx->dst_tensor_desc, requestedAlgoCount,
+                                                         &returnedAlgoCount, results));
+        algo = results[0].algo;
+        ctx->conv_algorithm = algo;
+    } else {
+        algo = static_cast<cudnnConvolutionFwdAlgo_t>(ctx->conv_algorithm);
+    }
 
-        if (workspace_sizeInBytes!=0) {
-            work_space = ctx->getCUDNNWorkspace(workspace_sizeInBytes);
-        }
+    size_t workspace_sizeInBytes = 0;
+    void *work_space = nullptr;
+    CHECK_CUDNN(cudnnGetConvolutionForwardWorkspaceSize(ctx->getCUDNNHandle(), ctx->src_tensor_desc, ctx->filter_desc,
+                                                        ctx->conv_desc, ctx->dst_tensor_desc, algo,
+                                                        &workspace_sizeInBytes));
+
+    if (workspace_sizeInBytes != 0) {
+        work_space = ctx->getCUDNNWorkspace(workspace_sizeInBytes);
+    }
 
-        CHECK_CUDNN(cudnnConvolutionForward(ctx->getCUDNNHandle(), &blend_alpha, ctx->src_tensor_desc, d_input,
-                ctx->filter_desc, d_filter, ctx->conv_desc, algo, work_space, workspace_sizeInBytes, &blend_beta,
-                ctx->dst_tensor_desc, d_res));
-
-        if(bias) {
-            if (bias != filter) {
-                const VT *d_bias = bias->getValues(&alloc_desc);
-//            CHECK_CUDART(cudaMalloc(reinterpret_cast<void**>(&d_bias), bias->getNumCols() * sizeOfDataType));
-//            CHECK_CUDART(cudaMemcpy(d_bias, bias->getValues(), bias->getNumCols() * sizeOfDataType, cudaMemcpyHostToDevice));
-
-                CHECK_CUDNN(cudnnSetTensor4dDescriptor(ctx->src_tensor_desc, ctx->tensor_format,
-                                                       ctx->getCUDNNDataType<VT>(), 1,
-                                                       c, 1, 1));
-                blend_beta = 1;
-                CHECK_CUDNN(
-                        cudnnAddTensor(ctx->getCUDNNHandle(), &blend_alpha, ctx->src_tensor_desc, d_bias, &blend_beta,
+    CHECK_CUDNN(cudnnConvolutionForward(ctx->getCUDNNHandle(), &blend_alpha, ctx->src_tensor_desc, d_input,
+                                        ctx->filter_desc, d_filter, ctx->conv_desc, algo, work_space,
+                                        workspace_sizeInBytes, &blend_beta, ctx->dst_tensor_desc, d_res));
+
+    if (bias) {
+        if (bias != filter) {
+            const VT *d_bias = bias->getValues(&alloc_desc);
+            //            CHECK_CUDART(cudaMalloc(reinterpret_cast<void**>(&d_bias),
+            //            bias->getNumCols() * sizeOfDataType));
+            //            CHECK_CUDART(cudaMemcpy(d_bias, bias->getValues(),
+            //            bias->getNumCols() * sizeOfDataType,
+            //            cudaMemcpyHostToDevice));
+
+            CHECK_CUDNN(cudnnSetTensor4dDescriptor(ctx->src_tensor_desc, ctx->tensor_format,
+                                                   ctx->getCUDNNDataType<VT>(), 1, c, 1, 1));
+            blend_beta = 1;
+            CHECK_CUDNN(cudnnAddTensor(ctx->getCUDNNHandle(), &blend_alpha, ctx->src_tensor_desc, d_bias, &blend_beta,
                                        ctx->dst_tensor_desc, d_res));
-            }
         }
     }
-
-    template struct Forward<DenseMatrix<float>, DenseMatrix<float>>;
-    template struct Forward<DenseMatrix<double>, DenseMatrix<double>>;
 }
 
+template struct Forward<DenseMatrix<float>, DenseMatrix<float>>;
+template struct Forward<DenseMatrix<double>, DenseMatrix<double>>;
+} // namespace CUDA::Convolution
diff --git a/src/runtime/local/kernels/CUDA/Convolution.h b/src/runtime/local/kernels/CUDA/Convolution.h
index a76bf8383..cf5db6cb8 100644
--- a/src/runtime/local/kernels/CUDA/Convolution.h
+++ b/src/runtime/local/kernels/CUDA/Convolution.h
@@ -16,10 +16,10 @@
 
 #pragma once
 
+#include "HostUtils.h"
 #include "runtime/local/context/DaphneContext.h"
 #include "runtime/local/datastructures/DataObjectFactory.h"
 #include "runtime/local/datastructures/DenseMatrix.h"
-#include "HostUtils.h"
 
 #include <limits>
 #include <random>
@@ -30,11 +30,10 @@
 
 namespace CUDA::Convolution {
 
-    template<typename DTRes, typename DTArg>
-    struct Forward {
-        static void apply(DTRes *&res, size_t& res_h, size_t& res_w, const DTArg *data, const DTArg *filter,
-                const DTArg *bias, size_t batch_size, size_t num_channels, size_t img_h, size_t img_w,
-                size_t filter_h, size_t filter_w, size_t stride_h, size_t stride_w, size_t pad_h, size_t pad_w,
-                DCTX(dctx));
-    };
-}
+template <typename DTRes, typename DTArg> struct Forward {
+    static void apply(DTRes *&res, size_t &res_h, size_t &res_w, const DTArg *data, const DTArg *filter,
+                      const DTArg *bias, size_t batch_size, size_t num_channels, size_t img_h, size_t img_w,
+                      size_t filter_h, size_t filter_w, size_t stride_h, size_t stride_w, size_t pad_h, size_t pad_w,
+                      DCTX(dctx));
+};
+} // namespace CUDA::Convolution
diff --git a/src/runtime/local/kernels/CUDA/CreateCUDAContext.cpp b/src/runtime/local/kernels/CUDA/CreateCUDAContext.cpp
index 1d3a5befc..3ce7d899a 100644
--- a/src/runtime/local/kernels/CUDA/CreateCUDAContext.cpp
+++ b/src/runtime/local/kernels/CUDA/CreateCUDAContext.cpp
@@ -2,10 +2,10 @@
 #include "CreateCUDAContext.h"
 
 namespace CUDA {
-    void createCUDAContext(DCTX(ctx)) {
-        // ToDo: one context per device
-        if(ctx->getUserConfig().log_ptr)
-            ctx->getUserConfig().log_ptr->registerLoggers();
-        ctx->cuda_contexts.emplace_back(CUDAContext::createCudaContext(0));
-    }
-}
\ No newline at end of file
+void createCUDAContext(DCTX(ctx)) {
+    // ToDo: one context per device
+    if (ctx->getUserConfig().log_ptr)
+        ctx->getUserConfig().log_ptr->registerLoggers();
+    ctx->cuda_contexts.emplace_back(CUDAContext::createCudaContext(0));
+}
+} // namespace CUDA
\ No newline at end of file
diff --git a/src/runtime/local/kernels/CUDA/CreateCUDAContext.h b/src/runtime/local/kernels/CUDA/CreateCUDAContext.h
index 44ed2a9a8..2992d131e 100644
--- a/src/runtime/local/kernels/CUDA/CreateCUDAContext.h
+++ b/src/runtime/local/kernels/CUDA/CreateCUDAContext.h
@@ -16,13 +16,13 @@
 
 #pragma once
 
-#include "runtime/local/context/DaphneContext.h"
 #include "runtime/local/context/CUDAContext.h"
+#include "runtime/local/context/DaphneContext.h"
 
 // ****************************************************************************
 // Convenience function
 // ****************************************************************************
 
 namespace CUDA {
-    void createCUDAContext(DCTX(ctx));
+void createCUDAContext(DCTX(ctx));
 }
\ No newline at end of file
diff --git a/src/runtime/local/kernels/CUDA/EwBinaryMat.h b/src/runtime/local/kernels/CUDA/EwBinaryMat.h
index c1e66daea..8af182cef 100644
--- a/src/runtime/local/kernels/CUDA/EwBinaryMat.h
+++ b/src/runtime/local/kernels/CUDA/EwBinaryMat.h
@@ -28,22 +28,21 @@
 #include <string>
 
 namespace CUDA {
-    template<class DTRes, class DTLhs, class DTRhs>
-    struct EwBinaryMat {
-        static void apply(BinaryOpCode opCode, DTRes *&res, const DTLhs *lhs, const DTRhs *rhs, DCTX(ctx)) = delete;
-    };
+template <class DTRes, class DTLhs, class DTRhs> struct EwBinaryMat {
+    static void apply(BinaryOpCode opCode, DTRes *&res, const DTLhs *lhs, const DTRhs *rhs, DCTX(ctx)) = delete;
+};
 
-    template<typename VTres, typename VTlhs, typename VTrhs>
-    struct EwBinaryMat<DenseMatrix<VTres>, DenseMatrix<VTlhs>, DenseMatrix<VTrhs>> {
-        static void apply(BinaryOpCode opCode, DenseMatrix<VTres> *&res, const DenseMatrix<VTlhs> *lhs,
-                          const DenseMatrix<VTrhs> *rhs, DCTX(ctx));
-    };
+template <typename VTres, typename VTlhs, typename VTrhs>
+struct EwBinaryMat<DenseMatrix<VTres>, DenseMatrix<VTlhs>, DenseMatrix<VTrhs>> {
+    static void apply(BinaryOpCode opCode, DenseMatrix<VTres> *&res, const DenseMatrix<VTlhs> *lhs,
+                      const DenseMatrix<VTrhs> *rhs, DCTX(ctx));
+};
 
 // ****************************************************************************
 // Convenience function
 // ****************************************************************************
-    template<class DTRes, class DTLhs, class DTRhs>
-    void ewBinaryMat(BinaryOpCode opCode, DTRes *&res, const DTLhs *lhs, const DTRhs *rhs, DCTX(ctx)) {
-        EwBinaryMat<DTRes, DTLhs, DTRhs>::apply(opCode, res, lhs, rhs, ctx);
-    }
-}
\ No newline at end of file
+template <class DTRes, class DTLhs, class DTRhs>
+void ewBinaryMat(BinaryOpCode opCode, DTRes *&res, const DTLhs *lhs, const DTRhs *rhs, DCTX(ctx)) {
+    EwBinaryMat<DTRes, DTLhs, DTRhs>::apply(opCode, res, lhs, rhs, ctx);
+}
+} // namespace CUDA
\ No newline at end of file
diff --git a/src/runtime/local/kernels/CUDA/EwBinaryObjSca.h b/src/runtime/local/kernels/CUDA/EwBinaryObjSca.h
index cccce21b5..dd7e900a1 100644
--- a/src/runtime/local/kernels/CUDA/EwBinaryObjSca.h
+++ b/src/runtime/local/kernels/CUDA/EwBinaryObjSca.h
@@ -34,19 +34,18 @@ namespace CUDA {
 // Struct for partial template specialization
 // ****************************************************************************
 
-    template<class DTRes, class DTLhs, typename VTRhs>
-    struct EwBinaryObjSca {
-        static void apply(BinaryOpCode opCode, DTRes *& res, const DTLhs * lhs, VTRhs rhs, DCTX(ctx)) = delete;
-    };
+template <class DTRes, class DTLhs, typename VTRhs> struct EwBinaryObjSca {
+    static void apply(BinaryOpCode opCode, DTRes *&res, const DTLhs *lhs, VTRhs rhs, DCTX(ctx)) = delete;
+};
 
 // ****************************************************************************
 // Convenience function
 // ****************************************************************************
 
-    template<class DTRes, class DTLhs, typename VTRhs>
-    void ewBinaryObjSca(BinaryOpCode opCode, DTRes *& res, const DTLhs * lhs, VTRhs rhs, DCTX(ctx)) {
-        EwBinaryObjSca<DTRes, DTLhs, VTRhs>::apply(opCode, res, lhs, rhs, ctx);
-    }
+template <class DTRes, class DTLhs, typename VTRhs>
+void ewBinaryObjSca(BinaryOpCode opCode, DTRes *&res, const DTLhs *lhs, VTRhs rhs, DCTX(ctx)) {
+    EwBinaryObjSca<DTRes, DTLhs, VTRhs>::apply(opCode, res, lhs, rhs, ctx);
+}
 
 // ****************************************************************************
 // (Partial) template specializations for different data/value types
@@ -56,8 +55,7 @@ namespace CUDA {
 // DenseMatrix <- DenseMatrix, scalar
 // ----------------------------------------------------------------------------
 
-    template<typename VT>
-    struct EwBinaryObjSca<DenseMatrix<VT>, DenseMatrix<VT>, VT> {
-        static void apply(BinaryOpCode opCode, DenseMatrix<VT> *& res, const DenseMatrix<VT> * lhs, VT rhs, DCTX(ctx));
-    };
-}
\ No newline at end of file
+template <typename VT> struct EwBinaryObjSca<DenseMatrix<VT>, DenseMatrix<VT>, VT> {
+    static void apply(BinaryOpCode opCode, DenseMatrix<VT> *&res, const DenseMatrix<VT> *lhs, VT rhs, DCTX(ctx));
+};
+} // namespace CUDA
\ No newline at end of file
diff --git a/src/runtime/local/kernels/CUDA/ExtractCol.h b/src/runtime/local/kernels/CUDA/ExtractCol.h
index c49d94b82..15fe26b66 100644
--- a/src/runtime/local/kernels/CUDA/ExtractCol.h
+++ b/src/runtime/local/kernels/CUDA/ExtractCol.h
@@ -24,23 +24,21 @@
 #include <string>
 
 namespace CUDA {
-    template<class DTRes, class DTArg, class DTSel>
-    struct ExtractCol {
-        static void apply(DTRes *&res, const DTArg *arg, const DTSel *sel, DCTX(ctx)) = delete;
-    };
-
-    template<class DTRes, class DTArg, class DTSel>
-    struct ExtractCol<DenseMatrix<DTRes>, DenseMatrix<DTArg>, DenseMatrix<DTSel>> {
-        static void
-        apply(DenseMatrix<DTRes> *&res, const DenseMatrix<DTArg> *arg, const DenseMatrix<DTSel> *sel, DCTX(ctx));
-    };
+template <class DTRes, class DTArg, class DTSel> struct ExtractCol {
+    static void apply(DTRes *&res, const DTArg *arg, const DTSel *sel, DCTX(ctx)) = delete;
+};
 
+template <class DTRes, class DTArg, class DTSel>
+struct ExtractCol<DenseMatrix<DTRes>, DenseMatrix<DTArg>, DenseMatrix<DTSel>> {
+    static void apply(DenseMatrix<DTRes> *&res, const DenseMatrix<DTArg> *arg, const DenseMatrix<DTSel> *sel,
+                      DCTX(ctx));
+};
 
 // ****************************************************************************
 // Convenience function
 // ****************************************************************************
-    template<class DTRes, class DTArg, class DTSel>
-    void extractCol(DTRes *&res, const DTArg *arg, const DTSel *sel, DCTX(ctx)) {
-        ExtractCol<DTRes, DTArg, DTSel>::apply(res, arg, sel, ctx);
-    }
-}
\ No newline at end of file
+template <class DTRes, class DTArg, class DTSel>
+void extractCol(DTRes *&res, const DTArg *arg, const DTSel *sel, DCTX(ctx)) {
+    ExtractCol<DTRes, DTArg, DTSel>::apply(res, arg, sel, ctx);
+}
+} // namespace CUDA
\ No newline at end of file
diff --git a/src/runtime/local/kernels/CUDA/Fill.h b/src/runtime/local/kernels/CUDA/Fill.h
index 80d1cf3a0..50d94d05c 100644
--- a/src/runtime/local/kernels/CUDA/Fill.h
+++ b/src/runtime/local/kernels/CUDA/Fill.h
@@ -28,25 +28,22 @@ namespace CUDA {
 // Struct for partial template specialization
 // ****************************************************************************
 
-    template<class DTRes, typename VTArg>
-    struct Fill {
-        static void apply(DTRes *&res, VTArg arg, size_t numRows, size_t numCols, DCTX(ctx)) = delete;
-    };
+template <class DTRes, typename VTArg> struct Fill {
+    static void apply(DTRes *&res, VTArg arg, size_t numRows, size_t numCols, DCTX(ctx)) = delete;
+};
 
 // ****************************************************************************
 // Convenience function
 // ****************************************************************************
 
-    template<class DTRes, typename VTArg>
-    void fill(DTRes *&res, VTArg arg, size_t numRows, size_t numCols, DCTX(ctx)) {
-        Fill<DTRes, VTArg>::apply(res, arg, numRows, numCols, ctx);
-    }
+template <class DTRes, typename VTArg> void fill(DTRes *&res, VTArg arg, size_t numRows, size_t numCols, DCTX(ctx)) {
+    Fill<DTRes, VTArg>::apply(res, arg, numRows, numCols, ctx);
+}
 
 // ****************************************************************************
 // (Partial) template specializations for different data/value types
 // ****************************************************************************
-    template<typename VT>
-    struct Fill<DenseMatrix<VT>, VT> {
-        static void apply(DenseMatrix<VT> *&res, VT arg, size_t numRows, size_t numCols, DCTX(ctx));
-    };
-}
\ No newline at end of file
+template <typename VT> struct Fill<DenseMatrix<VT>, VT> {
+    static void apply(DenseMatrix<VT> *&res, VT arg, size_t numRows, size_t numCols, DCTX(ctx));
+};
+} // namespace CUDA
\ No newline at end of file
diff --git a/src/runtime/local/kernels/CUDA/Gemv.cpp b/src/runtime/local/kernels/CUDA/Gemv.cpp
index 2b157a675..432a06ef1 100644
--- a/src/runtime/local/kernels/CUDA/Gemv.cpp
+++ b/src/runtime/local/kernels/CUDA/Gemv.cpp
@@ -18,47 +18,50 @@
 #include "runtime/local/datastructures/AllocationDescriptorCUDA.h"
 
 namespace CUDA {
-    template<>
-    [[maybe_unused]] void
-    launch_cublas_gemv<double>(const CUDAContext &ctx, size_t m, size_t n, const double *alpha, const double *beta,
-                               const double *A, const double *x, double *y, cublasOperation_t opA) {
-        // fixed for row major format
-        CHECK_CUBLAS(cublasDgemv(ctx.getCublasHandle(), opA, m, n, alpha, A, m, x, 1, beta, y, 1));
-    }
+template <>
+[[maybe_unused]] void launch_cublas_gemv<double>(const CUDAContext &ctx, size_t m, size_t n, const double *alpha,
+                                                 const double *beta, const double *A, const double *x, double *y,
+                                                 cublasOperation_t opA) {
+    // fixed for row major format
+    CHECK_CUBLAS(cublasDgemv(ctx.getCublasHandle(), opA, m, n, alpha, A, m, x, 1, beta, y, 1));
+}
 
-    template<>
-    [[maybe_unused]] void launch_cublas_gemv<float>(const CUDAContext &ctx, size_t m, size_t n, const float *alpha,
-            const float *beta, const float *A, const float *x, float *y, cublasOperation_t opA) {
-        // fixed for row major format
-        CHECK_CUBLAS(cublasSgemv(ctx.getCublasHandle(), opA, m, n, alpha, A, m, x, 1, beta, y, 1));
-    }
+template <>
+[[maybe_unused]] void launch_cublas_gemv<float>(const CUDAContext &ctx, size_t m, size_t n, const float *alpha,
+                                                const float *beta, const float *A, const float *x, float *y,
+                                                cublasOperation_t opA) {
+    // fixed for row major format
+    CHECK_CUBLAS(cublasSgemv(ctx.getCublasHandle(), opA, m, n, alpha, A, m, x, 1, beta, y, 1));
+}
 
-    template<typename T>
-    void Gemv<DenseMatrix<T>, DenseMatrix<T>, DenseMatrix<T>>::apply(DenseMatrix<T> *&res, const DenseMatrix<T> *mat,
-                                                                const DenseMatrix<T> *vec, DCTX(dctx)) {
+template <typename T>
+void Gemv<DenseMatrix<T>, DenseMatrix<T>, DenseMatrix<T>>::apply(DenseMatrix<T> *&res, const DenseMatrix<T> *mat,
+                                                                 const DenseMatrix<T> *vec, DCTX(dctx)) {
 
-        using VT = typename DenseMatrix<T>::VT;
-        const size_t deviceID = 0; //ToDo: multi device support
-        auto ctx = CUDAContext::get(dctx, deviceID);
-        AllocationDescriptorCUDA alloc_desc(dctx, deviceID);
-        
-        const size_t numRows = mat->getNumRows();
-        const size_t numCols = mat->getNumCols();
-        const VT blend_alpha = 1.0f;
-        const VT blend_beta = 0.0f;
-        const VT *d_mat = mat->getValues(&alloc_desc);
-        const VT *d_vec = vec->getValues(&alloc_desc);
+    using VT = typename DenseMatrix<T>::VT;
+    const size_t deviceID = 0; // ToDo: multi device support
+    auto ctx = CUDAContext::get(dctx, deviceID);
+    AllocationDescriptorCUDA alloc_desc(dctx, deviceID);
 
-        if(res == nullptr)
-            res = DataObjectFactory::create<DenseMatrix<T>>(numCols, 1, false, &alloc_desc);
-        VT *d_res = res->getValues(&alloc_desc);
+    const size_t numRows = mat->getNumRows();
+    const size_t numCols = mat->getNumCols();
+    const VT blend_alpha = 1.0f;
+    const VT blend_beta = 0.0f;
+    const VT *d_mat = mat->getValues(&alloc_desc);
+    const VT *d_vec = vec->getValues(&alloc_desc);
 
-//        launch_cublas_gemv<VT>(*ctx, numRows, numCols, &blend_alpha, &blend_beta, d_mat, d_vec, d_res);
-// Note: This invocation is supposed to be transposed(needed for lm microbench) + fixed for col-major behavior of cublas
-        launch_cublas_gemv<VT>(*ctx, numCols, numRows, &blend_alpha, &blend_beta, d_mat, d_vec, d_res, CUBLAS_OP_N);
-    }
+    if (res == nullptr)
+        res = DataObjectFactory::create<DenseMatrix<T>>(numCols, 1, false, &alloc_desc);
+    VT *d_res = res->getValues(&alloc_desc);
 
-    // explicit instantiations to satisfy linker
-    template struct Gemv<DenseMatrix<float>, DenseMatrix<float>, DenseMatrix<float>>;
-    template struct Gemv<DenseMatrix<double>, DenseMatrix<double>, DenseMatrix<double>>;
-}
\ No newline at end of file
+    //        launch_cublas_gemv<VT>(*ctx, numRows, numCols, &blend_alpha,
+    //        &blend_beta, d_mat, d_vec, d_res);
+    // Note: This invocation is supposed to be transposed(needed for lm
+    // microbench) + fixed for col-major behavior of cublas
+    launch_cublas_gemv<VT>(*ctx, numCols, numRows, &blend_alpha, &blend_beta, d_mat, d_vec, d_res, CUBLAS_OP_N);
+}
+
+// explicit instantiations to satisfy linker
+template struct Gemv<DenseMatrix<float>, DenseMatrix<float>, DenseMatrix<float>>;
+template struct Gemv<DenseMatrix<double>, DenseMatrix<double>, DenseMatrix<double>>;
+} // namespace CUDA
\ No newline at end of file
diff --git a/src/runtime/local/kernels/CUDA/Gemv.h b/src/runtime/local/kernels/CUDA/Gemv.h
index 2c9f795fd..82109ffda 100644
--- a/src/runtime/local/kernels/CUDA/Gemv.h
+++ b/src/runtime/local/kernels/CUDA/Gemv.h
@@ -16,35 +16,32 @@
 
 #pragma once
 
+#include <runtime/local/context/CUDAContext.h>
 #include <runtime/local/datastructures/DataObjectFactory.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
 #include <runtime/local/kernels/CUDA/HostUtils.h>
-#include <runtime/local/context/CUDAContext.h>
 
 namespace CUDA {
 
-    template<typename T>
-    void launch_cublas_gemv(const CUDAContext& ctx, size_t m, size_t n, const T* alpha, const T* beta, const T* A,
-                            const T* x, T* y, cublasOperation_t opA);
+template <typename T>
+void launch_cublas_gemv(const CUDAContext &ctx, size_t m, size_t n, const T *alpha, const T *beta, const T *A,
+                        const T *x, T *y, cublasOperation_t opA);
 
-    // ****************************************************************************
-    // Struct for partial template specialization
-    // ****************************************************************************
-    template<class DTRes, class DTMat, class DTVec>
-    struct Gemv {
-        static void apply(DTRes*& res, const DTMat* mat, const DTVec* vec, DCTX(dctx)) = delete;
-    };
+// ****************************************************************************
+// Struct for partial template specialization
+// ****************************************************************************
+template <class DTRes, class DTMat, class DTVec> struct Gemv {
+    static void apply(DTRes *&res, const DTMat *mat, const DTVec *vec, DCTX(dctx)) = delete;
+};
 
-    template<typename T>
-    struct Gemv<DenseMatrix<T>, DenseMatrix<T>, DenseMatrix<T>> {
-        static void apply(DenseMatrix<T>*& res, const DenseMatrix<T>* mat, const DenseMatrix<T>* vec, DCTX(dctx));
-    };
+template <typename T> struct Gemv<DenseMatrix<T>, DenseMatrix<T>, DenseMatrix<T>> {
+    static void apply(DenseMatrix<T> *&res, const DenseMatrix<T> *mat, const DenseMatrix<T> *vec, DCTX(dctx));
+};
 
-    // ****************************************************************************
-    // Convenience function
-    // ****************************************************************************
-    template<class DTRes, class DTMat, class DTVec>
-    void gemv(DTRes*& res, const DTMat* mat, const DTVec* vec, DCTX(ctx)) {
-        Gemv<DTRes, DTMat, DTVec>::apply(res, mat, vec, ctx);
-    }
-}
\ No newline at end of file
+// ****************************************************************************
+// Convenience function
+// ****************************************************************************
+template <class DTRes, class DTMat, class DTVec> void gemv(DTRes *&res, const DTMat *mat, const DTVec *vec, DCTX(ctx)) {
+    Gemv<DTRes, DTMat, DTVec>::apply(res, mat, vec, ctx);
+}
+} // namespace CUDA
\ No newline at end of file
diff --git a/src/runtime/local/kernels/CUDA/HostUtils.h b/src/runtime/local/kernels/CUDA/HostUtils.h
index bb637d99d..82429af70 100644
--- a/src/runtime/local/kernels/CUDA/HostUtils.h
+++ b/src/runtime/local/kernels/CUDA/HostUtils.h
@@ -16,110 +16,106 @@
 
 #pragma once
 
+#include <cublas_v2.h>
 #include <cuda.h>
 #include <cuda_runtime_api.h>
-#include <cublas_v2.h>
-#include <cusparse_v2.h>
 #include <cudnn.h>
 #include <cusolverDn.h>
+#include <cusparse_v2.h>
+
+#include <fmt/format.h>
 
 #include <iostream>
 #include <memory>
 
-#define CHECK_CUDART(call)                                                \
-  do {                                                                    \
-    cudaError_t status = call;                                            \
-    if (status != cudaSuccess) {                                          \
-        throw std::runtime_error(fmt::format("(CUDART) returned: {} ({}:{}:{}())", \
-                cudaGetErrorString(status), __FILE__, __LINE__, __func__));\
-    }                                                                     \
-  } while (0)
-
-#define CHECK_CUBLAS(call)                                                \
-  do {                                                                    \
-    cublasStatus_t status = call;                                         \
-    if (status != CUBLAS_STATUS_SUCCESS) {                                \
-        throw std::runtime_error(fmt::format("(CUBLAS) returned: {} ({}:{}:{}())", \
-                status, __FILE__, __LINE__, __func__));\
-    }                                                                     \
-  } while (0)
-
-#define CHECK_CUSPARSE(call)                                                 \
-    do {                                                                      \
-        cusparseStatus_t status = call;                                        \
-        if (status != CUSPARSE_STATUS_SUCCESS) {                                \
-        throw std::runtime_error(fmt::format("(CUSPARSE) returned: {} ({}:{}:{}())",  \
-                status, __FILE__, __LINE__, __func__)); \
-        }                                                                           \
+static auto format_as(cudaError_t e) { return fmt::underlying(e); }
+static auto format_as(cublasStatus_t e) { return fmt::underlying(e); }
+static auto format_as(cudnnStatus_t e) { return fmt::underlying(e); }
+static auto format_as(cusparseStatus_t e) { return fmt::underlying(e); }
+static auto format_as(cusolverStatus_t e) { return fmt::underlying(e); }
+
+#define CHECK_CUDART(call)                                                                                             \
+    do {                                                                                                               \
+        cudaError_t status = call;                                                                                     \
+        if (status != cudaSuccess) {                                                                                   \
+            throw std::runtime_error(fmt::format("(CUDART) returned: {} ({}:{}:{}())", cudaGetErrorString(status),     \
+                                                 __FILE__, __LINE__, __func__));                                       \
+        }                                                                                                              \
+    } while (0)
+
+#define CHECK_CUBLAS(call)                                                                                             \
+    do {                                                                                                               \
+        cublasStatus_t status = call;                                                                                  \
+        if (status != CUBLAS_STATUS_SUCCESS) {                                                                         \
+            throw std::runtime_error(                                                                                  \
+                fmt::format("(CUBLAS) returned: {} ({}:{}:{}())", status, __FILE__, __LINE__, __func__));              \
+        }                                                                                                              \
+    } while (0)
+
+#define CHECK_CUSPARSE(call)                                                                                           \
+    do {                                                                                                               \
+        cusparseStatus_t status = call;                                                                                \
+        if (status != CUSPARSE_STATUS_SUCCESS) {                                                                       \
+            throw std::runtime_error(                                                                                  \
+                fmt::format("(CUSPARSE) returned: {} ({}:{}:{}())", status, __FILE__, __LINE__, __func__));            \
+        }                                                                                                              \
+    } while (0)
+
+#define CHECK_CUDNN(call)                                                                                              \
+    do {                                                                                                               \
+        cudnnStatus_t status = call;                                                                                   \
+        if (status != CUDNN_STATUS_SUCCESS) {                                                                          \
+            throw std::runtime_error(                                                                                  \
+                fmt::format("(CUDNN) returned: {} ({}:{}:{}())", status, __FILE__, __LINE__, __func__));               \
+        }                                                                                                              \
     } while (0)
 
-#define CHECK_CUDNN(call)                                                  \
-  do {                                                                    \
-    cudnnStatus_t status = call;                                          \
-    if (status != CUDNN_STATUS_SUCCESS) {                                 \
-        throw std::runtime_error(fmt::format("(CUDNN) returned: {} ({}:{}:{}())", \
-                status, __FILE__, __LINE__, __func__));\
-    }                                                                     \
-  } while (0)
-
-#define CHECK_CUSOLVER(call)                                                   \
-  do {                                                                    \
-    cusolverStatus_t status = call;                                         \
-    if (status != CUSOLVER_STATUS_SUCCESS) {                                \
-        throw std::runtime_error(fmt::format("(CUSOLVER) returned: {} ({}:{}:{}())", \
-                status, __FILE__, __LINE__, __func__));\
-    }                                                                     \
-  } while (0)
+#define CHECK_CUSOLVER(call)                                                                                           \
+    do {                                                                                                               \
+        cusolverStatus_t status = call;                                                                                \
+        if (status != CUSOLVER_STATUS_SUCCESS) {                                                                       \
+            throw std::runtime_error(                                                                                  \
+                fmt::format("(CUSOLVER) returned: {} ({}:{}:{}())", status, __FILE__, __LINE__, __func__));            \
+        }                                                                                                              \
+    } while (0)
 
 #include <string_view>
 
-template <typename T>
-static constexpr auto type_name() noexcept {
+template <typename T> static constexpr auto type_name() noexcept {
     std::string_view name, prefix, suffix;
 #ifdef __clang__
     name = __PRETTY_FUNCTION__;
-  prefix = "auto type_name() [T = ";
-  suffix = "]";
+    prefix = "auto type_name() [T = ";
+    suffix = "]";
 #elif defined(__GNUC__)
     name = __PRETTY_FUNCTION__;
     prefix = "constexpr auto type_name() [with T = ";
     suffix = "]";
 #elif defined(_MSC_VER)
     name = __FUNCSIG__;
-  prefix = "auto __cdecl type_name<";
-  suffix = ">(void) noexcept";
+    prefix = "auto __cdecl type_name<";
+    suffix = ">(void) noexcept";
 #endif
     name.remove_prefix(prefix.size());
     name.remove_suffix(suffix.size());
     return name;
 }
 
-template<typename T>
-struct CudaDeleter {
-    void operator()(T* dev_ptr) const { del(dev_ptr); };
-    static void del(T* dev_ptr) {
-        cudaFree(reinterpret_cast<void*>(dev_ptr));
-    }
+template <typename T> struct CudaDeleter {
+    void operator()(T *dev_ptr) const { del(dev_ptr); };
+    static void del(T *dev_ptr) { cudaFree(reinterpret_cast<void *>(dev_ptr)); }
 };
 
-template<typename T>
-void cuda_deleter(T* dev_ptr) { CudaDeleter<T>::del(dev_ptr); }
+template <typename T> void cuda_deleter(T *dev_ptr) { CudaDeleter<T>::del(dev_ptr); }
 
-template<typename T>
-using CudaUniquePtr [[maybe_unused]] = std::unique_ptr<T, decltype(&cuda_deleter<T>)>;
+template <typename T> using CudaUniquePtr [[maybe_unused]] = std::unique_ptr<T, decltype(&cuda_deleter<T>)>;
 
-static inline uint32_t divup(unsigned n, unsigned div)
-{
-    return (n + div - 1) / div;
-}
+static inline uint32_t divup(unsigned n, unsigned div) { return (n + div - 1) / div; }
 
-template<typename VT>
-struct smem_calc /*: std::unary_function<int, int> */ {
+template <typename VT> struct smem_calc /*: std::unary_function<int, int> */ {
     int operator()(int i) const { return sizeof(VT) * i; }
-
 };
 
-
 static uint32_t nextPow2(uint32_t x) {
     --x;
     x |= x >> 1;
diff --git a/src/runtime/local/kernels/CUDA/MatMul.cpp b/src/runtime/local/kernels/CUDA/MatMul.cpp
index 042e519af..2db4597b1 100644
--- a/src/runtime/local/kernels/CUDA/MatMul.cpp
+++ b/src/runtime/local/kernels/CUDA/MatMul.cpp
@@ -15,201 +15,203 @@
  */
 
 #include "MatMul.h"
-//#include "Gemv.h"
+// #include "Gemv.h"
 #include "runtime/local/datastructures/AllocationDescriptorCUDA.h"
 
 namespace CUDA {
-    template<typename T>
-    void
-    launch_cublas_gemm(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, const int32_t m, int32_t n,
-                       int32_t k, const T *alpha, const T *A, int32_t lda, const T *B, int32_t ldb, const T *beta, T *C,
-                       int32_t ldc);
-
-    template<>
-    [[maybe_unused]] void launch_cublas_gemm<float>(cublasHandle_t handle, const cublasOperation_t transa, const
-            cublasOperation_t transb, const int32_t m, const int32_t n, const int32_t k, const float *alpha, const float
-            *A, const int32_t lda, const float *B, const int32_t ldb, const float *beta, float *C, const int32_t ldc) {
-        CHECK_CUBLAS(cublasSgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc));
+template <typename T>
+void launch_cublas_gemm(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, const int32_t m,
+                        int32_t n, int32_t k, const T *alpha, const T *A, int32_t lda, const T *B, int32_t ldb,
+                        const T *beta, T *C, int32_t ldc);
+
+template <>
+[[maybe_unused]] void launch_cublas_gemm<float>(cublasHandle_t handle, const cublasOperation_t transa,
+                                                const cublasOperation_t transb, const int32_t m, const int32_t n,
+                                                const int32_t k, const float *alpha, const float *A, const int32_t lda,
+                                                const float *B, const int32_t ldb, const float *beta, float *C,
+                                                const int32_t ldc) {
+    CHECK_CUBLAS(cublasSgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc));
+}
+
+template <>
+[[maybe_unused]] void launch_cublas_gemm<double>(cublasHandle_t handle, const cublasOperation_t transa,
+                                                 const cublasOperation_t transb, const int32_t m, const int32_t n,
+                                                 const int32_t k, const double *alpha, const double *A,
+                                                 const int32_t lda, const double *B, const int32_t ldb,
+                                                 const double *beta, double *C, const int32_t ldc) {
+    CHECK_CUBLAS(cublasDgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc));
+}
+
+template <typename T>
+void MatMul<DenseMatrix<T>, DenseMatrix<T>, DenseMatrix<T>>::apply(DenseMatrix<T> *&res, const DenseMatrix<T> *lhs,
+                                                                   const DenseMatrix<T> *rhs, bool transa, bool transb,
+                                                                   DCTX(dctx)) {
+    using VT = typename DenseMatrix<T>::VT;
+    const size_t deviceID = 0; // ToDo: multi device support
+    auto ctx = CUDAContext::get(dctx, deviceID);
+    AllocationDescriptorCUDA alloc_desc(dctx, deviceID);
+
+    const VT alpha = 1.0f;
+    const VT beta = 0.0f;
+
+    // swapping left and right input to fix row->col major format
+    auto nc2 = static_cast<int32_t>(lhs->getNumRows());
+    auto nr2 = static_cast<int32_t>(lhs->getNumCols());
+    auto nc1 = static_cast<int32_t>(rhs->getNumRows());
+    auto nr1 = static_cast<int32_t>(rhs->getNumCols());
+    auto B = lhs->getValues(&alloc_desc);
+    auto A = rhs->getValues(&alloc_desc);
+    std::swap(transa, transb);
+
+    // adjusting inputs to gemm according to transpose values
+    auto m = transa ? nc1 : nr1;
+    auto n = transb ? nr2 : nc2;
+    auto k = transa ? nr1 : nc1;
+
+    int32_t lda = transa ? k : m;
+    int32_t ldb = transb ? n : k;
+    int32_t ldc = m;
+
+    // output will be n x m because of column major format of cublas
+    if (res == nullptr)
+        res = DataObjectFactory::create<DenseMatrix<T>>(n, m, false, &alloc_desc);
+
+    auto C = res->getValues(&alloc_desc);
+
+    // ToDo: fix gemv for transpose operation
+    //        if(nc2 == 1) {
+    //            launch_cublas_gemv<VT>(*ctx, m, n, &blend_alpha, &blend_beta,
+    //            A, B, C, CUBLAS_OP_T);
+    //        }
+    //        else
+    {
+        launch_cublas_gemm<VT>(ctx->getCublasHandle(), transa ? CUBLAS_OP_T : CUBLAS_OP_N,
+                               transb ? CUBLAS_OP_T : CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
     }
-
-    template<>
-    [[maybe_unused]] void launch_cublas_gemm<double>(cublasHandle_t handle, const cublasOperation_t transa, const
-            cublasOperation_t transb, const int32_t m, const int32_t n, const int32_t k, const double *alpha, const double
-            *A, const int32_t lda, const double *B, const int32_t ldb, const double *beta, double *C, const int32_t ldc) {
-        CHECK_CUBLAS(cublasDgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc));
-    }
-
-    template<typename T>
-    void MatMul<DenseMatrix<T>, DenseMatrix<T>, DenseMatrix<T>>::apply(DenseMatrix<T> *&res, const DenseMatrix<T> *lhs,
-            const DenseMatrix<T> *rhs, bool transa, bool transb, DCTX(dctx)) {
-        using VT = typename DenseMatrix<T>::VT;
-        const size_t deviceID = 0; //ToDo: multi device support
-        auto ctx = CUDAContext::get(dctx, deviceID);
-        AllocationDescriptorCUDA alloc_desc(dctx, deviceID);
-
-        const VT alpha = 1.0f;
-        const VT beta = 0.0f;
-
-        // swapping left and right input to fix row->col major format
-        auto nc2 = static_cast< int32_t>(lhs->getNumRows());
-        auto nr2 = static_cast< int32_t>(lhs->getNumCols());
-        auto nc1 = static_cast< int32_t>(rhs->getNumRows());
-        auto nr1 = static_cast< int32_t>(rhs->getNumCols());
-        auto B = lhs->getValues(&alloc_desc);
-        auto A = rhs->getValues(&alloc_desc);
-        std::swap(transa, transb);
-
-        // adjusting inputs to gemm according to transpose values
-        auto m = transa ? nc1 : nr1;
-        auto n = transb ? nr2 : nc2;
-        auto k = transa ? nr1 : nc1;
-
-        int32_t lda = transa ? k : m;
-        int32_t ldb = transb ? n : k;
-        int32_t ldc = m;
-
-        // output will be n x m because of column major format of cublas
-        if(res == nullptr)
-            res = DataObjectFactory::create<DenseMatrix<T>>(n, m, false, &alloc_desc);
-
-        auto C =  res->getValues(&alloc_desc);
-
-        // ToDo: fix gemv for transpose operation
-//        if(nc2 == 1) {
-//            launch_cublas_gemv<VT>(*ctx, m, n, &blend_alpha, &blend_beta, A, B, C, CUBLAS_OP_T);
-//        }
-//        else
-        {
-            launch_cublas_gemm<VT>(ctx->getCublasHandle(), transa ? CUBLAS_OP_T : CUBLAS_OP_N, transb ? CUBLAS_OP_T :
-                    CUBLAS_OP_N, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc);
-        }
-    }
-
-    // ToDo: sparse mat mult (sample code below compiles but is not usable)
-    // from cusparse sample code:
-    // https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm/spgemm_example.c
-    template<typename T>
-    void MatMul<CSRMatrix<T>, CSRMatrix<T>, CSRMatrix<T>>::apply(CSRMatrix<T> *&res, const CSRMatrix<T> *lhs,
-            const CSRMatrix<T> *rhs, bool transa, bool transb, DCTX(dctx)) {
-        using VT = typename DenseMatrix<T>::VT;
-        auto ctx = CUDAContext::get(dctx, 0);
-        cusparseHandle_t handle = ctx->getCusparseHandle();
-
-        const size_t nr1 = lhs->getNumRows();
-        const size_t nc1 = lhs->getNumCols();
-        const size_t nr2 = rhs->getNumRows();
-        const size_t nc2 = rhs->getNumCols();
-        const size_t A_nnz = lhs->getNumNonZeros();
-        const size_t B_nnz = rhs->getNumNonZeros();
-        if (nc1 != nr2)
-            throw std::runtime_error("MatMul (CUDA): #cols of lhs and #rows of rhs must be the same");
-        const VT blend_alpha = 1.0f;
-        const VT blend_beta = 0.0f;
-        cusparseOperation_t opA = transa ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
-        cusparseOperation_t opB = transb ? CUSPARSE_OPERATION_TRANSPOSE: CUSPARSE_OPERATION_NON_TRANSPOSE;
-        cudaDataType computeType = ctx->template getCUSparseDataType<VT>();
-
-        //--------------------------------------------------------------------------
-        // Device memory management: Allocate and copy A, B
-        int *dA_csrOffsets, *dA_columns, *dB_csrOffsets, *dB_columns, *dC_csrOffsets, *dC_columns;
-        VT *dA_values, *dB_values, *dC_values;
-
-        // allocate A
-        CHECK_CUDART(cudaMalloc((void **) &dA_csrOffsets, (nr1 + 1) * sizeof(int)));
-        CHECK_CUDART(cudaMalloc((void **) &dA_columns, A_nnz * sizeof(int)));
-        CHECK_CUDART(cudaMalloc((void **) &dA_values, A_nnz * sizeof(VT)));
-
-        // allocate B
-        CHECK_CUDART(cudaMalloc((void **) &dB_csrOffsets, (nr2 + 1) * sizeof(int)));
-        CHECK_CUDART(cudaMalloc((void **) &dB_columns, B_nnz * sizeof(int)));
-        CHECK_CUDART(cudaMalloc((void **) &dB_values, B_nnz * sizeof(VT)));
-
-        // allocate C offsets
-        CHECK_CUDART(cudaMalloc((void **) &dC_csrOffsets, (nr1 + 1) * sizeof(int)));
-
-        // copy A
-        CHECK_CUDART(cudaMemcpy(dA_csrOffsets, lhs->getRowOffsets(), (nr1 + 1) * sizeof(int), cudaMemcpyHostToDevice));
-        CHECK_CUDART(cudaMemcpy(dA_columns, lhs->getColIdxs(), A_nnz * sizeof(int), cudaMemcpyHostToDevice));
-        CHECK_CUDART(cudaMemcpy(dA_values, lhs->getValues(), A_nnz * sizeof(VT), cudaMemcpyHostToDevice));
-
-        // copy B
-        CHECK_CUDART(cudaMemcpy(dB_csrOffsets, rhs->getRowOffsets(), (nr2 + 1) * sizeof(int), cudaMemcpyHostToDevice));
-        CHECK_CUDART(cudaMemcpy(dB_columns, rhs->getColIdxs(), B_nnz * sizeof(int), cudaMemcpyHostToDevice));
-        CHECK_CUDART(cudaMemcpy(dB_values, rhs->getValues(), B_nnz * sizeof(VT), cudaMemcpyHostToDevice));
-
-        cusparseSpMatDescr_t matA, matB, matC;
-        void *dBuffer1 = nullptr, *dBuffer2 = nullptr;
-        size_t bufferSize1 = 0, bufferSize2 = 0;
-
-        // Create sparse matrix A in CSR format
-        CHECK_CUSPARSE(
-                cusparseCreateCsr(&matA, nr1, nc1, A_nnz, dA_csrOffsets, dA_columns, dA_values, CUSPARSE_INDEX_32I,
-                                  CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F));
-
-        CHECK_CUSPARSE(
-                cusparseCreateCsr(&matB, nr2, nc2, B_nnz, dB_csrOffsets, dB_columns, dB_values, CUSPARSE_INDEX_32I,
-                                  CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F));
-
-        CHECK_CUSPARSE(cusparseCreateCsr(&matC, nr1, nc2, 0, nullptr, nullptr, nullptr, CUSPARSE_INDEX_32I,
-                                         CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F));
-
-        // SpGEMM Computation
-        cusparseSpGEMMDescr_t spgemmDesc;
-        CHECK_CUSPARSE(cusparseSpGEMM_createDescr(&spgemmDesc));
-
-        // ask bufferSize1 bytes for external memory
-        CHECK_CUSPARSE(cusparseSpGEMM_workEstimation(handle, opA, opB, &blend_alpha, matA, matB, &blend_beta, matC,
-                                                     computeType,
-                                                     CUSPARSE_SPGEMM_DEFAULT, spgemmDesc, &bufferSize1, nullptr));
-
-        CHECK_CUDART(cudaMalloc((void **) &dBuffer1, bufferSize1));
-
-        // inspect the matrices A and B to understand the memory requirement for
-        // the next step
-
-        CHECK_CUSPARSE(cusparseSpGEMM_workEstimation(handle, opA, opB, &blend_alpha, matA, matB, &blend_beta, matC,
-                                                     computeType,
-                                                     CUSPARSE_SPGEMM_DEFAULT, spgemmDesc, &bufferSize1, dBuffer1));
-
-        // ask bufferSize2 bytes for external memory
-        CHECK_CUSPARSE(
-                cusparseSpGEMM_compute(handle, opA, opB, &blend_alpha, matA, matB, &blend_beta, matC, computeType,
-                                       CUSPARSE_SPGEMM_DEFAULT, spgemmDesc, &bufferSize2, nullptr));
-
-        CHECK_CUDART(cudaMalloc((void **) &dBuffer2, bufferSize2));
-
-        // compute the intermediate product of A * B
-        CHECK_CUSPARSE(
-                cusparseSpGEMM_compute(handle, opA, opB, &blend_alpha, matA, matB, &blend_beta, matC, computeType,
-                                       CUSPARSE_SPGEMM_DEFAULT, spgemmDesc, &bufferSize2, dBuffer2));
-
-        // get matrix C non-zero entries C_nnz1
-        int64_t C_num_rows1, C_num_cols1, C_nnz1;
-        CHECK_CUSPARSE(cusparseSpMatGetSize(matC, &C_num_rows1, &C_num_cols1, &C_nnz1));
-
-        // allocate matrix C
-        CHECK_CUDART(cudaMalloc((void **) &dC_columns, C_nnz1 * sizeof(int)));
-        CHECK_CUDART(cudaMalloc((void **) &dC_values, C_nnz1 * sizeof(VT)));
-
-        // update matC with the new pointers
-        CHECK_CUSPARSE(cusparseCsrSetPointers(matC, dC_csrOffsets, dC_columns, dC_values));
-
-        // if beta != 0, cusparseSpGEMM_copy reuses/updates the values of dC_values
-
-        // copy the final products to the matrix C
-        CHECK_CUSPARSE(cusparseSpGEMM_copy(handle, opA, opB, &blend_alpha, matA, matB, &blend_beta, matC, computeType,
-                                           CUSPARSE_SPGEMM_DEFAULT, spgemmDesc));
-
-        // destroy matrix/vector descriptors
-        CHECK_CUSPARSE(cusparseSpGEMM_destroyDescr(spgemmDesc));
-        CHECK_CUSPARSE(cusparseDestroySpMat(matA));
-        CHECK_CUSPARSE(cusparseDestroySpMat(matB));
-        CHECK_CUSPARSE(cusparseDestroySpMat(matC));
-    }
-
-    // explicit instantiations to satisfy linker
-    template struct MatMul<CSRMatrix<float>, CSRMatrix<float>, CSRMatrix<float>>;
-    template struct MatMul<CSRMatrix<double>, CSRMatrix<double>, CSRMatrix<double>>;
-    template struct MatMul<DenseMatrix<float>, DenseMatrix<float>, DenseMatrix<float>>;
-    template struct MatMul<DenseMatrix<double>, DenseMatrix<double>, DenseMatrix<double>>;
-}
\ No newline at end of file
+}
+
+// ToDo: sparse mat mult (sample code below compiles but is not usable)
+// from cusparse sample code:
+// https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm/spgemm_example.c
+template <typename T>
+void MatMul<CSRMatrix<T>, CSRMatrix<T>, CSRMatrix<T>>::apply(CSRMatrix<T> *&res, const CSRMatrix<T> *lhs,
+                                                             const CSRMatrix<T> *rhs, bool transa, bool transb,
+                                                             DCTX(dctx)) {
+    using VT = typename DenseMatrix<T>::VT;
+    auto ctx = CUDAContext::get(dctx, 0);
+    cusparseHandle_t handle = ctx->getCusparseHandle();
+
+    const size_t nr1 = lhs->getNumRows();
+    const size_t nc1 = lhs->getNumCols();
+    const size_t nr2 = rhs->getNumRows();
+    const size_t nc2 = rhs->getNumCols();
+    const size_t A_nnz = lhs->getNumNonZeros();
+    const size_t B_nnz = rhs->getNumNonZeros();
+    if (nc1 != nr2)
+        throw std::runtime_error("MatMul (CUDA): #cols of lhs and #rows of rhs must be the same");
+    const VT blend_alpha = 1.0f;
+    const VT blend_beta = 0.0f;
+    cusparseOperation_t opA = transa ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
+    cusparseOperation_t opB = transb ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
+    cudaDataType computeType = ctx->template getCUSparseDataType<VT>();
+
+    //--------------------------------------------------------------------------
+    // Device memory management: Allocate and copy A, B
+    int *dA_csrOffsets, *dA_columns, *dB_csrOffsets, *dB_columns, *dC_csrOffsets, *dC_columns;
+    VT *dA_values, *dB_values, *dC_values;
+
+    // allocate A
+    CHECK_CUDART(cudaMalloc((void **)&dA_csrOffsets, (nr1 + 1) * sizeof(int)));
+    CHECK_CUDART(cudaMalloc((void **)&dA_columns, A_nnz * sizeof(int)));
+    CHECK_CUDART(cudaMalloc((void **)&dA_values, A_nnz * sizeof(VT)));
+
+    // allocate B
+    CHECK_CUDART(cudaMalloc((void **)&dB_csrOffsets, (nr2 + 1) * sizeof(int)));
+    CHECK_CUDART(cudaMalloc((void **)&dB_columns, B_nnz * sizeof(int)));
+    CHECK_CUDART(cudaMalloc((void **)&dB_values, B_nnz * sizeof(VT)));
+
+    // allocate C offsets
+    CHECK_CUDART(cudaMalloc((void **)&dC_csrOffsets, (nr1 + 1) * sizeof(int)));
+
+    // copy A
+    CHECK_CUDART(cudaMemcpy(dA_csrOffsets, lhs->getRowOffsets(), (nr1 + 1) * sizeof(int), cudaMemcpyHostToDevice));
+    CHECK_CUDART(cudaMemcpy(dA_columns, lhs->getColIdxs(), A_nnz * sizeof(int), cudaMemcpyHostToDevice));
+    CHECK_CUDART(cudaMemcpy(dA_values, lhs->getValues(), A_nnz * sizeof(VT), cudaMemcpyHostToDevice));
+
+    // copy B
+    CHECK_CUDART(cudaMemcpy(dB_csrOffsets, rhs->getRowOffsets(), (nr2 + 1) * sizeof(int), cudaMemcpyHostToDevice));
+    CHECK_CUDART(cudaMemcpy(dB_columns, rhs->getColIdxs(), B_nnz * sizeof(int), cudaMemcpyHostToDevice));
+    CHECK_CUDART(cudaMemcpy(dB_values, rhs->getValues(), B_nnz * sizeof(VT), cudaMemcpyHostToDevice));
+
+    cusparseSpMatDescr_t matA, matB, matC;
+    void *dBuffer1 = nullptr, *dBuffer2 = nullptr;
+    size_t bufferSize1 = 0, bufferSize2 = 0;
+
+    // Create sparse matrix A in CSR format
+    CHECK_CUSPARSE(cusparseCreateCsr(&matA, nr1, nc1, A_nnz, dA_csrOffsets, dA_columns, dA_values, CUSPARSE_INDEX_32I,
+                                     CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F));
+
+    CHECK_CUSPARSE(cusparseCreateCsr(&matB, nr2, nc2, B_nnz, dB_csrOffsets, dB_columns, dB_values, CUSPARSE_INDEX_32I,
+                                     CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F));
+
+    CHECK_CUSPARSE(cusparseCreateCsr(&matC, nr1, nc2, 0, nullptr, nullptr, nullptr, CUSPARSE_INDEX_32I,
+                                     CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F));
+
+    // SpGEMM Computation
+    cusparseSpGEMMDescr_t spgemmDesc;
+    CHECK_CUSPARSE(cusparseSpGEMM_createDescr(&spgemmDesc));
+
+    // ask bufferSize1 bytes for external memory
+    CHECK_CUSPARSE(cusparseSpGEMM_workEstimation(handle, opA, opB, &blend_alpha, matA, matB, &blend_beta, matC,
+                                                 computeType, CUSPARSE_SPGEMM_DEFAULT, spgemmDesc, &bufferSize1,
+                                                 nullptr));
+
+    CHECK_CUDART(cudaMalloc((void **)&dBuffer1, bufferSize1));
+
+    // inspect the matrices A and B to understand the memory requirement for
+    // the next step
+
+    CHECK_CUSPARSE(cusparseSpGEMM_workEstimation(handle, opA, opB, &blend_alpha, matA, matB, &blend_beta, matC,
+                                                 computeType, CUSPARSE_SPGEMM_DEFAULT, spgemmDesc, &bufferSize1,
+                                                 dBuffer1));
+
+    // ask bufferSize2 bytes for external memory
+    CHECK_CUSPARSE(cusparseSpGEMM_compute(handle, opA, opB, &blend_alpha, matA, matB, &blend_beta, matC, computeType,
+                                          CUSPARSE_SPGEMM_DEFAULT, spgemmDesc, &bufferSize2, nullptr));
+
+    CHECK_CUDART(cudaMalloc((void **)&dBuffer2, bufferSize2));
+
+    // compute the intermediate product of A * B
+    CHECK_CUSPARSE(cusparseSpGEMM_compute(handle, opA, opB, &blend_alpha, matA, matB, &blend_beta, matC, computeType,
+                                          CUSPARSE_SPGEMM_DEFAULT, spgemmDesc, &bufferSize2, dBuffer2));
+
+    // get matrix C non-zero entries C_nnz1
+    int64_t C_num_rows1, C_num_cols1, C_nnz1;
+    CHECK_CUSPARSE(cusparseSpMatGetSize(matC, &C_num_rows1, &C_num_cols1, &C_nnz1));
+
+    // allocate matrix C
+    CHECK_CUDART(cudaMalloc((void **)&dC_columns, C_nnz1 * sizeof(int)));
+    CHECK_CUDART(cudaMalloc((void **)&dC_values, C_nnz1 * sizeof(VT)));
+
+    // update matC with the new pointers
+    CHECK_CUSPARSE(cusparseCsrSetPointers(matC, dC_csrOffsets, dC_columns, dC_values));
+
+    // if beta != 0, cusparseSpGEMM_copy reuses/updates the values of dC_values
+
+    // copy the final products to the matrix C
+    CHECK_CUSPARSE(cusparseSpGEMM_copy(handle, opA, opB, &blend_alpha, matA, matB, &blend_beta, matC, computeType,
+                                       CUSPARSE_SPGEMM_DEFAULT, spgemmDesc));
+
+    // destroy matrix/vector descriptors
+    CHECK_CUSPARSE(cusparseSpGEMM_destroyDescr(spgemmDesc));
+    CHECK_CUSPARSE(cusparseDestroySpMat(matA));
+    CHECK_CUSPARSE(cusparseDestroySpMat(matB));
+    CHECK_CUSPARSE(cusparseDestroySpMat(matC));
+}
+
+// explicit instantiations to satisfy linker
+template struct MatMul<CSRMatrix<float>, CSRMatrix<float>, CSRMatrix<float>>;
+template struct MatMul<CSRMatrix<double>, CSRMatrix<double>, CSRMatrix<double>>;
+template struct MatMul<DenseMatrix<float>, DenseMatrix<float>, DenseMatrix<float>>;
+template struct MatMul<DenseMatrix<double>, DenseMatrix<double>, DenseMatrix<double>>;
+} // namespace CUDA
\ No newline at end of file
diff --git a/src/runtime/local/kernels/CUDA/MatMul.h b/src/runtime/local/kernels/CUDA/MatMul.h
index 03d899c34..689a708dd 100644
--- a/src/runtime/local/kernels/CUDA/MatMul.h
+++ b/src/runtime/local/kernels/CUDA/MatMul.h
@@ -16,40 +16,37 @@
 
 #pragma once
 
-#include <runtime/local/datastructures/DataObjectFactory.h>
 #include <runtime/local/datastructures/CSRMatrix.h>
+#include <runtime/local/datastructures/DataObjectFactory.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
 
-#include <runtime/local/kernels/CUDA/HostUtils.h>
 #include <runtime/local/context/DaphneContext.h>
+#include <runtime/local/kernels/CUDA/HostUtils.h>
 
 namespace CUDA {
 
-    // ****************************************************************************
-    // Struct for partial template specialization
-    // ****************************************************************************
-    template<class DTRes, class DTLhs, class DTRhs>
-    struct MatMul {
-        static void apply(DTRes *&res, const DTLhs *lhs, const DTRhs *rhs, bool transa, bool transb, DCTX(dctx)) = delete;
-    };
-
-    template<typename T>
-    struct MatMul<DenseMatrix<T>, DenseMatrix<T>, DenseMatrix<T>> {
-        static void apply(DenseMatrix<T> *&res, const DenseMatrix<T> *lhs, const DenseMatrix<T> *rhs, bool transa,
-                bool transb, DCTX(dctx));
-    };
-
-    template<typename T>
-    struct MatMul<CSRMatrix<T>, CSRMatrix<T>, CSRMatrix<T>> {
-        static void apply(CSRMatrix<T> *&res, const CSRMatrix<T> *lhs, const CSRMatrix<T> *rhs, bool transa, bool transb,
-                DCTX(dctx));
-    };
-
-    // ****************************************************************************
-    // Convenience function
-    // ****************************************************************************
-    template<class DTRes, class DTLhs, class DTRhs>
-    void matMul(DTRes *&res, const DTLhs *lhs, const DTRhs *rhs, bool transa, bool transb, DCTX(ctx)) {
-        MatMul<DTRes, DTLhs, DTRhs>::apply(res, lhs, rhs, transa, transb, ctx);
-    }
+// ****************************************************************************
+// Struct for partial template specialization
+// ****************************************************************************
+template <class DTRes, class DTLhs, class DTRhs> struct MatMul {
+    static void apply(DTRes *&res, const DTLhs *lhs, const DTRhs *rhs, bool transa, bool transb, DCTX(dctx)) = delete;
+};
+
+template <typename T> struct MatMul<DenseMatrix<T>, DenseMatrix<T>, DenseMatrix<T>> {
+    static void apply(DenseMatrix<T> *&res, const DenseMatrix<T> *lhs, const DenseMatrix<T> *rhs, bool transa,
+                      bool transb, DCTX(dctx));
+};
+
+template <typename T> struct MatMul<CSRMatrix<T>, CSRMatrix<T>, CSRMatrix<T>> {
+    static void apply(CSRMatrix<T> *&res, const CSRMatrix<T> *lhs, const CSRMatrix<T> *rhs, bool transa, bool transb,
+                      DCTX(dctx));
+};
+
+// ****************************************************************************
+// Convenience function
+// ****************************************************************************
+template <class DTRes, class DTLhs, class DTRhs>
+void matMul(DTRes *&res, const DTLhs *lhs, const DTRhs *rhs, bool transa, bool transb, DCTX(ctx)) {
+    MatMul<DTRes, DTLhs, DTRhs>::apply(res, lhs, rhs, transa, transb, ctx);
 }
+} // namespace CUDA
diff --git a/src/runtime/local/kernels/CUDA/Pooling.cpp b/src/runtime/local/kernels/CUDA/Pooling.cpp
index daf8ad585..50ffe0051 100644
--- a/src/runtime/local/kernels/CUDA/Pooling.cpp
+++ b/src/runtime/local/kernels/CUDA/Pooling.cpp
@@ -19,52 +19,54 @@
 
 namespace CUDA::NN::Pooling {
 
-    template<template<typename> class OP, typename DTRes, typename DTArg>
-    void Forward<OP, DTRes, DTArg>::apply(DTRes *&res, size_t& res_h, size_t& res_w,
-            const DTArg *data, const size_t batch_size, const size_t num_channels, const size_t img_h, const size_t img_w,
-            const size_t pool_h, const size_t pool_w, const size_t stride_h, const size_t stride_w, const size_t pad_h,
-            const size_t pad_w, DCTX(dctx))
-    {
-        const size_t deviceID = 0; //ToDo: multi device support
-        auto ctx = CUDAContext::get(dctx, deviceID);
-        AllocationDescriptorCUDA alloc_desc(dctx, deviceID);
-    
-        using VT = typename DTRes::VT;
-        const VT blend_alpha = 1;
-        const VT blend_beta = 0;
-        const VT* d_input = data->getValues(&alloc_desc);
+template <template <typename> class OP, typename DTRes, typename DTArg>
+void Forward<OP, DTRes, DTArg>::apply(DTRes *&res, size_t &res_h, size_t &res_w, const DTArg *data,
+                                      const size_t batch_size, const size_t num_channels, const size_t img_h,
+                                      const size_t img_w, const size_t pool_h, const size_t pool_w,
+                                      const size_t stride_h, const size_t stride_w, const size_t pad_h,
+                                      const size_t pad_w, DCTX(dctx)) {
+    const size_t deviceID = 0; // ToDo: multi device support
+    auto ctx = CUDAContext::get(dctx, deviceID);
+    AllocationDescriptorCUDA alloc_desc(dctx, deviceID);
 
-        CHECK_CUDNN(cudnnSetPooling2dDescriptor(ctx->pooling_desc, OP<VT>::isMAX() ? CUDNN_POOLING_MAX :
-                CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING, CUDNN_PROPAGATE_NAN, pool_h, pool_w, pad_h, pad_w, stride_h,
-                stride_w));
+    using VT = typename DTRes::VT;
+    const VT blend_alpha = 1;
+    const VT blend_beta = 0;
+    const VT *d_input = data->getValues(&alloc_desc);
 
-        CHECK_CUDNN(cudnnSetTensor4dDescriptor(ctx->src_tensor_desc, ctx->tensor_format, ctx->getCUDNNDataType<VT>(), batch_size,
-                num_channels, img_h, img_w));
+    CHECK_CUDNN(cudnnSetPooling2dDescriptor(
+        ctx->pooling_desc, OP<VT>::isMAX() ? CUDNN_POOLING_MAX : CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING,
+        CUDNN_PROPAGATE_NAN, pool_h, pool_w, pad_h, pad_w, stride_h, stride_w));
 
-        const int tensorDims = 4;
-        int tensorOuputDimA[tensorDims];
-        CHECK_CUDNN(cudnnGetPoolingNdForwardOutputDim(ctx->pooling_desc, ctx->src_tensor_desc, tensorDims,
-            tensorOuputDimA));
+    CHECK_CUDNN(cudnnSetTensor4dDescriptor(ctx->src_tensor_desc, ctx->tensor_format, ctx->getCUDNNDataType<VT>(),
+                                           batch_size, num_channels, img_h, img_w));
 
-        int n = tensorOuputDimA[0]; int c = tensorOuputDimA[1];
-        int h = tensorOuputDimA[2]; int w = tensorOuputDimA[3];
-        res_h = h;
-        res_w = w;
-        CHECK_CUDNN(cudnnSetTensor4dDescriptor(ctx->dst_tensor_desc, ctx->tensor_format, ctx->getCUDNNDataType<VT>(), n, c, h, w));
+    const int tensorDims = 4;
+    int tensorOuputDimA[tensorDims];
+    CHECK_CUDNN(
+        cudnnGetPoolingNdForwardOutputDim(ctx->pooling_desc, ctx->src_tensor_desc, tensorDims, tensorOuputDimA));
 
-        if (res == nullptr) {
-            res = DataObjectFactory::create<DTRes>(batch_size, c * h * w, false, &alloc_desc);
-        }
-        VT* d_res = res->getValues(&alloc_desc);
+    int n = tensorOuputDimA[0];
+    int c = tensorOuputDimA[1];
+    int h = tensorOuputDimA[2];
+    int w = tensorOuputDimA[3];
+    res_h = h;
+    res_w = w;
+    CHECK_CUDNN(
+        cudnnSetTensor4dDescriptor(ctx->dst_tensor_desc, ctx->tensor_format, ctx->getCUDNNDataType<VT>(), n, c, h, w));
 
-        CHECK_CUDNN(cudnnPoolingForward(ctx->getCUDNNHandle(), ctx->pooling_desc, &blend_alpha, ctx->src_tensor_desc,
-                                        d_input, &blend_beta, ctx->dst_tensor_desc, d_res));
+    if (res == nullptr) {
+        res = DataObjectFactory::create<DTRes>(batch_size, c * h * w, false, &alloc_desc);
     }
+    VT *d_res = res->getValues(&alloc_desc);
 
-    template struct Forward<::NN::Pooling::AVG, DenseMatrix<float>, DenseMatrix<float>>;
-    template struct Forward<::NN::Pooling::AVG, DenseMatrix<double>, DenseMatrix<double>>;
-
-    template struct Forward<::NN::Pooling::MAX, DenseMatrix<float>, DenseMatrix<float>>;
-    template struct Forward<::NN::Pooling::MAX, DenseMatrix<double>, DenseMatrix<double>>;
+    CHECK_CUDNN(cudnnPoolingForward(ctx->getCUDNNHandle(), ctx->pooling_desc, &blend_alpha, ctx->src_tensor_desc,
+                                    d_input, &blend_beta, ctx->dst_tensor_desc, d_res));
 }
 
+template struct Forward<::NN::Pooling::AVG, DenseMatrix<float>, DenseMatrix<float>>;
+template struct Forward<::NN::Pooling::AVG, DenseMatrix<double>, DenseMatrix<double>>;
+
+template struct Forward<::NN::Pooling::MAX, DenseMatrix<float>, DenseMatrix<float>>;
+template struct Forward<::NN::Pooling::MAX, DenseMatrix<double>, DenseMatrix<double>>;
+} // namespace CUDA::NN::Pooling
diff --git a/src/runtime/local/kernels/CUDA/Pooling.h b/src/runtime/local/kernels/CUDA/Pooling.h
index f629ebfaa..5d4f65a40 100644
--- a/src/runtime/local/kernels/CUDA/Pooling.h
+++ b/src/runtime/local/kernels/CUDA/Pooling.h
@@ -16,23 +16,21 @@
 
 #pragma once
 
+#include "HostUtils.h"
 #include "runtime/local/context/DaphneContext.h"
 #include "runtime/local/datastructures/DataObjectFactory.h"
 #include "runtime/local/datastructures/DenseMatrix.h"
-#include "HostUtils.h"
 #include "runtime/local/kernels/Pooling.h"
 
 namespace CUDA::NN::Pooling {
 
-    // introduce generic pooling ops in this namespace
-    using ::NN::Pooling::AVG;
-    using ::NN::Pooling::MAX;
-    
-    template<template<typename> class OP, typename DTRes, typename DTArg>
-    struct Forward {
-        static void apply(DTRes *&res, size_t& res_h, size_t& res_w, const DTArg *data,
-                size_t batch_size, size_t num_channels, size_t img_h, size_t img_w,
-                size_t pool_h, size_t pool_w, size_t stride_h, size_t stride_w,
-                size_t pad_h, size_t pad_w, DCTX(dctx));
-    };
-}
+// introduce generic pooling ops in this namespace
+using ::NN::Pooling::AVG;
+using ::NN::Pooling::MAX;
+
+template <template <typename> class OP, typename DTRes, typename DTArg> struct Forward {
+    static void apply(DTRes *&res, size_t &res_h, size_t &res_w, const DTArg *data, size_t batch_size,
+                      size_t num_channels, size_t img_h, size_t img_w, size_t pool_h, size_t pool_w, size_t stride_h,
+                      size_t stride_w, size_t pad_h, size_t pad_w, DCTX(dctx));
+};
+} // namespace CUDA::NN::Pooling
diff --git a/src/runtime/local/kernels/CUDA/Softmax.cpp b/src/runtime/local/kernels/CUDA/Softmax.cpp
index 10f88fcdd..1188531af 100644
--- a/src/runtime/local/kernels/CUDA/Softmax.cpp
+++ b/src/runtime/local/kernels/CUDA/Softmax.cpp
@@ -19,32 +19,34 @@
 
 namespace CUDA::Softmax {
 
-    template<typename DTRes, typename DTArg>
-    void Forward<DTRes, DTArg>::apply(DTRes *&res, const DTArg *data, DCTX(dctx)) {
-        const size_t deviceID = 0; //ToDo: multi device support
-        auto ctx = CUDAContext::get(dctx, deviceID);
-        AllocationDescriptorCUDA alloc_desc(dctx, deviceID);
-        
-        using VT = typename DTRes::VT;
-        int n = data->getNumRows();
-        int d = data->getNumCols();
-        const VT blend_alpha = 1;
-        const VT blend_beta = 0;
-        const VT* d_input = data->getValues(&alloc_desc);
-
-        CHECK_CUDNN(cudnnSetTensor4dDescriptor(ctx->src_tensor_desc, ctx->tensor_format, ctx->getCUDNNDataType<VT>(), n, d, 1, 1));
-        CHECK_CUDNN(cudnnSetTensor4dDescriptor(ctx->dst_tensor_desc, ctx->tensor_format, ctx->getCUDNNDataType<VT>(), n, d, 1, 1));
-
-        if (res == nullptr) {
-            res = DataObjectFactory::create<DTRes>(n,d, false, &alloc_desc);
-        }
-        VT* d_res = res->getValues(&alloc_desc);
-
-        CHECK_CUDNN(cudnnSoftmaxForward(ctx->getCUDNNHandle(), CUDNN_SOFTMAX_ACCURATE, CUDNN_SOFTMAX_MODE_CHANNEL,
-                &blend_alpha, ctx->src_tensor_desc, d_input, &blend_beta, ctx->dst_tensor_desc, d_res));
+template <typename DTRes, typename DTArg>
+void Forward<DTRes, DTArg>::apply(DTRes *&res, const DTArg *data, DCTX(dctx)) {
+    const size_t deviceID = 0; // ToDo: multi device support
+    auto ctx = CUDAContext::get(dctx, deviceID);
+    AllocationDescriptorCUDA alloc_desc(dctx, deviceID);
+
+    using VT = typename DTRes::VT;
+    int n = data->getNumRows();
+    int d = data->getNumCols();
+    const VT blend_alpha = 1;
+    const VT blend_beta = 0;
+    const VT *d_input = data->getValues(&alloc_desc);
+
+    CHECK_CUDNN(
+        cudnnSetTensor4dDescriptor(ctx->src_tensor_desc, ctx->tensor_format, ctx->getCUDNNDataType<VT>(), n, d, 1, 1));
+    CHECK_CUDNN(
+        cudnnSetTensor4dDescriptor(ctx->dst_tensor_desc, ctx->tensor_format, ctx->getCUDNNDataType<VT>(), n, d, 1, 1));
+
+    if (res == nullptr) {
+        res = DataObjectFactory::create<DTRes>(n, d, false, &alloc_desc);
     }
+    VT *d_res = res->getValues(&alloc_desc);
 
-    template struct Forward<DenseMatrix<float>, DenseMatrix<float>>;
-    template struct Forward<DenseMatrix<double>, DenseMatrix<double>>;
+    CHECK_CUDNN(cudnnSoftmaxForward(ctx->getCUDNNHandle(), CUDNN_SOFTMAX_ACCURATE, CUDNN_SOFTMAX_MODE_CHANNEL,
+                                    &blend_alpha, ctx->src_tensor_desc, d_input, &blend_beta, ctx->dst_tensor_desc,
+                                    d_res));
 }
 
+template struct Forward<DenseMatrix<float>, DenseMatrix<float>>;
+template struct Forward<DenseMatrix<double>, DenseMatrix<double>>;
+} // namespace CUDA::Softmax
diff --git a/src/runtime/local/kernels/CUDA/Softmax.h b/src/runtime/local/kernels/CUDA/Softmax.h
index 988ef0d87..09dc4df1c 100644
--- a/src/runtime/local/kernels/CUDA/Softmax.h
+++ b/src/runtime/local/kernels/CUDA/Softmax.h
@@ -16,10 +16,10 @@
 
 #pragma once
 
+#include "HostUtils.h"
 #include "runtime/local/context/DaphneContext.h"
 #include "runtime/local/datastructures/DataObjectFactory.h"
 #include "runtime/local/datastructures/DenseMatrix.h"
-#include "HostUtils.h"
 
 #include <limits>
 #include <random>
@@ -29,8 +29,7 @@
 #include <cstdint>
 
 namespace CUDA::Softmax {
-    template<typename DTRes, typename DTArg>
-    struct Forward {
-        static void apply(DTRes *&res, const DTArg *data, DCTX(dctx));
-    };
-}
+template <typename DTRes, typename DTArg> struct Forward {
+    static void apply(DTRes *&res, const DTArg *data, DCTX(dctx));
+};
+} // namespace CUDA::Softmax
diff --git a/src/runtime/local/kernels/CUDA/Solve.cpp b/src/runtime/local/kernels/CUDA/Solve.cpp
index 1fe244613..33072e9dc 100644
--- a/src/runtime/local/kernels/CUDA/Solve.cpp
+++ b/src/runtime/local/kernels/CUDA/Solve.cpp
@@ -21,137 +21,135 @@
 
 namespace CUDA {
 // -----------------------------------------------------------------------------------------------------------------
-    template<typename T>
-    cusolverStatus_t cusolverDnXgetrf_bufferSize(cusolverDnHandle_t handle, int m, int n, T *A, int lda, int *Lwork);
-
-    template<>
-    [[maybe_unused]] cusolverStatus_t
-    cusolverDnXgetrf_bufferSize<double>(cusolverDnHandle_t handle, int m, int n, double *A, int lda,
-                                        int *Lwork) {
-        return cusolverDnDgetrf_bufferSize(handle, m, n, A, lda, Lwork);
-    }
-
-    template<>
-    [[maybe_unused]] cusolverStatus_t
-    cusolverDnXgetrf_bufferSize<float>(cusolverDnHandle_t handle, int m, int n, float *A, int lda,
-                                       int *Lwork) {
-        return cusolverDnSgetrf_bufferSize(handle, m, n, A, lda, Lwork);
-    }
+template <typename T>
+cusolverStatus_t cusolverDnXgetrf_bufferSize(cusolverDnHandle_t handle, int m, int n, T *A, int lda, int *Lwork);
+
+template <>
+[[maybe_unused]] cusolverStatus_t cusolverDnXgetrf_bufferSize<double>(cusolverDnHandle_t handle, int m, int n,
+                                                                      double *A, int lda, int *Lwork) {
+    return cusolverDnDgetrf_bufferSize(handle, m, n, A, lda, Lwork);
+}
+
+template <>
+[[maybe_unused]] cusolverStatus_t cusolverDnXgetrf_bufferSize<float>(cusolverDnHandle_t handle, int m, int n, float *A,
+                                                                     int lda, int *Lwork) {
+    return cusolverDnSgetrf_bufferSize(handle, m, n, A, lda, Lwork);
+}
 
 // -----------------------------------------------------------------------------------------------------------------
-    template<typename T>
-    cusolverStatus_t
-    cusolverDnXgetrf(cusolverDnHandle_t handle, int m, int n, T *A, int lda, T *Workspace, int *devIpiv, int *devInfo);
-
-    template<>
-    [[maybe_unused]] cusolverStatus_t
-    cusolverDnXgetrf<double>(cusolverDnHandle_t handle, int m, int n, double *A, int lda, double *Workspace,
-                             int *devIpiv, int *devInfo) {
-        return cusolverDnDgetrf(handle, m, n, A, lda, Workspace, devIpiv, devInfo);
-    }
-
-    template<>
-    [[maybe_unused]] cusolverStatus_t
-    cusolverDnXgetrf<float>(cusolverDnHandle_t handle, int m, int n, float *A, int lda, float *Workspace,
-                            int *devIpiv, int *devInfo) {
-        return cusolverDnSgetrf(handle, m, n, A, lda, Workspace, devIpiv, devInfo);
-    }
+template <typename T>
+cusolverStatus_t cusolverDnXgetrf(cusolverDnHandle_t handle, int m, int n, T *A, int lda, T *Workspace, int *devIpiv,
+                                  int *devInfo);
+
+template <>
+[[maybe_unused]] cusolverStatus_t cusolverDnXgetrf<double>(cusolverDnHandle_t handle, int m, int n, double *A, int lda,
+                                                           double *Workspace, int *devIpiv, int *devInfo) {
+    return cusolverDnDgetrf(handle, m, n, A, lda, Workspace, devIpiv, devInfo);
+}
+
+template <>
+[[maybe_unused]] cusolverStatus_t cusolverDnXgetrf<float>(cusolverDnHandle_t handle, int m, int n, float *A, int lda,
+                                                          float *Workspace, int *devIpiv, int *devInfo) {
+    return cusolverDnSgetrf(handle, m, n, A, lda, Workspace, devIpiv, devInfo);
+}
 
 // -----------------------------------------------------------------------------------------------------------------
-    template<typename T>
-    cusolverStatus_t
-    cusolverDnXgetrs(cusolverDnHandle_t handle, cublasOperation_t trans, int n, int nrhs, const T *A, int lda,
-                     const int *devIpiv, T *B, int ldb, int *devInfo);
-
-    template<>
-    [[maybe_unused]] cusolverStatus_t
-    cusolverDnXgetrs<double>(cusolverDnHandle_t handle, cublasOperation_t trans, int n, int nrhs,
-                             const double *A, int lda, const int *devIpiv, double *B, int ldb, int *devInfo) {
-        return cusolverDnDgetrs(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, devInfo);
-    }
-
-    template<>
-    [[maybe_unused]] cusolverStatus_t
-    cusolverDnXgetrs<float>(cusolverDnHandle_t handle, cublasOperation_t trans, int n, int nrhs,
-                            const float *A, int lda, const int *devIpiv, float *B, int ldb, int *devInfo) {
-        return cusolverDnSgetrs(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, devInfo);
-    }
-
-    template<class VT>
-    void Solve<DenseMatrix<VT>, DenseMatrix<VT>, DenseMatrix<VT>>::apply
-            (DenseMatrix<VT> *&res, const DenseMatrix<VT> *lhs, const DenseMatrix<VT> *rhs, DCTX(dctx)) {
-        const size_t deviceID = 0; //ToDo: multi device support
-        auto ctx = CUDAContext::get(dctx, deviceID);
-        AllocationDescriptorCUDA alloc_desc(dctx, deviceID);
-        
-        const size_t nr1 = lhs->getNumRows();
-        const size_t nc1 = lhs->getNumCols();
-        const size_t nc2 = rhs->getNumCols();
-        if (nr1 != rhs->getNumRows())
-            throw std::runtime_error("Solve (CUDA): #rows of lhs and #rows of rhs must be the same");
-        if (nr1 != nc1)
-            throw std::runtime_error("Solve (CUDA): #rows and #cols of lhs must be the same");
-        if (lhs->getRowSkip() != nc1)
-            throw std::runtime_error("Solve (CUDA): #cols of lhs must match row skip");
-        if (nc2 != 1)
-            throw std::runtime_error("Solve (CUDA): #cols of rhs must be 1");
-
-        if(res == nullptr)
-            res = DataObjectFactory::create<DenseMatrix<VT>>(nr1, nc2, false, &alloc_desc);
-
-        int *d_Ipiv{}; /* pivoting sequence */
-        int *d_info{}; /* error info */
-        int lwork = 0;     /* size of workspace */
-        VT *d_work{}; /* device workspace for getrf */
-        VT *d_A{};
-        CHECK_CUBLAS(cublasSetStream(ctx->getCublasHandle(), ctx->getCuSolverStream()));
-        CHECK_CUDART(cudaMallocAsync(reinterpret_cast<void **>(&d_A), lhs->getBufferSize(), ctx->getCuSolverStream()));
-        const VT blend_alpha = 1.0f;
-        const VT blend_beta = 0.0f;
-
-        launch_cublas_geam<VT>(*ctx, nr1, nc1, &blend_alpha, &blend_beta, lhs->getValues(&alloc_desc), d_A);
-        CHECK_CUBLAS(cublasSetStream(ctx->getCublasHandle(), nullptr));
-        auto &m = nc1;
-        CHECK_CUDART(cudaMemcpyAsync(res->getValues(&alloc_desc), rhs->getValues(&alloc_desc), rhs->getBufferSize(),
-                cudaMemcpyDeviceToDevice, ctx->getCuSolverStream()));
-        auto d_B = res->getValues(&alloc_desc);
-        auto lda = m;
-        auto ldb = m;
-        CHECK_CUDART(cudaMallocAsync(reinterpret_cast<void **>(&d_Ipiv), sizeof(int) * nr1, ctx->getCuSolverStream()));
-        CHECK_CUDART(cudaMallocAsync(reinterpret_cast<void **>(&d_info), sizeof(int), ctx->getCuSolverStream()));
-
-        //ToDo: templatize
-        CHECK_CUSOLVER(cusolverDnXgetrf_bufferSize(ctx->getCUSOLVERHandle(), m, m, d_A, lda, &lwork));
-
-        ctx->logger->debug("allocating {} bytes for cuSolver workspace", sizeof(VT) * lwork);
-
-        CHECK_CUDART(cudaMallocAsync((void **) &d_work, sizeof(VT) * lwork, ctx->getCuSolverStream()));
-
-        CHECK_CUSOLVER(cusolverDnXgetrf(ctx->getCUSOLVERHandle(), m, m, d_A, lda, d_work, d_Ipiv, d_info));
-
-        int info = 0;     /* host copy of error info from cusolverDnXgetrf */
+template <typename T>
+cusolverStatus_t cusolverDnXgetrs(cusolverDnHandle_t handle, cublasOperation_t trans, int n, int nrhs, const T *A,
+                                  int lda, const int *devIpiv, T *B, int ldb, int *devInfo);
+
+template <>
+[[maybe_unused]] cusolverStatus_t cusolverDnXgetrs<double>(cusolverDnHandle_t handle, cublasOperation_t trans, int n,
+                                                           int nrhs, const double *A, int lda, const int *devIpiv,
+                                                           double *B, int ldb, int *devInfo) {
+    return cusolverDnDgetrs(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, devInfo);
+}
+
+template <>
+[[maybe_unused]] cusolverStatus_t cusolverDnXgetrs<float>(cusolverDnHandle_t handle, cublasOperation_t trans, int n,
+                                                          int nrhs, const float *A, int lda, const int *devIpiv,
+                                                          float *B, int ldb, int *devInfo) {
+    return cusolverDnSgetrs(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, devInfo);
+}
+
+template <class VT>
+void Solve<DenseMatrix<VT>, DenseMatrix<VT>, DenseMatrix<VT>>::apply(DenseMatrix<VT> *&res, const DenseMatrix<VT> *lhs,
+                                                                     const DenseMatrix<VT> *rhs, DCTX(dctx)) {
+    const size_t deviceID = 0; // ToDo: multi device support
+    auto ctx = CUDAContext::get(dctx, deviceID);
+    AllocationDescriptorCUDA alloc_desc(dctx, deviceID);
+
+    const size_t nr1 = lhs->getNumRows();
+    const size_t nc1 = lhs->getNumCols();
+    const size_t nc2 = rhs->getNumCols();
+    if (nr1 != rhs->getNumRows())
+        throw std::runtime_error("Solve (CUDA): #rows of lhs and #rows of rhs must be the same");
+    if (nr1 != nc1)
+        throw std::runtime_error("Solve (CUDA): #rows and #cols of lhs must be the same");
+    if (lhs->getRowSkip() != nc1)
+        throw std::runtime_error("Solve (CUDA): #cols of lhs must match row skip");
+    if (nc2 != 1)
+        throw std::runtime_error("Solve (CUDA): #cols of rhs must be 1");
+
+    if (res == nullptr)
+        res = DataObjectFactory::create<DenseMatrix<VT>>(nr1, nc2, false, &alloc_desc);
+
+    int *d_Ipiv{}; /* pivoting sequence */
+    int *d_info{}; /* error info */
+    int lwork = 0; /* size of workspace */
+    VT *d_work{};  /* device workspace for getrf */
+    VT *d_A{};
+    CHECK_CUBLAS(cublasSetStream(ctx->getCublasHandle(), ctx->getCuSolverStream()));
+    CHECK_CUDART(cudaMallocAsync(reinterpret_cast<void **>(&d_A), lhs->getBufferSize(), ctx->getCuSolverStream()));
+    const VT blend_alpha = 1.0f;
+    const VT blend_beta = 0.0f;
+
+    launch_cublas_geam<VT>(*ctx, nr1, nc1, &blend_alpha, &blend_beta, lhs->getValues(&alloc_desc), d_A);
+    CHECK_CUBLAS(cublasSetStream(ctx->getCublasHandle(), nullptr));
+    auto &m = nc1;
+    CHECK_CUDART(cudaMemcpyAsync(res->getValues(&alloc_desc), rhs->getValues(&alloc_desc), rhs->getBufferSize(),
+                                 cudaMemcpyDeviceToDevice, ctx->getCuSolverStream()));
+    auto d_B = res->getValues(&alloc_desc);
+    auto lda = m;
+    auto ldb = m;
+    CHECK_CUDART(cudaMallocAsync(reinterpret_cast<void **>(&d_Ipiv), sizeof(int) * nr1, ctx->getCuSolverStream()));
+    CHECK_CUDART(cudaMallocAsync(reinterpret_cast<void **>(&d_info), sizeof(int), ctx->getCuSolverStream()));
+
+    // ToDo: templatize
+    CHECK_CUSOLVER(cusolverDnXgetrf_bufferSize(ctx->getCUSOLVERHandle(), m, m, d_A, lda, &lwork));
+
+    ctx->logger->debug("allocating {} bytes for cuSolver workspace", sizeof(VT) * lwork);
+
+    CHECK_CUDART(cudaMallocAsync((void **)&d_work, sizeof(VT) * lwork, ctx->getCuSolverStream()));
+
+    CHECK_CUSOLVER(cusolverDnXgetrf(ctx->getCUSOLVERHandle(), m, m, d_A, lda, d_work, d_Ipiv, d_info));
+
+    int info = 0; /* host copy of error info from cusolverDnXgetrf */
+    CHECK_CUDART(cudaMemcpyAsync(&info, d_info, sizeof(int), cudaMemcpyDeviceToHost, ctx->getCuSolverStream()));
+    CHECK_CUDART(cudaStreamSynchronize(ctx->getCuSolverStream()));
+    if (info == 0) {
+        CHECK_CUSOLVER(
+            cusolverDnXgetrs(ctx->getCUSOLVERHandle(), CUBLAS_OP_N, m, nc2, d_A, lda, d_Ipiv, d_B, ldb, d_info));
+        info = 0; // reset status and request outcome of cusolverDnXgetrs
         CHECK_CUDART(cudaMemcpyAsync(&info, d_info, sizeof(int), cudaMemcpyDeviceToHost, ctx->getCuSolverStream()));
         CHECK_CUDART(cudaStreamSynchronize(ctx->getCuSolverStream()));
-        if(info == 0) {
-            CHECK_CUSOLVER(cusolverDnXgetrs(ctx->getCUSOLVERHandle(), CUBLAS_OP_N, m, nc2, d_A, lda, d_Ipiv, d_B, ldb,
-                                            d_info));
-            info = 0; // reset status and request outcome of cusolverDnXgetrs
-            CHECK_CUDART(cudaMemcpyAsync(&info, d_info, sizeof(int), cudaMemcpyDeviceToHost, ctx->getCuSolverStream()));
-            CHECK_CUDART(cudaStreamSynchronize(ctx->getCuSolverStream()));
-            if(info > 0)
-                throw std::runtime_error("cuSolve: A factor Ui is exactly singular, so the solution could not be computed");
-            else if(info < 0)
-                throw std::runtime_error(std::string(std::string("cuSolve: The ") + std::to_string(info) +
-                        std::string("-th value had an illegal value.")).c_str());
-        }
-        else if(info > 0)
-            throw std::runtime_error("cuSolve: A factor Ui is exactly singular, so the solution could not be computed");
-        else if(info < 0)
+        if (info > 0)
+            throw std::runtime_error("cuSolve: A factor Ui is exactly singular, so the solution "
+                                     "could not be computed");
+        else if (info < 0)
             throw std::runtime_error(std::string(std::string("cuSolve: The ") + std::to_string(info) +
-                    std::string("-th value had an illegal value.")).c_str());
-        cudaFreeAsync(d_A, ctx->getCuSolverStream());
-    }
-
-    template struct Solve<DenseMatrix<double>, DenseMatrix<double>, DenseMatrix<double>>;
-    template struct Solve<DenseMatrix<float>, DenseMatrix<float>, DenseMatrix<float>>;
-}
\ No newline at end of file
+                                                 std::string("-th value had an illegal value."))
+                                         .c_str());
+    } else if (info > 0)
+        throw std::runtime_error("cuSolve: A factor Ui is exactly singular, so "
+                                 "the solution could not be computed");
+    else if (info < 0)
+        throw std::runtime_error(std::string(std::string("cuSolve: The ") + std::to_string(info) +
+                                             std::string("-th value had an illegal value."))
+                                     .c_str());
+    cudaFreeAsync(d_A, ctx->getCuSolverStream());
+}
+
+template struct Solve<DenseMatrix<double>, DenseMatrix<double>, DenseMatrix<double>>;
+template struct Solve<DenseMatrix<float>, DenseMatrix<float>, DenseMatrix<float>>;
+} // namespace CUDA
\ No newline at end of file
diff --git a/src/runtime/local/kernels/CUDA/Solve.h b/src/runtime/local/kernels/CUDA/Solve.h
index 7e2fb320f..31244c9bc 100644
--- a/src/runtime/local/kernels/CUDA/Solve.h
+++ b/src/runtime/local/kernels/CUDA/Solve.h
@@ -27,22 +27,20 @@ namespace CUDA {
 // Struct for partial template specialization
 // ****************************************************************************
 
-    template<class DTRes, class DTLhs, class DTRhs>
-    struct Solve {
-        static void apply(DTRes *&res, const DTLhs *lhs, const DTRhs *rhs, DCTX(ctx)) = delete;
-    };
+template <class DTRes, class DTLhs, class DTRhs> struct Solve {
+    static void apply(DTRes *&res, const DTLhs *lhs, const DTRhs *rhs, DCTX(ctx)) = delete;
+};
 
 // ****************************************************************************
 // Convenience function
 // ****************************************************************************
 
-    template<class DTRes, class DTLhs, class DTRhs>
-    void solve(DTRes *&res, const DTLhs *lhs, const DTRhs *rhs, DCTX(ctx)) {
-        Solve<DTRes, DTLhs, DTRhs>::apply(res, lhs, rhs, ctx);
-    }
+template <class DTRes, class DTLhs, class DTRhs>
+void solve(DTRes *&res, const DTLhs *lhs, const DTRhs *rhs, DCTX(ctx)) {
+    Solve<DTRes, DTLhs, DTRhs>::apply(res, lhs, rhs, ctx);
+}
 
-    template<typename VT>
-    struct Solve<DenseMatrix<VT>, DenseMatrix<VT>, DenseMatrix<VT>> {
-        static void apply(DenseMatrix<VT> *&res, const DenseMatrix<VT> *lhs, const DenseMatrix<VT> *rhs, DCTX(ctx));
-    };
-}
\ No newline at end of file
+template <typename VT> struct Solve<DenseMatrix<VT>, DenseMatrix<VT>, DenseMatrix<VT>> {
+    static void apply(DenseMatrix<VT> *&res, const DenseMatrix<VT> *lhs, const DenseMatrix<VT> *rhs, DCTX(ctx));
+};
+} // namespace CUDA
\ No newline at end of file
diff --git a/src/runtime/local/kernels/CUDA/Syrk.h b/src/runtime/local/kernels/CUDA/Syrk.h
index 5c0ef466e..21a37d766 100644
--- a/src/runtime/local/kernels/CUDA/Syrk.h
+++ b/src/runtime/local/kernels/CUDA/Syrk.h
@@ -26,25 +26,22 @@ namespace CUDA {
 // Struct for partial template specialization
 // ****************************************************************************
 
-    template<class DTRes, class DTArg>
-    struct Syrk {
-        static void apply(DTRes *&res, const DTArg *arg, DCTX(ctx));
-    };
+template <class DTRes, class DTArg> struct Syrk {
+    static void apply(DTRes *&res, const DTArg *arg, DCTX(ctx));
+};
 
 // ****************************************************************************
 // Convenience function
 // ****************************************************************************
 
-    template<class DTRes, class DTArg>
-    void syrk(DTRes *&res, const DTArg *arg, DCTX(ctx)) {
-        Syrk<DTRes, DTArg>::apply(res, arg, ctx);
-    }
+template <class DTRes, class DTArg> void syrk(DTRes *&res, const DTArg *arg, DCTX(ctx)) {
+    Syrk<DTRes, DTArg>::apply(res, arg, ctx);
+}
 
 // ****************************************************************************
 // (Partial) template specializations for different data/value types
 // ****************************************************************************
-    template<typename VTres, typename VTarg>
-    struct Syrk<DenseMatrix<VTres>, DenseMatrix<VTarg>> {
-        static void apply(DenseMatrix<VTres> *&res, const DenseMatrix<VTarg> *arg, DCTX(dctx));
-    };
-}
\ No newline at end of file
+template <typename VTres, typename VTarg> struct Syrk<DenseMatrix<VTres>, DenseMatrix<VTarg>> {
+    static void apply(DenseMatrix<VTres> *&res, const DenseMatrix<VTarg> *arg, DCTX(dctx));
+};
+} // namespace CUDA
\ No newline at end of file
diff --git a/src/runtime/local/kernels/CUDA/Transpose.cpp b/src/runtime/local/kernels/CUDA/Transpose.cpp
index 22e003728..83e753e79 100644
--- a/src/runtime/local/kernels/CUDA/Transpose.cpp
+++ b/src/runtime/local/kernels/CUDA/Transpose.cpp
@@ -17,83 +17,79 @@
 #include "Transpose.h"
 
 namespace CUDA {
-    template<>
-    [[maybe_unused]] void launch_cublas_geam<double>(const CUDAContext &ctx, size_t m, size_t n, const double *alpha,
-                                                     const double *beta, const double *A, double *C) {
-        auto lda = n;
-        auto ldb = n;
-        auto ldc = m;
-        CHECK_CUBLAS(cublasDgeam(ctx.getCublasHandle(), CUBLAS_OP_T, CUBLAS_OP_T, m, n, alpha, A, lda, beta, A,
-                                 ldb, C, ldc));
-    }
+template <>
+[[maybe_unused]] void launch_cublas_geam<double>(const CUDAContext &ctx, size_t m, size_t n, const double *alpha,
+                                                 const double *beta, const double *A, double *C) {
+    auto lda = n;
+    auto ldb = n;
+    auto ldc = m;
+    CHECK_CUBLAS(
+        cublasDgeam(ctx.getCublasHandle(), CUBLAS_OP_T, CUBLAS_OP_T, m, n, alpha, A, lda, beta, A, ldb, C, ldc));
+}
 
-    template<>
-    [[maybe_unused]] void launch_cublas_geam<float>(const CUDAContext &ctx, size_t m, size_t n, const float *alpha,
-                                                    const float *beta, const float *A, float *C) {
-        auto lda = n;
-        auto ldb = n;
-        auto ldc = m;
-        CHECK_CUBLAS(cublasSgeam(ctx.getCublasHandle(), CUBLAS_OP_T, CUBLAS_OP_T, m, n, alpha, A, lda, beta, A,
-                                 ldb, C, ldc));
-    }
+template <>
+[[maybe_unused]] void launch_cublas_geam<float>(const CUDAContext &ctx, size_t m, size_t n, const float *alpha,
+                                                const float *beta, const float *A, float *C) {
+    auto lda = n;
+    auto ldb = n;
+    auto ldc = m;
+    CHECK_CUBLAS(
+        cublasSgeam(ctx.getCublasHandle(), CUBLAS_OP_T, CUBLAS_OP_T, m, n, alpha, A, lda, beta, A, ldb, C, ldc));
+}
 
-    template<>
-    [[maybe_unused]] void launch_cublas_geam<int64_t>(const CUDAContext &ctx, size_t m, size_t n, const int64_t *alpha,
-                                                      const int64_t *beta, const int64_t *A, int64_t *C) {
-        auto lda = n;
-        auto ldb = n;
-        auto ldc = m;
-        auto alpha_ = static_cast<double>(*alpha);
-        auto beta_ = static_cast<double>(*beta);
-        CHECK_CUBLAS(cublasDgeam(ctx.getCublasHandle(), CUBLAS_OP_T, CUBLAS_OP_T, m, n, &alpha_,
-                                 reinterpret_cast<const double *>(A), lda, &beta_, reinterpret_cast<const double *>(A),
-                                 ldb, reinterpret_cast<double *>(C), ldc));
-    }
+template <>
+[[maybe_unused]] void launch_cublas_geam<int64_t>(const CUDAContext &ctx, size_t m, size_t n, const int64_t *alpha,
+                                                  const int64_t *beta, const int64_t *A, int64_t *C) {
+    auto lda = n;
+    auto ldb = n;
+    auto ldc = m;
+    auto alpha_ = static_cast<double>(*alpha);
+    auto beta_ = static_cast<double>(*beta);
+    CHECK_CUBLAS(cublasDgeam(ctx.getCublasHandle(), CUBLAS_OP_T, CUBLAS_OP_T, m, n, &alpha_,
+                             reinterpret_cast<const double *>(A), lda, &beta_, reinterpret_cast<const double *>(A), ldb,
+                             reinterpret_cast<double *>(C), ldc));
+}
 
-    template<>
-    [[maybe_unused]] void launch_cublas_geam<uint64_t>(const CUDAContext &ctx, size_t m, size_t n, const uint64_t *alpha,
-                                                      const uint64_t *beta, const uint64_t *A, uint64_t *C) {
-        auto lda = n;
-        auto ldb = n;
-        auto ldc = m;
-        auto alpha_ = static_cast<double>(*alpha);
-        auto beta_ = static_cast<double>(*beta);
-        CHECK_CUBLAS(cublasDgeam(ctx.getCublasHandle(), CUBLAS_OP_T, CUBLAS_OP_T, m, n, &alpha_,
-                                 reinterpret_cast<const double *>(A), lda, &beta_, reinterpret_cast<const double *>(A),
-                                 ldb, reinterpret_cast<double *>(C), ldc));
-
-    }
+template <>
+[[maybe_unused]] void launch_cublas_geam<uint64_t>(const CUDAContext &ctx, size_t m, size_t n, const uint64_t *alpha,
+                                                   const uint64_t *beta, const uint64_t *A, uint64_t *C) {
+    auto lda = n;
+    auto ldb = n;
+    auto ldc = m;
+    auto alpha_ = static_cast<double>(*alpha);
+    auto beta_ = static_cast<double>(*beta);
+    CHECK_CUBLAS(cublasDgeam(ctx.getCublasHandle(), CUBLAS_OP_T, CUBLAS_OP_T, m, n, &alpha_,
+                             reinterpret_cast<const double *>(A), lda, &beta_, reinterpret_cast<const double *>(A), ldb,
+                             reinterpret_cast<double *>(C), ldc));
+}
 
 // ----------------------------------------------------------------------------
 // DenseMatrix <- DenseMatrix
 // ----------------------------------------------------------------------------
-    
-    template<typename VT>
-    void Transpose<DenseMatrix<VT>, DenseMatrix<VT>>::apply(DenseMatrix<VT> *&res, const DenseMatrix<VT> *arg,
-            DCTX(dctx)) {
-        const size_t numRows = arg->getNumRows();
-        const size_t numCols = arg->getNumCols();
-        const size_t deviceID = 0; //ToDo: multi device support
-        auto ctx = CUDAContext::get(dctx, deviceID);
-        AllocationDescriptorCUDA alloc_desc(dctx, deviceID);
-        const VT blend_alpha = 1.0f;
-        const VT blend_beta = 0.0f;
-        const VT *d_arg = arg->getValues(&alloc_desc);
-        
-        // skip data movement for vectors
-        if ((numRows == 1 || numCols == 1) && !arg->isView()) {
-            res = DataObjectFactory::create<DenseMatrix<VT>>(numCols, numRows, arg);
-        }
-        else
-        {
-            if (res == nullptr)
-                res = DataObjectFactory::create<DenseMatrix<VT>>(numCols, numRows, false, &alloc_desc);
-            VT *d_res = res->getValues(&alloc_desc);
-            launch_cublas_geam<VT>(*ctx, numRows, numCols, &blend_alpha, &blend_beta, d_arg, d_res);
-        }
+
+template <typename VT>
+void Transpose<DenseMatrix<VT>, DenseMatrix<VT>>::apply(DenseMatrix<VT> *&res, const DenseMatrix<VT> *arg, DCTX(dctx)) {
+    const size_t numRows = arg->getNumRows();
+    const size_t numCols = arg->getNumCols();
+    const size_t deviceID = 0; // ToDo: multi device support
+    auto ctx = CUDAContext::get(dctx, deviceID);
+    AllocationDescriptorCUDA alloc_desc(dctx, deviceID);
+    const VT blend_alpha = 1.0f;
+    const VT blend_beta = 0.0f;
+    const VT *d_arg = arg->getValues(&alloc_desc);
+
+    // skip data movement for vectors
+    if ((numRows == 1 || numCols == 1) && !arg->isView()) {
+        res = DataObjectFactory::create<DenseMatrix<VT>>(numCols, numRows, arg);
+    } else {
+        if (res == nullptr)
+            res = DataObjectFactory::create<DenseMatrix<VT>>(numCols, numRows, false, &alloc_desc);
+        VT *d_res = res->getValues(&alloc_desc);
+        launch_cublas_geam<VT>(*ctx, numRows, numCols, &blend_alpha, &blend_beta, d_arg, d_res);
     }
-    template struct Transpose<DenseMatrix<double>, DenseMatrix<double>>;
-    template struct Transpose<DenseMatrix<float>, DenseMatrix<float>>;
-    template struct Transpose<DenseMatrix<int64_t>, DenseMatrix<int64_t>>;
-    template struct Transpose<DenseMatrix<uint64_t>, DenseMatrix<uint64_t>>;
-}
\ No newline at end of file
+}
+template struct Transpose<DenseMatrix<double>, DenseMatrix<double>>;
+template struct Transpose<DenseMatrix<float>, DenseMatrix<float>>;
+template struct Transpose<DenseMatrix<int64_t>, DenseMatrix<int64_t>>;
+template struct Transpose<DenseMatrix<uint64_t>, DenseMatrix<uint64_t>>;
+} // namespace CUDA
\ No newline at end of file
diff --git a/src/runtime/local/kernels/CUDA/Transpose.h b/src/runtime/local/kernels/CUDA/Transpose.h
index 48d8eadb7..956117b88 100644
--- a/src/runtime/local/kernels/CUDA/Transpose.h
+++ b/src/runtime/local/kernels/CUDA/Transpose.h
@@ -23,30 +23,26 @@
 #include <cstddef>
 
 namespace CUDA {
-    template<typename T>
-    void launch_cublas_geam(const CUDAContext &ctx, size_t m, size_t n, const T *alpha, const T *beta, const T *A, T *C);
-
+template <typename T>
+void launch_cublas_geam(const CUDAContext &ctx, size_t m, size_t n, const T *alpha, const T *beta, const T *A, T *C);
 
 // ****************************************************************************
 // Struct for partial template specialization
 // ****************************************************************************
 
-    template<class DTRes, class DTArg>
-    struct Transpose {
-        static void apply(DTRes *&res, const DTArg *arg, DCTX(ctx)) = delete;
-    };
+template <class DTRes, class DTArg> struct Transpose {
+    static void apply(DTRes *&res, const DTArg *arg, DCTX(ctx)) = delete;
+};
 
-    template<typename VT>
-    struct Transpose<DenseMatrix<VT>, DenseMatrix<VT>> {
-        static void apply(DenseMatrix<VT> *&res, const DenseMatrix<VT> *arg, DCTX(ctx));
-    };
+template <typename VT> struct Transpose<DenseMatrix<VT>, DenseMatrix<VT>> {
+    static void apply(DenseMatrix<VT> *&res, const DenseMatrix<VT> *arg, DCTX(ctx));
+};
 
 // ****************************************************************************
 // Convenience function
 // ****************************************************************************
 
-    template<class DTRes, class DTArg>
-    void transpose(DTRes *&res, const DTArg *arg, DCTX(ctx)) {
-        Transpose<DTRes, DTArg>::apply(res, arg, ctx);
-    }
-}
\ No newline at end of file
+template <class DTRes, class DTArg> void transpose(DTRes *&res, const DTArg *arg, DCTX(ctx)) {
+    Transpose<DTRes, DTArg>::apply(res, arg, ctx);
+}
+} // namespace CUDA
\ No newline at end of file
diff --git a/src/runtime/local/kernels/Cartesian.h b/src/runtime/local/kernels/Cartesian.h
index 25e02c219..811648a22 100644
--- a/src/runtime/local/kernels/Cartesian.h
+++ b/src/runtime/local/kernels/Cartesian.h
@@ -15,53 +15,30 @@
 #include <cstddef>
 #include <cstdint>
 
-template<typename VTCol>
-void cartesianSetValue(
-    DenseMatrix<VTCol> * res,
-    const DenseMatrix<VTCol> * arg,
-    const int64_t targetRow,
-    const int64_t fromRow,
-    DCTX(ctx)
-){
+template <typename VTCol>
+void cartesianSetValue(DenseMatrix<VTCol> *res, const DenseMatrix<VTCol> *arg, const int64_t targetRow,
+                       const int64_t fromRow, DCTX(ctx)) {
     const VTCol argValue = arg->get(fromRow, 0);
     res->set(targetRow, 0, argValue);
 }
 
-template<typename VTCol>
-void cartesianSet(
-    ValueTypeCode vtcType,
-    Frame *&res,
-    const Frame * arg,
-    const int64_t toRow,
-    const int64_t toCol,
-    const int64_t fromRow,
-    const int64_t fromCol,
-    DCTX(ctx)
-) {
-    if(vtcType == ValueTypeUtils::codeFor<VTCol>){
-        cartesianSetValue<VTCol>(
-            res->getColumn<VTCol>(toCol),
-            arg->getColumn<VTCol>(fromCol),
-            toRow,
-            fromRow,
-            ctx
-        );
+template <typename VTCol>
+void cartesianSet(ValueTypeCode vtcType, Frame *&res, const Frame *arg, const int64_t toRow, const int64_t toCol,
+                  const int64_t fromRow, const int64_t fromCol, DCTX(ctx)) {
+    if (vtcType == ValueTypeUtils::codeFor<VTCol>) {
+        cartesianSetValue<VTCol>(res->getColumn<VTCol>(toCol), arg->getColumn<VTCol>(fromCol), toRow, fromRow, ctx);
     }
 }
 
-void cartesian(
-        Frame *& res,
-        const Frame * lhs, const Frame * rhs,
-        DCTX(ctx)
-) {
+inline void cartesian(Frame *&res, const Frame *lhs, const Frame *rhs, DCTX(ctx)) {
     const size_t numRowRhs = rhs->getNumRows();
     const size_t numRowLhs = lhs->getNumRows();
     const size_t totalRows = numRowRhs * numRowLhs;
     const size_t numColRhs = rhs->getNumCols();
     const size_t numColLhs = lhs->getNumCols();
     const size_t totalCols = numColRhs + numColLhs;
-    const std::string * oldlabels_l = lhs->getLabels();
-    const std::string * oldlabels_r = rhs->getLabels();
+    const std::string *oldlabels_l = lhs->getLabels();
+    const std::string *oldlabels_r = rhs->getLabels();
 
     int64_t row_idx_res = 0;
     int64_t col_idx_res = 0;
@@ -69,12 +46,12 @@ void cartesian(
     std::string newlabels[totalCols];
 
     // Setting Schema and Labels
-    for(size_t col_idx_l = 0; col_idx_l < numColLhs; col_idx_l++){
+    for (size_t col_idx_l = 0; col_idx_l < numColLhs; col_idx_l++) {
         schema[col_idx_res] = lhs->getColumnType(col_idx_l);
         newlabels[col_idx_res] = oldlabels_l[col_idx_l];
         col_idx_res++;
     }
-    for(size_t col_idx_r = 0; col_idx_r < numColRhs; col_idx_r++){
+    for (size_t col_idx_r = 0; col_idx_r < numColRhs; col_idx_r++) {
         schema[col_idx_res] = rhs->getColumnType(col_idx_r);
         newlabels[col_idx_res] = oldlabels_r[col_idx_r];
         col_idx_res++;
@@ -83,55 +60,19 @@ void cartesian(
     // Creating Result Frame
     res = DataObjectFactory::create<Frame>(totalRows, totalCols, schema, newlabels, false);
 
-    for(size_t row_idx_l = 0; row_idx_l < numRowLhs; row_idx_l++){
-        for(size_t row_idx_r = 0; row_idx_r < numRowRhs; row_idx_r++){
+    for (size_t row_idx_l = 0; row_idx_l < numRowLhs; row_idx_l++) {
+        for (size_t row_idx_r = 0; row_idx_r < numRowRhs; row_idx_r++) {
             col_idx_res = 0;
 
-            for(size_t idx_c = 0; idx_c < numColLhs; idx_c++){
-                cartesianSet<int64_t>(
-                    schema[col_idx_res],
-                    res,
-                    lhs,
-                    row_idx_res,
-                    col_idx_res,
-                    row_idx_l,
-                    idx_c,
-                    ctx
-                );
-                cartesianSet<double>(
-                    schema[col_idx_res],
-                    res,
-                    lhs,
-                    row_idx_res,
-                    col_idx_res,
-                    row_idx_l,
-                    idx_c,
-                    ctx
-                );
+            for (size_t idx_c = 0; idx_c < numColLhs; idx_c++) {
+                cartesianSet<int64_t>(schema[col_idx_res], res, lhs, row_idx_res, col_idx_res, row_idx_l, idx_c, ctx);
+                cartesianSet<double>(schema[col_idx_res], res, lhs, row_idx_res, col_idx_res, row_idx_l, idx_c, ctx);
                 col_idx_res++;
             }
 
-            for(size_t idx_c = 0; idx_c < numColRhs; idx_c++){
-                cartesianSet<int64_t>(
-                    schema[col_idx_res],
-                    res,
-                    rhs,
-                    row_idx_res,
-                    col_idx_res,
-                    row_idx_r,
-                    idx_c,
-                    ctx
-                );
-                cartesianSet<double>(
-                    schema[col_idx_res],
-                    res,
-                    rhs,
-                    row_idx_res,
-                    col_idx_res,
-                    row_idx_r,
-                    idx_c,
-                    ctx
-                );
+            for (size_t idx_c = 0; idx_c < numColRhs; idx_c++) {
+                cartesianSet<int64_t>(schema[col_idx_res], res, rhs, row_idx_res, col_idx_res, row_idx_r, idx_c, ctx);
+                cartesianSet<double>(schema[col_idx_res], res, rhs, row_idx_res, col_idx_res, row_idx_r, idx_c, ctx);
                 col_idx_res++;
             }
             row_idx_res++;
@@ -139,4 +80,4 @@ void cartesian(
     }
 }
 
-#endif //SRC_RUNTIME_LOCAL_KERNELS_CARTESIAN_H
+#endif // SRC_RUNTIME_LOCAL_KERNELS_CARTESIAN_H
diff --git a/src/runtime/local/kernels/CastObj.h b/src/runtime/local/kernels/CastObj.h
index 0099e3000..ecb53bcbb 100644
--- a/src/runtime/local/kernels/CastObj.h
+++ b/src/runtime/local/kernels/CastObj.h
@@ -17,19 +17,20 @@
 #pragma once
 
 #include <runtime/local/context/DaphneContext.h>
+#include <runtime/local/datastructures/CSRMatrix.h>
 #include <runtime/local/datastructures/DataObjectFactory.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
 #include <runtime/local/datastructures/Frame.h>
 #include <runtime/local/datastructures/ValueTypeCode.h>
 #include <runtime/local/datastructures/ValueTypeUtils.h>
+#include <runtime/local/kernels/CastSca.h>
 
 // ****************************************************************************
 // Struct for partial template specialization
 // ****************************************************************************
 
-template<class DTRes, class DTArg>
-struct CastObj {
-    static void apply(DTRes *& res, const DTArg * arg, DCTX(ctx)) = delete;
+template <class DTRes, class DTArg> struct CastObj {
+    static void apply(DTRes *&res, const DTArg *arg, DCTX(ctx)) = delete;
 };
 
 // ****************************************************************************
@@ -38,12 +39,11 @@ struct CastObj {
 
 /**
  * @brief Performs a cast of the given data object to another type.
- * 
+ *
  * @param arg The data object to cast.
  * @return The casted data object.
  */
-template<class DTRes, class DTArg>
-void castObj(DTRes *& res, const DTArg * arg, DCTX(ctx)) {
+template <class DTRes, class DTArg> void castObj(DTRes *&res, const DTArg *arg, DCTX(ctx)) {
     CastObj<DTRes, DTArg>::apply(res, arg, ctx);
 }
 
@@ -55,9 +55,8 @@ void castObj(DTRes *& res, const DTArg * arg, DCTX(ctx)) {
 // DenseMatrix <- Frame
 // ----------------------------------------------------------------------------
 
-template<typename VTRes>
-class CastObj<DenseMatrix<VTRes>, Frame> {
-    
+template <typename VTRes> class CastObj<DenseMatrix<VTRes>, Frame> {
+
     /**
      * @brief Casts the values of the input column at index `c` and stores the
      * casted values to column `c` in the output matrix.
@@ -65,36 +64,34 @@ class CastObj<DenseMatrix<VTRes>, Frame> {
      * @param argFrm The input frame.
      * @param c The position of the column to cast.
      */
-    template<typename VTArg>
-    static void castCol(DenseMatrix<VTRes> * res, const Frame * argFrm, size_t c) {
+    template <typename VTArg> static void castCol(DenseMatrix<VTRes> *res, const Frame *argFrm, size_t c) {
         const size_t numRows = argFrm->getNumRows();
-        const DenseMatrix<VTArg> * argCol = argFrm->getColumn<VTArg>(c);
-        for(size_t r = 0; r < numRows; r++)
+        const DenseMatrix<VTArg> *argCol = argFrm->getColumn<VTArg>(c);
+        for (size_t r = 0; r < numRows; r++)
             res->set(r, c, static_cast<VTRes>(argCol->get(r, 0)));
         DataObjectFactory::destroy(argCol);
     }
-    
-public:
-    static void apply(DenseMatrix<VTRes> *& res, const Frame * arg, DCTX(ctx)) {
+
+  public:
+    static void apply(DenseMatrix<VTRes> *&res, const Frame *arg, DCTX(ctx)) {
         const size_t numRows = arg->getNumRows();
         const size_t numCols = arg->getNumCols();
-        if(numCols == 1 && arg->getColumnType(0) == ValueTypeUtils::codeFor<VTRes>) {
+        if (numCols == 1 && arg->getColumnType(0) == ValueTypeUtils::codeFor<VTRes>) {
             // The input frame has a single column of the result's value type.
             // Zero-cost cast from frame to matrix.
             // TODO This case could even be used for (un)signed integers of the
             // same width, involving a reinterpret cast of the pointers.
             // TODO Can we avoid this const_cast?
             res = const_cast<DenseMatrix<VTRes> *>(arg->getColumn<VTRes>(0));
-        }
-        else {
+        } else {
             // The input frame has multiple columns and/or other value types
             // than the result.
             // Need to change column-major to row-major layout and/or cast the
             // individual values.
-            if(res == nullptr)
+            if (res == nullptr)
                 res = DataObjectFactory::create<DenseMatrix<VTRes>>(numRows, numCols, false);
             // TODO We could run over the rows in blocks for cache efficiency.
-            for(size_t c = 0; c < numCols; c++) {
+            for (size_t c = 0; c < numCols; c++) {
                 // TODO We do not really need all cases.
                 // - All pairs of the same type can be handled by a single
                 //   copy-the-column helper function.
@@ -103,17 +100,34 @@ class CastObj<DenseMatrix<VTRes>, Frame> {
                 // - Truncating integers to a narrower type does not need to
                 //   consider (un)signedness either.
                 // - ...
-                switch(arg->getColumnType(c)) {
-                    // For all value types:
-                    case ValueTypeCode::F64: castCol<double>(res, arg, c); break;
-                    case ValueTypeCode::F32: castCol<float >(res, arg, c); break;
-                    case ValueTypeCode::SI64: castCol<int64_t>(res, arg, c); break;
-                    case ValueTypeCode::SI32: castCol<int32_t>(res, arg, c); break;
-                    case ValueTypeCode::SI8 : castCol<int8_t >(res, arg, c); break;
-                    case ValueTypeCode::UI64: castCol<uint64_t>(res, arg, c); break;
-                    case ValueTypeCode::UI32: castCol<uint32_t>(res, arg, c); break;
-                    case ValueTypeCode::UI8 : castCol<uint8_t >(res, arg, c); break;
-                    default: throw std::runtime_error("CastObj::apply: unknown value type code");
+                switch (arg->getColumnType(c)) {
+                // For all value types:
+                case ValueTypeCode::F64:
+                    castCol<double>(res, arg, c);
+                    break;
+                case ValueTypeCode::F32:
+                    castCol<float>(res, arg, c);
+                    break;
+                case ValueTypeCode::SI64:
+                    castCol<int64_t>(res, arg, c);
+                    break;
+                case ValueTypeCode::SI32:
+                    castCol<int32_t>(res, arg, c);
+                    break;
+                case ValueTypeCode::SI8:
+                    castCol<int8_t>(res, arg, c);
+                    break;
+                case ValueTypeCode::UI64:
+                    castCol<uint64_t>(res, arg, c);
+                    break;
+                case ValueTypeCode::UI32:
+                    castCol<uint32_t>(res, arg, c);
+                    break;
+                case ValueTypeCode::UI8:
+                    castCol<uint8_t>(res, arg, c);
+                    break;
+                default:
+                    throw std::runtime_error("CastObj::apply: unknown value type code");
                 }
             }
         }
@@ -124,11 +138,10 @@ class CastObj<DenseMatrix<VTRes>, Frame> {
 //  Frame <- DenseMatrix
 // ----------------------------------------------------------------------------
 
-template<typename VTArg>
-class CastObj<Frame, DenseMatrix<VTArg>> {
+template <typename VTArg> class CastObj<Frame, DenseMatrix<VTArg>> {
 
-public:
-    static void apply(Frame *& res, const DenseMatrix<VTArg> * arg, DCTX(ctx)) {
+  public:
+    static void apply(Frame *&res, const DenseMatrix<VTArg> *arg, DCTX(ctx)) {
         const size_t numCols = arg->getNumCols();
         const size_t numRows = arg->getNumRows();
         std::vector<Structure *> cols;
@@ -138,17 +151,16 @@ class CastObj<Frame, DenseMatrix<VTArg>> {
             // column matrix of the output frame.
             // Cheap/Low-cost cast from dense matrix to frame.
             cols.push_back(const_cast<DenseMatrix<VTArg> *>(arg));
-        }
-        else {
+        } else {
             // The input matrix has multiple columns.
-            // Need to change row-major to column-major layout and 
+            // Need to change row-major to column-major layout and
             // split matrix into single column matrices.
-            for(size_t c = 0; c < numCols; c++) {
-                auto * colMatrix = DataObjectFactory::create<DenseMatrix<VTArg>>(numRows, 1, false);
-                for(size_t r = 0; r < numRows; r++)
+            for (size_t c = 0; c < numCols; c++) {
+                auto *colMatrix = DataObjectFactory::create<DenseMatrix<VTArg>>(numRows, 1, false);
+                for (size_t r = 0; r < numRows; r++)
                     colMatrix->set(r, 0, arg->get(r, c));
                 cols.push_back(colMatrix);
-            }   
+            }
         }
         res = DataObjectFactory::create<Frame>(cols, nullptr);
     }
@@ -158,98 +170,93 @@ class CastObj<Frame, DenseMatrix<VTArg>> {
 //  DenseMatrix <- DenseMatrix
 // ----------------------------------------------------------------------------
 
-template<typename VTRes, typename VTArg>
-class CastObj<DenseMatrix<VTRes>, DenseMatrix<VTArg>> {
+template <typename VTRes, typename VTArg> class CastObj<DenseMatrix<VTRes>, DenseMatrix<VTArg>> {
 
-public:
-    static void apply(DenseMatrix<VTRes> *& res, const DenseMatrix<VTArg> * arg, DCTX(ctx)) {
+  public:
+    static void apply(DenseMatrix<VTRes> *&res, const DenseMatrix<VTArg> *arg, DCTX(ctx)) {
         const size_t numCols = arg->getNumCols();
         const size_t numRows = arg->getNumRows();
 
-        if(res == nullptr)
+        if (res == nullptr)
             res = DataObjectFactory::create<DenseMatrix<VTRes>>(numRows, numCols, false);
-        
+
         auto resVals = res->getValues();
         auto argVals = arg->getValues();
 
-        if(arg->getRowSkip() == numCols && res->getRowSkip() == numCols)
-            // Since DenseMatrix implementation is backed by 
-            // a single dense array of values, we can simply 
+        if (arg->getRowSkip() == numCols && res->getRowSkip() == numCols)
+            // Since DenseMatrix implementation is backed by
+            // a single dense array of values, we can simply
             // perform cast in one loop over that array.
-            for(size_t idx = 0; idx < numCols*numRows; idx++)
-                resVals[idx] = static_cast<VTRes>(argVals[idx]);
+            for (size_t idx = 0; idx < numCols * numRows; idx++)
+                resVals[idx] = castSca<VTRes, VTArg>(argVals[idx], ctx);
         else
             // res and arg might be views into a larger DenseMatrix.
-            for(size_t r = 0; r < numRows; r++) {
-                for(size_t c = 0; c < numCols; c++)
-                    resVals[c] = static_cast<VTRes>(argVals[c]);
+            for (size_t r = 0; r < numRows; r++) {
+                for (size_t c = 0; c < numCols; c++)
+                    resVals[c] = castSca<VTRes, VTArg>(argVals[c], ctx);
                 resVals += res->getRowSkip();
                 argVals += arg->getRowSkip();
             }
     }
 };
 
-
 // ----------------------------------------------------------------------------
 //  DenseMatrix <- CSRMatrix
 // ----------------------------------------------------------------------------
 
-template<typename VT>
-class CastObj<DenseMatrix<VT>, CSRMatrix<VT>> {
+template <typename VT> class CastObj<DenseMatrix<VT>, CSRMatrix<VT>> {
 
-public:
-    static void apply(DenseMatrix<VT> *& res, const CSRMatrix<VT> * arg, DCTX(ctx)) {
+  public:
+    static void apply(DenseMatrix<VT> *&res, const CSRMatrix<VT> *arg, DCTX(ctx)) {
         const size_t numCols = arg->getNumCols();
         const size_t numRows = arg->getNumRows();
-        
-        if(res == nullptr)
+
+        if (res == nullptr)
             res = DataObjectFactory::create<DenseMatrix<VT>>(numRows, numCols, false);
-        
+
         // TODO This could be done more efficiently by avoiding the get()/set()
         // calls (use append() or direct access to the underlying arrays).
         VT temp;
-        for (size_t r=0; r<numRows; r++){
-            for (size_t c=0; c<numCols; c++){
-                temp=arg->get(r,c);
-                res->set(r,c,temp);
+        for (size_t r = 0; r < numRows; r++) {
+            for (size_t c = 0; c < numCols; c++) {
+                temp = arg->get(r, c);
+                res->set(r, c, temp);
             }
         }
     }
 };
 
-
 // ----------------------------------------------------------------------------
 //  CSRMatrix  <- DenseMatrix
 // ----------------------------------------------------------------------------
 
-template<typename VT>
-class CastObj<CSRMatrix<VT>, DenseMatrix<VT>> {
+template <typename VT> class CastObj<CSRMatrix<VT>, DenseMatrix<VT>> {
 
-public:
-    static void apply(CSRMatrix<VT> *& res, const DenseMatrix<VT> * arg, DCTX(ctx)) {
+  public:
+    static void apply(CSRMatrix<VT> *&res, const DenseMatrix<VT> *arg, DCTX(ctx)) {
         const size_t numCols = arg->getNumCols();
         const size_t numRows = arg->getNumRows();
-        size_t numNonZeros=0;
+        size_t numNonZeros = 0;
         VT temp;
 
-        for (size_t r=0; r<numRows; r++){
-            for (size_t c=0; c<numCols; c++){
-                temp=arg->get(r,c);
-                if (temp!=0)
+        for (size_t r = 0; r < numRows; r++) {
+            for (size_t c = 0; c < numCols; c++) {
+                temp = arg->get(r, c);
+                if (temp != 0)
                     numNonZeros++;
             }
         }
-        
-        if(res == nullptr)
+
+        if (res == nullptr)
             res = DataObjectFactory::create<CSRMatrix<VT>>(numRows, numCols, numNonZeros, true);
-        
+
         // TODO This could be done more efficiently by avoiding the get()/set()
         // calls (use append() or direct access to the underlying arrays, then
         // we could even avoid initializing the output CSRMatrix).
-        for (size_t r=0; r<numRows; r++){
-            for (size_t c=0; c<numCols; c++){
-                temp=arg->get(r,c);
-                res->set(r,c,temp);
+        for (size_t r = 0; r < numRows; r++) {
+            for (size_t c = 0; c < numCols; c++) {
+                temp = arg->get(r, c);
+                res->set(r, c, temp);
             }
         }
     }
@@ -259,13 +266,13 @@ class CastObj<CSRMatrix<VT>, DenseMatrix<VT>> {
 //  CSRMatrix  <- DenseMatrix
 // ----------------------------------------------------------------------------
 
-template<typename VTres, typename VTarg>
-class CastObj<CSRMatrix<VTres>, CSRMatrix<VTarg>> {
+template <typename VTres, typename VTarg> class CastObj<CSRMatrix<VTres>, CSRMatrix<VTarg>> {
 
-public:
-    static void apply(CSRMatrix<VTres> *& res, const CSRMatrix<VTarg> * arg, DCTX(ctx)) {
-        if(res == nullptr)
-            res = DataObjectFactory::create<CSRMatrix<VTres>>(arg->getNumCols(), arg->getNumRows(), arg->getNumNonZeros(), true);
+  public:
+    static void apply(CSRMatrix<VTres> *&res, const CSRMatrix<VTarg> *arg, DCTX(ctx)) {
+        if (res == nullptr)
+            res = DataObjectFactory::create<CSRMatrix<VTres>>(arg->getNumCols(), arg->getNumRows(),
+                                                              arg->getNumNonZeros(), true);
 
         auto res_val = res->getValues();
         auto res_cidx = res->getColIdxs();
@@ -275,12 +282,12 @@ class CastObj<CSRMatrix<VTres>, CSRMatrix<VTarg>> {
         auto arg_cidx = arg->getColIdxs();
         auto arg_roff = arg->getRowOffsets();
 
-        for (size_t nz=0; nz<arg->getNumNonZeros(); nz++){
+        for (size_t nz = 0; nz < arg->getNumNonZeros(); nz++) {
             res_val[nz] = static_cast<VTres>(arg_val[nz]);
             res_cidx[nz] = arg_cidx[nz];
         }
 
-        for(size_t r = 0; r < arg->getNumRows()+1; r++) {
+        for (size_t r = 0; r < arg->getNumRows() + 1; r++) {
             res_roff[r] = arg_roff[r];
         }
     }
@@ -290,11 +297,10 @@ class CastObj<CSRMatrix<VTres>, CSRMatrix<VTarg>> {
 //  Matrix <- Matrix
 // ----------------------------------------------------------------------------
 
-template<typename VTRes, typename VTArg>
-class CastObj<Matrix<VTRes>, Matrix<VTArg>> {
+template <typename VTRes, typename VTArg> class CastObj<Matrix<VTRes>, Matrix<VTArg>> {
 
-public:
-    static void apply(Matrix<VTRes> *& res, const Matrix<VTArg> * arg, DCTX(ctx)) {
+  public:
+    static void apply(Matrix<VTRes> *&res, const Matrix<VTArg> *arg, DCTX(ctx)) {
         const size_t numCols = arg->getNumCols();
         const size_t numRows = arg->getNumRows();
 
@@ -307,4 +313,4 @@ class CastObj<Matrix<VTRes>, Matrix<VTArg>> {
                 res->append(r, c, static_cast<VTRes>(arg->get(r, c)));
         res->finishAppend();
     }
-};
\ No newline at end of file
+};
diff --git a/src/runtime/local/kernels/CastObjSca.h b/src/runtime/local/kernels/CastObjSca.h
index 2be9367bc..9252bdf60 100644
--- a/src/runtime/local/kernels/CastObjSca.h
+++ b/src/runtime/local/kernels/CastObjSca.h
@@ -26,17 +26,15 @@
 // Struct for partial template specialization
 // ****************************************************************************
 
-template<typename VTRes, class DTArg>
-struct CastObjSca {
-    static VTRes apply(const DTArg * arg, DCTX(ctx)) = delete;
+template <typename VTRes, class DTArg> struct CastObjSca {
+    static VTRes apply(const DTArg *arg, DCTX(ctx)) = delete;
 };
 
 // ****************************************************************************
 // Convenience function
 // ****************************************************************************
 
-template<typename VTRes, class DTArg>
-VTRes castObjSca(const DTArg * arg, DCTX(ctx)) {
+template <typename VTRes, class DTArg> VTRes castObjSca(const DTArg *arg, DCTX(ctx)) {
     return CastObjSca<VTRes, DTArg>::apply(arg, ctx);
 }
 
@@ -48,12 +46,11 @@ VTRes castObjSca(const DTArg * arg, DCTX(ctx)) {
 // Scalar <- DenseMatrix
 // ----------------------------------------------------------------------------
 
-template<typename VTRes, typename VTArg>
-struct CastObjSca<VTRes, DenseMatrix<VTArg>> {
-    static VTRes apply(const DenseMatrix<VTArg> * arg, DCTX(ctx)) {
+template <typename VTRes, typename VTArg> struct CastObjSca<VTRes, DenseMatrix<VTArg>> {
+    static VTRes apply(const DenseMatrix<VTArg> *arg, DCTX(ctx)) {
         const size_t numRows = arg->getNumRows();
         const size_t numCols = arg->getNumCols();
-        if(numCols != 1 || numRows != 1)
+        if (numCols != 1 || numRows != 1)
             throw std::runtime_error("Cast matrix to scalar: matrix shape should be 1x1");
         return static_cast<VTRes>(*arg->getValues());
     }
@@ -63,25 +60,25 @@ struct CastObjSca<VTRes, DenseMatrix<VTArg>> {
 //  Scalar <- Frame
 // ----------------------------------------------------------------------------
 
-template<typename VTRes>
-struct CastObjSca<VTRes, Frame> {
-    static VTRes apply(const Frame * arg, DCTX(ctx)) {
+template <typename VTRes> struct CastObjSca<VTRes, Frame> {
+    static VTRes apply(const Frame *arg, DCTX(ctx)) {
         const size_t numCols = arg->getNumCols();
         const size_t numRows = arg->getNumRows();
-        if(numCols != 1 || numRows != 1)
+        if (numCols != 1 || numRows != 1)
             throw std::runtime_error("Cast frame to scalar: frame shape should be 1x1");
-        
+
         VTRes res = VTRes(0);
         auto colType = static_cast<unsigned int>(arg->getColumnType(0));
-        const void * resVal = arg->getColumnRaw(0);
-        // Cast void* to the largest column type width (split integer and floating point interpretations) 
-        // and then final cast to VTRes. This way we avoid DenseMatrix creation in Frame::getColumn().
+        const void *resVal = arg->getColumnRaw(0);
+        // Cast void* to the largest column type width (split integer and
+        // floating point interpretations) and then final cast to VTRes. This
+        // way we avoid DenseMatrix creation in Frame::getColumn().
         // TODO It is dangerous to treat the value type code as an integer here,
         // since this can easily break if we change the value type codes.
-        if(colType <= 5U)
-            res = static_cast<VTRes>(*reinterpret_cast<const int64_t*>(resVal));
-        else if(colType <= 7U)
-            res = static_cast<VTRes>(*reinterpret_cast<const double*>(resVal));
+        if (colType <= 5U)
+            res = static_cast<VTRes>(*reinterpret_cast<const int64_t *>(resVal));
+        else if (colType <= 7U)
+            res = static_cast<VTRes>(*reinterpret_cast<const double *>(resVal));
         else
             throw std::runtime_error("CastObjSca::apply: unknown value type code");
 
diff --git a/src/runtime/local/kernels/CastSca.h b/src/runtime/local/kernels/CastSca.h
index 980ba0d7f..c443e2b85 100644
--- a/src/runtime/local/kernels/CastSca.h
+++ b/src/runtime/local/kernels/CastSca.h
@@ -18,6 +18,7 @@
 #define SRC_RUNTIME_LOCAL_KERNELS_CASTSCA_H
 
 #include <runtime/local/context/DaphneContext.h>
+#include <runtime/local/datastructures/ValueTypeUtils.h>
 
 #include <cstring>
 
@@ -29,12 +30,11 @@
 
 /**
  * @brief Casts the given scalar to another type.
- * 
+ *
  * @param arg The value to cast.
  * @return The casted value.
  */
-template<typename VTRes, typename VTArg>
-struct CastSca {
+template <typename VTRes, typename VTArg> struct CastSca {
     static VTRes apply(VTArg arg, DCTX(ctx)) {
         // Default implementation.
         return static_cast<VTRes>(arg);
@@ -45,8 +45,7 @@ struct CastSca {
 // Convenience function
 // ****************************************************************************
 
-template<typename VTRes, typename VTArg>
-VTRes castSca(VTArg arg, DCTX(ctx)) {
+template <typename VTRes, typename VTArg> VTRes castSca(VTArg arg, DCTX(ctx)) {
     return CastSca<VTRes, VTArg>::apply(arg, ctx);
 }
 
@@ -58,16 +57,66 @@ VTRes castSca(VTArg arg, DCTX(ctx)) {
 // string <- any type
 // ----------------------------------------------------------------------------
 
-template<typename VTArg>
-struct CastSca<const char *, VTArg> {
-    static const char * apply(VTArg arg, DCTX(ctx)) {
+template <typename VTArg> struct CastSca<const char *, VTArg> {
+    static const char *apply(VTArg arg, DCTX(ctx)) {
         std::string str = std::to_string(arg).c_str();
         const size_t len = str.length();
-        char * res = new char[len + 1]();
+        char *res = new char[len + 1]();
         strncpy(res, str.c_str(), len);
         res[len] = 0;
         return res;
     }
 };
 
-#endif //SRC_RUNTIME_LOCAL_KERNELS_CASTSCA_H
\ No newline at end of file
+// ----------------------------------------------------------------------------
+// any type <- string
+// ----------------------------------------------------------------------------
+
+template <typename VTRes> struct CastSca<VTRes, std::string> {
+    static VTRes apply(std::string arg, DCTX(ctx)) {
+        if constexpr (std::is_integral<VTRes>::value) {
+            if constexpr (std::is_unsigned<VTRes>::value)
+                return static_cast<VTRes>(std::stoull(arg));
+            else
+                return static_cast<VTRes>(std::stoll(arg));
+        } else if constexpr (std::is_same<VTRes, double>::value)
+            return static_cast<VTRes>(std::stold(arg));
+
+        else if constexpr (std::is_same<VTRes, float>::value)
+            return static_cast<VTRes>(std::stof(arg));
+        else {
+            // Trigger a compiler warning using deprecated attribute.
+            return throwUnsupportedType(arg);
+        }
+    }
+
+    [[deprecated("CastSca: Warning! Unsupported result type in casting string values.")]]
+    static VTRes throwUnsupportedType(std::string arg) {
+        throw std::runtime_error("CastSca: Unsupported result type in casting string values");
+    }
+};
+
+template <typename VTRes> struct CastSca<VTRes, FixedStr16> {
+    static VTRes apply(FixedStr16 arg, DCTX(ctx)) {
+        if constexpr (std::is_integral<VTRes>::value) {
+            if constexpr (std::is_unsigned<VTRes>::value)
+                return static_cast<VTRes>(std::stoull(arg.buffer));
+            else
+                return static_cast<VTRes>(std::stoll(arg.buffer));
+        } else if constexpr (std::is_same<VTRes, double>::value)
+            return static_cast<VTRes>(std::stold(arg.buffer));
+        else if constexpr (std::is_same<VTRes, float>::value)
+            return static_cast<VTRes>(std::stof(arg.buffer));
+        else {
+            // Trigger a compiler warning using deprecated attribute.
+            return throwUnsupportedType(arg);
+        }
+    }
+
+    [[deprecated("CastSca: Warning! Unsupported result type in casting string values.")]]
+    static VTRes throwUnsupportedType(std::string arg) {
+        throw std::runtime_error("CastSca: Unsupported result type in casting string values");
+    }
+};
+
+#endif // SRC_RUNTIME_LOCAL_KERNELS_CASTSCA_H
diff --git a/src/runtime/local/kernels/CastScaObj.h b/src/runtime/local/kernels/CastScaObj.h
index ce4b97f5c..24bcd6503 100644
--- a/src/runtime/local/kernels/CastScaObj.h
+++ b/src/runtime/local/kernels/CastScaObj.h
@@ -27,17 +27,15 @@
 // Struct for partial template specialization
 // ****************************************************************************
 
-template<class DTRes, typename VTArg>
-struct CastScaObj {
-    static void apply(DTRes *& res, const VTArg arg, DCTX(ctx)) = delete;
+template <class DTRes, typename VTArg> struct CastScaObj {
+    static void apply(DTRes *&res, const VTArg arg, DCTX(ctx)) = delete;
 };
 
 // ****************************************************************************
 // Convenience function
 // ****************************************************************************
 
-template<class DTRes, typename VTArg>
-void castScaObj(DTRes *& res, const VTArg arg, DCTX(ctx)) {
+template <class DTRes, typename VTArg> void castScaObj(DTRes *&res, const VTArg arg, DCTX(ctx)) {
     CastScaObj<DTRes, VTArg>::apply(res, arg, ctx);
 }
 
@@ -49,10 +47,9 @@ void castScaObj(DTRes *& res, const VTArg arg, DCTX(ctx)) {
 // DenseMatrix <- Scalar
 // ----------------------------------------------------------------------------
 
-template<typename VTRes, typename VTArg>
-struct CastScaObj<DenseMatrix<VTRes>, VTArg> {
-    static void apply(DenseMatrix<VTRes> *& res, const VTArg arg, DCTX(ctx)) {
-        if(res == nullptr)
+template <typename VTRes, typename VTArg> struct CastScaObj<DenseMatrix<VTRes>, VTArg> {
+    static void apply(DenseMatrix<VTRes> *&res, const VTArg arg, DCTX(ctx)) {
+        if (res == nullptr)
             res = DataObjectFactory::create<DenseMatrix<VTRes>>(1, 1, false);
         *res->getValues() = static_cast<VTRes>(arg);
     }
@@ -62,15 +59,13 @@ struct CastScaObj<DenseMatrix<VTRes>, VTArg> {
 // Frame <- Scalar
 // ----------------------------------------------------------------------------
 
-template<typename VTArg>
-struct CastScaObj<Frame, VTArg> {
-    static void apply(Frame *& res, const VTArg arg, DCTX(ctx)) {
+template <typename VTArg> struct CastScaObj<Frame, VTArg> {
+    static void apply(Frame *&res, const VTArg arg, DCTX(ctx)) {
         auto col = DataObjectFactory::create<DenseMatrix<VTArg>>(1, 1, false);
         *col->getValues() = arg;
         std::vector<Structure *> cols = {col};
         res = DataObjectFactory::create<Frame>(cols, nullptr);
-        
     }
 };
 
-#endif //SRC_RUNTIME_LOCAL_KERNELS_CASTSCAOBJ_H
\ No newline at end of file
+#endif // SRC_RUNTIME_LOCAL_KERNELS_CASTSCAOBJ_H
\ No newline at end of file
diff --git a/src/runtime/local/kernels/CheckEq.h b/src/runtime/local/kernels/CheckEq.h
index a39968385..e7ffa418e 100644
--- a/src/runtime/local/kernels/CheckEq.h
+++ b/src/runtime/local/kernels/CheckEq.h
@@ -18,10 +18,10 @@
 
 #include <runtime/local/context/DaphneContext.h>
 #include <runtime/local/datastructures/CSRMatrix.h>
+#include <runtime/local/datastructures/ChunkedTensor.h>
+#include <runtime/local/datastructures/ContiguousTensor.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
 #include <runtime/local/datastructures/Frame.h>
-#include <runtime/local/datastructures/ContiguousTensor.h>
-#include <runtime/local/datastructures/ChunkedTensor.h>
 #include <runtime/local/datastructures/Matrix.h>
 
 
@@ -29,9 +29,8 @@
 // Struct for partial template specialization
 // ****************************************************************************
 
-template<class DT>
-struct CheckEq {
-    static bool apply(const DT * lhs, const DT * rhs, DCTX(ctx)) = delete;
+template <class DT> struct CheckEq {
+    static bool apply(const DT *lhs, const DT *rhs, DCTX(ctx)) = delete;
 };
 
 // ****************************************************************************
@@ -40,18 +39,15 @@ struct CheckEq {
 
 /**
  * @brief Checks if the two given matrices are logically equal.
- * 
+ *
  * More precisely, this requires that they have the same dimensions and are
  * elementwise equal.
- * 
+ *
  * @param lhs The first matrix.
  * @param rhs The second matrix.
  * @return `true` if they are equal, `false` otherwise.
  */
-template<class DT>
-bool checkEq(const DT * lhs, const DT * rhs, DCTX(ctx)) {
-    return CheckEq<DT>::apply(lhs, rhs, ctx);
-};
+template <class DT> bool checkEq(const DT *lhs, const DT *rhs, DCTX(ctx)) { return CheckEq<DT>::apply(lhs, rhs, ctx); };
 
 // ****************************************************************************
 // (Partial) template specializations for different data/value types
@@ -61,22 +57,16 @@ bool checkEq(const DT * lhs, const DT * rhs, DCTX(ctx)) {
 // DenseMatrix
 // ----------------------------------------------------------------------------
 
-template<typename VT>
-struct CheckEq<DenseMatrix<VT>> {
-    static bool apply(const DenseMatrix<VT> * lhs, const DenseMatrix<VT> * rhs, DCTX(ctx)) {
-        return *lhs == *rhs;
-    }
+template <typename VT> struct CheckEq<DenseMatrix<VT>> {
+    static bool apply(const DenseMatrix<VT> *lhs, const DenseMatrix<VT> *rhs, DCTX(ctx)) { return *lhs == *rhs; }
 };
 
 // ----------------------------------------------------------------------------
 // CSRMatrix
 // ----------------------------------------------------------------------------
 
-template<typename VT>
-struct CheckEq<CSRMatrix<VT>> {
-    static bool apply(const CSRMatrix<VT> * lhs, const CSRMatrix<VT> * rhs, DCTX(ctx)) {
-        return *lhs == *rhs;
-    }
+template <typename VT> struct CheckEq<CSRMatrix<VT>> {
+    static bool apply(const CSRMatrix<VT> *lhs, const CSRMatrix<VT> *rhs, DCTX(ctx)) { return *lhs == *rhs; }
 };
 
 // ----------------------------------------------------------------------------
@@ -84,18 +74,15 @@ struct CheckEq<CSRMatrix<VT>> {
 // ----------------------------------------------------------------------------
 
 template <> struct CheckEq<Frame> {
-    static bool apply(const Frame * lhs, const Frame * rhs, DCTX(ctx)) {
-        return *lhs == *rhs;
-    }
+    static bool apply(const Frame *lhs, const Frame *rhs, DCTX(ctx)) { return *lhs == *rhs; }
 };
 
 // ----------------------------------------------------------------------------
 // Contiguous Tensor
 // ----------------------------------------------------------------------------
 
-template<typename VT>
-struct CheckEq<ContiguousTensor<VT>> {
-    static bool apply(const ContiguousTensor<VT> * lhs, const ContiguousTensor<VT> * rhs, DCTX(ctx)) {
+template <typename VT> struct CheckEq<ContiguousTensor<VT>> {
+    static bool apply(const ContiguousTensor<VT> *lhs, const ContiguousTensor<VT> *rhs, DCTX(ctx)) {
         return *lhs == *rhs;
     }
 };
@@ -104,20 +91,14 @@ struct CheckEq<ContiguousTensor<VT>> {
 // Chunked Tensor
 // ----------------------------------------------------------------------------
 
-template<typename VT>
-struct CheckEq<ChunkedTensor<VT>> {
-    static bool apply(const ChunkedTensor<VT> * lhs, const ChunkedTensor<VT> * rhs, DCTX(ctx)) {
-        return *lhs == *rhs;
-    }
+template <typename VT> struct CheckEq<ChunkedTensor<VT>> {
+    static bool apply(const ChunkedTensor<VT> *lhs, const ChunkedTensor<VT> *rhs, DCTX(ctx)) { return *lhs == *rhs; }
 };
 
 // ----------------------------------------------------------------------------
 // Matrix
 // ----------------------------------------------------------------------------
 
-template<typename VT>
-struct CheckEq<Matrix<VT>> {
-    static bool apply(const Matrix<VT> * lhs, const Matrix<VT> * rhs, DCTX(ctx)) {
-        return *lhs == *rhs;
-    }
+template <typename VT> struct CheckEq<Matrix<VT>> {
+    static bool apply(const Matrix<VT> *lhs, const Matrix<VT> *rhs, DCTX(ctx)) { return *lhs == *rhs; }
 };
\ No newline at end of file
diff --git a/src/runtime/local/kernels/CheckEqApprox.h b/src/runtime/local/kernels/CheckEqApprox.h
index 370a85313..72faf166b 100644
--- a/src/runtime/local/kernels/CheckEqApprox.h
+++ b/src/runtime/local/kernels/CheckEqApprox.h
@@ -35,9 +35,8 @@
 // Struct for partial template specialization
 // ****************************************************************************
 
-template<class DT>
-struct CheckEqApprox{
-    static bool apply(const DT * lhs, const DT * rhs, double eps, DCTX(ctx)) = delete;
+template <class DT> struct CheckEqApprox {
+    static bool apply(const DT *lhs, const DT *rhs, double eps, DCTX(ctx)) = delete;
 };
 
 // ****************************************************************************
@@ -46,18 +45,17 @@ struct CheckEqApprox{
 
 /**
  * @brief Checks if the two given matrices are approximately equal.
- * 
- * More precisely, this requires that they have the same dimensions and are approximately
- * elementwise equal, i.e., if the difference between two elements is not greater than the threshold `eps`,
- * they are considered as equal. 
+ *
+ * More precisely, this requires that they have the same dimensions and are
+ * approximately elementwise equal, i.e., if the difference between two elements
+ * is not greater than the threshold `eps`, they are considered as equal.
  *
  * @param lhs The first matrix.
  * @param rhs The second matrix.
  * @param eps The similarity threshold.
  * @return `true` if they are equal, `false` otherwise.
  */
-template<class DT>
-bool checkEqApprox(const DT * lhs, const DT * rhs, double eps, DCTX(ctx)) {
+template <class DT> bool checkEqApprox(const DT *lhs, const DT *rhs, double eps, DCTX(ctx)) {
     return CheckEqApprox<DT>::apply(lhs, rhs, eps, ctx);
 }
 
@@ -86,40 +84,39 @@ bool operator==(const DT & lhs, const DT & rhs) {
 // DenseMatrix
 // ----------------------------------------------------------------------------
 
-template<typename VT>
-struct CheckEqApprox<DenseMatrix<VT>> {
-    static bool apply(const DenseMatrix<VT> * lhs, const DenseMatrix<VT> * rhs, double eps, DCTX(ctx)) {
-        if(lhs == rhs)
+template <typename VT> struct CheckEqApprox<DenseMatrix<VT>> {
+    static bool apply(const DenseMatrix<VT> *lhs, const DenseMatrix<VT> *rhs, double eps, DCTX(ctx)) {
+        if (lhs == rhs)
             return true;
-        
+
         const size_t numRows = lhs->getNumRows();
         const size_t numCols = lhs->getNumCols();
-        
-        if(numRows != rhs->getNumRows() || numCols != rhs->getNumCols())
+
+        if (numRows != rhs->getNumRows() || numCols != rhs->getNumCols())
             return false;
-        
-        const VT * valuesLhs = lhs->getValues();
-        const VT * valuesRhs = rhs->getValues();
-        
+
+        const VT *valuesLhs = lhs->getValues();
+        const VT *valuesRhs = rhs->getValues();
+
         const size_t rowSkipLhs = lhs->getRowSkip();
         const size_t rowSkipRhs = rhs->getRowSkip();
-        
-        if(valuesLhs == valuesRhs && rowSkipLhs == rowSkipRhs) // same pointer
+
+        if (valuesLhs == valuesRhs && rowSkipLhs == rowSkipRhs) // same pointer
             return true;
-        
-        for(size_t r = 0; r < numRows; r++){
-            for(size_t c = 0; c < numCols; c++){
+
+        for (size_t r = 0; r < numRows; r++) {
+            for (size_t c = 0; c < numCols; c++) {
                 VT diff = valuesLhs[c] - valuesRhs[c];
-                if (diff==0)
+                if (diff == 0)
                     continue;
-                diff = diff>0? diff : -diff;
-                if (diff> eps)
+                diff = diff > 0 ? diff : -diff;
+                if (diff > eps)
                     return false;
-            }   
+            }
             valuesLhs += lhs->getRowSkip();
             valuesRhs += rhs->getRowSkip();
-        }        
-         
+        }
+
         return true;
     }
 };
@@ -128,36 +125,34 @@ struct CheckEqApprox<DenseMatrix<VT>> {
 // CSRMatrix
 // ----------------------------------------------------------------------------
 
-template<typename VT>
-struct CheckEqApprox<CSRMatrix<VT>> {
-    static bool apply(const CSRMatrix<VT> * lhs, const CSRMatrix<VT> * rhs, double eps, DCTX(ctx)) {
-        if(lhs == rhs)
+template <typename VT> struct CheckEqApprox<CSRMatrix<VT>> {
+    static bool apply(const CSRMatrix<VT> *lhs, const CSRMatrix<VT> *rhs, double eps, DCTX(ctx)) {
+        if (lhs == rhs)
             return true;
-        
+
         const size_t numRows = lhs->getNumRows();
         const size_t numCols = lhs->getNumCols();
-        
-        if(numRows != rhs->getNumRows() || numCols != rhs->getNumCols())
+
+        if (numRows != rhs->getNumRows() || numCols != rhs->getNumCols())
             return false;
-       
-        for(size_t r = 0; r < numRows; r++){
-            const VT * valuesLhs = lhs->getValues(r);
-            const VT * valuesRhs = rhs->getValues(r);
-            const size_t nnzElementsLhs= lhs->getNumNonZeros(r);
-            const size_t nnzElementsRhs= rhs->getNumNonZeros(r);
-            if (nnzElementsLhs!=nnzElementsRhs)
+
+        for (size_t r = 0; r < numRows; r++) {
+            const VT *valuesLhs = lhs->getValues(r);
+            const VT *valuesRhs = rhs->getValues(r);
+            const size_t nnzElementsLhs = lhs->getNumNonZeros(r);
+            const size_t nnzElementsRhs = rhs->getNumNonZeros(r);
+            if (nnzElementsLhs != nnzElementsRhs)
                 return false;
-            for(size_t c = 0; c < nnzElementsLhs; c++){
+            for (size_t c = 0; c < nnzElementsLhs; c++) {
                 VT diff = valuesLhs[c] - valuesRhs[c];
-                if (diff==0)
-                     continue;
-                diff = diff>0? diff : -diff;
-                if (diff> eps)
+                if (diff == 0)
+                    continue;
+                diff = diff > 0 ? diff : -diff;
+                if (diff > eps)
                     return false;
-            }   
+            }
         }
         return true;
-
     }
 };
 
@@ -166,58 +161,65 @@ struct CheckEqApprox<CSRMatrix<VT>> {
 // ----------------------------------------------------------------------------
 
 template <> struct CheckEqApprox<Frame> {
-    static bool apply(const Frame * lhs, const Frame * rhs, double eps, DCTX(ctx)) {
-        if(lhs == rhs)
+    static bool apply(const Frame *lhs, const Frame *rhs, double eps, DCTX(ctx)) {
+        if (lhs == rhs)
             return true;
-        
+
         const size_t numRows = lhs->getNumRows();
         const size_t numCols = lhs->getNumCols();
-        
-        if(numRows != rhs->getNumRows() || numCols != rhs->getNumCols())
+
+        if (numRows != rhs->getNumRows() || numCols != rhs->getNumCols())
             return false;
-        
-        if(memcmp(lhs->getSchema(), rhs->getSchema(), numCols * sizeof(ValueTypeCode)) != 0)
+
+        if (memcmp(lhs->getSchema(), rhs->getSchema(), numCols * sizeof(ValueTypeCode)) != 0)
             return false;
 
-        const std::string * labelsLhs = lhs->getLabels();
-        const std::string * labelsRhs = rhs->getLabels();
+        const std::string *labelsLhs = lhs->getLabels();
+        const std::string *labelsRhs = rhs->getLabels();
         for (size_t c = 0; c < numCols; c++) {
-            if(labelsLhs[c] != labelsRhs[c])
+            if (labelsLhs[c] != labelsRhs[c])
                 return false;
         }
-        
-        for (size_t c = 0; c < numCols; c++)
-        {
-            switch(lhs->getColumnType(c)) {
-                // For all value types:
-                case ValueTypeCode::F64: if(!checkEqApprox(lhs->getColumn<double>(c),
-                    rhs->getColumn<double>(c), eps, ctx)) return false;
-                    break;
-                case ValueTypeCode::F32: if (!checkEqApprox(lhs->getColumn<float>(c),
-                    rhs->getColumn<float>(c), eps, ctx)) return false;
-                    break;
-                case ValueTypeCode::SI64: if (!checkEqApprox(lhs->getColumn<int64_t>(c),
-                    rhs->getColumn<int64_t>(c), eps, ctx)) return false;
-                    break;
-                case ValueTypeCode::SI32: if (!checkEqApprox(lhs->getColumn<int32_t>(c),
-                    rhs->getColumn<int32_t>(c), eps, ctx)) return false;
-                    break;
-                case ValueTypeCode::SI8 : if (!checkEqApprox(lhs->getColumn<int8_t>(c),
-                    rhs->getColumn<int8_t>(c), eps, ctx)) return false;
-                    break;
-                case ValueTypeCode::UI64: if (!checkEqApprox(lhs->getColumn<uint64_t>(c),
-                    rhs->getColumn<uint64_t>(c), eps, ctx)) return false;
-                    break;
-                case ValueTypeCode::UI32: if (!checkEqApprox(lhs->getColumn<uint32_t>(c), 
-                    rhs->getColumn<uint32_t>(c), eps, ctx)) return false;
-                    break;
-                case ValueTypeCode::UI8 : if (!checkEqApprox(lhs->getColumn<uint8_t>(c),
-                    rhs->getColumn<uint8_t>(c), eps, ctx)) return false;
-                    break;
-                default:
-                    throw std::runtime_error("CheckEqApprox::apply: unknown value type code");
+
+        for (size_t c = 0; c < numCols; c++) {
+            switch (lhs->getColumnType(c)) {
+            // For all value types:
+            case ValueTypeCode::F64:
+                if (!checkEqApprox(lhs->getColumn<double>(c), rhs->getColumn<double>(c), eps, ctx))
+                    return false;
+                break;
+            case ValueTypeCode::F32:
+                if (!checkEqApprox(lhs->getColumn<float>(c), rhs->getColumn<float>(c), eps, ctx))
+                    return false;
+                break;
+            case ValueTypeCode::SI64:
+                if (!checkEqApprox(lhs->getColumn<int64_t>(c), rhs->getColumn<int64_t>(c), eps, ctx))
+                    return false;
+                break;
+            case ValueTypeCode::SI32:
+                if (!checkEqApprox(lhs->getColumn<int32_t>(c), rhs->getColumn<int32_t>(c), eps, ctx))
+                    return false;
+                break;
+            case ValueTypeCode::SI8:
+                if (!checkEqApprox(lhs->getColumn<int8_t>(c), rhs->getColumn<int8_t>(c), eps, ctx))
+                    return false;
+                break;
+            case ValueTypeCode::UI64:
+                if (!checkEqApprox(lhs->getColumn<uint64_t>(c), rhs->getColumn<uint64_t>(c), eps, ctx))
+                    return false;
+                break;
+            case ValueTypeCode::UI32:
+                if (!checkEqApprox(lhs->getColumn<uint32_t>(c), rhs->getColumn<uint32_t>(c), eps, ctx))
+                    return false;
+                break;
+            case ValueTypeCode::UI8:
+                if (!checkEqApprox(lhs->getColumn<uint8_t>(c), rhs->getColumn<uint8_t>(c), eps, ctx))
+                    return false;
+                break;
+            default:
+                throw std::runtime_error("CheckEqApprox::apply: unknown value type code");
             }
-        }   
+        }
         return true;
     }
 };
@@ -226,26 +228,25 @@ template <> struct CheckEqApprox<Frame> {
 // Matrix
 // ----------------------------------------------------------------------------
 
-template<typename VT>
-struct CheckEqApprox<Matrix<VT>> {
-    static bool apply(const Matrix<VT> * lhs, const Matrix<VT> * rhs, double eps, DCTX(ctx)) {
+template <typename VT> struct CheckEqApprox<Matrix<VT>> {
+    static bool apply(const Matrix<VT> *lhs, const Matrix<VT> *rhs, double eps, DCTX(ctx)) {
         if (lhs == rhs)
             return true;
-        
+
         const size_t numRows = lhs->getNumRows();
         const size_t numCols = lhs->getNumCols();
-        
+
         if (numRows != rhs->getNumRows() || numCols != rhs->getNumCols())
             return false;
 
-        for (size_t r=0; r < numRows; ++r) {
-            for (size_t c=0; c < numCols; ++c) {
+        for (size_t r = 0; r < numRows; ++r) {
+            for (size_t c = 0; c < numCols; ++c) {
                 double diff = lhs->get(r, c) - rhs->get(r, c);
                 if (std::abs(diff) > eps)
                     return false;
             }
-        }        
-         
+        }
+
         return true;
     }
 };
\ No newline at end of file
diff --git a/src/runtime/local/kernels/ColBind.h b/src/runtime/local/kernels/ColBind.h
index 3180b2d28..a8ba68fe9 100644
--- a/src/runtime/local/kernels/ColBind.h
+++ b/src/runtime/local/kernels/ColBind.h
@@ -18,9 +18,9 @@
 #define SRC_RUNTIME_LOCAL_KERNELS_COLBIND_H
 
 #include <runtime/local/context/DaphneContext.h>
+#include <runtime/local/datastructures/CSRMatrix.h>
 #include <runtime/local/datastructures/DataObjectFactory.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
-#include <runtime/local/datastructures/CSRMatrix.h>
 #include <runtime/local/datastructures/Frame.h>
 #include <runtime/local/datastructures/Matrix.h>
 
@@ -34,17 +34,16 @@
 // Struct for partial template specialization
 // ****************************************************************************
 
-template<class DTRes, class DTLhs, class DTRhs>
-struct ColBind {
-    static void apply(DTRes *& res, const DTLhs * lhs, const DTRhs * rhs, DCTX(ctx)) = delete;
+template <class DTRes, class DTLhs, class DTRhs> struct ColBind {
+    static void apply(DTRes *&res, const DTLhs *lhs, const DTRhs *rhs, DCTX(ctx)) = delete;
 };
 
 // ****************************************************************************
 // Convenience function
 // ****************************************************************************
 
-template<class DTRes, class DTLhs, class DTRhs>
-void colBind(DTRes *& res, const DTLhs * lhs, const DTRhs * rhs, DCTX(ctx)) {
+template <class DTRes, class DTLhs, class DTRhs>
+void colBind(DTRes *&res, const DTLhs *lhs, const DTRhs *rhs, DCTX(ctx)) {
     ColBind<DTRes, DTLhs, DTRhs>::apply(res, lhs, rhs, ctx);
 }
 
@@ -56,34 +55,33 @@ void colBind(DTRes *& res, const DTLhs * lhs, const DTRhs * rhs, DCTX(ctx)) {
 // DenseMatrix <- DenseMatrix, DenseMatrix
 // ----------------------------------------------------------------------------
 
-template<typename VT>
-struct ColBind<DenseMatrix<VT>, DenseMatrix<VT>, DenseMatrix<VT>> {
-    static void apply(DenseMatrix<VT> *& res, const DenseMatrix<VT> * lhs, const DenseMatrix<VT> * rhs, DCTX(ctx)) {
+template <typename VT> struct ColBind<DenseMatrix<VT>, DenseMatrix<VT>, DenseMatrix<VT>> {
+    static void apply(DenseMatrix<VT> *&res, const DenseMatrix<VT> *lhs, const DenseMatrix<VT> *rhs, DCTX(ctx)) {
         const size_t numRows = lhs->getNumRows();
 
         if (numRows != rhs->getNumRows()) {
-            throw std::runtime_error(
-                "ColBind - the two operands must have the same number of rows, but lhs has " + std::to_string(numRows) +
-                " and rhs has " + std::to_string(rhs->getNumRows()) + " rows"
-            );
+            throw std::runtime_error("ColBind - the two operands must have the "
+                                     "same number of rows, but lhs has " +
+                                     std::to_string(numRows) + " and rhs has " + std::to_string(rhs->getNumRows()) +
+                                     " rows");
         }
 
         const size_t numColsLhs = lhs->getNumCols();
         const size_t numColsRhs = rhs->getNumCols();
-        
-        if(res == nullptr)
+
+        if (res == nullptr)
             res = DataObjectFactory::create<DenseMatrix<VT>>(numRows, numColsLhs + numColsRhs, false);
-        
-        const VT * valuesLhs = lhs->getValues();
-        const VT * valuesRhs = rhs->getValues();
-        VT * valuesRes = res->getValues();
-        
+
+        const VT *valuesLhs = lhs->getValues();
+        const VT *valuesRhs = rhs->getValues();
+        VT *valuesRes = res->getValues();
+
         const size_t rowSkipLhs = lhs->getRowSkip();
         const size_t rowSkipRhs = rhs->getRowSkip();
         const size_t rowSkipRes = res->getRowSkip();
-        
-        for(size_t r = 0; r < numRows; r++) {
-            memcpy(valuesRes             , valuesLhs, numColsLhs * sizeof(VT));
+
+        for (size_t r = 0; r < numRows; r++) {
+            memcpy(valuesRes, valuesLhs, numColsLhs * sizeof(VT));
             memcpy(valuesRes + numColsLhs, valuesRhs, numColsRhs * sizeof(VT));
             valuesLhs += rowSkipLhs;
             valuesRhs += rowSkipRhs;
@@ -96,9 +94,8 @@ struct ColBind<DenseMatrix<VT>, DenseMatrix<VT>, DenseMatrix<VT>> {
 // Frame <- Frame, Frame
 // ----------------------------------------------------------------------------
 
-template<>
-struct ColBind<Frame, Frame, Frame> {
-    static void apply(Frame *& res, const Frame * lhs, const Frame * rhs, DCTX(ctx)) {
+template <> struct ColBind<Frame, Frame, Frame> {
+    static void apply(Frame *&res, const Frame *lhs, const Frame *rhs, DCTX(ctx)) {
         res = DataObjectFactory::create<Frame>(lhs, rhs);
     }
 };
@@ -107,19 +104,18 @@ struct ColBind<Frame, Frame, Frame> {
 // CSRMatrix <- CSRMatrix, CSRMatrix
 // ----------------------------------------------------------------------------
 
-template<typename VT>
-struct ColBind<CSRMatrix<VT>, CSRMatrix<VT>, CSRMatrix<VT>> {
-    static void apply(CSRMatrix<VT> *& res, const CSRMatrix<VT> * lhs, const CSRMatrix<VT> * rhs, DCTX(ctx)) {
-        if(lhs->getNumRows() != rhs->getNumRows())
-            throw std::runtime_error(
-                "ColBind - the two operands must have the same number of rows, but lhs has " + std::to_string(lhs->getNumRows()) +
-                " and rhs has " + std::to_string(rhs->getNumRows()) + " rows"
-            );
+template <typename VT> struct ColBind<CSRMatrix<VT>, CSRMatrix<VT>, CSRMatrix<VT>> {
+    static void apply(CSRMatrix<VT> *&res, const CSRMatrix<VT> *lhs, const CSRMatrix<VT> *rhs, DCTX(ctx)) {
+        if (lhs->getNumRows() != rhs->getNumRows())
+            throw std::runtime_error("ColBind - the two operands must have the same number of rows, "
+                                     "but lhs has " +
+                                     std::to_string(lhs->getNumRows()) + " and rhs has " +
+                                     std::to_string(rhs->getNumRows()) + " rows");
 
         size_t numColsRes = lhs->getNumCols() + rhs->getNumCols();
         size_t numNonZerosRes = lhs->getNumNonZeros() + rhs->getNumNonZeros();
 
-        if(!res)
+        if (!res)
             res = DataObjectFactory::create<CSRMatrix<VT>>(lhs->getNumRows(), numColsRes, numNonZerosRes, false);
 
         auto resRowOffsets = res->getRowOffsets();
@@ -130,7 +126,7 @@ struct ColBind<CSRMatrix<VT>, CSRMatrix<VT>, CSRMatrix<VT>> {
         size_t lhsStartOffset = lhsRowOffsets[0];
         size_t rhsStartOffset = rhsRowOffsets[0];
 
-        for(size_t r = 0; r < res->getNumRows(); r++){
+        for (size_t r = 0; r < res->getNumRows(); r++) {
             size_t lhsOffset = rhsPrevEnd;
             size_t lhsLength = lhsRowOffsets[r + 1] - lhsRowOffsets[r];
             size_t rhsOffset = lhsOffset + lhsLength;
@@ -141,7 +137,7 @@ struct ColBind<CSRMatrix<VT>, CSRMatrix<VT>, CSRMatrix<VT>> {
             memcpy(&res->getValues()[rhsOffset], &rhs->getValues()[rhsRowOffsets[r]], rhsLength * sizeof(VT));
 
             memcpy(&res->getColIdxs()[lhsOffset], &lhs->getColIdxs()[lhsRowOffsets[r]], lhsLength * sizeof(size_t));
-            for(size_t c = 0; c < rhsLength; c++)
+            for (size_t c = 0; c < rhsLength; c++)
                 res->getColIdxs()[rhsOffset + c] = rhs->getColIdxs()[rhsRowOffsets[r] + c] + lhs->getNumCols();
 
             resRowOffsets[r] = lhsRowOffsets[r] - lhsStartOffset + rhsRowOffsets[r] - rhsStartOffset;
@@ -154,24 +150,23 @@ struct ColBind<CSRMatrix<VT>, CSRMatrix<VT>, CSRMatrix<VT>> {
 // Matrix <- Matrix, Matrix
 // ----------------------------------------------------------------------------
 
-template<typename VT>
-struct ColBind<Matrix<VT>, Matrix<VT>, Matrix<VT>> {
-    static void apply(Matrix<VT> *& res, const Matrix<VT> * lhs, const Matrix<VT> * rhs, DCTX(ctx)) {
+template <typename VT> struct ColBind<Matrix<VT>, Matrix<VT>, Matrix<VT>> {
+    static void apply(Matrix<VT> *&res, const Matrix<VT> *lhs, const Matrix<VT> *rhs, DCTX(ctx)) {
         const size_t numRows = lhs->getNumRows();
 
         if (numRows != rhs->getNumRows()) {
-            throw std::runtime_error(
-                "ColBind - the two operands must have the same number of rows, but lhs has " + std::to_string(numRows) +
-                " and rhs has " + std::to_string(rhs->getNumRows()) + " rows"
-            );
+            throw std::runtime_error("ColBind - the two operands must have the "
+                                     "same number of rows, but lhs has " +
+                                     std::to_string(numRows) + " and rhs has " + std::to_string(rhs->getNumRows()) +
+                                     " rows");
         }
 
         const size_t numColsLhs = lhs->getNumCols();
         const size_t numColsRhs = rhs->getNumCols();
-        
+
         if (res == nullptr)
             res = DataObjectFactory::create<DenseMatrix<VT>>(numRows, numColsLhs + numColsRhs, false);
-        
+
         res->prepareAppend();
         for (size_t r = 0; r < numRows; ++r) {
             for (size_t c = 0; c < numColsLhs; ++c)
@@ -183,4 +178,4 @@ struct ColBind<Matrix<VT>, Matrix<VT>, Matrix<VT>> {
     }
 };
 
-#endif //SRC_RUNTIME_LOCAL_KERNELS_COLBIND_H
+#endif // SRC_RUNTIME_LOCAL_KERNELS_COLBIND_H
diff --git a/src/runtime/local/kernels/CondMatMatMat.h b/src/runtime/local/kernels/CondMatMatMat.h
index 86d9c8e10..a1dbf8e98 100644
--- a/src/runtime/local/kernels/CondMatMatMat.h
+++ b/src/runtime/local/kernels/CondMatMatMat.h
@@ -29,17 +29,17 @@
 // Struct for partial template specialization
 // ****************************************************************************
 
-template<class DTRes, class DTCond, class DTThen, class DTElse>
-struct CondMatMatMat {
-    static void apply(DTRes *& res, const DTCond * cond, const DTThen * thenVal, const DTElse * elseVal, DCTX(ctx)) = delete;
+template <class DTRes, class DTCond, class DTThen, class DTElse> struct CondMatMatMat {
+    static void apply(DTRes *&res, const DTCond *cond, const DTThen *thenVal, const DTElse *elseVal,
+                      DCTX(ctx)) = delete;
 };
 
 // ****************************************************************************
 // Convenience function
 // ****************************************************************************
 
-template<class DTRes, class DTCond, class DTThen, class DTElse>
-void condMatMatMat(DTRes *& res, const DTCond * cond, const DTThen * thenVal, const DTElse * elseVal, DCTX(ctx)) {
+template <class DTRes, class DTCond, class DTThen, class DTElse>
+void condMatMatMat(DTRes *&res, const DTCond *cond, const DTThen *thenVal, const DTElse *elseVal, DCTX(ctx)) {
     CondMatMatMat<DTRes, DTCond, DTThen, DTElse>::apply(res, cond, thenVal, elseVal, ctx);
 }
 
@@ -51,40 +51,32 @@ void condMatMatMat(DTRes *& res, const DTCond * cond, const DTThen * thenVal, co
 // DenseMatrix <- DenseMatrix, DenseMatrix, DenseMatrix
 // ----------------------------------------------------------------------------
 
-template<typename VTVal, typename VTCond>
+template <typename VTVal, typename VTCond>
 struct CondMatMatMat<DenseMatrix<VTVal>, DenseMatrix<VTCond>, DenseMatrix<VTVal>, DenseMatrix<VTVal>> {
-    static void apply(
-        DenseMatrix<VTVal> *& res,
-        const DenseMatrix<VTCond> * cond,
-        const DenseMatrix<VTVal> * thenVal,
-        const DenseMatrix<VTVal> * elseVal,
-        DCTX(ctx)
-    ) {
+    static void apply(DenseMatrix<VTVal> *&res, const DenseMatrix<VTCond> *cond, const DenseMatrix<VTVal> *thenVal,
+                      const DenseMatrix<VTVal> *elseVal, DCTX(ctx)) {
         const size_t numRows = cond->getNumRows();
         const size_t numCols = cond->getNumCols();
 
-        if(
-            numRows != thenVal->getNumRows() || numRows != elseVal->getNumRows() ||
-            numCols != thenVal->getNumCols() || numCols != elseVal->getNumCols()
-        )
-            throw std::runtime_error(
-                    "CondMatMatMat: condition/then/else matrices must have the same shape"
-            );
+        if (numRows != thenVal->getNumRows() || numRows != elseVal->getNumRows() || numCols != thenVal->getNumCols() ||
+            numCols != elseVal->getNumCols())
+            throw std::runtime_error("CondMatMatMat: condition/then/else "
+                                     "matrices must have the same shape");
 
-        if(res == nullptr)
+        if (res == nullptr)
             res = DataObjectFactory::create<DenseMatrix<VTVal>>(numRows, numCols, false);
 
-        VTVal * valuesRes = res->getValues();
-        const VTCond * valuesCond = cond->getValues();
-        const VTVal * valuesThen = thenVal->getValues();
-        const VTVal * valuesElse = elseVal->getValues();
+        VTVal *valuesRes = res->getValues();
+        const VTCond *valuesCond = cond->getValues();
+        const VTVal *valuesThen = thenVal->getValues();
+        const VTVal *valuesElse = elseVal->getValues();
         const size_t rowSkipRes = res->getRowSkip();
         const size_t rowSkipCond = cond->getRowSkip();
         const size_t rowSkipThen = thenVal->getRowSkip();
         const size_t rowSkipElse = elseVal->getRowSkip();
 
-        for(size_t r = 0; r < numRows; r++) {
-            for(size_t c = 0; c < numCols; c++)
+        for (size_t r = 0; r < numRows; r++) {
+            for (size_t c = 0; c < numCols; c++)
                 valuesRes[c] = static_cast<bool>(valuesCond[c]) ? valuesThen[c] : valuesElse[c];
             valuesRes += rowSkipRes;
             valuesCond += rowSkipCond;
@@ -98,24 +90,20 @@ struct CondMatMatMat<DenseMatrix<VTVal>, DenseMatrix<VTCond>, DenseMatrix<VTVal>
 // Matrix <- Matrix, Matrix, Matrix
 // ----------------------------------------------------------------------------
 
-template<typename VTVal, typename VTCond>
+template <typename VTVal, typename VTCond>
 struct CondMatMatMat<Matrix<VTVal>, Matrix<VTCond>, Matrix<VTVal>, Matrix<VTVal>> {
-    static void apply(
-        Matrix<VTVal> *& res,
-        const Matrix<VTCond> * cond,
-        const Matrix<VTVal> * thenVal,
-        const Matrix<VTVal> * elseVal,
-        DCTX(ctx)
-    ) {
+    static void apply(Matrix<VTVal> *&res, const Matrix<VTCond> *cond, const Matrix<VTVal> *thenVal,
+                      const Matrix<VTVal> *elseVal, DCTX(ctx)) {
         const size_t numRows = cond->getNumRows();
         const size_t numCols = cond->getNumCols();
 
-        if (numRows != thenVal->getNumRows() || numRows != elseVal->getNumRows() ||
-            numCols != thenVal->getNumCols() || numCols != elseVal->getNumCols() ) {
+        if (numRows != thenVal->getNumRows() || numRows != elseVal->getNumRows() || numCols != thenVal->getNumCols() ||
+            numCols != elseVal->getNumCols()) {
             std::ostringstream errMsg;
-            errMsg << "CondMatMatMat: condition/then/else matrices must have the same shape but have ("
-                    << numRows << "," << numCols << "), (" << thenVal->getNumRows() << "," << thenVal->getNumCols()
-                    << ") and (" << elseVal->getNumRows() << "," << elseVal->getNumCols() << ")";
+            errMsg << "CondMatMatMat: condition/then/else matrices must have "
+                      "the same shape but have ("
+                   << numRows << "," << numCols << "), (" << thenVal->getNumRows() << "," << thenVal->getNumCols()
+                   << ") and (" << elseVal->getNumRows() << "," << elseVal->getNumCols() << ")";
             throw std::runtime_error(errMsg.str());
         }
 
@@ -130,4 +118,4 @@ struct CondMatMatMat<Matrix<VTVal>, Matrix<VTCond>, Matrix<VTVal>, Matrix<VTVal>
     }
 };
 
-#endif //SRC_RUNTIME_LOCAL_KERNELS_CONDMATMATMAT_H
\ No newline at end of file
+#endif // SRC_RUNTIME_LOCAL_KERNELS_CONDMATMATMAT_H
\ No newline at end of file
diff --git a/src/runtime/local/kernels/CondMatMatSca.h b/src/runtime/local/kernels/CondMatMatSca.h
index fab3ec680..6c924f3e2 100644
--- a/src/runtime/local/kernels/CondMatMatSca.h
+++ b/src/runtime/local/kernels/CondMatMatSca.h
@@ -29,17 +29,16 @@
 // Struct for partial template specialization
 // ****************************************************************************
 
-template<class DTRes, class DTCond, class DTThen, class VTElse>
-struct CondMatMatSca {
-    static void apply(DTRes *& res, const DTCond * cond, const DTThen * thenVal, VTElse elseVal, DCTX(ctx)) = delete;
+template <class DTRes, class DTCond, class DTThen, class VTElse> struct CondMatMatSca {
+    static void apply(DTRes *&res, const DTCond *cond, const DTThen *thenVal, VTElse elseVal, DCTX(ctx)) = delete;
 };
 
 // ****************************************************************************
 // Convenience function
 // ****************************************************************************
 
-template<class DTRes, class DTCond, class DTThen, class VTElse>
-void condMatMatSca(DTRes *& res, const DTCond * cond, const DTThen * thenVal, VTElse elseVal, DCTX(ctx)) {
+template <class DTRes, class DTCond, class DTThen, class VTElse>
+void condMatMatSca(DTRes *&res, const DTCond *cond, const DTThen *thenVal, VTElse elseVal, DCTX(ctx)) {
     CondMatMatSca<DTRes, DTCond, DTThen, VTElse>::apply(res, cond, thenVal, elseVal, ctx);
 }
 
@@ -47,43 +46,33 @@ void condMatMatSca(DTRes *& res, const DTCond * cond, const DTThen * thenVal, VT
 // (Partial) template specializations for different data/value types
 // ****************************************************************************
 
-
 // ----------------------------------------------------------------------------
 // DenseMatrix <- DenseMatrix, DenseMatrix, scalar
 // ----------------------------------------------------------------------------
 
-template<typename VTVal, typename VTCond>
+template <typename VTVal, typename VTCond>
 struct CondMatMatSca<DenseMatrix<VTVal>, DenseMatrix<VTCond>, DenseMatrix<VTVal>, VTVal> {
-    static void apply(
-        DenseMatrix<VTVal> *& res,
-        const DenseMatrix<VTCond> * cond,
-        const DenseMatrix<VTVal> * thenVal,
-        VTVal elseVal,
-        DCTX(ctx)
-    ) {
+    static void apply(DenseMatrix<VTVal> *&res, const DenseMatrix<VTCond> *cond, const DenseMatrix<VTVal> *thenVal,
+                      VTVal elseVal, DCTX(ctx)) {
         const size_t numRows = cond->getNumRows();
         const size_t numCols = cond->getNumCols();
 
-        if(
-            numRows != thenVal->getNumRows() ||
-            numCols != thenVal->getNumCols()
-        )
-            throw std::runtime_error(
-                    "CondMatMatSca: condition/then matrices must have the same shape"
-            );
+        if (numRows != thenVal->getNumRows() || numCols != thenVal->getNumCols())
+            throw std::runtime_error("CondMatMatSca: condition/then matrices "
+                                     "must have the same shape");
 
-        if(res == nullptr)
+        if (res == nullptr)
             res = DataObjectFactory::create<DenseMatrix<VTVal>>(numRows, numCols, false);
 
-        VTVal * valuesRes = res->getValues();
-        const VTCond * valuesCond = cond->getValues();
-        const VTVal * valuesThen = thenVal->getValues();
+        VTVal *valuesRes = res->getValues();
+        const VTCond *valuesCond = cond->getValues();
+        const VTVal *valuesThen = thenVal->getValues();
         const size_t rowSkipRes = res->getRowSkip();
         const size_t rowSkipCond = cond->getRowSkip();
         const size_t rowSkipThen = thenVal->getRowSkip();
 
-        for(size_t r = 0; r < numRows; r++) {
-            for(size_t c = 0; c < numCols; c++)
+        for (size_t r = 0; r < numRows; r++) {
+            for (size_t c = 0; c < numCols; c++)
                 valuesRes[c] = static_cast<bool>(valuesCond[c]) ? valuesThen[c] : elseVal;
             valuesRes += rowSkipRes;
             valuesCond += rowSkipCond;
@@ -96,23 +85,18 @@ struct CondMatMatSca<DenseMatrix<VTVal>, DenseMatrix<VTCond>, DenseMatrix<VTVal>
 // Matrix <- Matrix, Matrix, scalar
 // ----------------------------------------------------------------------------
 
-template<typename VTVal, typename VTCond>
-struct CondMatMatSca<Matrix<VTVal>, Matrix<VTCond>, Matrix<VTVal>, VTVal> {
-    static void apply(
-        Matrix<VTVal> *& res,
-        const Matrix<VTCond> * cond,
-        const Matrix<VTVal> * thenVal,
-        VTVal elseVal,
-        DCTX(ctx)
-    ) {
+template <typename VTVal, typename VTCond> struct CondMatMatSca<Matrix<VTVal>, Matrix<VTCond>, Matrix<VTVal>, VTVal> {
+    static void apply(Matrix<VTVal> *&res, const Matrix<VTCond> *cond, const Matrix<VTVal> *thenVal, VTVal elseVal,
+                      DCTX(ctx)) {
         const size_t numRows = cond->getNumRows();
         const size_t numCols = cond->getNumCols();
 
         if (numRows != thenVal->getNumRows() || numCols != thenVal->getNumCols()) {
             std::ostringstream errMsg;
-            errMsg << "CondMatMatSca: condition/then matrices must have the same shape but have ("
-                    << numRows << "," << numCols << ") and (" 
-                    << thenVal->getNumRows() << "," << thenVal->getNumCols() << ")";
+            errMsg << "CondMatMatSca: condition/then matrices must have the "
+                      "same shape but have ("
+                   << numRows << "," << numCols << ") and (" << thenVal->getNumRows() << "," << thenVal->getNumCols()
+                   << ")";
             throw std::runtime_error(errMsg.str());
         }
 
@@ -127,4 +111,4 @@ struct CondMatMatSca<Matrix<VTVal>, Matrix<VTCond>, Matrix<VTVal>, VTVal> {
     }
 };
 
-#endif //SRC_RUNTIME_LOCAL_KERNELS_CONDMATMATSCA_H
\ No newline at end of file
+#endif // SRC_RUNTIME_LOCAL_KERNELS_CONDMATMATSCA_H
\ No newline at end of file
diff --git a/src/runtime/local/kernels/CondMatScaMat.h b/src/runtime/local/kernels/CondMatScaMat.h
index ef6a51ab8..d177f178a 100644
--- a/src/runtime/local/kernels/CondMatScaMat.h
+++ b/src/runtime/local/kernels/CondMatScaMat.h
@@ -28,17 +28,16 @@
 // Struct for partial template specialization
 // ****************************************************************************
 
-template<class DTRes, class DTCond, class VTThen, class DTElse>
-struct CondMatScaMat {
-    static void apply(DTRes *& res, const DTCond * cond, VTThen thenVal, const DTElse * elseVal, DCTX(ctx)) = delete;
+template <class DTRes, class DTCond, class VTThen, class DTElse> struct CondMatScaMat {
+    static void apply(DTRes *&res, const DTCond *cond, VTThen thenVal, const DTElse *elseVal, DCTX(ctx)) = delete;
 };
 
 // ****************************************************************************
 // Convenience function
 // ****************************************************************************
 
-template<class DTRes, class DTCond, class VTThen, class DTElse>
-void condMatScaMat(DTRes *& res, const DTCond * cond, VTThen thenVal, const DTElse * elseVal, DCTX(ctx)) {
+template <class DTRes, class DTCond, class VTThen, class DTElse>
+void condMatScaMat(DTRes *&res, const DTCond *cond, VTThen thenVal, const DTElse *elseVal, DCTX(ctx)) {
     CondMatScaMat<DTRes, DTCond, VTThen, DTElse>::apply(res, cond, thenVal, elseVal, ctx);
 }
 
@@ -50,38 +49,29 @@ void condMatScaMat(DTRes *& res, const DTCond * cond, VTThen thenVal, const DTEl
 // DenseMatrix <- DenseMatrix, scalar, DenseMatrix
 // ----------------------------------------------------------------------------
 
-template<typename VTVal, typename VTCond>
+template <typename VTVal, typename VTCond>
 struct CondMatScaMat<DenseMatrix<VTVal>, DenseMatrix<VTCond>, VTVal, DenseMatrix<VTVal>> {
-    static void apply(
-        DenseMatrix<VTVal> *& res,
-        const DenseMatrix<VTCond> * cond,
-        VTVal thenVal,
-        const DenseMatrix<VTVal> * elseVal,
-        DCTX(ctx)
-    ) {
+    static void apply(DenseMatrix<VTVal> *&res, const DenseMatrix<VTCond> *cond, VTVal thenVal,
+                      const DenseMatrix<VTVal> *elseVal, DCTX(ctx)) {
         const size_t numRows = cond->getNumRows();
         const size_t numCols = cond->getNumCols();
 
-        if(
-            numRows != elseVal->getNumRows() ||
-            numCols != elseVal->getNumCols()
-        )
-            throw std::runtime_error(
-                    "CondMatScaMat: condition/else matrices must have the same shape"
-            );
+        if (numRows != elseVal->getNumRows() || numCols != elseVal->getNumCols())
+            throw std::runtime_error("CondMatScaMat: condition/else matrices "
+                                     "must have the same shape");
 
-        if(res == nullptr)
+        if (res == nullptr)
             res = DataObjectFactory::create<DenseMatrix<VTVal>>(numRows, numCols, false);
 
-        VTVal * valuesRes = res->getValues();
-        const VTCond * valuesCond = cond->getValues();
-        const VTVal * valuesElse = elseVal->getValues();
+        VTVal *valuesRes = res->getValues();
+        const VTCond *valuesCond = cond->getValues();
+        const VTVal *valuesElse = elseVal->getValues();
         const size_t rowSkipRes = res->getRowSkip();
         const size_t rowSkipCond = cond->getRowSkip();
         const size_t rowSkipElse = elseVal->getRowSkip();
 
-        for(size_t r = 0; r < numRows; r++) {
-            for(size_t c = 0; c < numCols; c++)
+        for (size_t r = 0; r < numRows; r++) {
+            for (size_t c = 0; c < numCols; c++)
                 valuesRes[c] = static_cast<bool>(valuesCond[c]) ? thenVal : valuesElse[c];
             valuesRes += rowSkipRes;
             valuesCond += rowSkipCond;
@@ -94,23 +84,18 @@ struct CondMatScaMat<DenseMatrix<VTVal>, DenseMatrix<VTCond>, VTVal, DenseMatrix
 // Matrix <- Matrix, scalar, Matrix
 // ----------------------------------------------------------------------------
 
-template<typename VTVal, typename VTCond>
-struct CondMatScaMat<Matrix<VTVal>, Matrix<VTCond>, VTVal, Matrix<VTVal>> {
-    static void apply(
-        Matrix<VTVal> *& res,
-        const Matrix<VTCond> * cond,
-        VTVal thenVal,
-        const Matrix<VTVal> * elseVal,
-        DCTX(ctx)
-    ) {
+template <typename VTVal, typename VTCond> struct CondMatScaMat<Matrix<VTVal>, Matrix<VTCond>, VTVal, Matrix<VTVal>> {
+    static void apply(Matrix<VTVal> *&res, const Matrix<VTCond> *cond, VTVal thenVal, const Matrix<VTVal> *elseVal,
+                      DCTX(ctx)) {
         const size_t numRows = cond->getNumRows();
         const size_t numCols = cond->getNumCols();
 
         if (numRows != elseVal->getNumRows() || numCols != elseVal->getNumCols()) {
             std::ostringstream errMsg;
-            errMsg << "CondMatScaMat: condition/else matrices must have the same shape but have ("
-                    << numRows << "," << numCols << ") and (" 
-                    << elseVal->getNumRows() << "," << elseVal->getNumCols() << ")";
+            errMsg << "CondMatScaMat: condition/else matrices must have the "
+                      "same shape but have ("
+                   << numRows << "," << numCols << ") and (" << elseVal->getNumRows() << "," << elseVal->getNumCols()
+                   << ")";
             throw std::runtime_error(errMsg.str());
         }
 
@@ -125,4 +110,4 @@ struct CondMatScaMat<Matrix<VTVal>, Matrix<VTCond>, VTVal, Matrix<VTVal>> {
     }
 };
 
-#endif //SRC_RUNTIME_LOCAL_KERNELS_CONDMATSCAMAT_H
\ No newline at end of file
+#endif // SRC_RUNTIME_LOCAL_KERNELS_CONDMATSCAMAT_H
\ No newline at end of file
diff --git a/src/runtime/local/kernels/CondMatScaSca.h b/src/runtime/local/kernels/CondMatScaSca.h
index 0a86fe627..2a68ebb3a 100644
--- a/src/runtime/local/kernels/CondMatScaSca.h
+++ b/src/runtime/local/kernels/CondMatScaSca.h
@@ -28,17 +28,16 @@
 // Struct for partial template specialization
 // ****************************************************************************
 
-template<class DTRes, class DTCond, class VTThen, class VTElse>
-struct CondMatScaSca {
-    static void apply(DTRes *& res, const DTCond * cond, VTThen thenVal, VTElse elseVal, DCTX(ctx)) = delete;
+template <class DTRes, class DTCond, class VTThen, class VTElse> struct CondMatScaSca {
+    static void apply(DTRes *&res, const DTCond *cond, VTThen thenVal, VTElse elseVal, DCTX(ctx)) = delete;
 };
 
 // ****************************************************************************
 // Convenience function
 // ****************************************************************************
 
-template<class DTRes, class DTCond, class VTThen, class VTElse>
-void condMatScaSca(DTRes *& res, const DTCond * cond, VTThen thenVal, VTElse elseVal, DCTX(ctx)) {
+template <class DTRes, class DTCond, class VTThen, class VTElse>
+void condMatScaSca(DTRes *&res, const DTCond *cond, VTThen thenVal, VTElse elseVal, DCTX(ctx)) {
     CondMatScaSca<DTRes, DTCond, VTThen, VTElse>::apply(res, cond, thenVal, elseVal, ctx);
 }
 
@@ -50,28 +49,22 @@ void condMatScaSca(DTRes *& res, const DTCond * cond, VTThen thenVal, VTElse els
 // DenseMatrix <- DenseMatrix, scalar, scalar
 // ----------------------------------------------------------------------------
 
-template<typename VTVal, typename VTCond>
-struct CondMatScaSca<DenseMatrix<VTVal>, DenseMatrix<VTCond>, VTVal, VTVal> {
-    static void apply(
-        DenseMatrix<VTVal> *& res,
-        const DenseMatrix<VTCond> * cond,
-        VTVal thenVal,
-        VTVal elseVal,
-        DCTX(ctx)
-    ) {
+template <typename VTVal, typename VTCond> struct CondMatScaSca<DenseMatrix<VTVal>, DenseMatrix<VTCond>, VTVal, VTVal> {
+    static void apply(DenseMatrix<VTVal> *&res, const DenseMatrix<VTCond> *cond, VTVal thenVal, VTVal elseVal,
+                      DCTX(ctx)) {
         const size_t numRows = cond->getNumRows();
         const size_t numCols = cond->getNumCols();
 
-        if(res == nullptr)
+        if (res == nullptr)
             res = DataObjectFactory::create<DenseMatrix<VTVal>>(numRows, numCols, false);
 
-        VTVal * valuesRes = res->getValues();
-        const VTCond * valuesCond = cond->getValues();
+        VTVal *valuesRes = res->getValues();
+        const VTCond *valuesCond = cond->getValues();
         const size_t rowSkipRes = res->getRowSkip();
         const size_t rowSkipCond = cond->getRowSkip();
 
-        for(size_t r = 0; r < numRows; r++) {
-            for(size_t c = 0; c < numCols; c++)
+        for (size_t r = 0; r < numRows; r++) {
+            for (size_t c = 0; c < numCols; c++)
                 valuesRes[c] = static_cast<bool>(valuesCond[c]) ? thenVal : elseVal;
             valuesRes += rowSkipRes;
             valuesCond += rowSkipCond;
@@ -83,15 +76,8 @@ struct CondMatScaSca<DenseMatrix<VTVal>, DenseMatrix<VTCond>, VTVal, VTVal> {
 // Matrix <- Matrix, scalar, scalar
 // ----------------------------------------------------------------------------
 
-template<typename VTVal, typename VTCond>
-struct CondMatScaSca<Matrix<VTVal>, Matrix<VTCond>, VTVal, VTVal> {
-    static void apply(
-        Matrix<VTVal> *& res,
-        const Matrix<VTCond> * cond,
-        VTVal thenVal,
-        VTVal elseVal,
-        DCTX(ctx)
-    ) {
+template <typename VTVal, typename VTCond> struct CondMatScaSca<Matrix<VTVal>, Matrix<VTCond>, VTVal, VTVal> {
+    static void apply(Matrix<VTVal> *&res, const Matrix<VTCond> *cond, VTVal thenVal, VTVal elseVal, DCTX(ctx)) {
         const size_t numRows = cond->getNumRows();
         const size_t numCols = cond->getNumCols();
 
@@ -106,4 +92,4 @@ struct CondMatScaSca<Matrix<VTVal>, Matrix<VTCond>, VTVal, VTVal> {
     }
 };
 
-#endif //SRC_RUNTIME_LOCAL_KERNELS_CONDMATSCASCA_H
\ No newline at end of file
+#endif // SRC_RUNTIME_LOCAL_KERNELS_CONDMATSCASCA_H
\ No newline at end of file
diff --git a/src/runtime/local/kernels/Conv2D.cpp b/src/runtime/local/kernels/Conv2D.cpp
index 9615894df..463f6f362 100644
--- a/src/runtime/local/kernels/Conv2D.cpp
+++ b/src/runtime/local/kernels/Conv2D.cpp
@@ -18,64 +18,57 @@
 
 namespace NN::Conv2D {
 
-    uint32_t getPQ(uint32_t img_extent, uint32_t filter_extent, uint32_t pad_extent, uint32_t stride_extent) {
-        uint32_t padded_image_extent = img_extent + 2 * pad_extent;
-        return (padded_image_extent - filter_extent) / stride_extent + 1;
-    }
+uint32_t getPQ(uint32_t img_extent, uint32_t filter_extent, uint32_t pad_extent, uint32_t stride_extent) {
+    uint32_t padded_image_extent = img_extent + 2 * pad_extent;
+    return (padded_image_extent - filter_extent) / stride_extent + 1;
+}
+
+template <typename DTRes, typename DTArg>
+void Forward<DTRes, DTArg>::apply(DTRes *&res, size_t &res_h, size_t &res_w, const DTArg *data, const size_t batch_size,
+                                  const size_t num_channels, const size_t img_h, const size_t img_w,
+                                  const DTArg *filter, const size_t num_filters, const size_t filter_h,
+                                  const size_t filter_w, const size_t stride_h, const size_t stride_w,
+                                  const size_t pad_h, const size_t pad_w, DCTX(dctx)) {
+    auto HW = img_h * img_w;
+    auto C = num_channels;
+    auto CHW = C * HW;
+    // padded height/width
+    auto P = getPQ(img_h, filter_h, pad_h, stride_w);
+    auto Q = getPQ(img_w, filter_w, pad_w, stride_h);
+    auto CPQ = C * P * Q;
+    res_h = P;
+    res_w = Q;
+    auto start = 0;
+    auto stop = batch_size;
 
-    template<typename DTRes, typename DTArg>
-    void Forward<DTRes, DTArg>::apply(DTRes *&res, size_t& res_h, size_t& res_w,
-                          const DTArg *data, const size_t batch_size, const size_t num_channels, const size_t img_h, const size_t img_w,
-                          const DTArg *filter, const size_t num_filters, const size_t filter_h, const size_t filter_w, 
-                          const size_t stride_h, const size_t stride_w, 
-                          const size_t pad_h, const size_t pad_w, DCTX(dctx))
-    {
-        auto HW = img_h * img_w;
-        auto C = num_channels;
-        auto CHW = C * HW;
-        // padded height/width
-        auto P = getPQ(img_h, filter_h, pad_h, stride_w);
-        auto Q = getPQ(img_w, filter_w, pad_w, stride_h);
-        auto CPQ = C * P * Q;
-        res_h = P;
-        res_w = Q;
-        auto start = 0;
-        auto stop = batch_size;
+    auto ii = start * CHW;
+    auto oi = start * CPQ;
 
-        auto ii = start * CHW;
-        auto oi = start * CPQ;
+    auto padded_img_h = img_h + 2 * pad_h;
+    auto padded_img_w = img_w + 2 * pad_w;
+    DTArg *padded_data = DataObjectFactory::create<DTArg>(1, padded_img_h * padded_img_w, true);
+    DTArg *selected_data = DataObjectFactory::create<DTArg>(1, HW, true);
 
-        auto padded_img_h = img_h + 2 * pad_h;
-        auto padded_img_w = img_w + 2 * pad_w;
-        DTArg *padded_data = DataObjectFactory::create<DTArg>(1, padded_img_h * padded_img_w, true);
-        DTArg *selected_data = DataObjectFactory::create<DTArg>(1, HW, true);
-        
-        if (res == nullptr) {
-            res = DataObjectFactory::create<DTRes>(batch_size, CPQ, true);
-        }        
-        for (uint32_t i = start; i < stop; i++)
-            for (uint32_t c = 0, off = ii + (i - start) * CHW, oix = oi + (i - start) * CPQ; c < C; c++, off += HW){                  
-                GetPaddedData<typename DTArg::VT>::run(data->getValues(),
-                                                           padded_data->getValues(),
-                                                           selected_data->getValues(),
-                                                           pad_w, pad_h, img_w, img_h,
-                                                           padded_img_w, off);
+    if (res == nullptr) {
+        res = DataObjectFactory::create<DTRes>(batch_size, CPQ, true);
+    }
+    for (uint32_t i = start; i < stop; i++)
+        for (uint32_t c = 0, off = ii + (i - start) * CHW, oix = oi + (i - start) * CPQ; c < C; c++, off += HW) {
+            GetPaddedData<typename DTArg::VT>::run(data->getValues(), padded_data->getValues(),
+                                                   selected_data->getValues(), pad_w, pad_h, img_w, img_h, padded_img_w,
+                                                   off);
             for (uint32_t p = 0; p < P; p++, oix += Q)
                 for (uint32_t h = p * stride_h; h < std::min(p * stride_h + pool_h, padded_img_h); h++)
-                    for (uint32_t q = 0, off2 = h * padded_img_w; q < Q; q++) 
+                    for (uint32_t q = 0, off2 = h * padded_img_w; q < Q; q++)
                         res->getValues()[oix + q] = Conv2D<typename DTArg::VT>::run(
-                            res->getValues()[oix + q], padded_data->getValues(), off2 + q * stride_w, std::min(pool_w, padded_img_w - q * stride_w), plen);
-                            
-                }
-                    
-        
-
-    }
-
-    template struct Forward<DenseMatrix<float>, DenseMatrix<float>>;
-    template struct Forward<DenseMatrix<double>, DenseMatrix<double>>;
-
-    template struct Forward<DenseMatrix<float>, DenseMatrix<float>>;
-    template struct Forward<DenseMatrix<double>, DenseMatrix<double>>;
+                            res->getValues()[oix + q], padded_data->getValues(), off2 + q * stride_w,
+                            std::min(pool_w, padded_img_w - q * stride_w), plen);
+        }
 }
 
+template struct Forward<DenseMatrix<float>, DenseMatrix<float>>;
+template struct Forward<DenseMatrix<double>, DenseMatrix<double>>;
+
+template struct Forward<DenseMatrix<float>, DenseMatrix<float>>;
+template struct Forward<DenseMatrix<double>, DenseMatrix<double>>;
+} // namespace NN::Conv2D
diff --git a/src/runtime/local/kernels/Conv2DBackwardData.h b/src/runtime/local/kernels/Conv2DBackwardData.h
index d793262d7..f0225263d 100644
--- a/src/runtime/local/kernels/Conv2DBackwardData.h
+++ b/src/runtime/local/kernels/Conv2DBackwardData.h
@@ -36,21 +36,12 @@
 // Struct for partial template specialization
 // ****************************************************************************
 
-template <class DTRes, class DTArg>
-struct Conv2DBackwardData
-{
-    static void apply(const DTArg *filter,
-                      const DTArg *output,
-                      const size_t stride_h, const size_t stride_w,
-                      const size_t pad_h, const size_t pad_w,
-                      const size_t input_batch_size,
-                      const size_t input_num_channels,
-                      const size_t input_h, const size_t input_w,
-                      const size_t filter_num_filters,
-                      const size_t filter_num_channels,
-                      const size_t filter_h, const size_t filter_w,
-                      DTRes *&data,
-                      DCTX(dctx)) = delete;
+template <class DTRes, class DTArg> struct Conv2DBackwardData {
+    static void apply(const DTArg *filter, const DTArg *output, const size_t stride_h, const size_t stride_w,
+                      const size_t pad_h, const size_t pad_w, const size_t input_batch_size,
+                      const size_t input_num_channels, const size_t input_h, const size_t input_w,
+                      const size_t filter_num_filters, const size_t filter_num_channels, const size_t filter_h,
+                      const size_t filter_w, DTRes *&data, DCTX(dctx)) = delete;
 };
 
 // ****************************************************************************
@@ -58,24 +49,14 @@ struct Conv2DBackwardData
 // ****************************************************************************
 
 template <class DTRes, class DTArg>
-void conv2DBackwardData(const DTArg *filter,
-                        const DTArg *output,
-                        const size_t stride_h, const size_t stride_w,
-                        const size_t pad_h, const size_t pad_w,
-                        const size_t input_batch_size,
-                        const size_t input_num_channels,
-                        const size_t input_h, const size_t input_w,
-                        const size_t filter_num_filters,
-                        const size_t filter_num_channels,
-                        const size_t filter_h, const size_t filter_w,
-                        DTRes *&data,
-                        DCTX(dctx))
-{
-    Conv2DBackwardData<DTRes, DTArg>::apply(
-        filter, output, stride_h, stride_w, pad_h, pad_w,
-        input_batch_size, input_num_channels, input_h, input_w,
-        filter_num_filters, filter_num_channels, filter_h, filter_w,
-        data, dctx);
+void conv2DBackwardData(const DTArg *filter, const DTArg *output, const size_t stride_h, const size_t stride_w,
+                        const size_t pad_h, const size_t pad_w, const size_t input_batch_size,
+                        const size_t input_num_channels, const size_t input_h, const size_t input_w,
+                        const size_t filter_num_filters, const size_t filter_num_channels, const size_t filter_h,
+                        const size_t filter_w, DTRes *&data, DCTX(dctx)) {
+    Conv2DBackwardData<DTRes, DTArg>::apply(filter, output, stride_h, stride_w, pad_h, pad_w, input_batch_size,
+                                            input_num_channels, input_h, input_w, filter_num_filters,
+                                            filter_num_channels, filter_h, filter_w, data, dctx);
 }
 
 // ****************************************************************************
@@ -87,11 +68,9 @@ void conv2DBackwardData(const DTArg *filter,
 // ----------------------------------------------------------------------------
 
 template <typename VT>
-static inline void
-GetLossGradientMatrix(const VT *output, VT *loss_gradient_matrix,
-                      size_t output_h, size_t output_w, size_t filter_h, size_t filter_w,
-                      size_t matrix_h, size_t matrix_w, size_t stride_h, size_t stride_w, uint32_t off)
-{
+static inline void GetLossGradientMatrix(const VT *output, VT *loss_gradient_matrix, size_t output_h, size_t output_w,
+                                         size_t filter_h, size_t filter_w, size_t matrix_h, size_t matrix_w,
+                                         size_t stride_h, size_t stride_w, uint32_t off) {
     for (uint32_t i = 0; i < matrix_h * matrix_w; i++)
         loss_gradient_matrix[i] = 0;
     uint32_t init = (filter_h - 1) * matrix_w + filter_w - 1;
@@ -102,32 +81,18 @@ GetLossGradientMatrix(const VT *output, VT *loss_gradient_matrix,
 }
 
 template <typename VT>
-static void
-GetRotatedFilter(const VT *filter, VT *rotated_filter,
-                 size_t filter_h, size_t filter_w, uint32_t off)
-{
+static void GetRotatedFilter(const VT *filter, VT *rotated_filter, size_t filter_h, size_t filter_w, uint32_t off) {
     for (uint32_t i = 0; i < filter_h * filter_w; i++)
         rotated_filter[i] = filter[off + filter_h * filter_w - i - 1];
 }
 
-template <typename VTRes, typename VTArg>
-struct Conv2DBackwardData<DenseMatrix<VTRes>, DenseMatrix<VTArg>>
-{
-
-    static void
-    apply(const DenseMatrix<VTArg> *filter,
-          const DenseMatrix<VTArg> *output,
-          const size_t stride_h, const size_t stride_w,
-          const size_t pad_h, const size_t pad_w,
-          const size_t input_batch_size,
-          const size_t input_num_channels,
-          const size_t input_h, const size_t input_w,
-          const size_t filter_num_filters,
-          const size_t filter_num_channels,
-          const size_t filter_h, const size_t filter_w,
-          DenseMatrix<VTRes> *&data,
-          DCTX(dctx))
-    {
+template <typename VTRes, typename VTArg> struct Conv2DBackwardData<DenseMatrix<VTRes>, DenseMatrix<VTArg>> {
+
+    static void apply(const DenseMatrix<VTArg> *filter, const DenseMatrix<VTArg> *output, const size_t stride_h,
+                      const size_t stride_w, const size_t pad_h, const size_t pad_w, const size_t input_batch_size,
+                      const size_t input_num_channels, const size_t input_h, const size_t input_w,
+                      const size_t filter_num_filters, const size_t filter_num_channels, const size_t filter_h,
+                      const size_t filter_w, DenseMatrix<VTRes> *&data, DCTX(dctx)) {
         auto HW = input_h * input_w;
         auto C = input_num_channels;
         auto CHW = C * HW;
@@ -157,20 +122,20 @@ struct Conv2DBackwardData<DenseMatrix<VTRes>, DenseMatrix<VTArg>>
         auto matrix_h = filter_h + padded_img_h - 1;
         auto matrix_w = filter_w + padded_img_w - 1;
 
-        DenseMatrix<VTArg> *rotated_filter = DataObjectFactory::create<DenseMatrix<VTArg>>(1, filter_h * filter_w, true);
-        DenseMatrix<VTArg> *loss_gradient_matrix = DataObjectFactory::create<DenseMatrix<VTArg>>(1, matrix_h * matrix_w, true);
-        
-        if (data == nullptr)
-        {
-            data = DataObjectFactory::create<DenseMatrix<VTArg>>(input_batch_size, input_num_channels * input_h * input_w, true);
+        DenseMatrix<VTArg> *rotated_filter =
+            DataObjectFactory::create<DenseMatrix<VTArg>>(1, filter_h * filter_w, true);
+        DenseMatrix<VTArg> *loss_gradient_matrix =
+            DataObjectFactory::create<DenseMatrix<VTArg>>(1, matrix_h * matrix_w, true);
+
+        if (data == nullptr) {
+            data = DataObjectFactory::create<DenseMatrix<VTArg>>(input_batch_size,
+                                                                 input_num_channels * input_h * input_w, true);
         }
 
         uint32_t off_o, off_i, off_m = 0;
         for (uint32_t i = start; i < stop; i++)
-            for (uint32_t c_input = 0; c_input < input_num_channels; c_input++)
-            {
-                for (uint32_t c_output = 0; c_output < filter_num_filters; c_output++)
-                {
+            for (uint32_t c_input = 0; c_input < input_num_channels; c_input++) {
+                for (uint32_t c_output = 0; c_output < filter_num_filters; c_output++) {
                     off_f = c_output * f_CHW + c_input * filter_h * filter_w;
                     GetRotatedFilter(filter->getValues(), rotated_filter->getValues(), filter_h, filter_w, off_f);
                     off_o = oi + (i - start) * o_CHW + c_output * o_HW;
@@ -180,10 +145,12 @@ struct Conv2DBackwardData<DenseMatrix<VTRes>, DenseMatrix<VTArg>>
                     for (u_int32_t i_h = pad_h; i_h < pad_h + input_h; i_h++)
                         for (u_int32_t f_h = 0; f_h < filter_h; f_h++)
                             for (u_int32_t i_w = pad_w; i_w < pad_h + input_w; i_w++)
-                                for (u_int32_t f_w = 0; f_w < filter_w; f_w++)
-                                {
+                                for (u_int32_t f_w = 0; f_w < filter_w; f_w++) {
                                     off_m = (i_h + f_h) * matrix_w + i_w + f_w;
-                                    data->getValues()[off_i + (i_h - pad_h) * input_w + i_w - pad_w] = data->getValues()[off_i + (i_h - pad_h) * input_w + i_w - pad_w] + loss_gradient_matrix->getValues()[off_m] * rotated_filter->getValues()[f_h * filter_w + f_w];
+                                    data->getValues()[off_i + (i_h - pad_h) * input_w + i_w - pad_w] =
+                                        data->getValues()[off_i + (i_h - pad_h) * input_w + i_w - pad_w] +
+                                        loss_gradient_matrix->getValues()[off_m] *
+                                            rotated_filter->getValues()[f_h * filter_w + f_w];
                                 }
                 }
             }
diff --git a/src/runtime/local/kernels/Conv2DBackwardFilter.h b/src/runtime/local/kernels/Conv2DBackwardFilter.h
index 616e14c9e..a38501074 100644
--- a/src/runtime/local/kernels/Conv2DBackwardFilter.h
+++ b/src/runtime/local/kernels/Conv2DBackwardFilter.h
@@ -36,21 +36,12 @@
 // Struct for partial template specialization
 // ****************************************************************************
 
-template <class DTRes, class DTArg>
-struct Conv2DBackwardFilter
-{
-    static void apply(  DTRes *&dFilter,
-                        const DTArg *input,
-                        const DTArg *output,
-                        const size_t stride_h, const size_t stride_w,
-                        const size_t pad_h, const size_t pad_w,
-                        const size_t input_batch_size,
-                        const size_t input_num_channels,
-                        const size_t input_h, const size_t input_w,
-                        const size_t filter_num_filters,
-                        const size_t filter_num_channels,
-                        const size_t filter_h, const size_t filter_w,
-                        DCTX(dctx)) = delete;
+template <class DTRes, class DTArg> struct Conv2DBackwardFilter {
+    static void apply(DTRes *&dFilter, const DTArg *input, const DTArg *output, const size_t stride_h,
+                      const size_t stride_w, const size_t pad_h, const size_t pad_w, const size_t input_batch_size,
+                      const size_t input_num_channels, const size_t input_h, const size_t input_w,
+                      const size_t filter_num_filters, const size_t filter_num_channels, const size_t filter_h,
+                      const size_t filter_w, DCTX(dctx)) = delete;
 };
 
 // ****************************************************************************
@@ -58,24 +49,14 @@ struct Conv2DBackwardFilter
 // ****************************************************************************
 
 template <class DTRes, class DTArg>
-void conv2DBackwardFilter(DTRes *&dFilter,
-                        const DTArg *input,
-                        const DTArg *output,
-                        const size_t stride_h, const size_t stride_w,
-                        const size_t pad_h, const size_t pad_w,
-                        const size_t input_batch_size,
-                        const size_t input_num_channels,
-                        const size_t input_h, const size_t input_w,
-                        const size_t filter_num_filters,
-                        const size_t filter_num_channels,
-                        const size_t filter_h, const size_t filter_w,
-                        DCTX(dctx))
-{
-    Conv2DBackwardFilter<DTRes, DTArg>::apply(
-        dFilter, input, output, stride_h, stride_w, pad_h, pad_w,
-        input_batch_size, input_num_channels, input_h, input_w,
-        filter_num_filters, filter_num_channels, filter_h, filter_w,
-        dctx);
+void conv2DBackwardFilter(DTRes *&dFilter, const DTArg *input, const DTArg *output, const size_t stride_h,
+                          const size_t stride_w, const size_t pad_h, const size_t pad_w, const size_t input_batch_size,
+                          const size_t input_num_channels, const size_t input_h, const size_t input_w,
+                          const size_t filter_num_filters, const size_t filter_num_channels, const size_t filter_h,
+                          const size_t filter_w, DCTX(dctx)) {
+    Conv2DBackwardFilter<DTRes, DTArg>::apply(dFilter, input, output, stride_h, stride_w, pad_h, pad_w,
+                                              input_batch_size, input_num_channels, input_h, input_w,
+                                              filter_num_filters, filter_num_channels, filter_h, filter_w, dctx);
 }
 
 // ****************************************************************************
@@ -87,13 +68,10 @@ void conv2DBackwardFilter(DTRes *&dFilter,
 // ----------------------------------------------------------------------------
 
 template <typename VT>
-static inline void
-GetGradientMatrix(  VT *matrix, const VT *output, 
-                    const size_t output_h, const size_t output_w, 
-                    const size_t filter_h, const size_t filter_w,
-                    const size_t padded_img_h, const size_t padded_img_w,
-                    const size_t stride_h, const size_t stride_w, const uint32_t off)
-{
+static inline void GetGradientMatrix(VT *matrix, const VT *output, const size_t output_h, const size_t output_w,
+                                     const size_t filter_h, const size_t filter_w, const size_t padded_img_h,
+                                     const size_t padded_img_w, const size_t stride_h, const size_t stride_w,
+                                     const uint32_t off) {
     auto matrix_h = padded_img_h - (filter_h - 1);
     auto matrix_w = padded_img_w - (filter_w - 1);
 
@@ -102,31 +80,19 @@ GetGradientMatrix(  VT *matrix, const VT *output,
 
     auto off_matrix = 0;
     for (uint32_t h = 0, i = 0; h < output_h; h++)
-        for (uint32_t w = 0; w < output_w; w++, i++)
-        {
+        for (uint32_t w = 0; w < output_w; w++, i++) {
             off_matrix = h * stride_h * matrix_w + w * stride_w;
             matrix[off_matrix] = output[off + i];
         }
 }
 
-template <typename VTRes, typename VTArg>
-struct Conv2DBackwardFilter<DenseMatrix<VTRes>, DenseMatrix<VTArg>>
-{
-
-    static void
-    apply(  DenseMatrix<VTRes> *&dFilter,
-            const DenseMatrix<VTArg> *input,
-            const DenseMatrix<VTArg> *output,
-            const size_t stride_h, const size_t stride_w,
-            const size_t pad_h, const size_t pad_w,
-            const size_t input_batch_size,
-            const size_t input_num_channels,
-            const size_t input_h, const size_t input_w,
-            const size_t filter_num_filters,
-            const size_t filter_num_channels,
-            const size_t filter_h, const size_t filter_w,
-            DCTX(dctx))
-    {
+template <typename VTRes, typename VTArg> struct Conv2DBackwardFilter<DenseMatrix<VTRes>, DenseMatrix<VTArg>> {
+
+    static void apply(DenseMatrix<VTRes> *&dFilter, const DenseMatrix<VTArg> *input, const DenseMatrix<VTArg> *output,
+                      const size_t stride_h, const size_t stride_w, const size_t pad_h, const size_t pad_w,
+                      const size_t input_batch_size, const size_t input_num_channels, const size_t input_h,
+                      const size_t input_w, const size_t filter_num_filters, const size_t filter_num_channels,
+                      const size_t filter_h, const size_t filter_w, DCTX(dctx)) {
         auto HW = input_h * input_w;
         auto C = input_num_channels;
         auto CHW = C * HW;
@@ -142,7 +108,6 @@ struct Conv2DBackwardFilter<DenseMatrix<VTRes>, DenseMatrix<VTArg>>
         auto start = 0;
         auto stop = input_batch_size;
 
-
         auto f_HW = filter_h * filter_w;
         auto f_CHW = filter_num_channels * f_HW;
 
@@ -152,38 +117,37 @@ struct Conv2DBackwardFilter<DenseMatrix<VTRes>, DenseMatrix<VTArg>>
         auto matrix_h = padded_img_h - (filter_h - 1);
         auto matrix_w = padded_img_w - (filter_w - 1);
 
-        DenseMatrix<VTArg> *gradient_matrix = DataObjectFactory::create<DenseMatrix<VTArg>>(1, matrix_h * matrix_w, true);
-        DenseMatrix<VTArg> *padded_matrix = DataObjectFactory::create<DenseMatrix<VTArg>>(1, padded_img_h * padded_img_w, true);
-        
-        if (dFilter == nullptr)
-        {
-            dFilter = DataObjectFactory::create<DenseMatrix<VTArg>>(filter_num_filters, filter_num_channels * filter_h * filter_w, true);
+        DenseMatrix<VTArg> *gradient_matrix =
+            DataObjectFactory::create<DenseMatrix<VTArg>>(1, matrix_h * matrix_w, true);
+        DenseMatrix<VTArg> *padded_matrix =
+            DataObjectFactory::create<DenseMatrix<VTArg>>(1, padded_img_h * padded_img_w, true);
+
+        if (dFilter == nullptr) {
+            dFilter = DataObjectFactory::create<DenseMatrix<VTArg>>(filter_num_filters,
+                                                                    filter_num_channels * filter_h * filter_w, true);
         }
 
-        for (uint32_t f = 0; f < filter_num_filters; f++)
-        {
-            for (uint32_t c = 0; c < filter_num_channels; c++)
-            {
-                for (uint32_t i = start; i < stop; i++)
-                {
+        for (uint32_t f = 0; f < filter_num_filters; f++) {
+            for (uint32_t c = 0; c < filter_num_channels; c++) {
+                for (uint32_t i = start; i < stop; i++) {
                     auto off_input = i * CHW + c * HW;
                     Padding(padded_matrix->getValues(), input->getValues(), pad_h, pad_w, input_w, input_h, off_input);
                     auto off_output = i * o_CHW + f * o_HW;
-                    GetGradientMatrix(gradient_matrix->getValues(), output->getValues(), output_h, output_w, filter_h, filter_w, padded_img_h, padded_img_w, stride_h, stride_w, off_output);
+                    GetGradientMatrix(gradient_matrix->getValues(), output->getValues(), output_h, output_w, filter_h,
+                                      filter_w, padded_img_h, padded_img_w, stride_h, stride_w, off_output);
 
                     for (uint32_t p = 0; p < filter_h; p++)
                         for (uint32_t q = 0; q < filter_w; q++)
                             for (uint32_t h = 0; h < matrix_h; h++)
-                                for (uint32_t w = 0; w < matrix_w; w++)
-                                    {
-                                        auto off_filter = f * f_CHW + c * f_HW + p * filter_h + q;
-                                        auto off_matrix = h * matrix_w + w;
-                                        auto off_padded = (p + h) * padded_img_w + q + w;
-                                        dFilter->getValues()[off_filter] = dFilter->getValues()[off_filter]
-                                                                         + padded_matrix->getValues()[off_padded]
-                                                                         * gradient_matrix->getValues()[off_matrix];
-                                    }
-                } 
+                                for (uint32_t w = 0; w < matrix_w; w++) {
+                                    auto off_filter = f * f_CHW + c * f_HW + p * filter_h + q;
+                                    auto off_matrix = h * matrix_w + w;
+                                    auto off_padded = (p + h) * padded_img_w + q + w;
+                                    dFilter->getValues()[off_filter] =
+                                        dFilter->getValues()[off_filter] + padded_matrix->getValues()[off_padded] *
+                                                                               gradient_matrix->getValues()[off_matrix];
+                                }
+                }
             }
         }
         DataObjectFactory::destroy(gradient_matrix);
diff --git a/src/runtime/local/kernels/Conv2DForward.h b/src/runtime/local/kernels/Conv2DForward.h
index 1272b1009..b903c371c 100644
--- a/src/runtime/local/kernels/Conv2DForward.h
+++ b/src/runtime/local/kernels/Conv2DForward.h
@@ -37,32 +37,24 @@
 // Struct for partial template specialization
 // ****************************************************************************
 
-template<class DTRes, class DTArg>
-struct Conv2DForward {
-    static void apply(DTRes *&res, size_t& res_h, size_t& res_w,
-                          const DTArg *data, const DTArg *filter, const DTArg *bias,
-                          const size_t batch_size, const size_t num_channels, const size_t img_h, const size_t img_w,
-                          const size_t filter_h, const size_t filter_w, 
-                          const size_t stride_h, const size_t stride_w, 
-                          const size_t pad_h, const size_t pad_w,  DCTX(dctx)) = delete;
+template <class DTRes, class DTArg> struct Conv2DForward {
+    static void apply(DTRes *&res, size_t &res_h, size_t &res_w, const DTArg *data, const DTArg *filter,
+                      const DTArg *bias, const size_t batch_size, const size_t num_channels, const size_t img_h,
+                      const size_t img_w, const size_t filter_h, const size_t filter_w, const size_t stride_h,
+                      const size_t stride_w, const size_t pad_h, const size_t pad_w, DCTX(dctx)) = delete;
 };
 
 // ****************************************************************************
 // Convenience function
 // ****************************************************************************
 
-template<class DTRes, class DTArg>
-void conv2DForward(DTRes *&res, size_t& res_h, size_t& res_w,
-                        const DTArg *data, const DTArg *filter, const DTArg *bias,
-                        const size_t batch_size, const size_t num_channels, const size_t img_h, const size_t img_w,
-                        const size_t filter_h, const size_t filter_w, 
-                        const size_t stride_h, const size_t stride_w, 
-                        const size_t pad_h, const size_t pad_w, DCTX(dctx)) {
-    Conv2DForward<DTRes, DTArg>::apply(res, res_h, res_w,
-                        data, filter, bias, batch_size, num_channels, img_h, img_w,
-                        filter_h, filter_w, 
-                        stride_h, stride_w, 
-                        pad_h, pad_w, dctx);
+template <class DTRes, class DTArg>
+void conv2DForward(DTRes *&res, size_t &res_h, size_t &res_w, const DTArg *data, const DTArg *filter, const DTArg *bias,
+                   const size_t batch_size, const size_t num_channels, const size_t img_h, const size_t img_w,
+                   const size_t filter_h, const size_t filter_w, const size_t stride_h, const size_t stride_w,
+                   const size_t pad_h, const size_t pad_w, DCTX(dctx)) {
+    Conv2DForward<DTRes, DTArg>::apply(res, res_h, res_w, data, filter, bias, batch_size, num_channels, img_h, img_w,
+                                       filter_h, filter_w, stride_h, stride_w, pad_h, pad_w, dctx);
 }
 
 // ****************************************************************************
@@ -74,33 +66,22 @@ void conv2DForward(DTRes *&res, size_t& res_h, size_t& res_w,
 // ----------------------------------------------------------------------------
 
 template <typename VT>
-static inline VT
-ConvOp(VT initial_value, const VT *in, const VT *filter, uint32_t in_start, uint32_t filter_start, uint32_t length)
-{
+static inline VT ConvOp(VT initial_value, const VT *in, const VT *filter, uint32_t in_start, uint32_t filter_start,
+                        uint32_t length) {
     VT ret = 0;
     for (uint32_t i = 0; i < length; ++i)
         ret += in[in_start + i] * filter[filter_start + i];
     return ret + initial_value;
 }
 
-template <typename VTRes, typename VTArg>
-struct Conv2DForward<DenseMatrix<VTRes>, DenseMatrix<VTArg>>
-{
-    static void 
-    apply(DenseMatrix<VTRes> *&res, 
-          size_t &res_h, size_t &res_w,
-          const DenseMatrix<VTArg> *data,
-          const DenseMatrix<VTArg> *filter, 
-          const DenseMatrix<VTArg> *bias,
-          const size_t batch_size, const size_t num_channels, 
-          const size_t img_h, const size_t img_w,     
-          const size_t filter_h, const size_t filter_w,
-          const size_t stride_h, const size_t stride_w,
-          const size_t pad_h, const size_t pad_w,  
-          DCTX(dctx))
-    {
+template <typename VTRes, typename VTArg> struct Conv2DForward<DenseMatrix<VTRes>, DenseMatrix<VTArg>> {
+    static void apply(DenseMatrix<VTRes> *&res, size_t &res_h, size_t &res_w, const DenseMatrix<VTArg> *data,
+                      const DenseMatrix<VTArg> *filter, const DenseMatrix<VTArg> *bias, const size_t batch_size,
+                      const size_t num_channels, const size_t img_h, const size_t img_w, const size_t filter_h,
+                      const size_t filter_w, const size_t stride_h, const size_t stride_w, const size_t pad_h,
+                      const size_t pad_w, DCTX(dctx)) {
         auto HW = img_h * img_w;
-        auto C = num_channels;        
+        auto C = num_channels;
         auto CHW = C * HW;
         // padded height/width
         auto P = getPQ(img_h, filter_h, pad_h, stride_h);
@@ -122,20 +103,17 @@ struct Conv2DForward<DenseMatrix<VTRes>, DenseMatrix<VTArg>>
 
         auto padded_img_h = img_h + 2 * pad_h;
         auto padded_img_w = img_w + 2 * pad_w;
-        DenseMatrix<VTArg> *padded_data = DataObjectFactory::create<DenseMatrix<VTArg>>(1, padded_img_h * padded_img_w, true);
+        DenseMatrix<VTArg> *padded_data =
+            DataObjectFactory::create<DenseMatrix<VTArg>>(1, padded_img_h * padded_img_w, true);
 
-        if (res == nullptr)
-        {
+        if (res == nullptr) {
             res = DataObjectFactory::create<DenseMatrix<VTArg>>(batch_size, CPQ, true);
         }
 
         u_int32_t off_o, off_i_padded, off_i = 0;
-        for (uint32_t i = start; i < stop; i++)
-        {            
-            for (uint32_t c_new = 0; c_new < C_new; c_new++)
-            {
-                for (uint32_t c = 0; c < C; c++)
-                {
+        for (uint32_t i = start; i < stop; i++) {
+            for (uint32_t c_new = 0; c_new < C_new; c_new++) {
+                for (uint32_t c = 0; c < C; c++) {
                     off_i = ii + (i - start) * CHW + c * HW;
 
                     Padding(padded_data->getValues(), data->getValues(), pad_h, pad_w, img_w, img_h, off_i);
@@ -143,21 +121,20 @@ struct Conv2DForward<DenseMatrix<VTRes>, DenseMatrix<VTArg>>
                     for (u_int32_t p = 0; p < P; p++)
                         for (u_int32_t h = 0; h < std::min(filter_h, padded_img_h - p * stride_h); h++)
                             for (u_int32_t q = 0; q < Q; q++)
-                                for(u_int32_t w = 0; w < std::min(filter_w, padded_img_w - q * stride_w); w++)
-                                {
+                                for (u_int32_t w = 0; w < std::min(filter_w, padded_img_w - q * stride_w); w++) {
                                     off_o = oi + (i - start) * CPQ + c_new * PQ + p * Q + q;
                                     off_i_padded = (p * stride_h + h) * padded_img_w + q * stride_w + w;
                                     off_f = c_new * f_CHW + c * filter_h * filter_w + h * filter_w + w;
 
-                                    res->getValues()[off_o] = res->getValues()[off_o] 
-                                                            + padded_data->getValues()[off_i_padded]
-                                                            * filter->getValues()[off_f];
+                                    res->getValues()[off_o] =
+                                        res->getValues()[off_o] +
+                                        padded_data->getValues()[off_i_padded] * filter->getValues()[off_f];
                                 }
                 }
                 for (u_int32_t l = 0; l < PQ; l++)
                     res->getValues()[oi + (i - start) * CPQ + c_new * PQ + l] += bias->getValues()[c_new];
-            }            
+            }
         }
-        DataObjectFactory::destroy(padded_data);   
+        DataObjectFactory::destroy(padded_data);
     }
 };
\ No newline at end of file
diff --git a/src/runtime/local/kernels/ConvertDenseMatrixToMemRef.h b/src/runtime/local/kernels/ConvertDenseMatrixToMemRef.h
index c281db96c..1cabfcaa8 100644
--- a/src/runtime/local/kernels/ConvertDenseMatrixToMemRef.h
+++ b/src/runtime/local/kernels/ConvertDenseMatrixToMemRef.h
@@ -20,8 +20,7 @@
 #include "runtime/local/datastructures/DenseMatrix.h"
 
 template <typename T>
-inline StridedMemRefType<T, 2> convertDenseMatrixToMemRef(
-    const DenseMatrix<T> *input, DCTX(ctx)) {
+inline StridedMemRefType<T, 2> convertDenseMatrixToMemRef(const DenseMatrix<T> *input, DCTX(ctx)) {
     StridedMemRefType<T, 2> memRef{};
     memRef.basePtr = input->getValuesSharedPtr().get();
     memRef.data = memRef.basePtr;
diff --git a/src/runtime/local/kernels/ConvertMemRefToDenseMatrix.h b/src/runtime/local/kernels/ConvertMemRefToDenseMatrix.h
index 96779ea70..f4f054c56 100644
--- a/src/runtime/local/kernels/ConvertMemRefToDenseMatrix.h
+++ b/src/runtime/local/kernels/ConvertMemRefToDenseMatrix.h
@@ -20,13 +20,10 @@
 #include "runtime/local/datastructures/DenseMatrix.h"
 
 template <typename T>
-inline void convertMemRefToDenseMatrix(DenseMatrix<T>*& result, size_t basePtr,
-                                       size_t offset, size_t size0,
-                                       size_t size1, size_t stride0,
-                                       size_t stride1, DCTX(ctx)) {
-    auto no_op_deleter = [](T*) {};
-    T* valuePtr = reinterpret_cast<T*>(basePtr);
+inline void convertMemRefToDenseMatrix(DenseMatrix<T> *&result, size_t basePtr, size_t offset, size_t size0,
+                                       size_t size1, size_t stride0, size_t stride1, DCTX(ctx)) {
+    auto no_op_deleter = [](T *) {};
+    T *valuePtr = reinterpret_cast<T *>(basePtr);
     std::shared_ptr<T[]> ptr(valuePtr, no_op_deleter);
     result = DataObjectFactory::create<DenseMatrix<T>>(size0, size1, ptr);
 }
-
diff --git a/src/runtime/local/kernels/CreateDaphneContext.cpp b/src/runtime/local/kernels/CreateDaphneContext.cpp
index e994e0d84..f77364b6f 100644
--- a/src/runtime/local/kernels/CreateDaphneContext.cpp
+++ b/src/runtime/local/kernels/CreateDaphneContext.cpp
@@ -18,11 +18,11 @@
 #include "util/KernelDispatchMapping.h"
 
 void createDaphneContext(DaphneContext *&res, uint64_t configPtr, uint64_t dispatchMappingPtr, uint64_t statisticsPtr,
-        uint64_t stringRefCountPtr) {
+                         uint64_t stringRefCountPtr) {
     auto config = reinterpret_cast<DaphneUserConfig *>(configPtr);
     auto dispatchMapping = reinterpret_cast<KernelDispatchMapping *>(dispatchMappingPtr);
     auto statistics = reinterpret_cast<Statistics *>(statisticsPtr);
-    auto stringRefCounter = reinterpret_cast<StringRefCounter*>(stringRefCountPtr);
+    auto stringRefCounter = reinterpret_cast<StringRefCounter *>(stringRefCountPtr);
     if (config->log_ptr != nullptr)
         config->log_ptr->registerLoggers();
     res = new DaphneContext(*config, *dispatchMapping, *statistics, *stringRefCounter);
diff --git a/src/runtime/local/kernels/CreateDaphneContext.h b/src/runtime/local/kernels/CreateDaphneContext.h
index d69d33a18..581a2a72a 100644
--- a/src/runtime/local/kernels/CreateDaphneContext.h
+++ b/src/runtime/local/kernels/CreateDaphneContext.h
@@ -25,4 +25,4 @@
 // Convenience function
 // ****************************************************************************
 void createDaphneContext(DaphneContext *&res, uint64_t configPtr, uint64_t dispatchMappingPtr, uint64_t statisticsPtr,
-        uint64_t stringRefCountPtr);
+                         uint64_t stringRefCountPtr);
diff --git a/src/runtime/local/kernels/CreateFrame.h b/src/runtime/local/kernels/CreateFrame.h
index ecf47e043..794687908 100644
--- a/src/runtime/local/kernels/CreateFrame.h
+++ b/src/runtime/local/kernels/CreateFrame.h
@@ -30,19 +30,20 @@
 // Convenience function
 // ****************************************************************************
 
-void createFrame(Frame *& res, Structure ** colMats, size_t numColMats, const char ** labels, size_t numLabels, DCTX(ctx)) {
+inline void createFrame(Frame *&res, Structure **colMats, size_t numColMats, const char **labels, size_t numLabels,
+                        DCTX(ctx)) {
     std::vector<Structure *> colMatsVec;
-    for(size_t c = 0; c < numColMats; c++)
+    for (size_t c = 0; c < numColMats; c++)
         colMatsVec.push_back(colMats[c]);
-    
-    std::string * labelsStr = numLabels ? new std::string[numLabels] : nullptr;
-    for(size_t c = 0; c < numLabels; c++)
+
+    std::string *labelsStr = numLabels ? new std::string[numLabels] : nullptr;
+    for (size_t c = 0; c < numLabels; c++)
         labelsStr[c] = labels[c];
-    
+
     res = DataObjectFactory::create<Frame>(colMatsVec, labelsStr);
-    
-    if(numLabels)
+
+    if (numLabels)
         delete[] labelsStr;
 }
 
-#endif //SRC_RUNTIME_LOCAL_KERNELS_CREATEFRAME_H
\ No newline at end of file
+#endif // SRC_RUNTIME_LOCAL_KERNELS_CREATEFRAME_H
diff --git a/src/runtime/local/kernels/CreateHDFSContext.h b/src/runtime/local/kernels/CreateHDFSContext.h
index 0aa68df39..064f78f90 100644
--- a/src/runtime/local/kernels/CreateHDFSContext.h
+++ b/src/runtime/local/kernels/CreateHDFSContext.h
@@ -23,6 +23,4 @@
 // Convenience function
 // ****************************************************************************
 
-static void createHDFSContext(DCTX(ctx)) {
-    ctx->hdfs_context = HDFSContext::createHDFSContext(ctx->config);
-}
+static void createHDFSContext(DCTX(ctx)) { ctx->hdfs_context = HDFSContext::createHDFSContext(ctx->config); }
diff --git a/src/runtime/local/kernels/CreateList.h b/src/runtime/local/kernels/CreateList.h
index c2bdbbdfd..dff5d8c6f 100644
--- a/src/runtime/local/kernels/CreateList.h
+++ b/src/runtime/local/kernels/CreateList.h
@@ -17,6 +17,8 @@
 #pragma once
 
 #include <runtime/local/context/DaphneContext.h>
+#include <runtime/local/datastructures/CSRMatrix.h>
+#include <runtime/local/datastructures/DenseMatrix.h>
 #include <runtime/local/datastructures/List.h>
 #include <runtime/local/datastructures/Structure.h>
 
@@ -24,9 +26,8 @@
 // Convenience function
 // ****************************************************************************
 
-template<class DT>
-void createList(List<DT> *& res, const DT ** elems, size_t numElems, DCTX(ctx)) {
+template <class DT> void createList(List<DT> *&res, const DT **elems, size_t numElems, DCTX(ctx)) {
     res = DataObjectFactory::create<List<DT>>();
-    for(size_t i = 0; i < numElems; i++)
+    for (size_t i = 0; i < numElems; i++)
         res->append(elems[i]);
-}
\ No newline at end of file
+}
diff --git a/src/runtime/local/kernels/DecRef.h b/src/runtime/local/kernels/DecRef.h
index e1a939d1c..5cb1149ad 100644
--- a/src/runtime/local/kernels/DecRef.h
+++ b/src/runtime/local/kernels/DecRef.h
@@ -24,24 +24,19 @@
 // Struct for partial template specialization
 // ****************************************************************************
 
-template<class DTArg>
-struct DecRef {
-    static void apply(const DTArg * arg, DCTX(ctx)) = delete;
+template <class DTArg> struct DecRef {
+    static void apply(const DTArg *arg, DCTX(ctx)) = delete;
 };
 
-template<>
-struct DecRef<Structure> {
-    static void apply(const Structure* arg, DCTX(ctx)) {
-        DataObjectFactory::destroy(arg);
-    }
+template <> struct DecRef<Structure> {
+    static void apply(const Structure *arg, DCTX(ctx)) { DataObjectFactory::destroy(arg); }
 };
 
-template<>
-struct DecRef<char> {
-    static void apply(const char* arg, DCTX(ctx)) {
+template <> struct DecRef<char> {
+    static void apply(const char *arg, DCTX(ctx)) {
         // Decrease the reference counter. If it became zero, delete the string.
-        if(!ctx->stringRefCount.dec(arg)) {
-            delete [] arg;
+        if (!ctx->stringRefCount.dec(arg)) {
+            delete[] arg;
         }
     }
 };
@@ -50,8 +45,4 @@ struct DecRef<char> {
 // Convenience function
 // ****************************************************************************
 
-template<class DTArg>
-void decRef(const DTArg * arg, DCTX(ctx)) {
-    DecRef<DTArg>::apply(arg, ctx);
-}
-
+template <class DTArg> void decRef(const DTArg *arg, DCTX(ctx)) { DecRef<DTArg>::apply(arg, ctx); }
diff --git a/src/runtime/local/kernels/DestroyDaphneContext.h b/src/runtime/local/kernels/DestroyDaphneContext.h
index e8f121963..fe24fee79 100644
--- a/src/runtime/local/kernels/DestroyDaphneContext.h
+++ b/src/runtime/local/kernels/DestroyDaphneContext.h
@@ -23,8 +23,6 @@
 // Convenience function
 // ****************************************************************************
 
-void destroyDaphneContext(const DaphneContext * ctx) {
-    delete ctx;
-}
+void destroyDaphneContext(const DaphneContext *ctx) { delete ctx; }
 
-#endif //SRC_RUNTIME_LOCAL_KERNELS_DESTROYDAPHNECONTEXT_H
\ No newline at end of file
+#endif // SRC_RUNTIME_LOCAL_KERNELS_DESTROYDAPHNECONTEXT_H
\ No newline at end of file
diff --git a/src/runtime/local/kernels/DiagMatrix.h b/src/runtime/local/kernels/DiagMatrix.h
index 17800fdd0..489c4190d 100644
--- a/src/runtime/local/kernels/DiagMatrix.h
+++ b/src/runtime/local/kernels/DiagMatrix.h
@@ -18,6 +18,7 @@
 #define SRC_RUNTIME_LOCAL_KERNELS_DIAGMATRIX_H
 
 #include <runtime/local/context/DaphneContext.h>
+#include <runtime/local/datastructures/CSRMatrix.h>
 #include <runtime/local/datastructures/DataObjectFactory.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
 #include <runtime/local/datastructures/Matrix.h>
@@ -31,17 +32,15 @@
 // Struct for partial template specialization
 // ****************************************************************************
 
-template<class DTRes, class DTArg>
-struct DiagMatrix {
-    static void apply(DTRes *& res, const DTArg * arg, DCTX(ctx)) = delete;
+template <class DTRes, class DTArg> struct DiagMatrix {
+    static void apply(DTRes *&res, const DTArg *arg, DCTX(ctx)) = delete;
 };
 
 // ****************************************************************************
 // Convenience function
 // ****************************************************************************
 
-template<class DTRes, class DTArg>
-void diagMatrix(DTRes *& res, const DTArg * arg, DCTX(ctx)) {
+template <class DTRes, class DTArg> void diagMatrix(DTRes *&res, const DTArg *arg, DCTX(ctx)) {
     DiagMatrix<DTRes, DTArg>::apply(res, arg, ctx);
 }
 
@@ -53,26 +52,24 @@ void diagMatrix(DTRes *& res, const DTArg * arg, DCTX(ctx)) {
 // DenseMatrix <- DenseMatrix
 // ----------------------------------------------------------------------------
 
-template<typename VT>
-struct DiagMatrix<DenseMatrix<VT>, DenseMatrix<VT>> {
-    static void apply(DenseMatrix<VT> *& res, const DenseMatrix<VT> * arg, DCTX(ctx)) {
+template <typename VT> struct DiagMatrix<DenseMatrix<VT>, DenseMatrix<VT>> {
+    static void apply(DenseMatrix<VT> *&res, const DenseMatrix<VT> *arg, DCTX(ctx)) {
         if (arg->getNumCols() != 1) {
-            throw std::runtime_error(
-                "DiagMatrix.h - parameter arg must be a column-matrix");
+            throw std::runtime_error("DiagMatrix.h - parameter arg must be a column-matrix");
         }
 
         const size_t numRowsCols = arg->getNumRows();
-        
-        if(res == nullptr)
+
+        if (res == nullptr)
             res = DataObjectFactory::create<DenseMatrix<VT>>(numRowsCols, numRowsCols, true);
-        
-        const VT * valuesArg = arg->getValues();
-        VT * valuesRes = res->getValues();
-        
+
+        const VT *valuesArg = arg->getValues();
+        VT *valuesRes = res->getValues();
+
         const size_t rowSkipArg = arg->getRowSkip();
         const size_t rowSkipRes = res->getRowSkip();
-        
-        for(size_t r = 0; r < numRowsCols; r++) {
+
+        for (size_t r = 0; r < numRowsCols; r++) {
             *valuesRes = *valuesArg;
             valuesArg += rowSkipArg;
             valuesRes += rowSkipRes + 1;
@@ -84,32 +81,31 @@ struct DiagMatrix<DenseMatrix<VT>, DenseMatrix<VT>> {
 // CSRMatrix <- DenseMatrix
 // ----------------------------------------------------------------------------
 
-template<typename VT>
-struct DiagMatrix<CSRMatrix<VT>, DenseMatrix<VT>> {
-    static void apply(CSRMatrix<VT> *& res, const DenseMatrix<VT> * arg, DCTX(ctx)) {
-        if(arg->getNumCols() != 1)
+template <typename VT> struct DiagMatrix<CSRMatrix<VT>, DenseMatrix<VT>> {
+    static void apply(CSRMatrix<VT> *&res, const DenseMatrix<VT> *arg, DCTX(ctx)) {
+        if (arg->getNumCols() != 1)
             throw std::runtime_error("parameter arg must be a column-matrix");
 
         const size_t numRowsCols = arg->getNumRows();
-        if(res==nullptr){
+        if (res == nullptr) {
             res = DataObjectFactory::create<CSRMatrix<VT>>(numRowsCols, numRowsCols, numRowsCols, false);
         }
 
-        const VT * valuesArg = arg->getValues();
+        const VT *valuesArg = arg->getValues();
         const size_t rowSkipArg = arg->getRowSkip();
 
-        VT * valuesRes = res->getValues();
-        size_t * colIdxsRes = res->getColIdxs();
-        size_t * rowOffsetsRes = res->getRowOffsets();
+        VT *valuesRes = res->getValues();
+        size_t *colIdxsRes = res->getColIdxs();
+        size_t *rowOffsetsRes = res->getRowOffsets();
 
         rowOffsetsRes[0] = 0;
 
-        for(size_t r = 0, pos = 0; r < numRowsCols; r++) {
+        for (size_t r = 0, pos = 0; r < numRowsCols; r++) {
             if (*valuesArg) {
-	        valuesRes[pos] = *valuesArg;
-	        colIdxsRes[pos++] = r;
-	    }
-	    rowOffsetsRes[r + 1] = pos;
+                valuesRes[pos] = *valuesArg;
+                colIdxsRes[pos++] = r;
+            }
+            rowOffsetsRes[r + 1] = pos;
             valuesArg += rowSkipArg;
         }
     }
@@ -119,29 +115,28 @@ struct DiagMatrix<CSRMatrix<VT>, DenseMatrix<VT>> {
 // CSRMatrix <- CSRMatrix
 // ----------------------------------------------------------------------------
 
-template<typename VT>
-struct DiagMatrix<CSRMatrix<VT>, CSRMatrix<VT>> {
-    static void apply(CSRMatrix<VT> *& res, const CSRMatrix<VT> * arg, DCTX(ctx)) {
-        if(arg->getNumCols() != 1)
+template <typename VT> struct DiagMatrix<CSRMatrix<VT>, CSRMatrix<VT>> {
+    static void apply(CSRMatrix<VT> *&res, const CSRMatrix<VT> *arg, DCTX(ctx)) {
+        if (arg->getNumCols() != 1)
             throw std::runtime_error("parameter arg must be a column-matrix");
 
         const size_t numRowsCols = arg->getNumRows();
-        if(res==nullptr){
+        if (res == nullptr) {
             res = DataObjectFactory::create<CSRMatrix<VT>>(numRowsCols, numRowsCols, numRowsCols, false);
         }
 
-        VT * valuesRes = res->getValues();
-        size_t * colIdxsRes = res->getColIdxs();
-        size_t * rowOffsetsRes = res->getRowOffsets();
+        VT *valuesRes = res->getValues();
+        size_t *colIdxsRes = res->getColIdxs();
+        size_t *rowOffsetsRes = res->getRowOffsets();
 
-	rowOffsetsRes[0] = 0;
+        rowOffsetsRes[0] = 0;
 
-        for(size_t r = 0, pos = 0; r < numRowsCols; r++) {
+        for (size_t r = 0, pos = 0; r < numRowsCols; r++) {
             if (arg->getNumNonZeros(r)) {
-	        valuesRes[pos] = *(arg->getValues(r));
-	        colIdxsRes[pos++] = r;
-	    }
-	    rowOffsetsRes[r + 1] = pos;
+                valuesRes[pos] = *(arg->getValues(r));
+                colIdxsRes[pos++] = r;
+            }
+            rowOffsetsRes[r + 1] = pos;
         }
     }
 };
@@ -150,18 +145,17 @@ struct DiagMatrix<CSRMatrix<VT>, CSRMatrix<VT>> {
 // Matrix <- Matrix
 // ----------------------------------------------------------------------------
 
-template<typename VT>
-struct DiagMatrix<Matrix<VT>, Matrix<VT>> {
-    static void apply(Matrix<VT> *& res, const Matrix<VT> * arg, DCTX(ctx)) {
+template <typename VT> struct DiagMatrix<Matrix<VT>, Matrix<VT>> {
+    static void apply(Matrix<VT> *&res, const Matrix<VT> *arg, DCTX(ctx)) {
         // TODO: this could be relaxed to allow row-matrices
         if (arg->getNumCols() != 1)
             throw std::runtime_error("DiagMatrix: parameter arg must be a column-matrix");
-        
+
         const size_t numRowsCols = arg->getNumRows();
-        
+
         if (res == nullptr)
             res = DataObjectFactory::create<DenseMatrix<VT>>(numRowsCols, numRowsCols, false);
-        
+
         res->prepareAppend();
         for (size_t r = 0; r < numRowsCols; ++r) {
             res->append(r, r, arg->get(r, 0));
@@ -170,4 +164,4 @@ struct DiagMatrix<Matrix<VT>, Matrix<VT>> {
     }
 };
 
-#endif //SRC_RUNTIME_LOCAL_KERNELS_DIAGMATRIX_H
+#endif // SRC_RUNTIME_LOCAL_KERNELS_DIAGMATRIX_H
diff --git a/src/runtime/local/kernels/DiagVector.h b/src/runtime/local/kernels/DiagVector.h
index 28caedf8e..aed82f0d7 100644
--- a/src/runtime/local/kernels/DiagVector.h
+++ b/src/runtime/local/kernels/DiagVector.h
@@ -29,18 +29,15 @@
 // Struct for partial template specialization
 // ****************************************************************************
 
-template<class DTRes, class DTArg>
-struct DiagVector {
-    static void apply(DTRes *& res, const DTArg * arg, DCTX(ctx)) = delete;
+template <class DTRes, class DTArg> struct DiagVector {
+    static void apply(DTRes *&res, const DTArg *arg, DCTX(ctx)) = delete;
 };
 
-
 // ****************************************************************************
 // Convenience function
 // ****************************************************************************
 
-template<class DTRes, class DTArg>
-void diagVector(DTRes *& res, const DTArg * arg, DCTX(ctx)) {
+template <class DTRes, class DTArg> void diagVector(DTRes *&res, const DTArg *arg, DCTX(ctx)) {
     DiagVector<DTRes, DTArg>::apply(res, arg, ctx);
 }
 
@@ -52,9 +49,8 @@ void diagVector(DTRes *& res, const DTArg * arg, DCTX(ctx)) {
 // DenseMatrix <- DenseMatrix
 // ----------------------------------------------------------------------------
 
-template<typename VT>
-struct DiagVector<DenseMatrix<VT>, DenseMatrix<VT>> {
-    static void apply(DenseMatrix<VT> *& res, const DenseMatrix<VT> * arg, DCTX(ctx)) {
+template <typename VT> struct DiagVector<DenseMatrix<VT>, DenseMatrix<VT>> {
+    static void apply(DenseMatrix<VT> *&res, const DenseMatrix<VT> *arg, DCTX(ctx)) {
         //------handling corner cases -------
         if (!arg) {
             throw std::runtime_error("arg must not be nullptr");
@@ -67,26 +63,24 @@ struct DiagVector<DenseMatrix<VT>, DenseMatrix<VT>> {
             throw std::runtime_error("arg matrix cannot be empty");
         }
 
-        if(res==nullptr){
-            res = DataObjectFactory::create<DenseMatrix<VT>>(numRows, 1,  false);
+        if (res == nullptr) {
+            res = DataObjectFactory::create<DenseMatrix<VT>>(numRows, 1, false);
         }
-        const VT * allValues = arg->getValues();
-        VT * allUpdatedValues = res->getValues();
-        const size_t rowSize=arg->getRowSkip();
-        for(size_t r = 0; r < numRows; r++){
-            allUpdatedValues[r]=allValues[r+(r*rowSize)];
+        const VT *allValues = arg->getValues();
+        VT *allUpdatedValues = res->getValues();
+        const size_t rowSize = arg->getRowSkip();
+        for (size_t r = 0; r < numRows; r++) {
+            allUpdatedValues[r] = allValues[r + (r * rowSize)];
         }
     }
 };
 
-
 // ----------------------------------------------------------------------------
 // DenseMatrix <- CSRMatrix
 // ----------------------------------------------------------------------------
 
-template<typename VT>
-struct DiagVector<DenseMatrix<VT>, CSRMatrix<VT>> {
-    static void apply(DenseMatrix<VT> *& res, const CSRMatrix<VT> * arg, DCTX(ctx)) {
+template <typename VT> struct DiagVector<DenseMatrix<VT>, CSRMatrix<VT>> {
+    static void apply(DenseMatrix<VT> *&res, const CSRMatrix<VT> *arg, DCTX(ctx)) {
         //-------handling corner cases ---------
         if (arg == nullptr) {
             throw std::runtime_error("arg must not be nullptr");
@@ -98,28 +92,28 @@ struct DiagVector<DenseMatrix<VT>, CSRMatrix<VT>> {
         if (numRows == 0) {
             throw std::runtime_error("arg matrix cannot be empty");
         }
-        if(res==nullptr){ 
-            res = DataObjectFactory::create<DenseMatrix<VT>>(numRows, 1,  false);
+        if (res == nullptr) {
+            res = DataObjectFactory::create<DenseMatrix<VT>>(numRows, 1, false);
         }
-        const VT *allValues= arg->getValues();
+        const VT *allValues = arg->getValues();
         const size_t *rowOffsets = arg->getRowOffsets();
-        const size_t *colIndxs= arg->getColIdxs();
-        VT * resValues = res->getValues();
+        const size_t *colIndxs = arg->getColIdxs();
+        VT *resValues = res->getValues();
         size_t startRowOffset;
         size_t endRowOffset;
         VT targetValue;
-        for (size_t i =0 ; i< numRows;++i){
+        for (size_t i = 0; i < numRows; ++i) {
             startRowOffset = rowOffsets[i];
-            endRowOffset = rowOffsets[i+1];
+            endRowOffset = rowOffsets[i + 1];
             targetValue = 0;
-            //TODO perf binary search in row range for i
-            for( size_t j = startRowOffset; j<endRowOffset; ++j ){
-                if( colIndxs[j]==i) {
-                    targetValue=allValues[j];
+            // TODO perf binary search in row range for i
+            for (size_t j = startRowOffset; j < endRowOffset; ++j) {
+                if (colIndxs[j] == i) {
+                    targetValue = allValues[j];
                     break;
                 }
             }
-            resValues[i]=targetValue;
+            resValues[i] = targetValue;
         }
     }
 };
@@ -128,9 +122,8 @@ struct DiagVector<DenseMatrix<VT>, CSRMatrix<VT>> {
 // Matrix <- Matrix
 // ----------------------------------------------------------------------------
 
-template<typename VT>
-struct DiagVector<Matrix<VT>, Matrix<VT>> {
-    static void apply(Matrix<VT> *& res, const Matrix<VT> * arg, DCTX(ctx)) {
+template <typename VT> struct DiagVector<Matrix<VT>, Matrix<VT>> {
+    static void apply(Matrix<VT> *&res, const Matrix<VT> *arg, DCTX(ctx)) {
         const size_t numRows = arg->getNumRows();
 
         //------handling corner cases -------
@@ -142,7 +135,7 @@ struct DiagVector<Matrix<VT>, Matrix<VT>> {
             throw std::runtime_error("DiagVector: arg matrix cannot be empty");
 
         if (res == nullptr)
-            res = DataObjectFactory::create<DenseMatrix<VT>>(numRows, 1,  false);
+            res = DataObjectFactory::create<DenseMatrix<VT>>(numRows, 1, false);
 
         res->prepareAppend();
         for (size_t r = 0; r < numRows; ++r)
diff --git a/src/runtime/local/kernels/DistributedPipeline.h b/src/runtime/local/kernels/DistributedPipeline.h
index 8dae85df8..f9cffbab4 100644
--- a/src/runtime/local/kernels/DistributedPipeline.h
+++ b/src/runtime/local/kernels/DistributedPipeline.h
@@ -17,53 +17,47 @@
 #pragma once
 
 #include <runtime/local/context/DaphneContext.h>
-//#include <runtime/local/datastructures/DataObjectFactory.h>
-//#include <runtime/local/datastructures/DenseMatrix.h>
+// #include <runtime/local/datastructures/DataObjectFactory.h>
+// #include <runtime/local/datastructures/DenseMatrix.h>
 #include <runtime/distributed/coordinator/kernels/DistributedWrapper.h>
-//#include <ir/daphneir/Daphne.h>
-
+// #include <ir/daphneir/Daphne.h>
 
 #include <iostream>
 
-using mlir::daphne::VectorSplit;
 using mlir::daphne::VectorCombine;
+using mlir::daphne::VectorSplit;
 
 // ****************************************************************************
 // Struct for partial template specialization
 // ****************************************************************************
 
-//template<class DTRes, class DTArg>
-//struct DistributedPipeline {
-//    static void apply(
-//            DTRes *& res,
-//            const char * irCode,
-//            const DTArg ** inputs, size_t numInputs,
-//            size_t * outRows, size_t numOutRows,
-//            size_t *outCols, size_t numOutCols,
-//            DCTX(ctx)
-//    );
-//};
+// template<class DTRes, class DTArg>
+// struct DistributedPipeline {
+//     static void apply(
+//             DTRes *& res,
+//             const char * irCode,
+//             const DTArg ** inputs, size_t numInputs,
+//             size_t * outRows, size_t numOutRows,
+//             size_t *outCols, size_t numOutCols,
+//             DCTX(ctx)
+//     );
+// };
 
 // ****************************************************************************
 // Convenience function
 // ****************************************************************************
 
 // One output.
-template<class DTRes>
-void distributedPipeline(
-        DTRes ** outputs, size_t numOutputs,
-        const Structure ** inputs, size_t numInputs,
-        int64_t * outRows, int64_t * outCols,
-        int64_t * splits, int64_t * combines,
-        const char * irCode,
-        DCTX(ctx)
-) {
+template <class DTRes>
+void distributedPipeline(DTRes **outputs, size_t numOutputs, const Structure **inputs, size_t numInputs,
+                         int64_t *outRows, int64_t *outCols, int64_t *splits, int64_t *combines, const char *irCode,
+                         DCTX(ctx)) {
 
     auto wrapper = std::make_unique<DistributedWrapper<DTRes>>(ctx);
     // TODO *** -> **
     DTRes ***res = new DTRes **[numOutputs];
     for (size_t i = 0; i < numOutputs; i++)
-        res[i] = outputs+i;
+        res[i] = outputs + i;
     wrapper->execute(irCode, res, inputs, numInputs, numOutputs, outRows, outCols,
-            reinterpret_cast<VectorSplit *>(splits), reinterpret_cast<VectorCombine *>(combines));
+                     reinterpret_cast<VectorSplit *>(splits), reinterpret_cast<VectorCombine *>(combines));
 }
diff --git a/src/runtime/local/kernels/EigenCal.h b/src/runtime/local/kernels/EigenCal.h
index 42f3beb64..1df65de6d 100644
--- a/src/runtime/local/kernels/EigenCal.h
+++ b/src/runtime/local/kernels/EigenCal.h
@@ -16,35 +16,35 @@
 
 #pragma once
 
+#include <Eigen/Dense>
+#include <Eigen/Eigenvalues>
 #include <runtime/local/context/DaphneContext.h>
 #include <runtime/local/datastructures/CSRMatrix.h>
 #include <runtime/local/datastructures/DataObjectFactory.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
 #include <runtime/local/kernels/IsSymmetric.h>
-#include <Eigen/Eigenvalues>
-#include <Eigen/Dense>
 
-#include <iostream>
 #include <cstddef>
+#include <iostream>
 
 // ****************************************************************************
 // Struct for partial template specialization
 // res1 matrix for eigenvalues, res2 matrix for eigenvectors
-// Column k of the returned matrix res2 is an eigenvector corresponding to eigenvalue number k as returned by eigenvalues(). 
-// The eigenvectors are normalized to have (Euclidean) norm equal to one. 
+// Column k of the returned matrix res2 is an eigenvector corresponding to
+// eigenvalue number k as returned by eigenvalues(). The eigenvectors are
+// normalized to have (Euclidean) norm equal to one.
 // ****************************************************************************
 
-template<class DTRes1, class DTRes2, class VTArg>
-struct EigenCal {
-    static void apply(DTRes1 *& res1, DTRes2 *& res2, const VTArg * inMat, DCTX(ctx)) = delete;
+template <class DTRes1, class DTRes2, class VTArg> struct EigenCal {
+    static void apply(DTRes1 *&res1, DTRes2 *&res2, const VTArg *inMat, DCTX(ctx)) = delete;
 };
 
 // ****************************************************************************
 // Convenience function
 // ****************************************************************************
-template<class DTRes1, class DTRes2, class VTArg>
-void eigenCal(DTRes1 *& res1, DTRes2 *& res2, const VTArg * inMat, DCTX(ctx)) {
-    EigenCal<DTRes1, DTRes2, VTArg>::apply(res1,res2, inMat, ctx);
+template <class DTRes1, class DTRes2, class VTArg>
+void eigenCal(DTRes1 *&res1, DTRes2 *&res2, const VTArg *inMat, DCTX(ctx)) {
+    EigenCal<DTRes1, DTRes2, VTArg>::apply(res1, res2, inMat, ctx);
 }
 
 // ****************************************************************************
@@ -52,109 +52,107 @@ void eigenCal(DTRes1 *& res1, DTRes2 *& res2, const VTArg * inMat, DCTX(ctx)) {
 // ****************************************************************************
 
 // ----------------------------------------------------------------------------
-// DenseMatrix 
+// DenseMatrix
 // Double Value types as input
 // ----------------------------------------------------------------------------
-template<>
-struct EigenCal<DenseMatrix<double>,DenseMatrix<double>,DenseMatrix<double>> {
-    static void apply(DenseMatrix<double> *& res1,DenseMatrix<double> *& res2, const DenseMatrix<double> * inMat,
-            DCTX(ctx)) {
+template <> struct EigenCal<DenseMatrix<double>, DenseMatrix<double>, DenseMatrix<double>> {
+    static void apply(DenseMatrix<double> *&res1, DenseMatrix<double> *&res2, const DenseMatrix<double> *inMat,
+                      DCTX(ctx)) {
         const auto nr = static_cast<size_t>(inMat->getNumRows());
         const auto nc = static_cast<size_t>(inMat->getNumCols());
         if (!isSymmetric<DenseMatrix<double>>(inMat, nullptr)) {
-            throw std::runtime_error(
-                "EigenCal - Input matrix must be symmetric");
+            throw std::runtime_error("EigenCal - Input matrix must be symmetric");
         }
 
         Eigen::MatrixXd inputMatrix = Eigen::Map<const Eigen::MatrixXd>(inMat->getValues(), nr, nc);
 
         // the instance s(A) includes the eigensystem
-        Eigen::EigenSolver<Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic> > s(inputMatrix);
-
-        size_t eigenValuesrows=s.eigenvalues().rows();
-        size_t eigenValuescols=s.eigenvalues().cols();
-        size_t eigenVectorsrows=s.eigenvectors().rows();
-        size_t eigenVectorscols=s.eigenvectors().cols();
-        Eigen::MatrixXd eigenVectors = s.eigenvectors().real().cast <double> ();
-        Eigen::MatrixXd eigenValues = s.eigenvalues().real().cast <double> ();
+        Eigen::EigenSolver<Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic>> s(inputMatrix);
 
-        // TODO wrap the eigenValues, eigenVectors pointer into a shared_ptr and then use
-        //that to create the DenseMatrix
+        size_t eigenValuesrows = s.eigenvalues().rows();
+        size_t eigenValuescols = s.eigenvalues().cols();
+        size_t eigenVectorsrows = s.eigenvectors().rows();
+        size_t eigenVectorscols = s.eigenvectors().cols();
+        Eigen::MatrixXd eigenVectors = s.eigenvectors().real().cast<double>();
+        Eigen::MatrixXd eigenValues = s.eigenvalues().real().cast<double>();
 
-        if(res1 == nullptr)
-             res1= DataObjectFactory::create<DenseMatrix<double>>(eigenValuesrows, eigenValuescols, false);
+        // TODO wrap the eigenValues, eigenVectors pointer into a shared_ptr and
+        // then use
+        // that to create the DenseMatrix
 
-        if(res2 == nullptr)
-             res2= DataObjectFactory::create<DenseMatrix<double>>(eigenVectorsrows, eigenVectorscols, false);
+        if (res1 == nullptr)
+            res1 = DataObjectFactory::create<DenseMatrix<double>>(eigenValuesrows, eigenValuescols, false);
 
+        if (res2 == nullptr)
+            res2 = DataObjectFactory::create<DenseMatrix<double>>(eigenVectorsrows, eigenVectorscols, false);
 
-        for(size_t r = 0; r < eigenValuesrows; r++) {
-            for(size_t c = 0; c < eigenValuescols; c++) {
-                res1->set(r,c,s.eigenvalues()[r].real());
+        for (size_t r = 0; r < eigenValuesrows; r++) {
+            for (size_t c = 0; c < eigenValuescols; c++) {
+                res1->set(r, c, s.eigenvalues()[r].real());
             }
         }
-        for(size_t r = 0; r < eigenVectorsrows; r++) {
-            for(size_t c = 0; c < eigenVectorscols; c++) {
-                res2->set(r,c,eigenVectors.coeff(r, c));
+        for (size_t r = 0; r < eigenVectorsrows; r++) {
+            for (size_t c = 0; c < eigenVectorscols; c++) {
+                res2->set(r, c, eigenVectors.coeff(r, c));
             }
         }
     }
 };
 
 // ----------------------------------------------------------------------------
-// DenseMatrix 
+// DenseMatrix
 // Float Value types as input
 // ----------------------------------------------------------------------------
-template<>
-struct EigenCal<DenseMatrix<float>,DenseMatrix<float>,DenseMatrix<float>> {
-    static void apply(DenseMatrix<float> *& res1,DenseMatrix<float> *& res2, const DenseMatrix<float> * inMat,
-            DCTX(ctx)) {
+template <> struct EigenCal<DenseMatrix<float>, DenseMatrix<float>, DenseMatrix<float>> {
+    static void apply(DenseMatrix<float> *&res1, DenseMatrix<float> *&res2, const DenseMatrix<float> *inMat,
+                      DCTX(ctx)) {
         const auto nr = static_cast<size_t>(inMat->getNumRows());
         const auto nc = static_cast<size_t>(inMat->getNumCols());
 
         if (!isSymmetric<DenseMatrix<float>>(inMat, nullptr)) {
-            throw std::runtime_error(
-                "EigenCal - Input matrix must be symmetric");
+            throw std::runtime_error("EigenCal - Input matrix must be symmetric");
         }
 
         Eigen::MatrixXf inputMatrix = Eigen::Map<const Eigen::MatrixXf>(inMat->getValues(), nr, nc);
 
         // the instance s(A) includes the eigensystem
-        Eigen::EigenSolver<Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic> > s(inputMatrix);
- 
-        size_t eigenValuesrows=s.eigenvalues().rows();
-        size_t eigenValuescols=s.eigenvalues().cols();
-        size_t eigenVectorsrows=s.eigenvectors().rows();
-        size_t eigenVectorscols=s.eigenvectors().cols();
-        Eigen::MatrixXf eigenVectors = s.eigenvectors().real().cast <float> ();
-        Eigen::MatrixXf eigenValues = s.eigenvalues().real().cast <float> ();
-
-        // When it comes to float number, it has been noticed than at the rounding part, errors occured at the sign of the results.
-        // So one needs this iteration to make sure that the the results are correct manually.
+        Eigen::EigenSolver<Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic>> s(inputMatrix);
+
+        size_t eigenValuesrows = s.eigenvalues().rows();
+        size_t eigenValuescols = s.eigenvalues().cols();
+        size_t eigenVectorsrows = s.eigenvectors().rows();
+        size_t eigenVectorscols = s.eigenvectors().cols();
+        Eigen::MatrixXf eigenVectors = s.eigenvectors().real().cast<float>();
+        Eigen::MatrixXf eigenValues = s.eigenvalues().real().cast<float>();
+
+        // When it comes to float number, it has been noticed than at the
+        // rounding part, errors occured at the sign of the results. So one
+        // needs this iteration to make sure that the the results are correct
+        // manually.
 
         for (size_t i = 0; i < eigenVectorscols; i++) {
-        Eigen::VectorXcf ev = eigenVectors.col(i);
-        size_t max_index = 0;
-        for (int32_t j = 1; j < ev.size(); j++) {
-            if (std::abs(ev(j)) > std::abs(ev(max_index)))
-                max_index = j;
+            Eigen::VectorXcf ev = eigenVectors.col(i);
+            size_t max_index = 0;
+            for (int32_t j = 1; j < ev.size(); j++) {
+                if (std::abs(ev(j)) > std::abs(ev(max_index)))
+                    max_index = j;
             }
             if (ev(max_index).real() < 0)
                 eigenVectors.col(i) *= -1;
-            }
+        }
 
-            if(res1 == nullptr)
-                res1= DataObjectFactory::create<DenseMatrix<float>>(eigenValuesrows, eigenValuescols, false);
+        if (res1 == nullptr)
+            res1 = DataObjectFactory::create<DenseMatrix<float>>(eigenValuesrows, eigenValuescols, false);
 
-            if(res2 == nullptr)
-                res2= DataObjectFactory::create<DenseMatrix<float>>(eigenVectorsrows, eigenVectorscols, false);
+        if (res2 == nullptr)
+            res2 = DataObjectFactory::create<DenseMatrix<float>>(eigenVectorsrows, eigenVectorscols, false);
 
-            for(size_t r = 0; r < eigenValuesrows; r++)
-                for(size_t c = 0; c < eigenValuescols; c++)
-                    res1->set(r,c,s.eigenvalues()[r].real());
+        for (size_t r = 0; r < eigenValuesrows; r++)
+            for (size_t c = 0; c < eigenValuescols; c++)
+                res1->set(r, c, s.eigenvalues()[r].real());
 
-            for(size_t r = 0; r < eigenVectorsrows; r++)
-                for(size_t c = 0; c < eigenVectorscols; c++)
-                    res2->set(r,c,eigenVectors.coeff(r, c));
-        }
+        for (size_t r = 0; r < eigenVectorsrows; r++)
+            for (size_t c = 0; c < eigenVectorscols; c++)
+                res2->set(r, c, eigenVectors.coeff(r, c));
+    }
 };
diff --git a/src/runtime/local/kernels/EwBinaryMat.h b/src/runtime/local/kernels/EwBinaryMat.h
index 070fdec75..5330c71d7 100644
--- a/src/runtime/local/kernels/EwBinaryMat.h
+++ b/src/runtime/local/kernels/EwBinaryMat.h
@@ -30,17 +30,16 @@
 // Struct for partial template specialization
 // ****************************************************************************
 
-template<class DTRes, class DTLhs, class DTRhs>
-struct EwBinaryMat {
-    static void apply(BinaryOpCode opCode, DTRes *& res, const DTLhs * lhs, const DTRhs * rhs, DCTX(ctx)) = delete;
+template <class DTRes, class DTLhs, class DTRhs> struct EwBinaryMat {
+    static void apply(BinaryOpCode opCode, DTRes *&res, const DTLhs *lhs, const DTRhs *rhs, DCTX(ctx)) = delete;
 };
 
 // ****************************************************************************
 // Convenience function
 // ****************************************************************************
 
-template<class DTRes, class DTLhs, class DTRhs>
-void ewBinaryMat(BinaryOpCode opCode, DTRes *& res, const DTLhs * lhs, const DTRhs * rhs, DCTX(ctx)) {
+template <class DTRes, class DTLhs, class DTRhs>
+void ewBinaryMat(BinaryOpCode opCode, DTRes *&res, const DTLhs *lhs, const DTRhs *rhs, DCTX(ctx)) {
     EwBinaryMat<DTRes, DTLhs, DTRhs>::apply(opCode, res, lhs, rhs, ctx);
 }
 
@@ -52,59 +51,58 @@ void ewBinaryMat(BinaryOpCode opCode, DTRes *& res, const DTLhs * lhs, const DTR
 // DenseMatrix <- DenseMatrix, DenseMatrix
 // ----------------------------------------------------------------------------
 
-template<typename VTres, typename VTlhs, typename VTrhs>
+template <typename VTres, typename VTlhs, typename VTrhs>
 struct EwBinaryMat<DenseMatrix<VTres>, DenseMatrix<VTlhs>, DenseMatrix<VTrhs>> {
-    static void apply(BinaryOpCode opCode, DenseMatrix<VTres> *& res, const DenseMatrix<VTlhs> * lhs, const DenseMatrix<VTrhs> * rhs, DCTX(ctx)) {
+    static void apply(BinaryOpCode opCode, DenseMatrix<VTres> *&res, const DenseMatrix<VTlhs> *lhs,
+                      const DenseMatrix<VTrhs> *rhs, DCTX(ctx)) {
         const size_t numRowsLhs = lhs->getNumRows();
         const size_t numColsLhs = lhs->getNumCols();
         const size_t numRowsRhs = rhs->getNumRows();
         const size_t numColsRhs = rhs->getNumCols();
 
-        if(res == nullptr)
+        if (res == nullptr)
             res = DataObjectFactory::create<DenseMatrix<VTres>>(numRowsLhs, numColsLhs, false);
-        
-        const VTlhs * valuesLhs = lhs->getValues();
-        const VTrhs * valuesRhs = rhs->getValues();
-        VTres * valuesRes = res->getValues();
-        
+
+        const VTlhs *valuesLhs = lhs->getValues();
+        const VTrhs *valuesRhs = rhs->getValues();
+        VTres *valuesRes = res->getValues();
+
         EwBinaryScaFuncPtr<VTres, VTlhs, VTrhs> func = getEwBinaryScaFuncPtr<VTres, VTlhs, VTrhs>(opCode);
-        
-        if(numRowsLhs == numRowsRhs && numColsLhs == numColsRhs) {
+
+        if (numRowsLhs == numRowsRhs && numColsLhs == numColsRhs) {
             // matrix op matrix (same size)
-            for(size_t r = 0; r < numRowsLhs; r++) {
-                for(size_t c = 0; c < numColsLhs; c++)
+            for (size_t r = 0; r < numRowsLhs; r++) {
+                for (size_t c = 0; c < numColsLhs; c++)
                     valuesRes[c] = func(valuesLhs[c], valuesRhs[c], ctx);
                 valuesLhs += lhs->getRowSkip();
                 valuesRhs += rhs->getRowSkip();
                 valuesRes += res->getRowSkip();
             }
-        }
-        else if(numColsLhs == numColsRhs && (numRowsRhs == 1 || numRowsLhs == 1)) {
+        } else if (numColsLhs == numColsRhs && (numRowsRhs == 1 || numRowsLhs == 1)) {
             // matrix op row-vector
-            for(size_t r = 0; r < numRowsLhs; r++) {
-                for(size_t c = 0; c < numColsLhs; c++)
+            for (size_t r = 0; r < numRowsLhs; r++) {
+                for (size_t c = 0; c < numColsLhs; c++)
                     valuesRes[c] = func(valuesLhs[c], valuesRhs[c], ctx);
                 valuesLhs += lhs->getRowSkip();
                 valuesRes += res->getRowSkip();
             }
-        }
-        else if(numRowsLhs == numRowsRhs && (numColsRhs == 1 || numColsLhs == 1)) {
+        } else if (numRowsLhs == numRowsRhs && (numColsRhs == 1 || numColsLhs == 1)) {
             // matrix op col-vector
-            for(size_t r = 0; r < numRowsLhs; r++) {
-                for(size_t c = 0; c < numColsLhs; c++)
+            for (size_t r = 0; r < numRowsLhs; r++) {
+                for (size_t c = 0; c < numColsLhs; c++)
                     valuesRes[c] = func(valuesLhs[c], valuesRhs[0], ctx);
                 valuesLhs += lhs->getRowSkip();
                 valuesRhs += rhs->getRowSkip();
                 valuesRes += res->getRowSkip();
             }
-        }
-        else {
+        } else {
             throw std::runtime_error("EwBinaryMat(Dense) - lhs and rhs must either "
-                "have the same dimensions, or one of them must be a row/column vector "
-                "with the width/height of the other, but lhs has shape (" +
-                std::to_string(numRowsLhs) + " x " + std::to_string(numColsLhs) +
-                ") and rhs has shape (" + std::to_string(numRowsRhs) + " x " +
-                std::to_string(numColsRhs) + ")");
+                                     "have the same dimensions, or one of them must be a row/column "
+                                     "vector "
+                                     "with the width/height of the other, but lhs has shape (" +
+                                     std::to_string(numRowsLhs) + " x " + std::to_string(numColsLhs) +
+                                     ") and rhs has shape (" + std::to_string(numRowsRhs) + " x " +
+                                     std::to_string(numColsRhs) + ")");
         }
     }
 };
@@ -113,139 +111,138 @@ struct EwBinaryMat<DenseMatrix<VTres>, DenseMatrix<VTlhs>, DenseMatrix<VTrhs>> {
 // CSRMatrix <- CSRMatrix, CSRMatrix
 // ----------------------------------------------------------------------------
 
-template<typename VT>
-struct EwBinaryMat<CSRMatrix<VT>, CSRMatrix<VT>, CSRMatrix<VT>> {
-    static void apply(BinaryOpCode opCode, CSRMatrix<VT> *& res, const CSRMatrix<VT> * lhs, const CSRMatrix<VT> * rhs, DCTX(ctx)) {
+template <typename VT> struct EwBinaryMat<CSRMatrix<VT>, CSRMatrix<VT>, CSRMatrix<VT>> {
+    static void apply(BinaryOpCode opCode, CSRMatrix<VT> *&res, const CSRMatrix<VT> *lhs, const CSRMatrix<VT> *rhs,
+                      DCTX(ctx)) {
         const size_t numRows = lhs->getNumRows();
         const size_t numCols = lhs->getNumCols();
-        if( numRows != rhs->getNumRows() || numCols != rhs->getNumCols() )
-            throw std::runtime_error("EwBinaryMat(CSR) - lhs and rhs must have the same dimensions.");
-        
+        if (numRows != rhs->getNumRows() || numCols != rhs->getNumCols())
+            throw std::runtime_error("EwBinaryMat(CSR) - lhs and rhs must have "
+                                     "the same dimensions.");
+
         size_t maxNnz;
-        switch(opCode) {
-            case BinaryOpCode::ADD: // merge
-                maxNnz = lhs->getNumNonZeros() + rhs->getNumNonZeros();
-                break;
-            case BinaryOpCode::MUL: // intersect
-                maxNnz = std::min(lhs->getNumNonZeros(), rhs->getNumNonZeros());
-                break;
-            default:
-                throw std::runtime_error("EwBinaryMat(CSR) - unknown BinaryOpCode");
+        switch (opCode) {
+        case BinaryOpCode::ADD: // merge
+            maxNnz = lhs->getNumNonZeros() + rhs->getNumNonZeros();
+            break;
+        case BinaryOpCode::MUL: // intersect
+            maxNnz = std::min(lhs->getNumNonZeros(), rhs->getNumNonZeros());
+            break;
+        default:
+            throw std::runtime_error("EwBinaryMat(CSR) - unknown BinaryOpCode");
         }
-        
-        if(res == nullptr)
+
+        if (res == nullptr)
             res = DataObjectFactory::create<CSRMatrix<VT>>(numRows, numCols, maxNnz, false);
-        
-        size_t * rowOffsetsRes = res->getRowOffsets();
-        
+
+        size_t *rowOffsetsRes = res->getRowOffsets();
+
         EwBinaryScaFuncPtr<VT, VT, VT> func = getEwBinaryScaFuncPtr<VT, VT, VT>(opCode);
-        
+
         rowOffsetsRes[0] = 0;
-        
-        switch(opCode) {
-            case BinaryOpCode::ADD: { // merge non-zero cells
-                for(size_t rowIdx = 0; rowIdx < numRows; rowIdx++) {
-                    size_t nnzRowLhs = lhs->getNumNonZeros(rowIdx);
-                    size_t nnzRowRhs = rhs->getNumNonZeros(rowIdx);
-                    if(nnzRowLhs && nnzRowRhs) {
-                        // merge within row
-                        const VT * valuesRowLhs = lhs->getValues(rowIdx);
-                        const VT * valuesRowRhs = rhs->getValues(rowIdx);
-                        VT * valuesRowRes = res->getValues(rowIdx);
-                        const size_t * colIdxsRowLhs = lhs->getColIdxs(rowIdx);
-                        const size_t * colIdxsRowRhs = rhs->getColIdxs(rowIdx);
-                        size_t * colIdxsRowRes = res->getColIdxs(rowIdx);
-                        size_t posLhs = 0;
-                        size_t posRhs = 0;
-                        size_t posRes = 0;
-                        while(posLhs < nnzRowLhs && posRhs < nnzRowRhs) {
-                            if(colIdxsRowLhs[posLhs] == colIdxsRowRhs[posRhs]) {
-                                valuesRowRes[posRes] = func(valuesRowLhs[posLhs], valuesRowRhs[posRhs], ctx);
-                                colIdxsRowRes[posRes] = colIdxsRowLhs[posLhs];
-                                posLhs++;
-                                posRhs++;
-                            }
-                            else if(colIdxsRowLhs[posLhs] < colIdxsRowRhs[posRhs]) {
-                                valuesRowRes[posRes] = valuesRowLhs[posLhs];
+
+        switch (opCode) {
+        case BinaryOpCode::ADD: { // merge non-zero cells
+            for (size_t rowIdx = 0; rowIdx < numRows; rowIdx++) {
+                size_t nnzRowLhs = lhs->getNumNonZeros(rowIdx);
+                size_t nnzRowRhs = rhs->getNumNonZeros(rowIdx);
+                if (nnzRowLhs && nnzRowRhs) {
+                    // merge within row
+                    const VT *valuesRowLhs = lhs->getValues(rowIdx);
+                    const VT *valuesRowRhs = rhs->getValues(rowIdx);
+                    VT *valuesRowRes = res->getValues(rowIdx);
+                    const size_t *colIdxsRowLhs = lhs->getColIdxs(rowIdx);
+                    const size_t *colIdxsRowRhs = rhs->getColIdxs(rowIdx);
+                    size_t *colIdxsRowRes = res->getColIdxs(rowIdx);
+                    size_t posLhs = 0;
+                    size_t posRhs = 0;
+                    size_t posRes = 0;
+                    while (posLhs < nnzRowLhs && posRhs < nnzRowRhs) {
+                        if (colIdxsRowLhs[posLhs] == colIdxsRowRhs[posRhs]) {
+                            VT funcRes = func(valuesRowLhs[posLhs], valuesRowRhs[posRhs], ctx);
+                            if (funcRes != VT(0)) {
+                                valuesRowRes[posRes] = funcRes;
                                 colIdxsRowRes[posRes] = colIdxsRowLhs[posLhs];
-                                posLhs++;
-                            }
-                            else {
-                                valuesRowRes[posRes] = valuesRowRhs[posRhs];
-                                colIdxsRowRes[posRes] = colIdxsRowRhs[posRhs];
-                                posRhs++;
+                                posRes++;
                             }
+                            posLhs++;
+                            posRhs++;
+                        } else if (colIdxsRowLhs[posLhs] < colIdxsRowRhs[posRhs]) {
+                            valuesRowRes[posRes] = valuesRowLhs[posLhs];
+                            colIdxsRowRes[posRes] = colIdxsRowLhs[posLhs];
+                            posLhs++;
+                            posRes++;
+                        } else {
+                            valuesRowRes[posRes] = valuesRowRhs[posRhs];
+                            colIdxsRowRes[posRes] = colIdxsRowRhs[posRhs];
+                            posRhs++;
                             posRes++;
                         }
-                        // copy from left
-                        const size_t restRowLhs = nnzRowLhs - posLhs;
-                        memcpy(valuesRowRes + posRes, valuesRowLhs + posLhs, restRowLhs * sizeof(VT));
-                        memcpy(colIdxsRowRes + posRes, colIdxsRowLhs + posLhs, restRowLhs * sizeof(size_t));
-                        // copy from right
-                        const size_t restRowRhs = nnzRowRhs - posRhs;
-                        memcpy(valuesRowRes + posRes, valuesRowRhs + posRhs, restRowRhs * sizeof(VT));
-                        memcpy(colIdxsRowRes + posRes, colIdxsRowRhs + posRhs, restRowRhs * sizeof(size_t));
-                        
-                        rowOffsetsRes[rowIdx + 1] = rowOffsetsRes[rowIdx] + posRes + restRowLhs + restRowRhs;
                     }
-                    else if(nnzRowLhs) {
-                        // copy from left
-                        memcpy(res->getValues(rowIdx), lhs->getValues(rowIdx), nnzRowLhs * sizeof(VT));
-                        memcpy(res->getColIdxs(rowIdx), lhs->getColIdxs(rowIdx), nnzRowLhs * sizeof(size_t));
-                        rowOffsetsRes[rowIdx + 1] = rowOffsetsRes[rowIdx] + nnzRowLhs;
-                    }
-                    else if(nnzRowRhs) {
-                        // copy from right
-                        memcpy(res->getValues(rowIdx), rhs->getValues(rowIdx), nnzRowRhs * sizeof(VT));
-                        memcpy(res->getColIdxs(rowIdx), rhs->getColIdxs(rowIdx), nnzRowRhs * sizeof(size_t));
-                        rowOffsetsRes[rowIdx + 1] = rowOffsetsRes[rowIdx] + nnzRowRhs;
-                    }
-                    else
-                        // empty row in result
-                        rowOffsetsRes[rowIdx + 1] = rowOffsetsRes[rowIdx];
-                }
-                break;
+                    // copy from left
+                    const size_t restRowLhs = nnzRowLhs - posLhs;
+                    memcpy(valuesRowRes + posRes, valuesRowLhs + posLhs, restRowLhs * sizeof(VT));
+                    memcpy(colIdxsRowRes + posRes, colIdxsRowLhs + posLhs, restRowLhs * sizeof(size_t));
+                    // copy from right
+                    const size_t restRowRhs = nnzRowRhs - posRhs;
+                    memcpy(valuesRowRes + posRes, valuesRowRhs + posRhs, restRowRhs * sizeof(VT));
+                    memcpy(colIdxsRowRes + posRes, colIdxsRowRhs + posRhs, restRowRhs * sizeof(size_t));
+
+                    rowOffsetsRes[rowIdx + 1] = rowOffsetsRes[rowIdx] + posRes + restRowLhs + restRowRhs;
+                } else if (nnzRowLhs) {
+                    // copy from left
+                    memcpy(res->getValues(rowIdx), lhs->getValues(rowIdx), nnzRowLhs * sizeof(VT));
+                    memcpy(res->getColIdxs(rowIdx), lhs->getColIdxs(rowIdx), nnzRowLhs * sizeof(size_t));
+                    rowOffsetsRes[rowIdx + 1] = rowOffsetsRes[rowIdx] + nnzRowLhs;
+                } else if (nnzRowRhs) {
+                    // copy from right
+                    memcpy(res->getValues(rowIdx), rhs->getValues(rowIdx), nnzRowRhs * sizeof(VT));
+                    memcpy(res->getColIdxs(rowIdx), rhs->getColIdxs(rowIdx), nnzRowRhs * sizeof(size_t));
+                    rowOffsetsRes[rowIdx + 1] = rowOffsetsRes[rowIdx] + nnzRowRhs;
+                } else
+                    // empty row in result
+                    rowOffsetsRes[rowIdx + 1] = rowOffsetsRes[rowIdx];
             }
-            case BinaryOpCode::MUL: { // intersect non-zero cells
-                for(size_t rowIdx = 0; rowIdx < numRows; rowIdx++) {
-                    size_t nnzRowLhs = lhs->getNumNonZeros(rowIdx);
-                    size_t nnzRowRhs = rhs->getNumNonZeros(rowIdx);
-                    if(nnzRowLhs && nnzRowRhs) {
-                        // intersect within row
-                        const VT * valuesRowLhs = lhs->getValues(rowIdx);
-                        const VT * valuesRowRhs = rhs->getValues(rowIdx);
-                        VT * valuesRowRes = res->getValues(rowIdx);
-                        const size_t * colIdxsRowLhs = lhs->getColIdxs(rowIdx);
-                        const size_t * colIdxsRowRhs = rhs->getColIdxs(rowIdx);
-                        size_t * colIdxsRowRes = res->getColIdxs(rowIdx);
-                        size_t posLhs = 0;
-                        size_t posRhs = 0;
-                        size_t posRes = 0;
-                        while(posLhs < nnzRowLhs && posRhs < nnzRowRhs) {
-                            if(colIdxsRowLhs[posLhs] == colIdxsRowRhs[posRhs]) {
-                                valuesRowRes[posRes] = func(valuesRowLhs[posLhs], valuesRowRhs[posRhs], ctx);
-                                colIdxsRowRes[posRes] = colIdxsRowLhs[posLhs];
-                                posLhs++;
-                                posRhs++;
-                                posRes++;
-                            }
-                            else if(colIdxsRowLhs[posLhs] < colIdxsRowRhs[posRhs])
-                                posLhs++;
-                            else
-                                posRhs++;
-                        }
-                        rowOffsetsRes[rowIdx + 1] = rowOffsetsRes[rowIdx] + posRes;
+            break;
+        }
+        case BinaryOpCode::MUL: { // intersect non-zero cells
+            for (size_t rowIdx = 0; rowIdx < numRows; rowIdx++) {
+                size_t nnzRowLhs = lhs->getNumNonZeros(rowIdx);
+                size_t nnzRowRhs = rhs->getNumNonZeros(rowIdx);
+                if (nnzRowLhs && nnzRowRhs) {
+                    // intersect within row
+                    const VT *valuesRowLhs = lhs->getValues(rowIdx);
+                    const VT *valuesRowRhs = rhs->getValues(rowIdx);
+                    VT *valuesRowRes = res->getValues(rowIdx);
+                    const size_t *colIdxsRowLhs = lhs->getColIdxs(rowIdx);
+                    const size_t *colIdxsRowRhs = rhs->getColIdxs(rowIdx);
+                    size_t *colIdxsRowRes = res->getColIdxs(rowIdx);
+                    size_t posLhs = 0;
+                    size_t posRhs = 0;
+                    size_t posRes = 0;
+                    while (posLhs < nnzRowLhs && posRhs < nnzRowRhs) {
+                        if (colIdxsRowLhs[posLhs] == colIdxsRowRhs[posRhs]) {
+                            valuesRowRes[posRes] = func(valuesRowLhs[posLhs], valuesRowRhs[posRhs], ctx);
+                            colIdxsRowRes[posRes] = colIdxsRowLhs[posLhs];
+                            posLhs++;
+                            posRhs++;
+                            posRes++;
+                        } else if (colIdxsRowLhs[posLhs] < colIdxsRowRhs[posRhs])
+                            posLhs++;
+                        else
+                            posRhs++;
                     }
-                    else
-                        // empty row in result
-                        rowOffsetsRes[rowIdx + 1] = rowOffsetsRes[rowIdx];
-                }
-                break;
+                    rowOffsetsRes[rowIdx + 1] = rowOffsetsRes[rowIdx] + posRes;
+                } else
+                    // empty row in result
+                    rowOffsetsRes[rowIdx + 1] = rowOffsetsRes[rowIdx];
             }
-            default:
-                throw std::runtime_error("EwBinaryMat(CSR) - unknown BinaryOpCode");
+            break;
+        }
+        default:
+            throw std::runtime_error("EwBinaryMat(CSR) - unknown BinaryOpCode");
         }
-        
+
         // TODO Update number of non-zeros in result in the end.
     }
 };
@@ -254,18 +251,19 @@ struct EwBinaryMat<CSRMatrix<VT>, CSRMatrix<VT>, CSRMatrix<VT>> {
 // CSRMatrix <- CSRMatrix, DenseMatrix
 // ----------------------------------------------------------------------------
 
-template<typename VT>
-struct EwBinaryMat<CSRMatrix<VT>, CSRMatrix<VT>, DenseMatrix<VT>> {
-    static void apply(BinaryOpCode opCode, CSRMatrix<VT> *& res, const CSRMatrix<VT> * lhs, const DenseMatrix<VT> * rhs, DCTX(ctx)) {
+template <typename VT> struct EwBinaryMat<CSRMatrix<VT>, CSRMatrix<VT>, DenseMatrix<VT>> {
+    static void apply(BinaryOpCode opCode, CSRMatrix<VT> *&res, const CSRMatrix<VT> *lhs, const DenseMatrix<VT> *rhs,
+                      DCTX(ctx)) {
         const size_t numRows = lhs->getNumRows();
         const size_t numCols = lhs->getNumCols();
         // TODO: lhs broadcast
-        if( (numRows != rhs->getNumRows() &&  rhs->getNumRows() != 1)
-            || (numCols != rhs->getNumCols() && rhs->getNumCols() != 1 ) )
-            throw std::runtime_error("EwBinaryMat(CSR) - lhs and rhs must have the same dimensions (or broadcast)");
+        if ((numRows != rhs->getNumRows() && rhs->getNumRows() != 1) ||
+            (numCols != rhs->getNumCols() && rhs->getNumCols() != 1))
+            throw std::runtime_error("EwBinaryMat(CSR) - lhs and rhs must have "
+                                     "the same dimensions (or broadcast)");
 
         size_t maxNnz;
-        switch(opCode) {
+        switch (opCode) {
         case BinaryOpCode::MUL: // intersect
             maxNnz = lhs->getNumNonZeros();
             break;
@@ -273,7 +271,7 @@ struct EwBinaryMat<CSRMatrix<VT>, CSRMatrix<VT>, DenseMatrix<VT>> {
             throw std::runtime_error("EwBinaryMat(CSR) - unknown BinaryOpCode");
         }
 
-        if(res == nullptr)
+        if (res == nullptr)
             res = DataObjectFactory::create<CSRMatrix<VT>>(numRows, numCols, maxNnz, false);
 
         size_t *rowOffsetsRes = res->getRowOffsets();
@@ -282,30 +280,29 @@ struct EwBinaryMat<CSRMatrix<VT>, CSRMatrix<VT>, DenseMatrix<VT>> {
 
         rowOffsetsRes[0] = 0;
 
-        switch(opCode) {
+        switch (opCode) {
         case BinaryOpCode::MUL: { // intersect non-zero cells
-            for(size_t rowIdx = 0; rowIdx < numRows; rowIdx++) {
+            for (size_t rowIdx = 0; rowIdx < numRows; rowIdx++) {
                 size_t nnzRowLhs = lhs->getNumNonZeros(rowIdx);
-                if(nnzRowLhs) {
+                if (nnzRowLhs) {
                     // intersect within row
-                    const VT * valuesRowLhs = lhs->getValues(rowIdx);
-                    VT * valuesRowRes = res->getValues(rowIdx);
-                    const size_t * colIdxsRowLhs = lhs->getColIdxs(rowIdx);
-                    size_t * colIdxsRowRes = res->getColIdxs(rowIdx);
+                    const VT *valuesRowLhs = lhs->getValues(rowIdx);
+                    VT *valuesRowRes = res->getValues(rowIdx);
+                    const size_t *colIdxsRowLhs = lhs->getColIdxs(rowIdx);
+                    size_t *colIdxsRowRes = res->getColIdxs(rowIdx);
                     auto rhsRow = (rhs->getNumRows() == 1 ? 0 : rowIdx);
                     size_t posRes = 0;
                     for (size_t posLhs = 0; posLhs < nnzRowLhs; ++posLhs) {
                         auto rhsCol = (rhs->getNumCols() == 1 ? 0 : colIdxsRowLhs[posLhs]);
                         auto rVal = rhs->get(rhsRow, rhsCol);
-                        if(rVal != 0) {
+                        if (rVal != 0) {
                             valuesRowRes[posRes] = func(valuesRowLhs[posLhs], rVal, ctx);
                             colIdxsRowRes[posRes] = colIdxsRowLhs[posLhs];
                             posRes++;
                         }
                     }
                     rowOffsetsRes[rowIdx + 1] = rowOffsetsRes[rowIdx] + posRes;
-                }
-                else
+                } else
                     // empty row in result
                     rowOffsetsRes[rowIdx + 1] = rowOffsetsRes[rowIdx];
             }
@@ -323,20 +320,20 @@ struct EwBinaryMat<CSRMatrix<VT>, CSRMatrix<VT>, DenseMatrix<VT>> {
 // Matrix <- Matrix, Matrix
 // ----------------------------------------------------------------------------
 
-template<typename VT>
-struct EwBinaryMat<Matrix<VT>, Matrix<VT>, Matrix<VT>> {
-    static void apply(BinaryOpCode opCode, Matrix<VT> *& res, const Matrix<VT> * lhs, const Matrix<VT> * rhs, DCTX(ctx)) {
+template <typename VT> struct EwBinaryMat<Matrix<VT>, Matrix<VT>, Matrix<VT>> {
+    static void apply(BinaryOpCode opCode, Matrix<VT> *&res, const Matrix<VT> *lhs, const Matrix<VT> *rhs, DCTX(ctx)) {
         const size_t numRows = lhs->getNumRows();
         const size_t numCols = lhs->getNumCols();
         if (numRows != rhs->getNumRows() || numCols != rhs->getNumCols())
             throw std::runtime_error("EwBinaryMat - lhs and rhs must have the same dimensions.");
-        
-        // TODO Choose matrix implementation depending on expected number of non-zeros.
+
+        // TODO Choose matrix implementation depending on expected number of
+        // non-zeros.
         if (res == nullptr)
             res = DataObjectFactory::create<DenseMatrix<VT>>(numRows, numCols, false);
-        
+
         EwBinaryScaFuncPtr<VT, VT, VT> func = getEwBinaryScaFuncPtr<VT, VT, VT>(opCode);
-        
+
         res->prepareAppend();
         for (size_t r = 0; r < numRows; ++r)
             for (size_t c = 0; c < numCols; ++c)
diff --git a/src/runtime/local/kernels/EwBinaryObjSca.h b/src/runtime/local/kernels/EwBinaryObjSca.h
index d570bca88..1c4a6f69c 100644
--- a/src/runtime/local/kernels/EwBinaryObjSca.h
+++ b/src/runtime/local/kernels/EwBinaryObjSca.h
@@ -25,7 +25,6 @@
 #include <runtime/local/kernels/BinaryOpCode.h>
 #include <runtime/local/kernels/EwBinarySca.h>
 
-
 #include <cstddef>
 #include <cstring>
 
@@ -33,17 +32,16 @@
 // Struct for partial template specialization
 // ****************************************************************************
 
-template<class DTRes, class DTLhs, typename VTRhs>
-struct EwBinaryObjSca {
-    static void apply(BinaryOpCode opCode, DTRes *& res, const DTLhs * lhs, VTRhs rhs, DCTX(ctx)) = delete;
+template <class DTRes, class DTLhs, typename VTRhs> struct EwBinaryObjSca {
+    static void apply(BinaryOpCode opCode, DTRes *&res, const DTLhs *lhs, VTRhs rhs, DCTX(ctx)) = delete;
 };
 
 // ****************************************************************************
 // Convenience function
 // ****************************************************************************
 
-template<class DTRes, class DTLhs, typename VTRhs>
-void ewBinaryObjSca(BinaryOpCode opCode, DTRes *& res, const DTLhs * lhs, VTRhs rhs, DCTX(ctx)) {
+template <class DTRes, class DTLhs, typename VTRhs>
+void ewBinaryObjSca(BinaryOpCode opCode, DTRes *&res, const DTLhs *lhs, VTRhs rhs, DCTX(ctx)) {
     EwBinaryObjSca<DTRes, DTLhs, VTRhs>::apply(opCode, res, lhs, rhs, ctx);
 }
 
@@ -55,22 +53,21 @@ void ewBinaryObjSca(BinaryOpCode opCode, DTRes *& res, const DTLhs * lhs, VTRhs
 // DenseMatrix <- DenseMatrix, scalar
 // ----------------------------------------------------------------------------
 
-template<typename VT>
-struct EwBinaryObjSca<DenseMatrix<VT>, DenseMatrix<VT>, VT> {
-    static void apply(BinaryOpCode opCode, DenseMatrix<VT> *& res, const DenseMatrix<VT> * lhs, VT rhs, DCTX(ctx)) {
+template <typename VT> struct EwBinaryObjSca<DenseMatrix<VT>, DenseMatrix<VT>, VT> {
+    static void apply(BinaryOpCode opCode, DenseMatrix<VT> *&res, const DenseMatrix<VT> *lhs, VT rhs, DCTX(ctx)) {
         const size_t numRows = lhs->getNumRows();
         const size_t numCols = lhs->getNumCols();
-        
-        if(res == nullptr)
+
+        if (res == nullptr)
             res = DataObjectFactory::create<DenseMatrix<VT>>(numRows, numCols, false);
-        
-        const VT * valuesLhs = lhs->getValues();
-        VT * valuesRes = res->getValues();
-        
+
+        const VT *valuesLhs = lhs->getValues();
+        VT *valuesRes = res->getValues();
+
         EwBinaryScaFuncPtr<VT, VT, VT> func = getEwBinaryScaFuncPtr<VT, VT, VT>(opCode);
-        
-        for(size_t r = 0; r < numRows; r++) {
-            for(size_t c = 0; c < numCols; c++)
+
+        for (size_t r = 0; r < numRows; r++) {
+            for (size_t c = 0; c < numCols; c++)
                 valuesRes[c] = func(valuesLhs[c], rhs, ctx);
             valuesLhs += lhs->getRowSkip();
             valuesRes += res->getRowSkip();
@@ -82,18 +79,18 @@ struct EwBinaryObjSca<DenseMatrix<VT>, DenseMatrix<VT>, VT> {
 // Matrix <- Matrix, scalar
 // ----------------------------------------------------------------------------
 
-template<typename VT>
-struct EwBinaryObjSca<Matrix<VT>, Matrix<VT>, VT> {
-    static void apply(BinaryOpCode opCode, Matrix<VT> *& res, const Matrix<VT> * lhs, VT rhs, DCTX(ctx)) {
+template <typename VT> struct EwBinaryObjSca<Matrix<VT>, Matrix<VT>, VT> {
+    static void apply(BinaryOpCode opCode, Matrix<VT> *&res, const Matrix<VT> *lhs, VT rhs, DCTX(ctx)) {
         const size_t numRows = lhs->getNumRows();
         const size_t numCols = lhs->getNumCols();
-        
-        // TODO Choose matrix implementation depending on expected number of non-zeros.
+
+        // TODO Choose matrix implementation depending on expected number of
+        // non-zeros.
         if (res == nullptr)
             res = DataObjectFactory::create<DenseMatrix<VT>>(numRows, numCols, false);
-        
+
         EwBinaryScaFuncPtr<VT, VT, VT> func = getEwBinaryScaFuncPtr<VT, VT, VT>(opCode);
-        
+
         res->prepareAppend();
         for (size_t r = 0; r < numRows; ++r)
             for (size_t c = 0; c < numCols; ++c)
@@ -106,37 +103,53 @@ struct EwBinaryObjSca<Matrix<VT>, Matrix<VT>, VT> {
 // Frame <- Frame, scalar
 // ----------------------------------------------------------------------------
 
-template<typename VT>
-void ewBinaryFrameColSca(BinaryOpCode opCode, Frame *& res, const Frame * lhs, VT rhs, size_t c, DCTX(ctx)) {
-    auto * col_res = res->getColumn<VT>(c);
-    auto * col_lhs = lhs->getColumn<VT>(c);
+template <typename VT>
+void ewBinaryFrameColSca(BinaryOpCode opCode, Frame *&res, const Frame *lhs, VT rhs, size_t c, DCTX(ctx)) {
+    auto *col_res = res->getColumn<VT>(c);
+    auto *col_lhs = lhs->getColumn<VT>(c);
     ewBinaryObjSca<DenseMatrix<VT>, DenseMatrix<VT>, VT>(opCode, col_res, col_lhs, rhs, ctx);
 }
 
-template<typename VT>
-struct EwBinaryObjSca<Frame, Frame, VT> {
-    static void apply(BinaryOpCode opCode, Frame *& res, const Frame * lhs, VT rhs, DCTX(ctx)) {
+template <typename VT> struct EwBinaryObjSca<Frame, Frame, VT> {
+    static void apply(BinaryOpCode opCode, Frame *&res, const Frame *lhs, VT rhs, DCTX(ctx)) {
         const size_t numRows = lhs->getNumRows();
         const size_t numCols = lhs->getNumCols();
 
-        if(res == nullptr)
+        if (res == nullptr)
             res = DataObjectFactory::create<Frame>(numRows, numCols, lhs->getSchema(), lhs->getLabels(), false);
-        
+
         for (size_t c = 0; c < numCols; c++) {
-            switch(lhs->getColumnType(c)) {
-                // For all value types:
-                case ValueTypeCode::F64: ewBinaryFrameColSca<double>(opCode, res, lhs, rhs, c, ctx); break;
-                case ValueTypeCode::F32: ewBinaryFrameColSca<float>(opCode, res, lhs, rhs, c, ctx); break;
-                case ValueTypeCode::SI64: ewBinaryFrameColSca<int64_t>(opCode, res, lhs, rhs, c, ctx); break;
-                case ValueTypeCode::SI32: ewBinaryFrameColSca<int32_t>(opCode, res, lhs, rhs, c, ctx); break;
-                case ValueTypeCode::SI8 : ewBinaryFrameColSca<int8_t>(opCode, res, lhs, rhs, c, ctx); break;
-                case ValueTypeCode::UI64: ewBinaryFrameColSca<uint64_t>(opCode, res, lhs, rhs, c, ctx); break;
-                case ValueTypeCode::UI32: ewBinaryFrameColSca<uint32_t>(opCode, res, lhs, rhs, c, ctx); break; 
-                case ValueTypeCode::UI8 : ewBinaryFrameColSca<uint8_t>(opCode, res, lhs, rhs, c, ctx); break;
-                default: throw std::runtime_error("EwBinaryObjSca::apply: unknown value type code");
+            switch (lhs->getColumnType(c)) {
+            // For all value types:
+            case ValueTypeCode::F64:
+                ewBinaryFrameColSca<double>(opCode, res, lhs, rhs, c, ctx);
+                break;
+            case ValueTypeCode::F32:
+                ewBinaryFrameColSca<float>(opCode, res, lhs, rhs, c, ctx);
+                break;
+            case ValueTypeCode::SI64:
+                ewBinaryFrameColSca<int64_t>(opCode, res, lhs, rhs, c, ctx);
+                break;
+            case ValueTypeCode::SI32:
+                ewBinaryFrameColSca<int32_t>(opCode, res, lhs, rhs, c, ctx);
+                break;
+            case ValueTypeCode::SI8:
+                ewBinaryFrameColSca<int8_t>(opCode, res, lhs, rhs, c, ctx);
+                break;
+            case ValueTypeCode::UI64:
+                ewBinaryFrameColSca<uint64_t>(opCode, res, lhs, rhs, c, ctx);
+                break;
+            case ValueTypeCode::UI32:
+                ewBinaryFrameColSca<uint32_t>(opCode, res, lhs, rhs, c, ctx);
+                break;
+            case ValueTypeCode::UI8:
+                ewBinaryFrameColSca<uint8_t>(opCode, res, lhs, rhs, c, ctx);
+                break;
+            default:
+                throw std::runtime_error("EwBinaryObjSca::apply: unknown value type code");
             }
-        }   
+        }
     }
 };
 
-#endif //SRC_RUNTIME_LOCAL_KERNELS_EWBINARYOBJSCA_H
+#endif // SRC_RUNTIME_LOCAL_KERNELS_EWBINARYOBJSCA_H
diff --git a/src/runtime/local/kernels/EwBinarySca.h b/src/runtime/local/kernels/EwBinarySca.h
index 99c0ab414..2864fa446 100644
--- a/src/runtime/local/kernels/EwBinarySca.h
+++ b/src/runtime/local/kernels/EwBinarySca.h
@@ -30,7 +30,7 @@
 // Struct for partial template specialization
 // ****************************************************************************
 
-template<BinaryOpCode opCode, class VTRes, class VTLhs, class VTRhs>
+template <BinaryOpCode opCode, class VTRes, class VTLhs, class VTRhs>
 // Note that, deviating from the kernel function ewBinarySca below, the opCode
 // is a template parameter here, because we want to enable re-use for efficient
 // elementwise operations on matrices, where we want to be able to avoid the
@@ -46,30 +46,29 @@ struct EwBinarySca {
 /**
  * @brief A function pointer to a binary function on scalars.
  */
-template<typename VTRes, typename VTLhs, typename VTRhs>
-using EwBinaryScaFuncPtr = VTRes (*)(VTLhs, VTRhs, DCTX());
+template <typename VTRes, typename VTLhs, typename VTRhs> using EwBinaryScaFuncPtr = VTRes (*)(VTLhs, VTRhs, DCTX());
 
 /**
  * @brief Returns the binary function on scalars for the specified binary
  * operation.
- * 
+ *
  * @param opCode
- * @return 
+ * @return
  */
-template<typename VTRes, typename VTLhs, typename VTRhs>
+template <typename VTRes, typename VTLhs, typename VTRhs>
 EwBinaryScaFuncPtr<VTRes, VTLhs, VTRhs> getEwBinaryScaFuncPtr(BinaryOpCode opCode) {
     // The template instantiation of EwBinarySca must be guarded by the
     // if-constexpr on supportsBinaryOp, such that we don't try to compile
-    // C++ code that is not applicable to the value types VTLhs and VTRgs (e.g., an
-    // arithmetic operation on strings).
+    // C++ code that is not applicable to the value types VTLhs and VTRgs (e.g.,
+    // an arithmetic operation on strings).
 
     EwBinaryScaFuncPtr<VTRes, VTLhs, VTRhs> res = nullptr;
     switch (opCode) {
-        #define MAKE_CASE(opCode) \
-            case opCode: \
-                if constexpr(supportsBinaryOp<opCode, VTRes, VTLhs, VTRhs>) \
-                    res = &EwBinarySca<opCode, VTRes, VTLhs, VTRhs>::apply; \
-                break;
+#define MAKE_CASE(opCode)                                                                                              \
+    case opCode:                                                                                                       \
+        if constexpr (supportsBinaryOp<opCode, VTRes, VTLhs, VTRhs>)                                                   \
+            res = &EwBinarySca<opCode, VTRes, VTLhs, VTRhs>::apply;                                                    \
+        break;
         // Arithmetic.
         MAKE_CASE(BinaryOpCode::ADD)
         MAKE_CASE(BinaryOpCode::SUB)
@@ -93,20 +92,15 @@ EwBinaryScaFuncPtr<VTRes, VTLhs, VTRhs> getEwBinaryScaFuncPtr(BinaryOpCode opCod
         MAKE_CASE(BinaryOpCode::OR)
         // Strings.
         MAKE_CASE(BinaryOpCode::CONCAT)
-        #undef MAKE_CASE
-        default:
-            throw std::runtime_error(
-                "unknown BinaryOpCode: " + std::to_string(static_cast<int>(opCode))
-            );
+#undef MAKE_CASE
+    default:
+        throw std::runtime_error("unknown BinaryOpCode: " + std::to_string(static_cast<int>(opCode)));
     }
-    if(!res)
-        throw std::runtime_error(
-            "the binary operation " + std::string(binary_op_codes[static_cast<int>(opCode)]) +
-            " is not supported on the value types " +
-            ValueTypeUtils::cppNameFor<VTRes> + " (res), " +
-            ValueTypeUtils::cppNameFor<VTLhs> + " (lhs), and " +
-            ValueTypeUtils::cppNameFor<VTRhs> + " (rhs)"
-        );
+    if (!res)
+        throw std::runtime_error("the binary operation " + std::string(binary_op_codes[static_cast<int>(opCode)]) +
+                                 " is not supported on the value types " + ValueTypeUtils::cppNameFor<VTRes> +
+                                 " (res), " + ValueTypeUtils::cppNameFor<VTLhs> + " (lhs), and " +
+                                 ValueTypeUtils::cppNameFor<VTRhs> + " (rhs)");
     return res;
 }
 
@@ -116,13 +110,13 @@ EwBinaryScaFuncPtr<VTRes, VTLhs, VTRhs> getEwBinaryScaFuncPtr(BinaryOpCode opCod
 
 /**
  * @brief Performs a binary operation on two scalars.
- * 
+ *
  * @param opCode The binary operation to perform.
  * @param lhs The left-hand-side operand.
  * @param rhs The right-hand-side operand.
  * @return The result of the binary operation.
  */
-template<typename TRes, typename TLhs, typename TRhs>
+template <typename TRes, typename TLhs, typename TRhs>
 TRes ewBinarySca(BinaryOpCode opCode, TLhs lhs, TRhs rhs, DCTX(ctx)) {
     return getEwBinaryScaFuncPtr<TRes, TLhs, TRhs>(opCode)(lhs, rhs, ctx);
 }
@@ -132,41 +126,36 @@ TRes ewBinarySca(BinaryOpCode opCode, TLhs lhs, TRhs rhs, DCTX(ctx)) {
 // ****************************************************************************
 
 // Handle multiply extra
-template<typename TLhs, typename TRhs>
-struct EwBinarySca<BinaryOpCode::MUL, bool, TLhs, TRhs> {
+template <typename TLhs, typename TRhs> struct EwBinarySca<BinaryOpCode::MUL, bool, TLhs, TRhs> {
     inline static bool apply(TLhs lhs, TRhs rhs, DCTX(ctx)) {
         uint32_t result = lhs * rhs;
         return static_cast<bool>(result);
     }
 };
 
-#define MAKE_EW_BINARY_SCA(opCode, expr) \
-    template<typename TRes, typename TLhs, typename TRhs> \
-    struct EwBinarySca<opCode, TRes, TLhs, TRhs> { \
-        inline static TRes apply(TLhs lhs, TRhs rhs, DCTX(ctx)) { \
-            return expr; \
-        } \
+#define MAKE_EW_BINARY_SCA(opCode, expr)                                                                               \
+    template <typename TRes, typename TLhs, typename TRhs> struct EwBinarySca<opCode, TRes, TLhs, TRhs> {              \
+        inline static TRes apply(TLhs lhs, TRhs rhs, DCTX(ctx)) { return expr; }                                       \
     };
 
 // One such line for each binary function to support.
 // Arithmetic.
 MAKE_EW_BINARY_SCA(BinaryOpCode::ADD, lhs + rhs)
 MAKE_EW_BINARY_SCA(BinaryOpCode::SUB, lhs - rhs)
-MAKE_EW_BINARY_SCA(BinaryOpCode::MUL, lhs * rhs)
+MAKE_EW_BINARY_SCA(BinaryOpCode::MUL, lhs *rhs)
 MAKE_EW_BINARY_SCA(BinaryOpCode::DIV, lhs / rhs)
 MAKE_EW_BINARY_SCA(BinaryOpCode::POW, pow(lhs, rhs))
 MAKE_EW_BINARY_SCA(BinaryOpCode::MOD, std::fmod(lhs, rhs))
-MAKE_EW_BINARY_SCA(BinaryOpCode::LOG, std::log(lhs)/std::log(rhs))
+MAKE_EW_BINARY_SCA(BinaryOpCode::LOG, std::log(lhs) / std::log(rhs))
 // Comparisons.
-MAKE_EW_BINARY_SCA(BinaryOpCode::EQ , lhs == rhs)
+MAKE_EW_BINARY_SCA(BinaryOpCode::EQ, lhs == rhs)
 MAKE_EW_BINARY_SCA(BinaryOpCode::NEQ, lhs != rhs)
-MAKE_EW_BINARY_SCA(BinaryOpCode::LT , lhs <  rhs)
-MAKE_EW_BINARY_SCA(BinaryOpCode::LE , lhs <= rhs)
-MAKE_EW_BINARY_SCA(BinaryOpCode::GT , lhs >  rhs)
-MAKE_EW_BINARY_SCA(BinaryOpCode::GE , lhs >= rhs)
-template<typename TRes>
-struct EwBinarySca<BinaryOpCode::EQ, TRes, const char *, const char *> {
-    inline static TRes apply(const char * lhs, const char * rhs, DCTX(ctx)) {
+MAKE_EW_BINARY_SCA(BinaryOpCode::LT, lhs < rhs)
+MAKE_EW_BINARY_SCA(BinaryOpCode::LE, lhs <= rhs)
+MAKE_EW_BINARY_SCA(BinaryOpCode::GT, lhs > rhs)
+MAKE_EW_BINARY_SCA(BinaryOpCode::GE, lhs >= rhs)
+template <typename TRes> struct EwBinarySca<BinaryOpCode::EQ, TRes, const char *, const char *> {
+    inline static TRes apply(const char *lhs, const char *rhs, DCTX(ctx)) {
         return std::string_view(lhs) == std::string_view(rhs);
     }
 };
@@ -174,19 +163,19 @@ struct EwBinarySca<BinaryOpCode::EQ, TRes, const char *, const char *> {
 MAKE_EW_BINARY_SCA(BinaryOpCode::MIN, std::min(lhs, rhs))
 MAKE_EW_BINARY_SCA(BinaryOpCode::MAX, std::max(lhs, rhs))
 // Logical.
-MAKE_EW_BINARY_SCA(BinaryOpCode::AND, lhs && rhs)
-MAKE_EW_BINARY_SCA(BinaryOpCode::OR , lhs || rhs)
+MAKE_EW_BINARY_SCA(BinaryOpCode::AND, lhs &&rhs)
+MAKE_EW_BINARY_SCA(BinaryOpCode::OR, lhs || rhs)
 // Strings.
-template<>
-struct EwBinarySca<BinaryOpCode::CONCAT, const char *, const char *, const char *> {
-    inline static const char * apply(const char * lhs, const char * rhs, DCTX(ctx)) {
+MAKE_EW_BINARY_SCA(BinaryOpCode::CONCAT, lhs + rhs)
+template <> struct EwBinarySca<BinaryOpCode::CONCAT, const char *, const char *, const char *> {
+    inline static const char *apply(const char *lhs, const char *rhs, DCTX(ctx)) {
         const auto lenLhs = std::string_view(lhs).size();
         const auto lenRhs = std::string_view(rhs).size();
         const auto lenRes = lenLhs + lenRhs;
-        
-        char* res = new char[lenRes + 1];
-        
-        std::memcpy(res         , lhs, lenLhs);
+
+        char *res = new char[lenRes + 1];
+
+        std::memcpy(res, lhs, lenLhs);
         std::memcpy(res + lenLhs, rhs, lenRhs);
         res[lenRes] = '\0';
 
@@ -196,4 +185,4 @@ struct EwBinarySca<BinaryOpCode::CONCAT, const char *, const char *, const char
 
 #undef MAKE_EW_BINARY_SCA
 
-#endif //SRC_RUNTIME_LOCAL_KERNELS_EWBINARYSCA_H
+#endif // SRC_RUNTIME_LOCAL_KERNELS_EWBINARYSCA_H
diff --git a/src/runtime/local/kernels/EwUnaryMat.h b/src/runtime/local/kernels/EwUnaryMat.h
index 56d2e2c51..1524587cb 100644
--- a/src/runtime/local/kernels/EwUnaryMat.h
+++ b/src/runtime/local/kernels/EwUnaryMat.h
@@ -21,8 +21,8 @@
 #include <runtime/local/datastructures/DataObjectFactory.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
 #include <runtime/local/datastructures/Matrix.h>
-#include <runtime/local/kernels/UnaryOpCode.h>
 #include <runtime/local/kernels/EwUnarySca.h>
+#include <runtime/local/kernels/UnaryOpCode.h>
 
 #include <cstddef>
 
@@ -30,17 +30,15 @@
 // Struct for partial template specialization
 // ****************************************************************************
 
-template<class DTRes, class DTArg>
-struct EwUnaryMat {
-    static void apply(UnaryOpCode opCode, DTRes *& res, const DTArg * arg, DCTX(ctx)) = delete;
+template <class DTRes, class DTArg> struct EwUnaryMat {
+    static void apply(UnaryOpCode opCode, DTRes *&res, const DTArg *arg, DCTX(ctx)) = delete;
 };
 
 // ****************************************************************************
 // Convenience function
 // ****************************************************************************
 
-template<class DTRes, class DTArg>
-void ewUnaryMat(UnaryOpCode opCode, DTRes *& res, const DTArg * arg, DCTX(ctx)) {
+template <class DTRes, class DTArg> void ewUnaryMat(UnaryOpCode opCode, DTRes *&res, const DTArg *arg, DCTX(ctx)) {
     EwUnaryMat<DTRes, DTArg>::apply(opCode, res, arg, ctx);
 }
 
@@ -52,22 +50,21 @@ void ewUnaryMat(UnaryOpCode opCode, DTRes *& res, const DTArg * arg, DCTX(ctx))
 // DenseMatrix <- DenseMatrix
 // ----------------------------------------------------------------------------
 
-template<typename VT>
-struct EwUnaryMat<DenseMatrix<VT>, DenseMatrix<VT>> {
-    static void apply(UnaryOpCode opCode, DenseMatrix<VT> *& res, const DenseMatrix<VT> * arg, DCTX(ctx)) {
+template <typename VT> struct EwUnaryMat<DenseMatrix<VT>, DenseMatrix<VT>> {
+    static void apply(UnaryOpCode opCode, DenseMatrix<VT> *&res, const DenseMatrix<VT> *arg, DCTX(ctx)) {
         const size_t numRows = arg->getNumRows();
         const size_t numCols = arg->getNumCols();
-        
-        if(res == nullptr)
+
+        if (res == nullptr)
             res = DataObjectFactory::create<DenseMatrix<VT>>(numRows, numCols, false);
-        
-        const VT * valuesArg = arg->getValues();
-        VT * valuesRes = res->getValues();
-        
+
+        const VT *valuesArg = arg->getValues();
+        VT *valuesRes = res->getValues();
+
         EwUnaryScaFuncPtr<VT, VT> func = getEwUnaryScaFuncPtr<VT, VT>(opCode);
-        
-        for(size_t r = 0; r < numRows; r++) {
-            for(size_t c = 0; c < numCols; c++)
+
+        for (size_t r = 0; r < numRows; r++) {
+            for (size_t c = 0; c < numCols; c++)
                 valuesRes[c] = func(valuesArg[c], ctx);
             valuesArg += arg->getRowSkip();
             valuesRes += res->getRowSkip();
@@ -79,9 +76,8 @@ struct EwUnaryMat<DenseMatrix<VT>, DenseMatrix<VT>> {
 // Matrix <- Matrix
 // ----------------------------------------------------------------------------
 
-template<typename VT>
-struct EwUnaryMat<Matrix<VT>, Matrix<VT>> {
-    static void apply(UnaryOpCode opCode, Matrix<VT> *& res, const Matrix<VT> * arg, DCTX(ctx)) {
+template <typename VT> struct EwUnaryMat<Matrix<VT>, Matrix<VT>> {
+    static void apply(UnaryOpCode opCode, Matrix<VT> *&res, const Matrix<VT> *arg, DCTX(ctx)) {
         const size_t numRows = arg->getNumRows();
         const size_t numCols = arg->getNumCols();
 
@@ -98,4 +94,4 @@ struct EwUnaryMat<Matrix<VT>, Matrix<VT>> {
     }
 };
 
-#endif //SRC_RUNTIME_LOCAL_KERNELS_EWUNARYMAT_H
+#endif // SRC_RUNTIME_LOCAL_KERNELS_EWUNARYMAT_H
diff --git a/src/runtime/local/kernels/EwUnarySca.h b/src/runtime/local/kernels/EwUnarySca.h
index e7107a11b..b16639f40 100644
--- a/src/runtime/local/kernels/EwUnarySca.h
+++ b/src/runtime/local/kernels/EwUnarySca.h
@@ -21,9 +21,10 @@
 #include <runtime/local/datastructures/ValueTypeUtils.h>
 #include <runtime/local/kernels/UnaryOpCode.h>
 
+#include <algorithm>
 #include <limits>
-#include <stdexcept>
 #include <sstream>
+#include <stdexcept>
 
 #include <cmath>
 
@@ -31,7 +32,7 @@
 // Struct for partial template specialization
 // ****************************************************************************
 
-template<UnaryOpCode opCode, class VTRes, class VTArg>
+template <UnaryOpCode opCode, class VTRes, class VTArg>
 // Note that, deviating from the kernel function ewUnarySca below, the opCode
 // is a template parameter here, because we want to enable re-use for efficient
 // elementwise operations on matrices, where we want to be able to avoid the
@@ -47,30 +48,28 @@ struct EwUnarySca {
 /**
  * @brief A function pointer to a unary function on scalars.
  */
-template<typename VTRes, typename VTArg>
-using EwUnaryScaFuncPtr = VTRes (*)(VTArg, DCTX());
+template <typename VTRes, typename VTArg> using EwUnaryScaFuncPtr = VTRes (*)(VTArg, DCTX());
 
 /**
  * @brief Returns the unary function on scalars for the specified unary
  * operation.
- * 
+ *
  * @param opCode
- * @return 
+ * @return
  */
-template<typename VTRes, typename VTArg>
-EwUnaryScaFuncPtr<VTRes, VTArg> getEwUnaryScaFuncPtr(UnaryOpCode opCode) {
+template <typename VTRes, typename VTArg> EwUnaryScaFuncPtr<VTRes, VTArg> getEwUnaryScaFuncPtr(UnaryOpCode opCode) {
     // The template instantiation of EwUnarySca must be guarded by the
     // if-constexpr on supportsUnaryOp, such that we don't try to compile
     // C++ code that is not applicable to the value type VTArg (e.g., an
     // arithmetic operation on strings).
 
     EwUnaryScaFuncPtr<VTRes, VTArg> res = nullptr;
-    switch(opCode) {
-        #define MAKE_CASE(opCode) \
-            case opCode: \
-                if constexpr(supportsUnaryOp<opCode, VTRes, VTArg>) \
-                    res = &EwUnarySca<opCode, VTRes, VTArg>::apply; \
-                break;
+    switch (opCode) {
+#define MAKE_CASE(opCode)                                                                                              \
+    case opCode:                                                                                                       \
+        if constexpr (supportsUnaryOp<opCode, VTRes, VTArg>)                                                           \
+            res = &EwUnarySca<opCode, VTRes, VTArg>::apply;                                                            \
+        break;
         // Arithmetic/general math.
         MAKE_CASE(UnaryOpCode::MINUS)
         MAKE_CASE(UnaryOpCode::ABS)
@@ -94,19 +93,17 @@ EwUnaryScaFuncPtr<VTRes, VTArg> getEwUnaryScaFuncPtr(UnaryOpCode opCode) {
         MAKE_CASE(UnaryOpCode::ROUND)
         // Comparison.
         MAKE_CASE(UnaryOpCode::ISNAN)
-        #undef MAKE_CASE
-        default:
-            throw std::runtime_error(
-                "unknown UnaryOpCode: " + std::to_string(static_cast<int>(opCode))
-            );
+        // String.
+        MAKE_CASE(UnaryOpCode::LOWER)
+        MAKE_CASE(UnaryOpCode::UPPER)
+#undef MAKE_CASE
+    default:
+        throw std::runtime_error("unknown UnaryOpCode: " + std::to_string(static_cast<int>(opCode)));
     }
-    if(!res)
-        throw std::runtime_error(
-            "the unary operation " + std::string(unary_op_codes[static_cast<int>(opCode)]) +
-            " is not supported on the value types " +
-            ValueTypeUtils::cppNameFor<VTRes> + " (res) and " +
-            ValueTypeUtils::cppNameFor<VTArg> + "(arg)"
-        );
+    if (!res)
+        throw std::runtime_error("the unary operation " + std::string(unary_op_codes[static_cast<int>(opCode)]) +
+                                 " is not supported on the value types " + ValueTypeUtils::cppNameFor<VTRes> +
+                                 " (res) and " + ValueTypeUtils::cppNameFor<VTArg> + "(arg)");
     return res;
 }
 
@@ -116,13 +113,12 @@ EwUnaryScaFuncPtr<VTRes, VTArg> getEwUnaryScaFuncPtr(UnaryOpCode opCode) {
 
 /**
  * @brief Performs a unary operation on a scalar.
- * 
+ *
  * @param opCode The unary operation to perform.
  * @param arg The operand.
  * @return The result of the unary operation.
  */
-template<typename TRes, typename TArg>
-TRes ewUnarySca(UnaryOpCode opCode, TArg arg, DCTX(ctx)) {
+template <typename TRes, typename TArg> TRes ewUnarySca(UnaryOpCode opCode, TArg arg, DCTX(ctx)) {
     return getEwUnaryScaFuncPtr<TRes, TArg>(opCode)(arg, ctx);
 }
 
@@ -130,58 +126,61 @@ TRes ewUnarySca(UnaryOpCode opCode, TArg arg, DCTX(ctx)) {
 // (Partial) template specializations for different op codes
 // ****************************************************************************
 
-#define MAKE_EW_UNARY_SCA(opCode, expr) \
-    template<typename TRes, typename TArg> \
-    struct EwUnarySca<opCode, TRes, TArg> { \
-        inline static TRes apply(TArg arg, DCTX(ctx)) { \
-            return expr; \
-        } \
+#define MAKE_EW_UNARY_SCA(opCode, expr)                                                                                \
+    template <typename TRes, typename TArg> struct EwUnarySca<opCode, TRes, TArg> {                                    \
+        inline static TRes apply(TArg arg, DCTX(ctx)) { return expr; }                                                 \
+    };
+
+#define MAKE_EW_UNARY_SCA_OPEN_DOMAIN_ERROR(opCode, expr, lowerBound, strFuncDomain)                                   \
+    template <typename TRes, typename TArg> struct EwUnarySca<opCode, TRes, TArg> {                                    \
+        inline static TRes apply(TArg arg, DCTX(ctx)) {                                                                \
+            if (lowerBound > arg) {                                                                                    \
+                std::ostringstream errMsg;                                                                             \
+                errMsg << "invalid argument '" << arg << "' passed to unary func " << strFuncDomain;                   \
+                throw std::domain_error(errMsg.str());                                                                 \
+            }                                                                                                          \
+            return expr;                                                                                               \
+        }                                                                                                              \
     };
 
-#define MAKE_EW_UNARY_SCA_OPEN_DOMAIN_ERROR(opCode, expr, lowerBound, strFuncDomain) \
-    template<typename TRes, typename TArg> \
-    struct EwUnarySca<opCode, TRes, TArg> { \
-        inline static TRes apply(TArg arg, DCTX(ctx)) { \
-            if (lowerBound > arg) { \
-                std::ostringstream errMsg; \
-                errMsg << "invalid argument '" << arg << "' passed to unary func " << strFuncDomain; \
-                throw std::domain_error(errMsg.str()); \
-            } \
-            return expr; \
-        } \
+#define MAKE_EW_UNARY_SCA_CLOSED_DOMAIN_ERROR(opCode, expr, lowerBound, upperBound, strFuncDomain)                     \
+    template <typename TRes, typename TArg> struct EwUnarySca<opCode, TRes, TArg> {                                    \
+        inline static TRes apply(TArg arg, DCTX(ctx)) {                                                                \
+            if (lowerBound > arg || arg > upperBound) {                                                                \
+                std::ostringstream errMsg;                                                                             \
+                errMsg << "invalid argument '" << arg << "' passed to unary func " << strFuncDomain;                   \
+                throw std::domain_error(errMsg.str());                                                                 \
+            }                                                                                                          \
+            return expr;                                                                                               \
+        }                                                                                                              \
     };
 
-#define MAKE_EW_UNARY_SCA_CLOSED_DOMAIN_ERROR(opCode, expr, lowerBound, upperBound, strFuncDomain) \
-    template<typename TRes, typename TArg> \
-    struct EwUnarySca<opCode, TRes, TArg> { \
-        inline static TRes apply(TArg arg, DCTX(ctx)) { \
-            if (lowerBound > arg || arg > upperBound) { \
-                std::ostringstream errMsg; \
-                errMsg << "invalid argument '" << arg << "' passed to unary func " << strFuncDomain; \
-                throw std::domain_error(errMsg.str()); \
-            } \
-            return expr; \
-        } \
+#define MAKE_EW_UNARY_STRING_TRANSFORM(opCode, expr)                                                                   \
+    template <> struct EwUnarySca<opCode, std::string, std::string> {                                                  \
+        inline static std::string apply(std::string arg, DCTX(ctx)) {                                                  \
+            std::string new_string = arg;                                                                              \
+            std::transform(new_string.begin(), new_string.end(), new_string.begin(), static_cast<int (*)(int)>(expr)); \
+            return new_string;                                                                                         \
+        }                                                                                                              \
     };
 
 // One such line for each unary function to support.
 // Arithmetic/general math.
 MAKE_EW_UNARY_SCA(UnaryOpCode::MINUS, -arg);
-MAKE_EW_UNARY_SCA(UnaryOpCode::ABS, abs(arg));
-MAKE_EW_UNARY_SCA(UnaryOpCode::SIGN, (arg == 0) ? 0 : ((arg < 0) ? -1 : ((arg > 0) ? 1 : std::numeric_limits<TRes>::quiet_NaN())));
-MAKE_EW_UNARY_SCA_OPEN_DOMAIN_ERROR(UnaryOpCode::SQRT, sqrt(arg),
-                                    -0.0, "SQRT with domain [-0, inf]")
+MAKE_EW_UNARY_SCA(UnaryOpCode::ABS, std::abs(arg));
+MAKE_EW_UNARY_SCA(UnaryOpCode::SIGN,
+                  (arg == 0) ? 0 : ((arg < 0) ? -1 : ((arg > 0) ? 1 : std::numeric_limits<TRes>::quiet_NaN())));
+MAKE_EW_UNARY_SCA_OPEN_DOMAIN_ERROR(UnaryOpCode::SQRT, sqrt(arg), -0.0, "SQRT with domain [-0, inf]")
 MAKE_EW_UNARY_SCA(UnaryOpCode::EXP, exp(arg));
-MAKE_EW_UNARY_SCA_OPEN_DOMAIN_ERROR(UnaryOpCode::LN, log(arg),
-                                    -0.0, "LN with domain [-0, inf]");     // -0 maps to -inf
+MAKE_EW_UNARY_SCA_OPEN_DOMAIN_ERROR(UnaryOpCode::LN, log(arg), -0.0,
+                                    "LN with domain [-0, inf]"); // -0 maps to -inf
 // Trigonometric/Hyperbolic functions
 MAKE_EW_UNARY_SCA(UnaryOpCode::SIN, sin(arg));
 MAKE_EW_UNARY_SCA(UnaryOpCode::COS, cos(arg));
-MAKE_EW_UNARY_SCA(UnaryOpCode::TAN, tan(arg));                              // undefined points effectively do not restrict domain
-MAKE_EW_UNARY_SCA_CLOSED_DOMAIN_ERROR(UnaryOpCode::ASIN, asin(arg),
-                                    -1.0, 1.0, "ASIN with domain [-1, 1]");
-MAKE_EW_UNARY_SCA_CLOSED_DOMAIN_ERROR(UnaryOpCode::ACOS, acos(arg),
-                                    -1.0, 1.0, "ACOS with domain [-1, 1]");
+MAKE_EW_UNARY_SCA(UnaryOpCode::TAN,
+                  tan(arg)); // undefined points effectively do not restrict domain
+MAKE_EW_UNARY_SCA_CLOSED_DOMAIN_ERROR(UnaryOpCode::ASIN, asin(arg), -1.0, 1.0, "ASIN with domain [-1, 1]");
+MAKE_EW_UNARY_SCA_CLOSED_DOMAIN_ERROR(UnaryOpCode::ACOS, acos(arg), -1.0, 1.0, "ACOS with domain [-1, 1]");
 MAKE_EW_UNARY_SCA(UnaryOpCode::ATAN, atan(arg));
 MAKE_EW_UNARY_SCA(UnaryOpCode::SINH, sinh(arg));
 MAKE_EW_UNARY_SCA(UnaryOpCode::COSH, cosh(arg));
@@ -192,9 +191,15 @@ MAKE_EW_UNARY_SCA(UnaryOpCode::CEIL, std::ceil(arg));
 MAKE_EW_UNARY_SCA(UnaryOpCode::ROUND, round(arg));
 // Comparison.
 MAKE_EW_UNARY_SCA(UnaryOpCode::ISNAN, std::isnan(arg));
+// String.
+MAKE_EW_UNARY_SCA(UnaryOpCode::LOWER, arg.lower())
+MAKE_EW_UNARY_SCA(UnaryOpCode::UPPER, arg.upper())
+MAKE_EW_UNARY_STRING_TRANSFORM(UnaryOpCode::LOWER, std::tolower)
+MAKE_EW_UNARY_STRING_TRANSFORM(UnaryOpCode::UPPER, std::toupper)
 
 #undef MAKE_EW_UNARY_SCA_CLOSED_DOMAIN_ERROR
 #undef MAKE_EW_UNARY_SCA_OPEN_DOMAIN_ERROR
 #undef MAKE_EW_UNARY_SCA
+#undef MAKE_EW_UNARY_STRING_TRANSFORM
 
-#endif //SRC_RUNTIME_LOCAL_KERNELS_EWUNARYSCA_H
\ No newline at end of file
+#endif // SRC_RUNTIME_LOCAL_KERNELS_EWUNARYSCA_H
diff --git a/src/runtime/local/kernels/ExtractCol.h b/src/runtime/local/kernels/ExtractCol.h
index 682783b63..a7e598fa3 100644
--- a/src/runtime/local/kernels/ExtractCol.h
+++ b/src/runtime/local/kernels/ExtractCol.h
@@ -32,9 +32,8 @@
 // Struct for partial template specialization
 // ****************************************************************************
 
-template<class DTRes, class DTArg, class DTSel>
-struct ExtractCol {
-    static void apply(DTRes *& res, const DTArg * arg, const DTSel * sel, DCTX(ctx)) = delete;
+template <class DTRes, class DTArg, class DTSel> struct ExtractCol {
+    static void apply(DTRes *&res, const DTArg *arg, const DTSel *sel, DCTX(ctx)) = delete;
 };
 
 // ****************************************************************************
@@ -45,8 +44,8 @@ struct ExtractCol {
 // with the rest of the code and DaphneIR (even though int64_t also makes
 // sense), but currently, it would be too hard to get a matrix of size_t via
 // DaphneDSL, since we do not have value type casts yet.
-template<class DTRes, class DTArg, class DTSel>
-void extractCol(DTRes *& res, const DTArg * arg, const DTSel * sel, DCTX(ctx)) {
+template <class DTRes, class DTArg, class DTSel>
+void extractCol(DTRes *&res, const DTArg *arg, const DTSel *sel, DCTX(ctx)) {
     ExtractCol<DTRes, DTArg, DTSel>::apply(res, arg, sel, ctx);
 }
 
@@ -55,12 +54,13 @@ void extractCol(DTRes *& res, const DTArg * arg, const DTSel * sel, DCTX(ctx)) {
 // ****************************************************************************
 
 // index boundaries are verified later for performance
-#define VALIDATE_ARGS(numColsSel) \
-    if(numColsSel != 1) { \
-        std::ostringstream errMsg; \
-        errMsg << "invalid argument passed to ExtractCol: column selection must be given as column matrix but has '" \
-            << numColsSel << "' columns instead of one"; \
-        throw std::runtime_error(errMsg.str()); \
+#define VALIDATE_ARGS(numColsSel)                                                                                      \
+    if (numColsSel != 1) {                                                                                             \
+        std::ostringstream errMsg;                                                                                     \
+        errMsg << "invalid argument passed to ExtractCol: column selection "                                           \
+                  "must be given as column matrix but has '"                                                           \
+               << numColsSel << "' columns instead of one";                                                            \
+        throw std::runtime_error(errMsg.str());                                                                        \
     }
 
 // ****************************************************************************
@@ -71,13 +71,15 @@ void extractCol(DTRes *& res, const DTArg * arg, const DTSel * sel, DCTX(ctx)) {
 // DenseMatrix <- DenseMatrix, DenseMatrix (positions)
 // ----------------------------------------------------------------------------
 
-template<typename VTArg, typename VTSel>
+template <typename VTArg, typename VTSel>
 struct ExtractCol<DenseMatrix<VTArg>, DenseMatrix<VTArg>, DenseMatrix<VTSel>> {
-    static void apply(DenseMatrix<VTArg> *& res, const DenseMatrix<VTArg> * arg, const DenseMatrix<VTSel> * sel, DCTX(ctx)) {
+    static void apply(DenseMatrix<VTArg> *&res, const DenseMatrix<VTArg> *arg, const DenseMatrix<VTSel> *sel,
+                      DCTX(ctx)) {
         VALIDATE_ARGS(sel->getNumCols());
 
-        // left as VTSel to enable more boundary validation, converted to size_t later
-        const VTSel * VTcolIdxs = sel->getValues();
+        // left as VTSel to enable more boundary validation, converted to size_t
+        // later
+        const VTSel *VTcolIdxs = sel->getValues();
         const size_t numColsRes = sel->getNumRows();
         const size_t numRows = arg->getNumRows();
         const size_t numColsArg = arg->getNumCols();
@@ -85,20 +87,22 @@ struct ExtractCol<DenseMatrix<VTArg>, DenseMatrix<VTArg>, DenseMatrix<VTSel>> {
         if (res == nullptr)
             res = DataObjectFactory::create<DenseMatrix<VTArg>>(numRows, numColsRes, false);
 
-        const VTArg * valuesArg = arg->getValues();
-        VTArg * valuesRes = res->getValues();
-        
+        const VTArg *valuesArg = arg->getValues();
+        VTArg *valuesRes = res->getValues();
+
         const size_t rowSkipArg = arg->getRowSkip();
         const size_t rowSkipRes = res->getRowSkip();
-        
+
         for (size_t r = 0; r < numRows; r++) {
             for (size_t c = 0; c < numColsRes; c++) {
                 const VTSel VTcolIdx = VTcolIdxs[c];
                 const size_t colIdx = static_cast<const size_t>(VTcolIdx);
                 if (VTcolIdx < 0 || numColsArg <= colIdx) {
                     std::ostringstream errMsg;
-                    errMsg << "invalid argument '" << VTcolIdx << "' passed to ExtractCol: out of bounds "
-                        "for dense matrix with column boundaries '[0, " << numColsArg << ")'";
+                    errMsg << "invalid argument '" << VTcolIdx
+                           << "' passed to ExtractCol: out of bounds "
+                              "for dense matrix with column boundaries '[0, "
+                           << numColsArg << ")'";
                     throw std::out_of_range(errMsg.str());
                 }
 
@@ -114,15 +118,14 @@ struct ExtractCol<DenseMatrix<VTArg>, DenseMatrix<VTArg>, DenseMatrix<VTSel>> {
 // Frame <- Frame, String (column label)
 // ----------------------------------------------------------------------------
 
-template<>
-struct ExtractCol<Frame, Frame, char> {
-    static void apply(Frame *& res, const Frame * arg, const char * sel, DCTX(ctx)) {
+template <> struct ExtractCol<Frame, Frame, char> {
+    static void apply(Frame *&res, const Frame *arg, const char *sel, DCTX(ctx)) {
         std::string delimiter = ".";
         const std::string colName = std::string(sel);
         const std::string frameName = colName.substr(0, colName.find(delimiter));
         const std::string colLabel = colName.substr(colName.find(delimiter) + delimiter.length(), colName.length());
-        if (colLabel.compare("*") ==0) {
-            const std::string * labels = arg->getLabels();
+        if (colLabel.compare("*") == 0) {
+            const std::string *labels = arg->getLabels();
             const size_t numLabels = arg->getNumCols();
             std::vector<size_t> extractLabelIdxs;
             for (size_t i = 0; i < numLabels; i++) {
@@ -131,23 +134,23 @@ struct ExtractCol<Frame, Frame, char> {
                     extractLabelIdxs.push_back(arg->getColumnIdx(labels[i]));
                 }
             }
-            res = DataObjectFactory::create<Frame>(arg, 0, arg->getNumRows(), extractLabelIdxs.size(), extractLabelIdxs.data());
+            res = DataObjectFactory::create<Frame>(arg, 0, arg->getNumRows(), extractLabelIdxs.size(),
+                                                   extractLabelIdxs.data());
         } else {
             size_t colIdx = arg->getColumnIdx(sel);
             res = DataObjectFactory::create<Frame>(arg, 0, arg->getNumRows(), 1, &colIdx);
         }
-        
     }
 };
 
-template< typename VTSel >
-struct ExtractCol<Frame, Frame, DenseMatrix<VTSel>> {
-    static void apply(Frame *& res, const Frame * arg, const DenseMatrix<VTSel> * sel, DCTX(ctx)) {
+template <typename VTSel> struct ExtractCol<Frame, Frame, DenseMatrix<VTSel>> {
+    static void apply(Frame *&res, const Frame *arg, const DenseMatrix<VTSel> *sel, DCTX(ctx)) {
         VALIDATE_ARGS(sel->getNumCols());
 
-        // left as VTSel to enable more boundary validation, converted to size_t later
-        const VTSel * VTvaluesSel = sel->getValues();
-        const size_t* colIdxs = reinterpret_cast<const size_t*>(VTvaluesSel);
+        // left as VTSel to enable more boundary validation, converted to size_t
+        // later
+        const VTSel *VTvaluesSel = sel->getValues();
+        const size_t *colIdxs = reinterpret_cast<const size_t *>(VTvaluesSel);
         const size_t numColsRes = sel->getNumRows();
         const size_t numRowsRes = arg->getNumRows();
         const size_t numColsArg = arg->getNumCols();
@@ -155,8 +158,10 @@ struct ExtractCol<Frame, Frame, DenseMatrix<VTSel>> {
             const VTSel VTcolIdx = VTvaluesSel[c];
             if (VTcolIdx < 0 || numColsArg <= colIdxs[c]) {
                 std::ostringstream errMsg;
-                errMsg << "invalid argument '" << VTcolIdx << "' passed to ExtractCol: ouf of bounds "
-                    "for frame with column boundaries '[0, " << numColsArg << ")'";
+                errMsg << "invalid argument '" << VTcolIdx
+                       << "' passed to ExtractCol: ouf of bounds "
+                          "for frame with column boundaries '[0, "
+                       << numColsArg << ")'";
                 throw std::out_of_range(errMsg.str());
             }
         }
@@ -168,9 +173,8 @@ struct ExtractCol<Frame, Frame, DenseMatrix<VTSel>> {
 // Matrix <- Matrix, Matrix (positions)
 // ----------------------------------------------------------------------------
 
-template<typename VTArg, typename VTSel>
-struct ExtractCol<Matrix<VTArg>, Matrix<VTArg>, Matrix<VTSel>> {
-    static void apply(Matrix<VTArg> *& res, const Matrix<VTArg> * arg, const Matrix<VTSel> * sel, DCTX(ctx)) {
+template <typename VTArg, typename VTSel> struct ExtractCol<Matrix<VTArg>, Matrix<VTArg>, Matrix<VTSel>> {
+    static void apply(Matrix<VTArg> *&res, const Matrix<VTArg> *arg, const Matrix<VTSel> *sel, DCTX(ctx)) {
         VALIDATE_ARGS(sel->getNumCols());
 
         const size_t numColsRes = sel->getNumRows();
@@ -187,8 +191,10 @@ struct ExtractCol<Matrix<VTArg>, Matrix<VTArg>, Matrix<VTSel>> {
                 const size_t colIdx = static_cast<const size_t>(VTcolIdx);
                 if (VTcolIdx < 0 || numColsArg <= colIdx) {
                     std::ostringstream errMsg;
-                    errMsg << "invalid argument '" << VTcolIdx << "' passed to ExtractCol: out of bounds "
-                        "for dense matrix with column boundaries '[0, " << numColsArg << ")'";
+                    errMsg << "invalid argument '" << VTcolIdx
+                           << "' passed to ExtractCol: out of bounds "
+                              "for dense matrix with column boundaries '[0, "
+                           << numColsArg << ")'";
                     throw std::out_of_range(errMsg.str());
                 }
                 res->append(r, c, arg->get(r, colIdx));
diff --git a/src/runtime/local/kernels/ExtractRow.h b/src/runtime/local/kernels/ExtractRow.h
index 47c84cd23..442bccdd0 100644
--- a/src/runtime/local/kernels/ExtractRow.h
+++ b/src/runtime/local/kernels/ExtractRow.h
@@ -28,25 +28,24 @@
 #include <sstream>
 #include <stdexcept>
 
+#include <cmath>
 #include <cstddef>
 #include <cstdint>
-#include <cmath>
 
 // ****************************************************************************
 // Struct for partial template specialization
 // ****************************************************************************
 
-template<class DTRes, class DTArg, typename VTSel>
-struct ExtractRow {
-    static void apply(DTRes *& res, const DTArg * arg, const DenseMatrix<VTSel> * sel, DCTX(ctx)) = delete;
+template <class DTRes, class DTArg, typename VTSel> struct ExtractRow {
+    static void apply(DTRes *&res, const DTArg *arg, const DenseMatrix<VTSel> *sel, DCTX(ctx)) = delete;
 };
 
 // ****************************************************************************
 // Convenience function
 // ****************************************************************************
 
-template<class DTRes, class DTArg, typename VTSel>
-void extractRow(DTRes *& res, const DTArg * arg, const DenseMatrix<VTSel> * sel, DCTX(ctx)) {
+template <class DTRes, class DTArg, typename VTSel>
+void extractRow(DTRes *&res, const DTArg *arg, const DenseMatrix<VTSel> *sel, DCTX(ctx)) {
     ExtractRow<DTRes, DTArg, VTSel>::apply(res, arg, sel, ctx);
 }
 
@@ -55,13 +54,14 @@ void extractRow(DTRes *& res, const DTArg * arg, const DenseMatrix<VTSel> * sel,
 // ****************************************************************************
 
 // index boundaries are verified later for performance
-#define VALIDATE_ARGS(numColsSel) \
-    if(numColsSel != 1) { \
-        std::ostringstream errMsg; \
-        errMsg << "invalid argument passed to ExtractRow: column selection must be given as column matrix but has '" \
-            << numColsSel << "' columns instead of one"; \
-        throw std::runtime_error(errMsg.str()); \
-    } \
+#define VALIDATE_ARGS(numColsSel)                                                                                      \
+    if (numColsSel != 1) {                                                                                             \
+        std::ostringstream errMsg;                                                                                     \
+        errMsg << "invalid argument passed to ExtractRow: column selection "                                           \
+                  "must be given as column matrix but has '"                                                           \
+               << numColsSel << "' columns instead of one";                                                            \
+        throw std::runtime_error(errMsg.str());                                                                        \
+    }
 
 // ****************************************************************************
 // (Partial) template specializations for different data/value types
@@ -74,16 +74,15 @@ void extractRow(DTRes *& res, const DTArg * arg, const DenseMatrix<VTSel> * sel,
 // 0 (row-wise) or 1 (column-wise)
 #define EXTRACTROW_FRAME_MODE 0
 
-template<typename VTSel>
-struct ExtractRow<Frame, Frame, VTSel> {
-    static void apply(Frame *& res, const Frame * arg, const DenseMatrix<VTSel> * sel, DCTX(ctx)) {
+template <typename VTSel> struct ExtractRow<Frame, Frame, VTSel> {
+    static void apply(Frame *&res, const Frame *arg, const DenseMatrix<VTSel> *sel, DCTX(ctx)) {
         VALIDATE_ARGS(sel->getNumCols());
-        
+
         const size_t numRowsSel = sel->getNumRows();
         const size_t numCols = arg->getNumCols();
         const size_t numRowsArg = arg->getNumRows();
-        const ValueTypeCode * schema = arg->getSchema();
-        
+        const ValueTypeCode *schema = arg->getSchema();
+
 #if EXTRACTROW_FRAME_MODE == 0
         // Add some padding due to stores in units of 8 bytes (see below). This
         // formula is a little pessimistic, though.
@@ -91,34 +90,34 @@ struct ExtractRow<Frame, Frame, VTSel> {
 #elif EXTRACTROW_FRAME_MODE == 1
         const size_t numRowsResAlloc = numRowsSel;
 #endif
-        if(res == nullptr)
-            res = DataObjectFactory::create<Frame>(
-                    numRowsResAlloc, numCols, schema, arg->getLabels(), false
-            );
-        
-        const VTSel * valuesSel = sel->getValues();
-        
+        if (res == nullptr)
+            res = DataObjectFactory::create<Frame>(numRowsResAlloc, numCols, schema, arg->getLabels(), false);
+
+        const VTSel *valuesSel = sel->getValues();
+
 #if EXTRACTROW_FRAME_MODE == 0
         // Some information on each column.
         const auto elementSizes = std::make_unique<size_t[]>(numCols);
-        const auto argCols = std::make_unique<const uint8_t*[]>(numCols);
-        auto resCols = std::make_unique<uint8_t*[]>(numCols);
+        const auto argCols = std::make_unique<const uint8_t *[]>(numCols);
+        auto resCols = std::make_unique<uint8_t *[]>(numCols);
         // Initialize information on each column.
-        for(size_t c = 0; c < numCols; c++) {
+        for (size_t c = 0; c < numCols; c++) {
             elementSizes[c] = ValueTypeUtils::sizeOf(schema[c]);
             argCols[c] = reinterpret_cast<const uint8_t *>(arg->getColumnRaw(c));
             resCols[c] = reinterpret_cast<uint8_t *>(res->getColumnRaw(c));
         }
         // Actual filtering.
-        for(size_t r = 0; r < numRowsSel; r++) {
+        for (size_t r = 0; r < numRowsSel; r++) {
             const size_t pos = valuesSel[r];
             if (valuesSel[r] < 0 || numRowsArg <= pos) {
                 std::ostringstream errMsg;
-                errMsg << "invalid argument '" << valuesSel[r] << "' passed to ExtractRow: "
-                    "out of bounds for frame with row boundaries '[0, " << numRowsArg << ")'";
+                errMsg << "invalid argument '" << valuesSel[r]
+                       << "' passed to ExtractRow: "
+                          "out of bounds for frame with row boundaries '[0, "
+                       << numRowsArg << ")'";
                 throw std::out_of_range(errMsg.str());
             }
-            for(size_t c = 0; c < numCols; c++) {
+            for (size_t c = 0; c < numCols; c++) {
                 // We always copy in units of 8 bytes (uint64_t). If the
                 // actual element size is lower, the superfluous bytes will
                 // be overwritten by the next match. With this approach, we
@@ -126,10 +125,8 @@ struct ExtractRow<Frame, Frame, VTSel> {
                 // interpret the types for a L/S of fitting size.
                 // TODO Don't multiply by elementSize, but left-shift by
                 // ld(elementSize).
-                *reinterpret_cast<uint64_t *>(resCols[c]) = 
-                        *reinterpret_cast<const uint64_t *>(
-                                argCols[c] + pos * elementSizes[c]
-                        );
+                *reinterpret_cast<uint64_t *>(resCols[c]) =
+                    *reinterpret_cast<const uint64_t *>(argCols[c] + pos * elementSizes[c]);
                 resCols[c] += elementSizes[c];
             }
         }
@@ -147,104 +144,112 @@ struct ExtractRow<Frame, Frame, VTSel> {
 // DenseMatrix <- DenseMatrix
 // ----------------------------------------------------------------------------
 
-template<typename VT, typename VTSel>
-struct ExtractRow<DenseMatrix<VT>, DenseMatrix<VT>, VTSel> {
-    static void apply(DenseMatrix<VT> *& res, const DenseMatrix<VT> * arg, const DenseMatrix<VTSel> * sel, DCTX(ctx)) {
+template <typename VT, typename VTSel> struct ExtractRow<DenseMatrix<VT>, DenseMatrix<VT>, VTSel> {
+    static void apply(DenseMatrix<VT> *&res, const DenseMatrix<VT> *arg, const DenseMatrix<VTSel> *sel, DCTX(ctx)) {
         // input validation
-        if(arg==nullptr)
-            throw std::runtime_error("invalid argument passed to ExtractRow on dense matrix: arg cannot be null");
-        if(sel==nullptr)
-            throw std::runtime_error("invalid argument passed to ExtractRow on dense matrix: rowIdxs sel cannot be null");
+        if (arg == nullptr)
+            throw std::runtime_error("invalid argument passed to ExtractRow on "
+                                     "dense matrix: arg cannot be null");
+        if (sel == nullptr)
+            throw std::runtime_error("invalid argument passed to ExtractRow on dense matrix: "
+                                     "rowIdxs sel cannot be null");
         VALIDATE_ARGS(sel->getNumCols());
 
         const size_t numRowsSel = sel->getNumRows();
         const size_t numRowsArg = arg->getNumRows();
         const size_t numColsArg = arg->getNumCols();
-        if(res==nullptr){
+        if (res == nullptr) {
             res = DataObjectFactory::create<DenseMatrix<VT>>(numRowsSel, numColsArg, false);
-        }
-        else if(res->getNumRows() != numRowsSel || res->getNumCols() != numColsArg){
-            // TODO what is the best strategy: throw a warning or just re-allocate?
+        } else if (res->getNumRows() != numRowsSel || res->getNumCols() != numColsArg) {
+            // TODO what is the best strategy: throw a warning or just
+            // re-allocate?
             std::ostringstream errMsg;
-            errMsg << "invalid argument passed to ExtractRow on dense matrix: res was not null, but given res has wrong dimensions "
-                        << res->getNumRows() << "x" << res->getNumCols() << " instead of " << numRowsSel << "x" << numColsArg;
+            errMsg << "invalid argument passed to ExtractRow on dense matrix: "
+                      "res was not null, but given res has wrong dimensions "
+                   << res->getNumRows() << "x" << res->getNumCols() << " instead of " << numRowsSel << "x"
+                   << numColsArg;
             throw std::runtime_error(errMsg.str());
         }
-        
-        //Main Logic
-        VT * allUpdatedValues = res->getValues();
-        const VTSel * valuesSel = sel->getValues();
-        for(size_t r = 0; r < numRowsSel; r++){
-            const VTSel valSelectedRow = valuesSel[r];  // only one column
+
+        // Main Logic
+        VT *allUpdatedValues = res->getValues();
+        const VTSel *valuesSel = sel->getValues();
+        for (size_t r = 0; r < numRowsSel; r++) {
+            const VTSel valSelectedRow = valuesSel[r]; // only one column
             // TODO For performance reasons, we might skip such checks or make
             // them optional somehow, but it is okay for now.
-            if(std::isnan(valSelectedRow)){
+            if (std::isnan(valSelectedRow)) {
                 std::ostringstream errMsg;
-                errMsg << "invalid argument passed to ExtractRow on dense matrix: rowIdxs sel value at index " << r << " is NaN";
+                errMsg << "invalid argument passed to ExtractRow on dense "
+                          "matrix: rowIdxs sel value at index "
+                       << r << " is NaN";
                 throw std::runtime_error(errMsg.str());
-            } 
-            else if (valSelectedRow < 0 || numRowsArg <= static_cast<const size_t>(valSelectedRow)) {
+            } else if (valSelectedRow < 0 || numRowsArg <= static_cast<const size_t>(valSelectedRow)) {
                 std::ostringstream errMsg;
-                errMsg << "invalid argument '" << valSelectedRow << "' passed to ExtractRow: out of bounds for "
-                    "matrix with row boundaries '[0, " << numRowsArg << ")'";
+                errMsg << "invalid argument '" << valSelectedRow
+                       << "' passed to ExtractRow: out of bounds for "
+                          "matrix with row boundaries '[0, "
+                       << numRowsArg << ")'";
                 throw std::out_of_range(errMsg.str());
-            }
-            else
-            {
-                const VT * allValues = arg->getValues()+static_cast<const size_t>(valSelectedRow)*arg->getRowSkip();
-                for(size_t c = 0; c < numColsArg; c++){
-                    allUpdatedValues[c]=allValues[c];   
-                }   
-                allUpdatedValues += res->getRowSkip();                     
+            } else {
+                const VT *allValues = arg->getValues() + static_cast<const size_t>(valSelectedRow) * arg->getRowSkip();
+                for (size_t c = 0; c < numColsArg; c++) {
+                    allUpdatedValues[c] = allValues[c];
+                }
+                allUpdatedValues += res->getRowSkip();
             }
         }
-    }        
+    }
 };
 
 // ----------------------------------------------------------------------------
 // Matrix <- Matrix
 // ----------------------------------------------------------------------------
 
-template<typename VT, typename VTSel>
-struct ExtractRow<Matrix<VT>, Matrix<VT>, VTSel> {
-    static void apply(Matrix<VT> *& res, const Matrix<VT> * arg, const Matrix<VTSel> * sel, DCTX(ctx)) {
+template <typename VT, typename VTSel> struct ExtractRow<Matrix<VT>, Matrix<VT>, VTSel> {
+    static void apply(Matrix<VT> *&res, const Matrix<VT> *arg, const Matrix<VTSel> *sel, DCTX(ctx)) {
         // input validation
         if (arg == nullptr)
-            throw std::runtime_error("invalid argument passed to ExtractRow on dense matrix: arg cannot be null");
+            throw std::runtime_error("invalid argument passed to ExtractRow on "
+                                     "dense matrix: arg cannot be null");
         if (sel == nullptr)
-            throw std::runtime_error("invalid argument passed to ExtractRow on dense matrix: rowIdxs sel cannot be null");
+            throw std::runtime_error("invalid argument passed to ExtractRow on dense matrix: "
+                                     "rowIdxs sel cannot be null");
         VALIDATE_ARGS(sel->getNumCols());
 
         const size_t numRowsSel = sel->getNumRows();
         const size_t numRowsArg = arg->getNumRows();
         const size_t numColsArg = arg->getNumCols();
-        if (res == nullptr){
+        if (res == nullptr) {
             res = DataObjectFactory::create<DenseMatrix<VT>>(numRowsSel, numColsArg, false);
-        }
-        else if (res->getNumRows() != numRowsSel || res->getNumCols() != numColsArg) {
+        } else if (res->getNumRows() != numRowsSel || res->getNumCols() != numColsArg) {
             std::ostringstream errMsg;
-            errMsg << "invalid argument passed to ExtractRow on dense matrix: res was not null, but given res has wrong dimensions "
-                        << res->getNumRows() << "x" << res->getNumCols() << " instead of " << numRowsSel << "x" << numColsArg;
+            errMsg << "invalid argument passed to ExtractRow on dense matrix: "
+                      "res was not null, but given res has wrong dimensions "
+                   << res->getNumRows() << "x" << res->getNumCols() << " instead of " << numRowsSel << "x"
+                   << numColsArg;
             throw std::runtime_error(errMsg.str());
         }
-        
+
         // Main Logic
         res->prepareAppend();
-        for (size_t r = 0; r < numRowsSel; ++r){
-            const VTSel valSelectedRow = sel->get(r, 0);  // only one column
+        for (size_t r = 0; r < numRowsSel; ++r) {
+            const VTSel valSelectedRow = sel->get(r, 0); // only one column
 
             if (std::isnan(valSelectedRow)) {
                 std::ostringstream errMsg;
-                errMsg << "invalid argument passed to ExtractRow on dense matrix: rowIdxs sel value at index " << r << " is NaN";
+                errMsg << "invalid argument passed to ExtractRow on dense "
+                          "matrix: rowIdxs sel value at index "
+                       << r << " is NaN";
                 throw std::runtime_error(errMsg.str());
-            } 
-            else if (valSelectedRow < 0 || numRowsArg <= static_cast<const size_t>(valSelectedRow)) {
+            } else if (valSelectedRow < 0 || numRowsArg <= static_cast<const size_t>(valSelectedRow)) {
                 std::ostringstream errMsg;
-                errMsg << "invalid argument '" << valSelectedRow << "' passed to ExtractRow: out of bounds for "
-                    "matrix with row boundaries '[0, " << numRowsArg << ")'";
+                errMsg << "invalid argument '" << valSelectedRow
+                       << "' passed to ExtractRow: out of bounds for "
+                          "matrix with row boundaries '[0, "
+                       << numRowsArg << ")'";
                 throw std::out_of_range(errMsg.str());
-            }
-            else {
+            } else {
                 for (size_t c = 0; c < numColsArg; ++c)
                     res->append(r, c, arg->get(static_cast<const size_t>(valSelectedRow), c));
             }
@@ -255,4 +260,4 @@ struct ExtractRow<Matrix<VT>, Matrix<VT>, VTSel> {
 
 #undef VALIDATE_ARGS
 
-#endif //SRC_RUNTIME_LOCAL_KERNELS_EXTRACTROW_H
\ No newline at end of file
+#endif // SRC_RUNTIME_LOCAL_KERNELS_EXTRACTROW_H
\ No newline at end of file
diff --git a/src/runtime/local/kernels/FPGAOPENCL/CreateFPGAContext.h b/src/runtime/local/kernels/FPGAOPENCL/CreateFPGAContext.h
index e208eedf9..47a042c26 100644
--- a/src/runtime/local/kernels/FPGAOPENCL/CreateFPGAContext.h
+++ b/src/runtime/local/kernels/FPGAOPENCL/CreateFPGAContext.h
@@ -24,8 +24,8 @@
 // ****************************************************************************
 
 namespace FPGAOPENCL {
-    static void createFPGAContext(DCTX(ctx)) {
-        // ToDo: one context per device
-        ctx->fpga_contexts.emplace_back(FPGAContext::createFpgaContext(0));
-    }
+static void createFPGAContext(DCTX(ctx)) {
+    // ToDo: one context per device
+    ctx->fpga_contexts.emplace_back(FPGAContext::createFpgaContext(0));
 }
+} // namespace FPGAOPENCL
diff --git a/src/runtime/local/kernels/FPGAOPENCL/MatMul.h b/src/runtime/local/kernels/FPGAOPENCL/MatMul.h
index dcd1b442e..332d8afbf 100644
--- a/src/runtime/local/kernels/FPGAOPENCL/MatMul.h
+++ b/src/runtime/local/kernels/FPGAOPENCL/MatMul.h
@@ -24,28 +24,28 @@
 #include <cblas.h>
 #include <math.h>
 
-#include <cstddef>
 #include "gemm_interface.h"
 #include "sgemv_interface.h"
+#include <cstddef>
 
 namespace FPGAOPENCL {
 // ****************************************************************************
 // Struct for partial template specialization
 // ****************************************************************************
 
-template<class DTRes, class DTLhs, class DTRhs>
-struct MatMul {
-//   static void apply(DTRes *& res, const DTLhs * lhs, const DTRhs * rhs, DCTX(ctx)) = delete;
-   static void apply(DTRes *& res, const DTLhs * lhs, const DTRhs * rhs, bool transa, bool transb, DCTX(ctx)) = delete;
+template <class DTRes, class DTLhs, class DTRhs> struct MatMul {
+    //   static void apply(DTRes *& res, const DTLhs * lhs, const DTRhs * rhs,
+    //   DCTX(ctx)) = delete;
+    static void apply(DTRes *&res, const DTLhs *lhs, const DTRhs *rhs, bool transa, bool transb, DCTX(ctx)) = delete;
 };
 
 // ****************************************************************************
 // Convenience function
 // ****************************************************************************
 
-template<class DTRes, class DTLhs, class DTRhs>
-void matMul(DTRes *& res, const DTLhs * lhs, const DTRhs * rhs, bool transa, bool transb, DCTX(ctx)) {
-    MatMul<DTRes, DTLhs, DTRhs>::apply(res, lhs, rhs,transa, transb, ctx);
+template <class DTRes, class DTLhs, class DTRhs>
+void matMul(DTRes *&res, const DTLhs *lhs, const DTRhs *rhs, bool transa, bool transb, DCTX(ctx)) {
+    MatMul<DTRes, DTLhs, DTRhs>::apply(res, lhs, rhs, transa, transb, ctx);
 }
 
 // ****************************************************************************
@@ -56,9 +56,9 @@ void matMul(DTRes *& res, const DTLhs * lhs, const DTRhs * rhs, bool transa, boo
 // DenseMatrix <- DenseMatrix, DenseMatrix
 // ----------------------------------------------------------------------------
 
-template<>
-struct MatMul<DenseMatrix<float>, DenseMatrix<float>, DenseMatrix<float>> {
-    static void apply(DenseMatrix<float> *& res, const DenseMatrix<float> * lhs, const DenseMatrix<float> * rhs, bool transa, bool transb,DCTX(ctx)) {
+template <> struct MatMul<DenseMatrix<float>, DenseMatrix<float>, DenseMatrix<float>> {
+    static void apply(DenseMatrix<float> *&res, const DenseMatrix<float> *lhs, const DenseMatrix<float> *rhs,
+                      bool transa, bool transb, DCTX(ctx)) {
         const size_t nr1 = lhs->getNumRows();
         const size_t nc1 = lhs->getNumCols();
         const size_t nc2 = rhs->getNumCols();
@@ -66,147 +66,148 @@ struct MatMul<DenseMatrix<float>, DenseMatrix<float>, DenseMatrix<float>> {
         if (nc1 != nr2)
             perror("#cols of lhs and #rows of rhs must be the same");
 
-// Parameters of the systolic array in the bitstream. Do not change.
-
-	//GEMM
-#define II_gemm   32
-#define JJ_gemm   32
-#define KK_gemm   32
-#define III_gemm  14
-#define JJJ_gemm  16
-#define KKK_gemm  16
-	//GEMV
-#define II_gemv   32
-#define KK_gemv   32
-#define III_gemv  64
-#define KKK_gemv  1
+            // Parameters of the systolic array in the bitstream. Do not change.
+
+            // GEMM
+#define II_gemm 32
+#define JJ_gemm 32
+#define KK_gemm 32
+#define III_gemm 14
+#define JJJ_gemm 16
+#define KKK_gemm 16
+            // GEMV
+#define II_gemv 32
+#define KK_gemv 32
+#define III_gemv 64
+#define KKK_gemv 1
+
+            // #ifndef NDEBUG
+            //  if (nr1%(II*III) != 0)
+            //      perror("lhs #rows number must be a multiple of 448");
+            //  if (nc1%(JJ*JJJ) != 0 || nc1 <= 512 || nr2%(JJ*JJJ) != 0 || nc1
+            //  <= 512)
+            //      perror("#cols of lhs and #rows of rhs must be a multiple of
+            //      512 (and minimum 1024)");
+            //  if (nc2%(KK*KKK) != 0)
+            //     perror("#cols of rhs must be a multiple of 512");
+            // #endif
+
+            //	printf("\ntest MatMul f32 \n");
+            // Parameters of the systolic array in the bitstream. Do not change.
+
+            // #define TYPE float
 
+#define ACL_ALIGNMENT 64
+        int OUTERMOST_I; //= ceil(nr1/448);
+        int OUTERMOST_J; //= ceil(nc2/512);
+        int OUTERMOST_K; //= ceil(nc1/512);
 
-//#ifndef NDEBUG
-    // if (nr1%(II*III) != 0)
-    //     perror("lhs #rows number must be a multiple of 448");
-    // if (nc1%(JJ*JJJ) != 0 || nc1 <= 512 || nr2%(JJ*JJJ) != 0 || nc1 <= 512)
-    //     perror("#cols of lhs and #rows of rhs must be a multiple of 512 (and minimum 1024)");
-	// if (nc2%(KK*KKK) != 0)
-    //     perror("#cols of rhs must be a multiple of 512");
-//#endif       
+        float *A, *B, *C;
+        void *aa = NULL, *bb = NULL, *cc = NULL;
 
-//	printf("\ntest MatMul f32 \n");
-// Parameters of the systolic array in the bitstream. Do not change.
+        int TOTAL_I; //= III * II * OUTERMOST_I;
+        int TOTAL_J; //= JJJ * JJ * OUTERMOST_J;
+        int TOTAL_K; //= KKK * KK * OUTERMOST_K;
 
-//#define TYPE float
+        long int num_elem_A; // = (long int)TOTAL_I*TOTAL_K;
+        long int num_elem_B; // = (long int)TOTAL_K*TOTAL_J;
+        long int num_elem_C; // = (long int)TOTAL_I*TOTAL_J;
 
-#define ACL_ALIGNMENT 64
-    int OUTERMOST_I; //= ceil(nr1/448);
-    int OUTERMOST_J; //= ceil(nc2/512);
-    int OUTERMOST_K; //= ceil(nc1/512);
-
-    float *A, *B, *C;
-    void *aa=NULL,*bb=NULL,*cc=NULL;
- 
- 
-    int TOTAL_I; //= III * II * OUTERMOST_I;
-    int TOTAL_J; //= JJJ * JJ * OUTERMOST_J;
-    int TOTAL_K; //= KKK * KK * OUTERMOST_K;
-    
-    long int num_elem_A; // = (long int)TOTAL_I*TOTAL_K;
-    long int num_elem_B; // = (long int)TOTAL_K*TOTAL_J;
-    long int num_elem_C; // = (long int)TOTAL_I*TOTAL_J;
-    
 #ifndef NDEBUG
-    printf("\nA rows %ld\n",nr1);
-    printf("\nA cols %ld\n",nc1);
-    printf("\nX rows %ld\n",nr2);
-    printf("\nX cols %ld\n",nc2);
-#endif 
-
-    
-    if(nc2==1)//gemv kernel
-    {
+        printf("\nA rows %ld\n", nr1);
+        printf("\nA cols %ld\n", nc1);
+        printf("\nX rows %ld\n", nr2);
+        printf("\nX cols %ld\n", nc2);
+#endif
+
+        if (nc2 == 1) // gemv kernel
+        {
 #ifndef NDEBUG
-	printf("\nrunning GEMV kernel \n");
-#endif    
-        OUTERMOST_I = ceil(nr1/2048);
-    	OUTERMOST_K = ceil(nc1/32);
- 
-    	TOTAL_I = III_gemv * II_gemv * OUTERMOST_I;
-    	TOTAL_K = KKK_gemv * KK_gemv * OUTERMOST_K;
-    
-    	num_elem_A = (long int)TOTAL_I*TOTAL_K;
-    	num_elem_B = (long int)TOTAL_K;
-    	num_elem_C = (long int)TOTAL_I;
-    
-    	posix_memalign(&aa,ACL_ALIGNMENT,num_elem_A * sizeof(float));
-    	A=(float*)aa;
-    	if (A==NULL)
-       	    perror("Failed malloc of matrix A");
-    	posix_memalign(&bb,ACL_ALIGNMENT,num_elem_B * sizeof(float));
-    	B=(float*)bb;
-    	if (B==NULL)
-           perror("Failed malloc of matrix B");
-    	posix_memalign(&cc,ACL_ALIGNMENT,num_elem_C * sizeof(float));
-    	C=(float*)cc;
-    	if (C==NULL)
-            perror("Failed malloc of matrix C");
-
-    	memcpy(A,lhs->getValues(),num_elem_A * sizeof(float));//sizeof(lhs));
-    	memcpy(B,rhs->getValues(),num_elem_B * sizeof(float));//sizeof(rhs));
-    	
-	sgemv(A, B, C, OUTERMOST_I, OUTERMOST_K, ctx);
-
-   	 if(res == nullptr)
-            res = DataObjectFactory::create<DenseMatrix<float>>(nr1, nc2, false);
-   
-    	memcpy(res->getValues(),C,num_elem_C * sizeof(float));//sizeof(C)
- 
-    }
-    else //gemm kernel
-    {   
+            printf("\nrunning GEMV kernel \n");
+#endif
+            OUTERMOST_I = ceil(nr1 / 2048);
+            OUTERMOST_K = ceil(nc1 / 32);
+
+            TOTAL_I = III_gemv * II_gemv * OUTERMOST_I;
+            TOTAL_K = KKK_gemv * KK_gemv * OUTERMOST_K;
+
+            num_elem_A = (long int)TOTAL_I * TOTAL_K;
+            num_elem_B = (long int)TOTAL_K;
+            num_elem_C = (long int)TOTAL_I;
+
+            posix_memalign(&aa, ACL_ALIGNMENT, num_elem_A * sizeof(float));
+            A = (float *)aa;
+            if (A == NULL)
+                perror("Failed malloc of matrix A");
+            posix_memalign(&bb, ACL_ALIGNMENT, num_elem_B * sizeof(float));
+            B = (float *)bb;
+            if (B == NULL)
+                perror("Failed malloc of matrix B");
+            posix_memalign(&cc, ACL_ALIGNMENT, num_elem_C * sizeof(float));
+            C = (float *)cc;
+            if (C == NULL)
+                perror("Failed malloc of matrix C");
+
+            memcpy(A, lhs->getValues(),
+                   num_elem_A * sizeof(float)); // sizeof(lhs));
+            memcpy(B, rhs->getValues(),
+                   num_elem_B * sizeof(float)); // sizeof(rhs));
+
+            sgemv(A, B, C, OUTERMOST_I, OUTERMOST_K, ctx);
+
+            if (res == nullptr)
+                res = DataObjectFactory::create<DenseMatrix<float>>(nr1, nc2, false);
+
+            memcpy(res->getValues(), C,
+                   num_elem_C * sizeof(float)); // sizeof(C)
+
+        } else // gemm kernel
+        {
 #ifndef NDEBUG
-	printf("\nrunning GEMM kernel \n");
-#endif    
-    	OUTERMOST_I = ceil(nr1/448);
-    	OUTERMOST_J = ceil(nc2/512);
-    	OUTERMOST_K = ceil(nc1/512);
- 
-    	TOTAL_I = III_gemm * II_gemm * OUTERMOST_I;
-    	TOTAL_J = JJJ_gemm * JJ_gemm * OUTERMOST_J;
-    	TOTAL_K = KKK_gemm * KK_gemm * OUTERMOST_K;
-    
-    	num_elem_A = (long int)TOTAL_I*TOTAL_K;
-    	num_elem_B = (long int)TOTAL_K*TOTAL_J;
-    	num_elem_C = (long int)TOTAL_I*TOTAL_J;
-    
-    	posix_memalign(&aa,ACL_ALIGNMENT,num_elem_A * sizeof(float));
-    	A=(float*)aa;
-    	if (A==NULL)
-            perror("Failed malloc of matrix A");
-    	posix_memalign(&bb,ACL_ALIGNMENT,num_elem_B * sizeof(float));
-    	B=(float*)bb;
-    	if (B==NULL)
-            perror("Failed malloc of matrix B");
-    	posix_memalign(&cc,ACL_ALIGNMENT,num_elem_C * sizeof(float));
-    	C=(float*)cc;
-    	if (C==NULL)
-            perror("Failed malloc of matrix C");
-
-    	memcpy(A,lhs->getValues(),num_elem_A * sizeof(float));
-    	memcpy(B,rhs->getValues(),num_elem_B * sizeof(float));
-     
-    	sgemm(A, B, C, OUTERMOST_I, OUTERMOST_J, OUTERMOST_K, ctx);
- 
-    	if(res == nullptr)
-            res = DataObjectFactory::create<DenseMatrix<float>>(nr1, nc2, false);
-   
-    	memcpy(res->getValues(),C,num_elem_C * sizeof(float));
-   }
-
-   }
+            printf("\nrunning GEMM kernel \n");
+#endif
+            OUTERMOST_I = ceil(nr1 / 448);
+            OUTERMOST_J = ceil(nc2 / 512);
+            OUTERMOST_K = ceil(nc1 / 512);
+
+            TOTAL_I = III_gemm * II_gemm * OUTERMOST_I;
+            TOTAL_J = JJJ_gemm * JJ_gemm * OUTERMOST_J;
+            TOTAL_K = KKK_gemm * KK_gemm * OUTERMOST_K;
+
+            num_elem_A = (long int)TOTAL_I * TOTAL_K;
+            num_elem_B = (long int)TOTAL_K * TOTAL_J;
+            num_elem_C = (long int)TOTAL_I * TOTAL_J;
+
+            posix_memalign(&aa, ACL_ALIGNMENT, num_elem_A * sizeof(float));
+            A = (float *)aa;
+            if (A == NULL)
+                perror("Failed malloc of matrix A");
+            posix_memalign(&bb, ACL_ALIGNMENT, num_elem_B * sizeof(float));
+            B = (float *)bb;
+            if (B == NULL)
+                perror("Failed malloc of matrix B");
+            posix_memalign(&cc, ACL_ALIGNMENT, num_elem_C * sizeof(float));
+            C = (float *)cc;
+            if (C == NULL)
+                perror("Failed malloc of matrix C");
+
+            memcpy(A, lhs->getValues(), num_elem_A * sizeof(float));
+            memcpy(B, rhs->getValues(), num_elem_B * sizeof(float));
+
+            sgemm(A, B, C, OUTERMOST_I, OUTERMOST_J, OUTERMOST_K, ctx);
+
+            if (res == nullptr)
+                res = DataObjectFactory::create<DenseMatrix<float>>(nr1, nc2, false);
+
+            memcpy(res->getValues(), C, num_elem_C * sizeof(float));
+        }
+    }
 };
 /* TODO
 template<>
 struct MatMul<DenseMatrix<double>, DenseMatrix<double>, DenseMatrix<double>> {
-    static void apply(DenseMatrix<double> *& res, const DenseMatrix<double> * lhs, const DenseMatrix<double> * rhs, bool transa, bool transb, DCTX(ctx)) {
+    static void apply(DenseMatrix<double> *& res, const DenseMatrix<double> *
+lhs, const DenseMatrix<double> * rhs, bool transa, bool transb, DCTX(ctx)) {
         const size_t nr1 = lhs->getNumRows();
         const size_t nc1 = lhs->getNumCols();
         const size_t nc2 = rhs->getNumCols();
@@ -214,23 +215,24 @@ struct MatMul<DenseMatrix<double>, DenseMatrix<double>, DenseMatrix<double>> {
         const size_t nr2 = rhs->getNumRows();
         if (nc1 != nr2)
             perror("#cols of lhs and #rows of rhs must be the same");
-#endif 
+#endif
         if(res == nullptr)
-            res = DataObjectFactory::create<DenseMatrix<double>>(nr1, nc2, false);
+            res = DataObjectFactory::create<DenseMatrix<double>>(nr1, nc2,
+false);
 
         if(nr1 == 1 && nc2 == 1) // Vector-Vector
-            res->set(0, 0, cblas_ddot(nc1, lhs->getValues(), 1, rhs->getValues(), rhs->getRowSkip()));
-        else if(nc2 == 1)        // Matrix-Vector
-            cblas_dgemv(CblasRowMajor, CblasNoTrans, nr1, nc1, 1, lhs->getValues(),
-                lhs->getRowSkip(), rhs->getValues(), rhs->getRowSkip(), 0,
+            res->set(0, 0, cblas_ddot(nc1, lhs->getValues(), 1,
+rhs->getValues(), rhs->getRowSkip())); else if(nc2 == 1)        // Matrix-Vector
+            cblas_dgemv(CblasRowMajor, CblasNoTrans, nr1, nc1, 1,
+lhs->getValues(), lhs->getRowSkip(), rhs->getValues(), rhs->getRowSkip(), 0,
                 res->getValues(), res->getRowSkip());
         else                     // Matrix-Matrix
-            cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, nr1, nc2, nc1,
-                1, lhs->getValues(), lhs->getRowSkip(), rhs->getValues(),
+            cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, nr1, nc2,
+nc1, 1, lhs->getValues(), lhs->getRowSkip(), rhs->getValues(),
                 rhs->getRowSkip(), 0, res->getValues(), res->getRowSkip());
     }
 };
 */
-}
+} // namespace FPGAOPENCL
 
-#endif //SRC_RUNTIME_LOCAL_KERNELS_MATMUL_H
+#endif // SRC_RUNTIME_LOCAL_KERNELS_MATMUL_H
diff --git a/src/runtime/local/kernels/FPGAOPENCL/Syrk.h b/src/runtime/local/kernels/FPGAOPENCL/Syrk.h
index e200f40c8..78f042406 100644
--- a/src/runtime/local/kernels/FPGAOPENCL/Syrk.h
+++ b/src/runtime/local/kernels/FPGAOPENCL/Syrk.h
@@ -18,31 +18,29 @@
 
 #include <runtime/local/context/DaphneContext.h>
 #include <runtime/local/datastructures/CSRMatrix.h>
-#include <runtime/local/datastructures/DenseMatrix.h>
 #include <runtime/local/datastructures/DataObjectFactory.h>
+#include <runtime/local/datastructures/DenseMatrix.h>
 
 #include <cblas.h>
 #include <math.h>
 
-#include <cstddef>
 #include "syrk_interface.h"
+#include <cstddef>
 
-namespace FPGAOPENCL{
+namespace FPGAOPENCL {
 // ****************************************************************************
 // Struct for partial template specialization
 // ****************************************************************************
 
-template<class DTRes, class DTArg>
-struct Syrk {
-    static void apply(DTRes *& res, const DTArg * arg, DCTX(ctx)) = delete;
+template <class DTRes, class DTArg> struct Syrk {
+    static void apply(DTRes *&res, const DTArg *arg, DCTX(ctx)) = delete;
 };
 
 // ****************************************************************************
 // Convenience function
 // ****************************************************************************
 
-template<class DTRes, class DTArg>
-void syrk(DTRes *& res, const DTArg * arg, DCTX(ctx)) {
+template <class DTRes, class DTArg> void syrk(DTRes *&res, const DTArg *arg, DCTX(ctx)) {
     Syrk<DTRes, DTArg>::apply(res, arg, ctx);
 }
 
@@ -54,60 +52,54 @@ void syrk(DTRes *& res, const DTArg * arg, DCTX(ctx)) {
 // DenseMatrix <- DenseMatrix
 // ----------------------------------------------------------------------------
 
-template<>
-struct Syrk<DenseMatrix<float>, DenseMatrix<float>> {
-    static void apply(DenseMatrix<float> *& res, const DenseMatrix<float> * arg, DCTX(ctx)) {
+template <> struct Syrk<DenseMatrix<float>, DenseMatrix<float>> {
+    static void apply(DenseMatrix<float> *&res, const DenseMatrix<float> *arg, DCTX(ctx)) {
         const size_t numRows = arg->getNumRows();
         const size_t numCols = arg->getNumCols();
 
-	#define KKK         16
-	#define JJJ         8
-	#define III         16
-	#define JJ          32
-	#define II          16
-	#define KK          16
-
-    	int OUTERMOST_I = ceil(numRows/256);		//256=III*II 
-	int OUTERMOST_K = ceil(numCols/256);		// 256=KKK*KK 
-
-    	float *A, *C;
- 	void *aa=NULL,*cc=NULL;
- 
-	int TOTAL_I = III * II * OUTERMOST_I; 
-    	int TOTAL_K = KKK * KK * OUTERMOST_K; 
-    
-    	long int num_elem_A = (long int)TOTAL_I * TOTAL_K; 
-    	long int num_elem_C = (long int)TOTAL_I * TOTAL_I; 
-    	
-     	posix_memalign(&aa,ACL_ALIGNMENT,num_elem_A * sizeof(float));
-    	A=(float*)aa;
-    	if (A==NULL)
-       	    perror("Failed malloc of matrix A");
-    	posix_memalign(&cc,ACL_ALIGNMENT,num_elem_C * sizeof(float));
-    	C=(float*)cc;
-    	if (C==NULL)
-            perror("Failed malloc of matrix C");
+#define KKK 16
+#define JJJ 8
+#define III 16
+#define JJ 32
+#define II 16
+#define KK 16
 
+        int OUTERMOST_I = ceil(numRows / 256); // 256=III*II
+        int OUTERMOST_K = ceil(numCols / 256); // 256=KKK*KK
 
-    	memcpy(A,arg->getValues(),num_elem_A * sizeof(float));
+        float *A, *C;
+        void *aa = NULL, *cc = NULL;
 
-//#ifndef NDEBUG
-	printf("\nrunning FPGA SYRK kernel \n");
-//#endif    
- 
-	syrk(A ,C ,OUTERMOST_I, OUTERMOST_K, ctx);
+        int TOTAL_I = III * II * OUTERMOST_I;
+        int TOTAL_K = KKK * KK * OUTERMOST_K;
 
-    	printf("\nSyrk kernel finished!\n");
+        long int num_elem_A = (long int)TOTAL_I * TOTAL_K;
+        long int num_elem_C = (long int)TOTAL_I * TOTAL_I;
 
+        posix_memalign(&aa, ACL_ALIGNMENT, num_elem_A * sizeof(float));
+        A = (float *)aa;
+        if (A == NULL)
+            perror("Failed malloc of matrix A");
+        posix_memalign(&cc, ACL_ALIGNMENT, num_elem_C * sizeof(float));
+        C = (float *)cc;
+        if (C == NULL)
+            perror("Failed malloc of matrix C");
 
- 
-        if(res == nullptr)
+        memcpy(A, arg->getValues(), num_elem_A * sizeof(float));
+
+        // #ifndef NDEBUG
+        printf("\nrunning FPGA SYRK kernel \n");
+        // #endif
+
+        syrk(A, C, OUTERMOST_I, OUTERMOST_K, ctx);
+
+        printf("\nSyrk kernel finished!\n");
+
+        if (res == nullptr)
             res = DataObjectFactory::create<DenseMatrix<float>>(numCols, numCols, false);
 
-	memcpy(res->getValues(),C,num_elem_C * sizeof(float));
- 
-    
+        memcpy(res->getValues(), C, num_elem_C * sizeof(float));
     }
 };
 
-}
+} // namespace FPGAOPENCL
diff --git a/src/runtime/local/kernels/FPGAOPENCL/gemm_interface.cpp b/src/runtime/local/kernels/FPGAOPENCL/gemm_interface.cpp
old mode 100755
new mode 100644
index b81d99bb2..2ee537881
--- a/src/runtime/local/kernels/FPGAOPENCL/gemm_interface.cpp
+++ b/src/runtime/local/kernels/FPGAOPENCL/gemm_interface.cpp
@@ -1,11 +1,12 @@
-// Copyright (C) 2013-2019 Altera Corporation, San Jose, California, USA. All rights reserved.
-// Permission is hereby granted, free of charge, to any person obtaining a copy of this
-// software and associated documentation files (the "Software"), to deal in the Software
-// without restriction, including without limitation the rights to use, copy, modify, merge,
-// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to
-// whom the Software is furnished to do so, subject to the following conditions:
-// The above copyright notice and this permission notice shall be included in all copies or
-// substantial portions of the Software.
+// Copyright (C) 2013-2019 Altera Corporation, San Jose, California, USA. All
+// rights reserved. Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation files (the
+// "Software"), to deal in the Software without restriction, including without
+// limitation the rights to use, copy, modify, merge, publish, distribute,
+// sublicense, and/or sell copies of the Software, and to permit persons to whom
+// the Software is furnished to do so, subject to the following conditions: The
+// above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
 //
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
@@ -16,11 +17,11 @@
 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 // OTHER DEALINGS IN THE SOFTWARE.
 //
-// This agreement shall be governed in all respects by the laws of the State of California and
-// by the laws of the United States of America.
+// This agreement shall be governed in all respects by the laws of the State of
+// California and by the laws of the United States of America.
 
-
-// This file is modified from /glob/development-tools/versions/fpgasupportstack/a10/1.2.1/intelFPGA_pro/hld/examples_aoc/matrix_mult/host/src/main.cpp
+// This file is modified from
+// /glob/development-tools/versions/fpgasupportstack/a10/1.2.1/intelFPGA_pro/hld/examples_aoc/matrix_mult/host/src/main.cpp
 
 #include "AOCLUtils/aocl_utils.h"
 #include "CL/opencl.h"
@@ -31,6 +32,10 @@
 #include <iomanip>
 #include <iostream>
 #include <math.h>
+#include <runtime/local/context/DaphneContext.h>
+#include <runtime/local/context/FPGAContext.h>
+#include <runtime/local/kernels/FPGAOPENCL/gemm_interface.h>
+#include <runtime/local/kernels/FPGAOPENCL/kernel_utils.h>
 #include <sstream>
 #include <stdint.h>
 #include <stdio.h>
@@ -38,21 +43,14 @@
 #include <string>
 #include <sys/time.h>
 #include <time.h>
-#include <runtime/local/context/FPGAContext.h>
-#include <runtime/local/context/DaphneContext.h>
-#include <runtime/local/kernels/FPGAOPENCL/gemm_interface.h>
-#include <runtime/local/kernels/FPGAOPENCL/kernel_utils.h>
-
-
-
 
 // Parameters of the systolic array
-#define II   32
-#define JJ   32
-#define KK   32
-#define III  14
-#define JJJ  16
-#define KKK  16
+#define II 32
+#define JJ 32
+#define KK 32
+#define III 14
+#define JJJ 16
+#define KKK 16
 
 using namespace aocl_utils;
 
@@ -61,38 +59,33 @@ using namespace aocl_utils;
 #define STR_HELPER(x) #x
 #define STR(x) STR_HELPER(x)
 
-#define DPRINTF(...)     \
-    printf(__VA_ARGS__); \
+#define DPRINTF(...)                                                                                                   \
+    printf(__VA_ARGS__);                                                                                               \
     fflush(stdout);
 
-#define NUM_QUEUES_TO_CREATE    6
-#define NUM_KERNELS_TO_CREATE   6
+#define NUM_QUEUES_TO_CREATE 6
+#define NUM_KERNELS_TO_CREATE 6
 
-#define CHECK(status)                                       \
-    if (status != CL_SUCCESS) {                             \
-        printf("error %d in line %d.\n", status, __LINE__); \
-        exit(1);                                            \
+#define CHECK(status)                                                                                                  \
+    if (status != CL_SUCCESS) {                                                                                        \
+        printf("error %d in line %d.\n", status, __LINE__);                                                            \
+        exit(1);                                                                                                       \
     }
 
 #define ACL_ALIGNMENT 64
 
-const char *sgemm_kernel_name[] = {
-    "kernel_A_loader",
-    "kernel_B_loader",
-    "kernel_unloader_WAIT_FINISH",
-    "kernel_A_feeder",
-    "kernel_B_feeder",
-    "kernel_Out"
-};
+const char *sgemm_kernel_name[] = {"kernel_A_loader", "kernel_B_loader", "kernel_unloader_WAIT_FINISH",
+                                   "kernel_A_feeder", "kernel_B_feeder", "kernel_Out"};
 
-int sgemm(const float *A, const float *B, float *C, const int OUTERMOST_I, const int OUTERMOST_J, const int OUTERMOST_K, DCTX(ctx)) {
+int sgemm(const float *A, const float *B, float *C, const int OUTERMOST_I, const int OUTERMOST_J, const int OUTERMOST_K,
+          DCTX(ctx)) {
     const int TOTAL_I = III * II * OUTERMOST_I;
     const int TOTAL_J = JJJ * JJ * OUTERMOST_J;
     const int TOTAL_K = KKK * KK * OUTERMOST_K;
-    
-    long int num_elem_A = (long int)TOTAL_I*TOTAL_K;
-    long int num_elem_B = (long int)TOTAL_K*TOTAL_J;
-    long int num_elem_C = (long int)TOTAL_I*TOTAL_J;
+
+    long int num_elem_A = (long int)TOTAL_I * TOTAL_K;
+    long int num_elem_B = (long int)TOTAL_K * TOTAL_J;
+    long int num_elem_C = (long int)TOTAL_I * TOTAL_J;
 
     float *serialized_A, *serialized_B;
     if ((serialized_A = (float *)acl_aligned_malloc(num_elem_A * sizeof(float))) == NULL) {
@@ -106,18 +99,17 @@ int sgemm(const float *A, const float *B, float *C, const int OUTERMOST_I, const
     long int addr = 0;
     for (int i = 0; i < TOTAL_I; i++)
         for (int k = 0; k < TOTAL_K; k++) {
-            serialized_A[addr++] = A[k + i*TOTAL_K];
+            serialized_A[addr++] = A[k + i * TOTAL_K];
         }
     // Serialize B
     addr = 0;
     for (int j = 0; j < TOTAL_J; j++)
         for (int k = 0; k < TOTAL_K; k++) {
-            serialized_B[addr++] = B[j+k*TOTAL_J];
+            serialized_B[addr++] = B[j + k * TOTAL_J];
         }
 
-
     cl_int status;
-    auto fctx = ctx->getFPGAContext(0);    
+    auto fctx = ctx->getFPGAContext(0);
 
     //----------------------------------------------
     // Create command queues
@@ -128,21 +120,15 @@ int sgemm(const float *A, const float *B, float *C, const int OUTERMOST_I, const
     // Create a command queue using clCreateCommandQueue(),
     // and associate it with the device you want to execute on
     for (int i = 0; i < NUM_QUEUES_TO_CREATE; i++) {
-        //fDPRINTF(stdout,"cmdQueue i = %d\n", i);
-        cmdQueue[i] = clCreateCommandQueue(
-            fctx->context,
-            fctx->devices[0],
-            CL_QUEUE_PROFILING_ENABLE,
-            &status);
+        // fDPRINTF(stdout,"cmdQueue i = %d\n", i);
+        cmdQueue[i] = clCreateCommandQueue(fctx->context, fctx->devices[0], CL_QUEUE_PROFILING_ENABLE, &status);
         CHECK(status);
     }
 
-    //fDPRINTF(stdout,"cmdQueue i = %d, a queue for reading the C buffer\n", i);
-    cmdQueue[NUM_QUEUES_TO_CREATE] = clCreateCommandQueue(
-        fctx->context,
-        fctx->devices[0],
-        CL_QUEUE_PROFILING_ENABLE,
-        &status);
+    // fDPRINTF(stdout,"cmdQueue i = %d, a queue for reading the C buffer\n",
+    // i);
+    cmdQueue[NUM_QUEUES_TO_CREATE] =
+        clCreateCommandQueue(fctx->context, fctx->devices[0], CL_QUEUE_PROFILING_ENABLE, &status);
     CHECK(status);
 
     //----------------------------------------------
@@ -152,71 +138,43 @@ int sgemm(const float *A, const float *B, float *C, const int OUTERMOST_I, const
     cl_mem input_B_buf;
     cl_mem output_C_buf;
 #ifndef NDEBUG
-    DPRINTF("\n===== Host-CPU transferring W and X to the FPGA device global memory (DDR4) via PCIe ======\n\n");
+    DPRINTF("\n===== Host-CPU transferring W and X to the FPGA device global "
+            "memory (DDR4) via PCIe ======\n\n");
 #endif
-    input_A_buf = clCreateBuffer(
-        fctx->context,
-        CL_MEM_READ_ONLY,
-        num_elem_A * sizeof(cl_float),
-        NULL,
-        &status);
+    input_A_buf = clCreateBuffer(fctx->context, CL_MEM_READ_ONLY, num_elem_A * sizeof(cl_float), NULL, &status);
     CHECK(status);
 
-    input_B_buf = clCreateBuffer(
-        fctx->context,
-        CL_MEM_READ_ONLY,
-        num_elem_B * sizeof(cl_float),
-        NULL,
-        &status);
+    input_B_buf = clCreateBuffer(fctx->context, CL_MEM_READ_ONLY, num_elem_B * sizeof(cl_float), NULL, &status);
     CHECK(status);
 
-    output_C_buf = clCreateBuffer(
-        fctx->context,
-        CL_MEM_WRITE_ONLY,
-        num_elem_C * sizeof(cl_float),
-        NULL,
-        &status);
+    output_C_buf = clCreateBuffer(fctx->context, CL_MEM_WRITE_ONLY, num_elem_C * sizeof(cl_float), NULL, &status);
     CHECK(status);
 
     //----------------------------------------------
     // Write host data to device buffers
     //----------------------------------------------
     // blocking writes
-    status = clEnqueueWriteBuffer(
-        cmdQueue[0],
-        input_A_buf,
-        CL_TRUE,
-        0,
-        num_elem_A * sizeof(cl_float),
-        serialized_A,
-        0,
-        NULL,
-        NULL);
+    status = clEnqueueWriteBuffer(cmdQueue[0], input_A_buf, CL_TRUE, 0, num_elem_A * sizeof(cl_float), serialized_A, 0,
+                                  NULL, NULL);
     CHECK(status);
 
-    status = clEnqueueWriteBuffer(
-        cmdQueue[1],
-        input_B_buf,
-        CL_TRUE,
-        0,
-        num_elem_B * sizeof(cl_float),
-        serialized_B,
-        0,
-        NULL,
-        NULL);
+    status = clEnqueueWriteBuffer(cmdQueue[1], input_B_buf, CL_TRUE, 0, num_elem_B * sizeof(cl_float), serialized_B, 0,
+                                  NULL, NULL);
     CHECK(status);
 
     //----------------------------------------------
     // Create the program from binaries
     //----------------------------------------------
-    //DPRINTF("\n===== Host-CPU setting up OpenCL program and kernels ======\n\n");
+    // DPRINTF("\n===== Host-CPU setting up OpenCL program and kernels
+    // ======\n\n");
 
     cl_program program;
     size_t binary_length;
     const unsigned char *binary;
 
     fflush(stdout);
-    // create the program using binary already compiled offline using aoc (i.e. the .aocx file)
+    // create the program using binary already compiled offline using aoc (i.e.
+    // the .aocx file)
     char *aocx_file = getenv("BITSTREAM");
     FILE *fp = fopen(aocx_file, "rb");
 
@@ -239,16 +197,10 @@ int sgemm(const float *A, const float *B, float *C, const int OUTERMOST_I, const
     }
     fclose(fp);
 
-    //DPRINTF("Create program with binary\n");
-    // Create a program using clCreateProgramWithBinary()
-    program = clCreateProgramWithBinary(
-        fctx->context,
-        1,
-        fctx->devices,
-        &binary_length,
-        (const unsigned char **)&binary,
-        &status,
-        NULL);
+    // DPRINTF("Create program with binary\n");
+    //  Create a program using clCreateProgramWithBinary()
+    program = clCreateProgramWithBinary(fctx->context, 1, fctx->devices, &binary_length,
+                                        (const unsigned char **)&binary, &status, NULL);
     CHECK(status);
 
     //----------------------------------------------
@@ -257,10 +209,7 @@ int sgemm(const float *A, const float *B, float *C, const int OUTERMOST_I, const
     status = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
     if (status != CL_SUCCESS) {
         char log[128 * 1024] = {0};
-        clGetProgramBuildInfo(
-		program, 
-		fctx->devices[0], 
-		CL_PROGRAM_BUILD_LOG, 128 * 1024, log, NULL);
+        clGetProgramBuildInfo(program, fctx->devices[0], CL_PROGRAM_BUILD_LOG, 128 * 1024, log, NULL);
         CHECK(status);
     }
 
@@ -274,130 +223,50 @@ int sgemm(const float *A, const float *B, float *C, const int OUTERMOST_I, const
     DPRINTF("All kernels created\n");
 #endif
     // A_loader
-    status = clSetKernelArg(
-        kernel[0],
-        0,
-        sizeof(int),
-	&TOTAL_K);
+    status = clSetKernelArg(kernel[0], 0, sizeof(int), &TOTAL_K);
     CHECK(status);
-    status = clSetKernelArg(
-        kernel[0],
-        1,
-        sizeof(int),
-	&TOTAL_I);
+    status = clSetKernelArg(kernel[0], 1, sizeof(int), &TOTAL_I);
     CHECK(status);
-    status = clSetKernelArg(
-        kernel[0],
-        2,
-        sizeof(int),
-	&TOTAL_J);
+    status = clSetKernelArg(kernel[0], 2, sizeof(int), &TOTAL_J);
     CHECK(status);
-    status = clSetKernelArg(
-        kernel[0],
-        3,
-        sizeof(cl_mem),
-	&input_A_buf);
+    status = clSetKernelArg(kernel[0], 3, sizeof(cl_mem), &input_A_buf);
     CHECK(status);
     // B_loader
-    status = clSetKernelArg(
-        kernel[1],
-        0,
-        sizeof(int),
-	&TOTAL_K);
+    status = clSetKernelArg(kernel[1], 0, sizeof(int), &TOTAL_K);
     CHECK(status);
-    status = clSetKernelArg(
-        kernel[1],
-        1,
-        sizeof(int),
-	&TOTAL_I);
+    status = clSetKernelArg(kernel[1], 1, sizeof(int), &TOTAL_I);
     CHECK(status);
-    status = clSetKernelArg(
-        kernel[1],
-        2,
-        sizeof(int),
-	&TOTAL_J);
+    status = clSetKernelArg(kernel[1], 2, sizeof(int), &TOTAL_J);
     CHECK(status);
-    status = clSetKernelArg(
-        kernel[1],
-        3,
-        sizeof(cl_mem),
-	&input_B_buf);
+    status = clSetKernelArg(kernel[1], 3, sizeof(cl_mem), &input_B_buf);
     CHECK(status);
     // unloader
-    status = clSetKernelArg(
-        kernel[2],
-        0,
-        sizeof(int),
-	&TOTAL_I);
+    status = clSetKernelArg(kernel[2], 0, sizeof(int), &TOTAL_I);
     CHECK(status);
-    status = clSetKernelArg(
-        kernel[2],
-        1,
-        sizeof(int),
-	&TOTAL_J);
+    status = clSetKernelArg(kernel[2], 1, sizeof(int), &TOTAL_J);
     CHECK(status);
-    status = clSetKernelArg(
-        kernel[2],
-        2,
-        sizeof(cl_mem),
-	&output_C_buf);
+    status = clSetKernelArg(kernel[2], 2, sizeof(cl_mem), &output_C_buf);
     CHECK(status);
     // A_feeder
-    status = clSetKernelArg(
-        kernel[3],
-        0,
-        sizeof(int),
-	&TOTAL_K);
+    status = clSetKernelArg(kernel[3], 0, sizeof(int), &TOTAL_K);
     CHECK(status);
-    status = clSetKernelArg(
-        kernel[3],
-        1,
-        sizeof(int),
-	&TOTAL_I);
+    status = clSetKernelArg(kernel[3], 1, sizeof(int), &TOTAL_I);
     CHECK(status);
-    status = clSetKernelArg(
-        kernel[3],
-        2,
-        sizeof(int),
-	&TOTAL_J);
+    status = clSetKernelArg(kernel[3], 2, sizeof(int), &TOTAL_J);
     CHECK(status);
     // B_feeder
-    status = clSetKernelArg(
-        kernel[4],
-        0,
-        sizeof(int),
-	&TOTAL_K);
+    status = clSetKernelArg(kernel[4], 0, sizeof(int), &TOTAL_K);
     CHECK(status);
-    status = clSetKernelArg(
-        kernel[4],
-        1,
-        sizeof(int),
-	&TOTAL_I);
+    status = clSetKernelArg(kernel[4], 1, sizeof(int), &TOTAL_I);
     CHECK(status);
-    status = clSetKernelArg(
-        kernel[4],
-        2,
-        sizeof(int),
-	&TOTAL_J);
+    status = clSetKernelArg(kernel[4], 2, sizeof(int), &TOTAL_J);
     CHECK(status);
     // Out
-    status = clSetKernelArg(
-        kernel[5],
-        0,
-        sizeof(int),
-	&TOTAL_K);
+    status = clSetKernelArg(kernel[5], 0, sizeof(int), &TOTAL_K);
     CHECK(status);
-    status = clSetKernelArg(
-        kernel[5],
-        1,
-        sizeof(int),
-	&TOTAL_I);
+    status = clSetKernelArg(kernel[5], 1, sizeof(int), &TOTAL_I);
     CHECK(status);
-    status = clSetKernelArg(
-        kernel[5],
-        2,
-        sizeof(int),
-	&TOTAL_J);
+    status = clSetKernelArg(kernel[5], 2, sizeof(int), &TOTAL_J);
     CHECK(status);
 
     //----------------------------------------------
@@ -420,28 +289,21 @@ int sgemm(const float *A, const float *B, float *C, const int OUTERMOST_I, const
     cl_event kernel_exec_event[NUM_KERNELS_TO_CREATE];
 
 #ifndef NDEBUG
-    DPRINTF("\n===== Host-CPU enqeuing the OpenCL kernels to the FPGA device ======\n\n");
+    DPRINTF("\n===== Host-CPU enqeuing the OpenCL kernels to the FPGA device "
+            "======\n\n");
 #endif
     for (int i = 0; i < NUM_KERNELS_TO_CREATE; i++) {
         // Alternatively, can use clEnqueueTaskKernel
 #ifndef NDEBUG
         DPRINTF("clEnqueueNDRangeKernel[%d]: %s!\n", i, sgemm_kernel_name[i]);
 #endif
-	status = clEnqueueNDRangeKernel(
-            cmdQueue[i],
-            kernel[i],
-            1,
-            NULL,
-            globalWorkSize,
-            localWorkSize,
-            0,
-            NULL,
-            &kernel_exec_event[i]);
+        status = clEnqueueNDRangeKernel(cmdQueue[i], kernel[i], 1, NULL, globalWorkSize, localWorkSize, 0, NULL,
+                                        &kernel_exec_event[i]);
         CHECK(status);
     }
 #ifndef NDEBUG
     DPRINTF(" *** FPGA execution started!\n");
-#endif    
+#endif
     for (int i = 0; i < NUM_KERNELS_TO_CREATE; i++) {
         status = clFlush(cmdQueue[i]);
         CHECK(status);
@@ -450,7 +312,7 @@ int sgemm(const float *A, const float *B, float *C, const int OUTERMOST_I, const
     for (int i = 0; i < NUM_QUEUES_TO_CREATE; i++) {
 #ifndef NDEBUG
         DPRINTF("cmd queue: %d\n", i);
-#endif    
+#endif
         fflush(stdout);
         status = clFinish(cmdQueue[i]);
         CHECK(status);
@@ -458,8 +320,8 @@ int sgemm(const float *A, const float *B, float *C, const int OUTERMOST_I, const
 #ifndef NDEBUG
     DPRINTF(" *** FPGA execution finished!\n");
     DPRINTF("\n\n");
-//#endif    
- 
+    // #endif
+
     double k_start_time[NUM_KERNELS_TO_CREATE];
     double k_end_time[NUM_KERNELS_TO_CREATE];
     double k_exec_time[NUM_KERNELS_TO_CREATE];
@@ -470,14 +332,14 @@ int sgemm(const float *A, const float *B, float *C, const int OUTERMOST_I, const
             max_time = k_exec_time[i];
         }
     }
-//#ifndef NDEBUG
+    // #ifndef NDEBUG
     DPRINTF("Time taken: %lf sec\n\n", max_time);
 
     printf("\n===== Reporting measured throughput ======\n\n");
-//#endif    
+    // #endif
     double k_earliest_start_time = k_start_time[0];
     double k_latest_end_time = k_end_time[0];
-    
+
     for (int i = 1; i < NUM_KERNELS_TO_CREATE; i++) {
         if (k_start_time[i] < k_earliest_start_time)
             k_earliest_start_time = k_start_time[i];
@@ -486,16 +348,19 @@ int sgemm(const float *A, const float *B, float *C, const int OUTERMOST_I, const
             k_latest_end_time = k_end_time[i];
     }
 
-    // IMPORTANT: we care about the finish time of drain_C, once data is drained we are done
+    // IMPORTANT: we care about the finish time of drain_C, once data is drained
+    // we are done
     k_latest_end_time = k_end_time[NUM_KERNELS_TO_CREATE - 1];
 
     for (int i = 0; i < NUM_KERNELS_TO_CREATE; i++) {
-        printf("  Kernel execution time on FPGA: %s, \n   \t\t\t\t\t\t\t\t\texec time = %.5f s, start=%.5f s, end=%.5f s\n", sgemm_kernel_name[i], k_exec_time[i], k_start_time[i], k_end_time[i]);
+        printf("  Kernel execution time on FPGA: %s, \n   \t\t\t\t\t\t\t\t\texec "
+               "time = %.5f s, start=%.5f s, end=%.5f s\n",
+               sgemm_kernel_name[i], k_exec_time[i], k_start_time[i], k_end_time[i]);
     }
-//#endif
- 
+    // #endif
+
     double k_overall_exec_time = k_latest_end_time - k_earliest_start_time;
-//#ifndef NDEBUG
+    // #ifndef NDEBUG
     printf("\n");
     printf("  Loader kernels start time\t\t= %.5f s\n", k_earliest_start_time);
     printf("  Unloader kernels end time\t\t= %.5f s\n", k_latest_end_time);
@@ -506,10 +371,11 @@ int sgemm(const float *A, const float *B, float *C, const int OUTERMOST_I, const
 
     double num_operations = (double)2.0 * (TOTAL_K) * (double)(TOTAL_I) * (double)(TOTAL_J);
 
-    printf("  # operations = %.0f\n", num_operations );
+    printf("  # operations = %.0f\n", num_operations);
     printf("  Throughput: %.5f GFLOPS\n", (double)1.0e-9 * num_operations / k_overall_exec_time);
 
-    DPRINTF("\n===== Host-CPU transferring result matrix C from the FPGA device global memory (DDR4) via PCIe ======\n\n");
+    DPRINTF("\n===== Host-CPU transferring result matrix C from the FPGA "
+            "device global memory (DDR4) via PCIe ======\n\n");
 #endif
     // Read the results back from the device, blocking read
     float *serialized_Z;
@@ -518,24 +384,17 @@ int sgemm(const float *A, const float *B, float *C, const int OUTERMOST_I, const
     }
 
     clEnqueueReadBuffer(
-        //cmdQueue[KID_DRAIN_MAT_C],
-        cmdQueue[NUM_KERNELS_TO_CREATE], // using a special queue for reading buffer C
-        output_C_buf,
-        CL_TRUE,
-        0,
-        num_elem_C * sizeof(cl_float),
-        serialized_Z,
-        0,
-        NULL,
-        NULL);
+        // cmdQueue[KID_DRAIN_MAT_C],
+        cmdQueue[NUM_KERNELS_TO_CREATE], // using a special queue for reading
+                                         // buffer C
+        output_C_buf, CL_TRUE, 0, num_elem_C * sizeof(cl_float), serialized_Z, 0, NULL, NULL);
     CHECK(status);
 
     // Deserialize Z
     addr = 0;
     for (int i = 0; i < TOTAL_I; i++)
         for (int j = 0; j < TOTAL_J; j++) {
-            C[j + i*TOTAL_J] = serialized_Z[addr++];
+            C[j + i * TOTAL_J] = serialized_Z[addr++];
         }
     return 0;
 }
-
diff --git a/src/runtime/local/kernels/FPGAOPENCL/gemm_interface.h b/src/runtime/local/kernels/FPGAOPENCL/gemm_interface.h
old mode 100755
new mode 100644
index 0a8b4de9d..ac7dac21d
--- a/src/runtime/local/kernels/FPGAOPENCL/gemm_interface.h
+++ b/src/runtime/local/kernels/FPGAOPENCL/gemm_interface.h
@@ -2,9 +2,7 @@
 #define SGEMM_INTERFACE
 #include <runtime/local/context/DaphneContext.h>
 
-
-extern int sgemm(const float *A, const float *B, float *C, const int OUTERMOST_I, const int OUTERMOST_J, const int OUTERMOST_K, DCTX(ctx));
+extern int sgemm(const float *A, const float *B, float *C, const int OUTERMOST_I, const int OUTERMOST_J,
+                 const int OUTERMOST_K, DCTX(ctx));
 
 #endif
-
-
diff --git a/src/runtime/local/kernels/FPGAOPENCL/kernel_utils.cpp b/src/runtime/local/kernels/FPGAOPENCL/kernel_utils.cpp
index 966e831fe..6713b67e1 100644
--- a/src/runtime/local/kernels/FPGAOPENCL/kernel_utils.cpp
+++ b/src/runtime/local/kernels/FPGAOPENCL/kernel_utils.cpp
@@ -1,7 +1,7 @@
 #ifndef SRC_RUNTIME_LOCAL_FPGAOPENCL_KERNEL_UTILS_H
 #define SRC_RUNTIME_LOCAL_FPGAOPENCL_KERNEL_UTILS_H
 
-
+#include "kernel_utils.h"
 #include "AOCLUtils/aocl_utils.h"
 #include "CL/opencl.h"
 #include <cstdio>
@@ -18,7 +18,6 @@
 #include <string>
 #include <sys/time.h>
 #include <time.h>
-#include "kernel_utils.h"
 
 using namespace aocl_utils;
 
@@ -40,7 +39,7 @@ double compute_kernel_execution_time(cl_event &event, double &start_d, double &e
 
     start_d = (double)1.0e-9 * start;
     end_d = (double)1.0e-9 * end;
-    //return (double)(end-start);
+    // return (double)(end-start);
     return (double)1.0e-9 * (end - start); // nanoseconds to seconds
 }
 
diff --git a/src/runtime/local/kernels/FPGAOPENCL/kernel_utils.h b/src/runtime/local/kernels/FPGAOPENCL/kernel_utils.h
index bd42c1bf6..efd5b2626 100644
--- a/src/runtime/local/kernels/FPGAOPENCL/kernel_utils.h
+++ b/src/runtime/local/kernels/FPGAOPENCL/kernel_utils.h
@@ -1,7 +1,6 @@
 #ifndef SRC_RUNTIME_LOCAL_FPGAOPENCL_KERNEL_UTILS_H
 #define SRC_RUNTIME_LOCAL_FPGAOPENCL_KERNEL_UTILS_H
 
-
 #include "AOCLUtils/aocl_utils.h"
 #include "CL/opencl.h"
 
diff --git a/src/runtime/local/kernels/FPGAOPENCL/sgemv_interface.cpp b/src/runtime/local/kernels/FPGAOPENCL/sgemv_interface.cpp
index 78fb19634..96e60023b 100644
--- a/src/runtime/local/kernels/FPGAOPENCL/sgemv_interface.cpp
+++ b/src/runtime/local/kernels/FPGAOPENCL/sgemv_interface.cpp
@@ -1,11 +1,12 @@
-// Copyright (C) 2013-2019 Altera Corporation, San Jose, California, USA. All rights reserved.
-// Permission is hereby granted, free of charge, to any person obtaining a copy of this
-// software and associated documentation files (the "Software"), to deal in the Software
-// without restriction, including without limitation the rights to use, copy, modify, merge,
-// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to
-// whom the Software is furnished to do so, subject to the following conditions:
-// The above copyright notice and this permission notice shall be included in all copies or
-// substantial portions of the Software.
+// Copyright (C) 2013-2019 Altera Corporation, San Jose, California, USA. All
+// rights reserved. Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation files (the
+// "Software"), to deal in the Software without restriction, including without
+// limitation the rights to use, copy, modify, merge, publish, distribute,
+// sublicense, and/or sell copies of the Software, and to permit persons to whom
+// the Software is furnished to do so, subject to the following conditions: The
+// above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
 //
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
@@ -16,15 +17,15 @@
 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 // OTHER DEALINGS IN THE SOFTWARE.
 //
-// This agreement shall be governed in all respects by the laws of the State of California and
-// by the laws of the United States of America.
+// This agreement shall be governed in all respects by the laws of the State of
+// California and by the laws of the United States of America.
 
-
-// This file is modified from /glob/development-tools/versions/fpgasupportstack/a10/1.2.1/intelFPGA_pro/hld/examples_aoc/matrix_mult/host/src/main.cpp
+// This file is modified from
+// /glob/development-tools/versions/fpgasupportstack/a10/1.2.1/intelFPGA_pro/hld/examples_aoc/matrix_mult/host/src/main.cpp
 
 #include "AOCLUtils/aocl_utils.h"
-#include "CL/opencl.h"
 #include "CL/cl_ext_intelfpga.h"
+#include "CL/opencl.h"
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
@@ -32,6 +33,10 @@
 #include <iomanip>
 #include <iostream>
 #include <math.h>
+#include <runtime/local/context/DaphneContext.h>
+#include <runtime/local/context/FPGAContext.h>
+#include <runtime/local/kernels/FPGAOPENCL/kernel_utils.h>
+#include <runtime/local/kernels/FPGAOPENCL/sgemv_interface.h>
 #include <sstream>
 #include <stdint.h>
 #include <stdio.h>
@@ -39,18 +44,14 @@
 #include <string>
 #include <sys/time.h>
 #include <time.h>
-#include <runtime/local/context/FPGAContext.h>
-#include <runtime/local/context/DaphneContext.h>
-#include <runtime/local/kernels/FPGAOPENCL/kernel_utils.h>
-#include <runtime/local/kernels/FPGAOPENCL/sgemv_interface.h>
 
-//defines for S10
-#define KKK	        1
-#define III	        64
-#define II	        32
-#define KK	        32
-//#define I   64
-//#define K   64
+// defines for S10
+#define KKK 1
+#define III 64
+#define II 32
+#define KK 32
+// #define I   64
+// #define K   64
 
 using namespace aocl_utils;
 
@@ -59,78 +60,73 @@ using namespace aocl_utils;
 #define STR_HELPER(x) #x
 #define STR(x) STR_HELPER(x)
 
-#define DPRINTF(...)     \
-    printf(__VA_ARGS__); \
+#define DPRINTF(...)                                                                                                   \
+    printf(__VA_ARGS__);                                                                                               \
     fflush(stdout);
 
-#define NUM_QUEUES_TO_CREATE    5
-#define NUM_KERNELS_TO_CREATE   5
+#define NUM_QUEUES_TO_CREATE 5
+#define NUM_KERNELS_TO_CREATE 5
 
-#define CHECK(status)                                       \
-    if (status != CL_SUCCESS) {                             \
-        printf("error %d in line %d.\n", status, __LINE__); \
-        exit(1);                                            \
+#define CHECK(status)                                                                                                  \
+    if (status != CL_SUCCESS) {                                                                                        \
+        printf("error %d in line %d.\n", status, __LINE__);                                                            \
+        exit(1);                                                                                                       \
     }
 
 #define ACL_ALIGNMENT 64
-const char *sgemv_kernel_name[] = {
-    "kernel_aLoader",
-    "kernel_xLoader",
-    "kernel_xFeeder",
-    "kernel_V",
-    "kernel_unloader"
-};
-
-int sgemv(const float *A, const float *B, float *C, const int I, const int K, DCTX(ctx)){
+const char *sgemv_kernel_name[] = {"kernel_aLoader", "kernel_xLoader", "kernel_xFeeder", "kernel_V", "kernel_unloader"};
+
+int sgemv(const float *A, const float *B, float *C, const int I, const int K, DCTX(ctx)) {
     const int TOTAL_I = III * II * I;
     const int TOTAL_K = KKK * KK * K;
-    
+
     float *serialized_A_1, *serialized_A_2, *serialized_A_3, *serialized_A_4, *serialized_B;
-    if ((serialized_A_1 = (float *)acl_aligned_malloc(TOTAL_I*TOTAL_K/4 * sizeof(float))) == NULL) {
+    if ((serialized_A_1 = (float *)acl_aligned_malloc(TOTAL_I * TOTAL_K / 4 * sizeof(float))) == NULL) {
         perror("Failed malloc of matrix serialized_A");
     }
-    if ((serialized_A_2 = (float *)acl_aligned_malloc(TOTAL_I*TOTAL_K/4 * sizeof(float))) == NULL) {
+    if ((serialized_A_2 = (float *)acl_aligned_malloc(TOTAL_I * TOTAL_K / 4 * sizeof(float))) == NULL) {
         perror("Failed malloc of matrix serialized_A");
     }
-    if ((serialized_A_3 = (float *)acl_aligned_malloc(TOTAL_I*TOTAL_K/4 * sizeof(float))) == NULL) {
+    if ((serialized_A_3 = (float *)acl_aligned_malloc(TOTAL_I * TOTAL_K / 4 * sizeof(float))) == NULL) {
         perror("Failed malloc of matrix serialized_A");
     }
-    if ((serialized_A_4 = (float *)acl_aligned_malloc(TOTAL_I*TOTAL_K/4 * sizeof(float))) == NULL) {
+    if ((serialized_A_4 = (float *)acl_aligned_malloc(TOTAL_I * TOTAL_K / 4 * sizeof(float))) == NULL) {
         perror("Failed malloc of matrix serialized_A");
     }
-    if ((serialized_B = (float *)acl_aligned_malloc(TOTAL_K*I * sizeof(float))) == NULL) {
+    if ((serialized_B = (float *)acl_aligned_malloc(TOTAL_K * I * sizeof(float))) == NULL) {
         perror("Failed malloc of matrix serialized_A");
     }
 
     // Serialize A
     long int addr = 0;
     for (int i = 0; i < I; i++)
-    for (int k = 0; k < K; k++)
-    for (int kk = 0; kk < KK; kk++)
-    for (int ii = 0; ii < II; ii++)
-    for (int iii = 0; iii < III/4; iii++)
-    for (int kkk = 0; kkk < KKK; kkk++) {
-        int total_k = kkk + KKK*kk + KKK*KK*k;
-        int total_i = iii + III*ii + III*II*i;
-        serialized_A_1[addr] = A[total_k + TOTAL_K*total_i];
-        //printf("\n index A_2 %d",total_k + TOTAL_K*(total_i+III/4));
-	serialized_A_2[addr] = A[total_k + TOTAL_K*(total_i+III/4)];
-        serialized_A_3[addr] = A[total_k + TOTAL_K*(total_i+III/2)];
-        serialized_A_4[addr] = A[total_k + TOTAL_K*(total_i+3*III/4)];
-        addr++;
-    }
+        for (int k = 0; k < K; k++)
+            for (int kk = 0; kk < KK; kk++)
+                for (int ii = 0; ii < II; ii++)
+                    for (int iii = 0; iii < III / 4; iii++)
+                        for (int kkk = 0; kkk < KKK; kkk++) {
+                            int total_k = kkk + KKK * kk + KKK * KK * k;
+                            int total_i = iii + III * ii + III * II * i;
+                            serialized_A_1[addr] = A[total_k + TOTAL_K * total_i];
+                            // printf("\n index A_2 %d",total_k +
+                            // TOTAL_K*(total_i+III/4));
+                            serialized_A_2[addr] = A[total_k + TOTAL_K * (total_i + III / 4)];
+                            serialized_A_3[addr] = A[total_k + TOTAL_K * (total_i + III / 2)];
+                            serialized_A_4[addr] = A[total_k + TOTAL_K * (total_i + 3 * III / 4)];
+                            addr++;
+                        }
     // Serialize B
     addr = 0;
     for (int i = 0; i < I; i++)
-    for (int k = 0; k < K; k++)
-    for (int kk = 0; kk < KK; kk++)
-    for (int kkk = 0; kkk < KKK; kkk++) {
-        int total_k = kkk + KKK*kk + KKK*KK*k;
-        serialized_B[addr++] = B[total_k];
-    }
+        for (int k = 0; k < K; k++)
+            for (int kk = 0; kk < KK; kk++)
+                for (int kkk = 0; kkk < KKK; kkk++) {
+                    int total_k = kkk + KKK * kk + KKK * KK * k;
+                    serialized_B[addr++] = B[total_k];
+                }
 
     cl_int status;
-    auto fctx = ctx->getFPGAContext(0);    
+    auto fctx = ctx->getFPGAContext(0);
 
     //----------------------------------------------
     // Create command queues
@@ -141,21 +137,15 @@ int sgemv(const float *A, const float *B, float *C, const int I, const int K, DC
     // Create a command queue using clCreateCommandQueue(),
     // and associate it with the device you want to execute on
     for (int i = 0; i < NUM_QUEUES_TO_CREATE; i++) {
-        //fDPRINTF(stdout,"cmdQueue i = %d\n", i);
-        cmdQueue[i] = clCreateCommandQueue(
-            fctx->context,
-            fctx->devices[0],
-            CL_QUEUE_PROFILING_ENABLE,
-            &status);
+        // fDPRINTF(stdout,"cmdQueue i = %d\n", i);
+        cmdQueue[i] = clCreateCommandQueue(fctx->context, fctx->devices[0], CL_QUEUE_PROFILING_ENABLE, &status);
         CHECK(status);
     }
 
-    //fDPRINTF(stdout,"cmdQueue i = %d, a queue for reading the C buffer\n", i);
-    cmdQueue[NUM_QUEUES_TO_CREATE] = clCreateCommandQueue(
-        fctx->context,
-        fctx->devices[0],
-        CL_QUEUE_PROFILING_ENABLE,
-        &status);
+    // fDPRINTF(stdout,"cmdQueue i = %d, a queue for reading the C buffer\n",
+    // i);
+    cmdQueue[NUM_QUEUES_TO_CREATE] =
+        clCreateCommandQueue(fctx->context, fctx->devices[0], CL_QUEUE_PROFILING_ENABLE, &status);
     CHECK(status);
 
     //----------------------------------------------
@@ -166,54 +156,29 @@ int sgemv(const float *A, const float *B, float *C, const int I, const int K, DC
     cl_mem input_B_buf;
     cl_mem output_C_buf;
 #ifndef NDEBUG
-    DPRINTF("\n=====[SGEMV] Host-CPU transferring W and X to the FPGA device global memory (DDR4 ) via PCIe ======\n\n");
+    DPRINTF("\n=====[SGEMV] Host-CPU transferring W and X to the FPGA device "
+            "global memory (DDR4 ) via PCIe ======\n\n");
 #endif
-    input_A_buf_1 = clCreateBuffer(
-        fctx->context,
-        CL_CHANNEL_1_INTELFPGA,
-        TOTAL_K * TOTAL_I / 4 * sizeof(cl_float),
-        NULL,
-        &status);
+    input_A_buf_1 =
+        clCreateBuffer(fctx->context, CL_CHANNEL_1_INTELFPGA, TOTAL_K * TOTAL_I / 4 * sizeof(cl_float), NULL, &status);
     CHECK(status);
 
-    input_A_buf_2 = clCreateBuffer(
-        fctx->context,
-        CL_CHANNEL_2_INTELFPGA,
-        TOTAL_K * TOTAL_I / 4 * sizeof(cl_float),
-        NULL,
-        &status);
+    input_A_buf_2 =
+        clCreateBuffer(fctx->context, CL_CHANNEL_2_INTELFPGA, TOTAL_K * TOTAL_I / 4 * sizeof(cl_float), NULL, &status);
     CHECK(status);
 
-    input_A_buf_3 = clCreateBuffer(
-        fctx->context,
-        CL_CHANNEL_3_INTELFPGA,
-        TOTAL_K * TOTAL_I / 4 * sizeof(cl_float),
-        NULL,
-        &status);
+    input_A_buf_3 =
+        clCreateBuffer(fctx->context, CL_CHANNEL_3_INTELFPGA, TOTAL_K * TOTAL_I / 4 * sizeof(cl_float), NULL, &status);
     CHECK(status);
 
-    input_A_buf_4 = clCreateBuffer(
-        fctx->context,
-        CL_CHANNEL_4_INTELFPGA,
-        TOTAL_K * TOTAL_I / 4 * sizeof(cl_float),
-        NULL,
-        &status);
+    input_A_buf_4 =
+        clCreateBuffer(fctx->context, CL_CHANNEL_4_INTELFPGA, TOTAL_K * TOTAL_I / 4 * sizeof(cl_float), NULL, &status);
     CHECK(status);
 
-    input_B_buf = clCreateBuffer(
-        fctx->context,
-        CL_CHANNEL_1_INTELFPGA,
-        TOTAL_K * sizeof(cl_float),
-        NULL,
-        &status);
+    input_B_buf = clCreateBuffer(fctx->context, CL_CHANNEL_1_INTELFPGA, TOTAL_K * sizeof(cl_float), NULL, &status);
     CHECK(status);
 
-    output_C_buf = clCreateBuffer(
-        fctx->context,
-        CL_CHANNEL_2_INTELFPGA,
-        TOTAL_I * sizeof(cl_float),
-        NULL,
-        &status);
+    output_C_buf = clCreateBuffer(fctx->context, CL_CHANNEL_2_INTELFPGA, TOTAL_I * sizeof(cl_float), NULL, &status);
     CHECK(status);
 
     //----------------------------------------------
@@ -221,64 +186,24 @@ int sgemv(const float *A, const float *B, float *C, const int I, const int K, DC
     //----------------------------------------------
 
     // blocking writes
-    status = clEnqueueWriteBuffer(
-        cmdQueue[0],
-        input_A_buf_1,
-        CL_TRUE,
-        0,
-        TOTAL_K * TOTAL_I / 4 * sizeof(cl_float),
-        serialized_A_1,
-        0,
-        NULL,
-        NULL);
+    status = clEnqueueWriteBuffer(cmdQueue[0], input_A_buf_1, CL_TRUE, 0, TOTAL_K * TOTAL_I / 4 * sizeof(cl_float),
+                                  serialized_A_1, 0, NULL, NULL);
     CHECK(status);
 
-    status = clEnqueueWriteBuffer(
-        cmdQueue[0],
-        input_A_buf_2,
-        CL_TRUE,
-        0,
-        TOTAL_K * TOTAL_I / 4 * sizeof(cl_float),
-        serialized_A_2,
-        0,
-        NULL,
-        NULL);
+    status = clEnqueueWriteBuffer(cmdQueue[0], input_A_buf_2, CL_TRUE, 0, TOTAL_K * TOTAL_I / 4 * sizeof(cl_float),
+                                  serialized_A_2, 0, NULL, NULL);
     CHECK(status);
 
-    status = clEnqueueWriteBuffer(
-        cmdQueue[0],
-        input_A_buf_3,
-        CL_TRUE,
-        0,
-        TOTAL_K * TOTAL_I / 4 * sizeof(cl_float),
-        serialized_A_3,
-        0,
-        NULL,
-        NULL);
+    status = clEnqueueWriteBuffer(cmdQueue[0], input_A_buf_3, CL_TRUE, 0, TOTAL_K * TOTAL_I / 4 * sizeof(cl_float),
+                                  serialized_A_3, 0, NULL, NULL);
     CHECK(status);
 
-    status = clEnqueueWriteBuffer(
-        cmdQueue[0],
-        input_A_buf_4,
-        CL_TRUE,
-        0,
-        TOTAL_K * TOTAL_I / 4 * sizeof(cl_float),
-        serialized_A_4,
-        0,
-        NULL,
-        NULL);
+    status = clEnqueueWriteBuffer(cmdQueue[0], input_A_buf_4, CL_TRUE, 0, TOTAL_K * TOTAL_I / 4 * sizeof(cl_float),
+                                  serialized_A_4, 0, NULL, NULL);
     CHECK(status);
 
-    status = clEnqueueWriteBuffer(
-        cmdQueue[1],
-        input_B_buf,
-        CL_TRUE,
-        0,
-        TOTAL_K * sizeof(cl_float),
-        serialized_B,
-        0,
-        NULL,
-        NULL);
+    status = clEnqueueWriteBuffer(cmdQueue[1], input_B_buf, CL_TRUE, 0, TOTAL_K * sizeof(cl_float), serialized_B, 0,
+                                  NULL, NULL);
     CHECK(status);
 
     //----------------------------------------------
@@ -293,7 +218,8 @@ int sgemv(const float *A, const float *B, float *C, const int I, const int K, DC
     const unsigned char *binary;
 
     fflush(stdout);
-    // create the program using binary already compiled offline using aoc (i.e. the .aocx file)
+    // create the program using binary already compiled offline using aoc (i.e.
+    // the .aocx file)
     char *aocx_file = getenv("BITSTREAM");
     FILE *fp = fopen(aocx_file, "rb");
 
@@ -316,16 +242,10 @@ int sgemv(const float *A, const float *B, float *C, const int I, const int K, DC
     }
     fclose(fp);
 
-//    DPRINTF("Create program with binary\n");
+    //    DPRINTF("Create program with binary\n");
     // Create a program using clCreateProgramWithBinary()
-    program = clCreateProgramWithBinary(
-        fctx->context,
-        1,
-        fctx->devices,
-        &binary_length,
-        (const unsigned char **)&binary,
-        &status,
-        NULL);
+    program = clCreateProgramWithBinary(fctx->context, 1, fctx->devices, &binary_length,
+                                        (const unsigned char **)&binary, &status, NULL);
     CHECK(status);
 
     //----------------------------------------------
@@ -349,99 +269,39 @@ int sgemv(const float *A, const float *B, float *C, const int I, const int K, DC
     DPRINTF("All kernels created\n");
 #endif
     // A_loader
-    status = clSetKernelArg(
-        kernel[0],
-        0,
-        sizeof(int),
-        &TOTAL_K);
+    status = clSetKernelArg(kernel[0], 0, sizeof(int), &TOTAL_K);
     CHECK(status);
-    status = clSetKernelArg(
-        kernel[0],
-        1,
-        sizeof(int),
-        &TOTAL_I);
+    status = clSetKernelArg(kernel[0], 1, sizeof(int), &TOTAL_I);
     CHECK(status);
-    status = clSetKernelArg(
-        kernel[0],
-        2,
-        sizeof(cl_mem),
-        &input_A_buf_1);
+    status = clSetKernelArg(kernel[0], 2, sizeof(cl_mem), &input_A_buf_1);
     CHECK(status);
-    status = clSetKernelArg(
-        kernel[0],
-        3,
-        sizeof(cl_mem),
-        &input_A_buf_2);
+    status = clSetKernelArg(kernel[0], 3, sizeof(cl_mem), &input_A_buf_2);
     CHECK(status);
-    status = clSetKernelArg(
-        kernel[0],
-        4,
-        sizeof(cl_mem),
-        &input_A_buf_3);
+    status = clSetKernelArg(kernel[0], 4, sizeof(cl_mem), &input_A_buf_3);
     CHECK(status);
-    status = clSetKernelArg(
-        kernel[0],
-        5,
-        sizeof(cl_mem),
-        &input_A_buf_4);
+    status = clSetKernelArg(kernel[0], 5, sizeof(cl_mem), &input_A_buf_4);
     CHECK(status);
     // X_loader
-    status = clSetKernelArg(
-        kernel[1],
-        0,
-        sizeof(int),
-        &TOTAL_K);
+    status = clSetKernelArg(kernel[1], 0, sizeof(int), &TOTAL_K);
     CHECK(status);
-    status = clSetKernelArg(
-        kernel[1],
-        1,
-        sizeof(int),
-        &TOTAL_I);
+    status = clSetKernelArg(kernel[1], 1, sizeof(int), &TOTAL_I);
     CHECK(status);
-    status = clSetKernelArg(
-        kernel[1],
-        2,
-        sizeof(cl_mem),
-        &input_B_buf);
+    status = clSetKernelArg(kernel[1], 2, sizeof(cl_mem), &input_B_buf);
     CHECK(status);
     // X_feeder
-    status = clSetKernelArg(
-        kernel[2],
-        0,
-        sizeof(int),
-        &TOTAL_K);
+    status = clSetKernelArg(kernel[2], 0, sizeof(int), &TOTAL_K);
     CHECK(status);
-    status = clSetKernelArg(
-        kernel[2],
-        1,
-        sizeof(int),
-        &TOTAL_I);
+    status = clSetKernelArg(kernel[2], 1, sizeof(int), &TOTAL_I);
     CHECK(status);
     // kernel_V
-    status = clSetKernelArg(
-        kernel[3],
-        0,
-        sizeof(int),
-        &TOTAL_K);
+    status = clSetKernelArg(kernel[3], 0, sizeof(int), &TOTAL_K);
     CHECK(status);
-    status = clSetKernelArg(
-        kernel[3],
-        1,
-        sizeof(int),
-        &TOTAL_I);
+    status = clSetKernelArg(kernel[3], 1, sizeof(int), &TOTAL_I);
     CHECK(status);
     // unloader
-    status = clSetKernelArg(
-        kernel[4],
-        0,
-        sizeof(int),
-        &TOTAL_I);
+    status = clSetKernelArg(kernel[4], 0, sizeof(int), &TOTAL_I);
     CHECK(status);
-    status = clSetKernelArg(
-        kernel[4],
-        1,
-        sizeof(cl_mem),
-        &output_C_buf);
+    status = clSetKernelArg(kernel[4], 1, sizeof(cl_mem), &output_C_buf);
     CHECK(status);
 
     //----------------------------------------------
@@ -463,23 +323,16 @@ int sgemv(const float *A, const float *B, float *C, const int I, const int K, DC
 
     cl_event kernel_exec_event[NUM_KERNELS_TO_CREATE];
 #ifndef NDEBUG
-    DPRINTF("\n===== Host-CPU enqeuing the OpenCL kernels to the FPGA device ======\n\n");
-#endif    
+    DPRINTF("\n===== Host-CPU enqeuing the OpenCL kernels to the FPGA device "
+            "======\n\n");
+#endif
     for (int i = 0; i < NUM_KERNELS_TO_CREATE; i++) {
         // Alternatively, can use clEnqueueTaskKernel
 #ifndef NDEBUG
-    	    DPRINTF("clEnqueueNDRangeKernel[%d]: %s!\n", i, sgemv_kernel_name[i]);
+        DPRINTF("clEnqueueNDRangeKernel[%d]: %s!\n", i, sgemv_kernel_name[i]);
 #endif
-        status = clEnqueueNDRangeKernel(
-            cmdQueue[i],
-            kernel[i],
-            1,
-            NULL,
-            globalWorkSize,
-            localWorkSize,
-            0,
-            NULL,
-            &kernel_exec_event[i]);
+        status = clEnqueueNDRangeKernel(cmdQueue[i], kernel[i], 1, NULL, globalWorkSize, localWorkSize, 0, NULL,
+                                        &kernel_exec_event[i]);
         CHECK(status);
     }
 #ifndef NDEBUG
@@ -526,11 +379,14 @@ int sgemv(const float *A, const float *B, float *C, const int I, const int K, DC
             k_latest_end_time = k_end_time[i];
     }
 
-    // IMPORTANT: we care about the finish time of drain_C, once data is drained we are done
+    // IMPORTANT: we care about the finish time of drain_C, once data is drained
+    // we are done
     k_latest_end_time = k_end_time[NUM_KERNELS_TO_CREATE - 1];
 
     for (int i = 0; i < NUM_KERNELS_TO_CREATE; i++) {
-        printf("  Kernel execution time on FPGA: %s, \n   \t\t\t\t\t\t\t\t\texec time = %.5f s, start=%.5f s, end=%.5f s\n", sgemv_kernel_name[i], k_exec_time[i], k_start_time[i], k_end_time[i]);
+        printf("  Kernel execution time on FPGA: %s, \n   \t\t\t\t\t\t\t\t\texec "
+               "time = %.5f s, start=%.5f s, end=%.5f s\n",
+               sgemv_kernel_name[i], k_exec_time[i], k_start_time[i], k_end_time[i]);
     }
 
     double k_overall_exec_time = k_latest_end_time - k_earliest_start_time;
@@ -545,42 +401,38 @@ int sgemv(const float *A, const float *B, float *C, const int I, const int K, DC
 
     double num_operations = (double)2.0 * (TOTAL_K) * (double)(TOTAL_I);
 
-    printf("  # operations = %.0f\n", num_operations );
+    printf("  # operations = %.0f\n", num_operations);
     printf("  Throughput: %.5f GFLOPS\n", (double)1.0e-9 * num_operations / k_overall_exec_time);
 
-    DPRINTF("\n===== Host-CPU transferring result matrix C from the FPGA device global memory (DDR4) via PCIe ======\n\n");
+    DPRINTF("\n===== Host-CPU transferring result matrix C from the FPGA "
+            "device global memory (DDR4) via PCIe ======\n\n");
 #endif
     // Read the results back from the device, blocking read
- //   float *C;
- //   if ((C = (float *)acl_aligned_malloc(TOTAL_I * sizeof(float))) == NULL) {
- //       perror("Failed malloc of matrix C");
- //   }
+    //   float *C;
+    //   if ((C = (float *)acl_aligned_malloc(TOTAL_I * sizeof(float))) == NULL)
+    //   {
+    //       perror("Failed malloc of matrix C");
+    //   }
 
     clEnqueueReadBuffer(
-        //cmdQueue[KID_DRAIN_MAT_C],
-        cmdQueue[NUM_KERNELS_TO_CREATE], // using a special queue for reading buffer C
-        output_C_buf,
-        CL_TRUE,
-        0,
-        TOTAL_I * sizeof(cl_float),
-        C,
-        0,
-        NULL,
-        NULL);
+        // cmdQueue[KID_DRAIN_MAT_C],
+        cmdQueue[NUM_KERNELS_TO_CREATE], // using a special queue for reading
+                                         // buffer C
+        output_C_buf, CL_TRUE, 0, TOTAL_I * sizeof(cl_float), C, 0, NULL, NULL);
     CHECK(status);
 
 #ifndef NDEBUG
     bool passed = 1;
     for (size_t i = 0; i < I; i++)
-    for (size_t ii = 0; ii < II; ii++)
-    for (size_t iii = 0; iii < III; iii++) {
-        size_t total_i = iii + III * ii + III * II * i;
-        float golden = 0.0f;
-        for (size_t k = 0; k < TOTAL_K; k++) {
-            golden += A[k+TOTAL_K*total_i] * B[k];
-        }
-        passed &= fabs(golden - C[total_i]) < 0.005*fabs(golden);
-    } 
+        for (size_t ii = 0; ii < II; ii++)
+            for (size_t iii = 0; iii < III; iii++) {
+                size_t total_i = iii + III * ii + III * II * i;
+                float golden = 0.0f;
+                for (size_t k = 0; k < TOTAL_K; k++) {
+                    golden += A[k + TOTAL_K * total_i] * B[k];
+                }
+                passed &= fabs(golden - C[total_i]) < 0.005 * fabs(golden);
+            }
 
     if (passed) {
         printf("[PASSED]\n");
@@ -589,6 +441,5 @@ int sgemv(const float *A, const float *B, float *C, const int I, const int K, DC
     }
 #endif
 
-return 0;
+    return 0;
 }
-
diff --git a/src/runtime/local/kernels/FPGAOPENCL/sgemv_interface.h b/src/runtime/local/kernels/FPGAOPENCL/sgemv_interface.h
old mode 100755
new mode 100644
index 6a4cc2600..a56319f8f
--- a/src/runtime/local/kernels/FPGAOPENCL/sgemv_interface.h
+++ b/src/runtime/local/kernels/FPGAOPENCL/sgemv_interface.h
@@ -2,9 +2,6 @@
 #define SGEMV_INTERFACE
 #include <runtime/local/context/DaphneContext.h>
 
-
-extern int sgemv(const float *A, const float *B, float *C, const int OUTERMOST_I, const int OUTERMOST_K,DCTX(ctx));
+extern int sgemv(const float *A, const float *B, float *C, const int OUTERMOST_I, const int OUTERMOST_K, DCTX(ctx));
 
 #endif
-
-
diff --git a/src/runtime/local/kernels/FPGAOPENCL/syrk_interface.cpp b/src/runtime/local/kernels/FPGAOPENCL/syrk_interface.cpp
index 2e34b975b..96caef4c0 100644
--- a/src/runtime/local/kernels/FPGAOPENCL/syrk_interface.cpp
+++ b/src/runtime/local/kernels/FPGAOPENCL/syrk_interface.cpp
@@ -1,11 +1,12 @@
-// Copyright (C) 2013-2019 Altera Corporation, San Jose, California, USA. All rights reserved.
-// Permission is hereby granted, free of charge, to any person obtaining a copy of this
-// software and associated documentation files (the "Software"), to deal in the Software
-// without restriction, including without limitation the rights to use, copy, modify, merge,
-// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to
-// whom the Software is furnished to do so, subject to the following conditions:
-// The above copyright notice and this permission notice shall be included in all copies or
-// substantial portions of the Software.
+// Copyright (C) 2013-2019 Altera Corporation, San Jose, California, USA. All
+// rights reserved. Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation files (the
+// "Software"), to deal in the Software without restriction, including without
+// limitation the rights to use, copy, modify, merge, publish, distribute,
+// sublicense, and/or sell copies of the Software, and to permit persons to whom
+// the Software is furnished to do so, subject to the following conditions: The
+// above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
 //
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
@@ -16,11 +17,11 @@
 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 // OTHER DEALINGS IN THE SOFTWARE.
 //
-// This agreement shall be governed in all respects by the laws of the State of California and
-// by the laws of the United States of America.
+// This agreement shall be governed in all respects by the laws of the State of
+// California and by the laws of the United States of America.
 
-
-// This file is modified from /glob/development-tools/versions/fpgasupportstack/a10/1.2.1/intelFPGA_pro/hld/examples_aoc/matrix_mult/host/src/main.cpp
+// This file is modified from
+// /glob/development-tools/versions/fpgasupportstack/a10/1.2.1/intelFPGA_pro/hld/examples_aoc/matrix_mult/host/src/main.cpp
 
 #include "AOCLUtils/aocl_utils.h"
 #include "CL/opencl.h"
@@ -31,6 +32,10 @@
 #include <iomanip>
 #include <iostream>
 #include <math.h>
+#include <runtime/local/context/DaphneContext.h>
+#include <runtime/local/context/FPGAContext.h>
+#include <runtime/local/kernels/FPGAOPENCL/kernel_utils.h>
+#include <runtime/local/kernels/FPGAOPENCL/syrk_interface.h>
 #include <sstream>
 #include <stdint.h>
 #include <stdio.h>
@@ -38,19 +43,14 @@
 #include <string>
 #include <sys/time.h>
 #include <time.h>
-#include <runtime/local/context/FPGAContext.h>
-#include <runtime/local/context/DaphneContext.h>
-#include <runtime/local/kernels/FPGAOPENCL/syrk_interface.h>
-#include <runtime/local/kernels/FPGAOPENCL/kernel_utils.h>
 
-
-//Parameters related to systolic array sizes
-#define KKK         16
-#define JJJ         8
-#define III         16
-#define JJ          32
-#define II          16
-#define KK          16
+// Parameters related to systolic array sizes
+#define KKK 16
+#define JJJ 8
+#define III 16
+#define JJ 32
+#define II 16
+#define KK 16
 
 using namespace aocl_utils;
 
@@ -59,38 +59,32 @@ using namespace aocl_utils;
 #define STR_HELPER(x) #x
 #define STR(x) STR_HELPER(x)
 
-#define DPRINTF(...)     \
-    printf(__VA_ARGS__); \
+#define DPRINTF(...)                                                                                                   \
+    printf(__VA_ARGS__);                                                                                               \
     fflush(stdout);
 
-#define NUM_QUEUES_TO_CREATE    4
-#define NUM_KERNELS_TO_CREATE   4
+#define NUM_QUEUES_TO_CREATE 4
+#define NUM_KERNELS_TO_CREATE 4
 
-#define CHECK(status)                                       \
-    if (status != CL_SUCCESS) {                             \
-        printf("error %d in line %d.\n", status, __LINE__); \
-        exit(1);                                            \
+#define CHECK(status)                                                                                                  \
+    if (status != CL_SUCCESS) {                                                                                        \
+        printf("error %d in line %d.\n", status, __LINE__);                                                            \
+        exit(1);                                                                                                       \
     }
 
 #define ACL_ALIGNMENT 64
 
-const char *syrk_kernel_name[] = {
-    "kernel_ALoader",
-    "kernel_BLoader",
-    "kernel_Out",
-    "kernel_unloader"
-};
-
+const char *syrk_kernel_name[] = {"kernel_ALoader", "kernel_BLoader", "kernel_Out", "kernel_unloader"};
 
 int syrk(const float *A, float *C, const int OUTERMOST_I, const int OUTERMOST_K, DCTX(ctx)) {
     const int OUTERMOST_J = OUTERMOST_I;
     const int TOTAL_I = III * II * OUTERMOST_I;
     const int TOTAL_J = JJJ * JJ * OUTERMOST_J;
     const int TOTAL_K = KKK * KK * OUTERMOST_K;
-    
-    long int num_elem_A = (long int)TOTAL_I*TOTAL_K;
-    long int num_elem_B = (long int)TOTAL_K*TOTAL_J;
-    long int num_elem_C = (long int)TOTAL_I*TOTAL_J;
+
+    long int num_elem_A = (long int)TOTAL_I * TOTAL_K;
+    long int num_elem_B = (long int)TOTAL_K * TOTAL_J;
+    long int num_elem_C = (long int)TOTAL_I * TOTAL_J;
 
     float *serialized_A, *serialized_B;
     if ((serialized_A = (float *)acl_aligned_malloc(num_elem_A * sizeof(float))) == NULL) {
@@ -103,34 +97,34 @@ int syrk(const float *A, float *C, const int OUTERMOST_I, const int OUTERMOST_K,
     // Serialize A
     long int addr = 0;
     for (int i = 0; i < OUTERMOST_I; i++)
-    for (int k = 0; k < OUTERMOST_K; k++)
-    for (int kk = 0; kk < KK; kk++)
-    for (int ii = 0; ii < II; ii++)
-    for (int iii = 0; iii < III; iii++)
-    for (int kkk = 0; kkk < KKK; kkk++) {
-        int total_k = kkk + KKK*kk + KKK*KK*k;
-        int total_i = iii + III*ii + III*II*i;
-        serialized_A[addr++] = A[total_k + total_i*TOTAL_K];
-    }
+        for (int k = 0; k < OUTERMOST_K; k++)
+            for (int kk = 0; kk < KK; kk++)
+                for (int ii = 0; ii < II; ii++)
+                    for (int iii = 0; iii < III; iii++)
+                        for (int kkk = 0; kkk < KKK; kkk++) {
+                            int total_k = kkk + KKK * kk + KKK * KK * k;
+                            int total_i = iii + III * ii + III * II * i;
+                            serialized_A[addr++] = A[total_k + total_i * TOTAL_K];
+                        }
     // Serialize B
     addr = 0;
     for (int j = 0; j < OUTERMOST_J; j++)
-    for (int k = 0; k < OUTERMOST_K; k++)
-    for (int kk = 0; kk < KK; kk++)
-    for (int jj = 0; jj < JJ; jj++)
-    for (int jjj = 0; jjj < JJJ; jjj++)
-    for (int kkk = 0; kkk < KKK; kkk++) {
-        int total_k = kkk + KKK*kk + KKK*KK*k;
-        int total_j = jjj + JJJ*jj + JJJ*JJ*j;
-        serialized_B[addr++] = A[total_j + total_k*TOTAL_J]; // B[total_j + total_k*TOTAL_J];
-
-    }
+        for (int k = 0; k < OUTERMOST_K; k++)
+            for (int kk = 0; kk < KK; kk++)
+                for (int jj = 0; jj < JJ; jj++)
+                    for (int jjj = 0; jjj < JJJ; jjj++)
+                        for (int kkk = 0; kkk < KKK; kkk++) {
+                            int total_k = kkk + KKK * kk + KKK * KK * k;
+                            int total_j = jjj + JJJ * jj + JJJ * JJ * j;
+                            serialized_B[addr++] = A[total_j + total_k * TOTAL_J]; // B[total_j + total_k*TOTAL_J];
+                        }
 #ifndef NDEBUG
-    DPRINTF("\n===== Host-CPU setting up the OpenCL platform and device in FPGA SYRK======\n\n");
+    DPRINTF("\n===== Host-CPU setting up the OpenCL platform and device in "
+            "FPGA SYRK======\n\n");
 #endif
     // Use this to check the output of each API call
     cl_int status;
-    auto fctx = ctx->getFPGAContext(0);    
+    auto fctx = ctx->getFPGAContext(0);
 
     //----------------------------------------------
     // Create command queues
@@ -141,21 +135,15 @@ int syrk(const float *A, float *C, const int OUTERMOST_I, const int OUTERMOST_K,
     // Create a command queue using clCreateCommandQueue(),
     // and associate it with the device you want to execute on
     for (int i = 0; i < NUM_QUEUES_TO_CREATE; i++) {
-        //fDPRINTF(stdout,"cmdQueue i = %d\n", i);
-        cmdQueue[i] = clCreateCommandQueue(
-            fctx->context,
-            fctx->devices[0],
-            CL_QUEUE_PROFILING_ENABLE,
-            &status);
+        // fDPRINTF(stdout,"cmdQueue i = %d\n", i);
+        cmdQueue[i] = clCreateCommandQueue(fctx->context, fctx->devices[0], CL_QUEUE_PROFILING_ENABLE, &status);
         CHECK(status);
     }
 
-    //fDPRINTF(stdout,"cmdQueue i = %d, a queue for reading the C buffer\n", i);
-    cmdQueue[NUM_QUEUES_TO_CREATE] = clCreateCommandQueue(
-        fctx->context,
-        fctx->devices[0],
-        CL_QUEUE_PROFILING_ENABLE,
-        &status);
+    // fDPRINTF(stdout,"cmdQueue i = %d, a queue for reading the C buffer\n",
+    // i);
+    cmdQueue[NUM_QUEUES_TO_CREATE] =
+        clCreateCommandQueue(fctx->context, fctx->devices[0], CL_QUEUE_PROFILING_ENABLE, &status);
     CHECK(status);
 
     //----------------------------------------------
@@ -166,33 +154,22 @@ int syrk(const float *A, float *C, const int OUTERMOST_I, const int OUTERMOST_K,
     cl_mem input_B_buf;
     cl_mem output_C_buf;
 #ifndef NDEBUG
-    DPRINTF("\n===== Host-CPU transferring W and X to the FPGA device global memory (DDR4) via PCIe ======\n\n");
+    DPRINTF("\n===== Host-CPU transferring W and X to the FPGA device global "
+            "memory (DDR4) via PCIe ======\n\n");
 #endif
-    input_A_buf = clCreateBuffer(
-        fctx->context,
-        //CL_MEM_READ_ONLY | CL_MEM_BANK_1_ALTERA,
-        CL_MEM_READ_ONLY,
-        num_elem_A * sizeof(cl_float),
-        NULL,
-        &status);
+    input_A_buf = clCreateBuffer(fctx->context,
+                                 // CL_MEM_READ_ONLY | CL_MEM_BANK_1_ALTERA,
+                                 CL_MEM_READ_ONLY, num_elem_A * sizeof(cl_float), NULL, &status);
     CHECK(status);
 
-    input_B_buf = clCreateBuffer(
-        fctx->context,
-        //CL_MEM_READ_ONLY | CL_MEM_BANK_1_ALTERA,
-        CL_MEM_READ_ONLY,
-        num_elem_B * sizeof(cl_float),
-        NULL,
-        &status);
+    input_B_buf = clCreateBuffer(fctx->context,
+                                 // CL_MEM_READ_ONLY | CL_MEM_BANK_1_ALTERA,
+                                 CL_MEM_READ_ONLY, num_elem_B * sizeof(cl_float), NULL, &status);
     CHECK(status);
 
-    output_C_buf = clCreateBuffer(
-        fctx->context,
-        //CL_MEM_WRITE_ONLY | CL_MEM_BANK_1_ALTERA,
-        CL_MEM_WRITE_ONLY,
-        num_elem_C * sizeof(cl_float),
-        NULL,
-        &status);
+    output_C_buf = clCreateBuffer(fctx->context,
+                                  // CL_MEM_WRITE_ONLY | CL_MEM_BANK_1_ALTERA,
+                                  CL_MEM_WRITE_ONLY, num_elem_C * sizeof(cl_float), NULL, &status);
     CHECK(status);
 
     //----------------------------------------------
@@ -200,28 +177,12 @@ int syrk(const float *A, float *C, const int OUTERMOST_I, const int OUTERMOST_K,
     //----------------------------------------------
 
     // blocking writes
-    status = clEnqueueWriteBuffer(
-        cmdQueue[0],
-        input_A_buf,
-        CL_TRUE,
-        0,
-        num_elem_A * sizeof(cl_float),
-        serialized_A,
-        0,
-        NULL,
-        NULL);
+    status = clEnqueueWriteBuffer(cmdQueue[0], input_A_buf, CL_TRUE, 0, num_elem_A * sizeof(cl_float), serialized_A, 0,
+                                  NULL, NULL);
     CHECK(status);
 
-    status = clEnqueueWriteBuffer(
-        cmdQueue[1],
-        input_B_buf,
-        CL_TRUE,
-        0,
-        num_elem_B * sizeof(cl_float),
-        serialized_B,
-        0,
-        NULL,
-        NULL);
+    status = clEnqueueWriteBuffer(cmdQueue[1], input_B_buf, CL_TRUE, 0, num_elem_B * sizeof(cl_float), serialized_B, 0,
+                                  NULL, NULL);
     CHECK(status);
 
     //----------------------------------------------
@@ -236,7 +197,8 @@ int syrk(const float *A, float *C, const int OUTERMOST_I, const int OUTERMOST_K,
     const unsigned char *binary;
 
     fflush(stdout);
-    // create the program using binary already compiled offline using aoc (i.e. the .aocx file)
+    // create the program using binary already compiled offline using aoc (i.e.
+    // the .aocx file)
     char *aocx_file = getenv("BITSTREAM");
     FILE *fp = fopen(aocx_file, "rb");
 
@@ -262,14 +224,8 @@ int syrk(const float *A, float *C, const int OUTERMOST_I, const int OUTERMOST_K,
     DPRINTF("Create program with binary\n");
 #endif
     // Create a program using clCreateProgramWithBinary()
-    program = clCreateProgramWithBinary(
-        fctx->context,
-        1,
-        fctx->devices,
-        &binary_length,
-        (const unsigned char **)&binary,
-        &status,
-        NULL);
+    program = clCreateProgramWithBinary(fctx->context, 1, fctx->devices, &binary_length,
+                                        (const unsigned char **)&binary, &status, NULL);
     CHECK(status);
 
     //----------------------------------------------
@@ -288,101 +244,45 @@ int syrk(const float *A, float *C, const int OUTERMOST_I, const int OUTERMOST_K,
 
     for (int j = 0; j < NUM_KERNELS_TO_CREATE; j++) {
 #ifndef NDEBUG
-    	    DPRINTF("Creating kernel[%d]: %s\n", j, syrk_kernel_name[j]);
+        DPRINTF("Creating kernel[%d]: %s\n", j, syrk_kernel_name[j]);
 #endif
-    	    kernel[j] = clCreateKernel(program, (const char *)syrk_kernel_name[j], &status);
+        kernel[j] = clCreateKernel(program, (const char *)syrk_kernel_name[j], &status);
         CHECK(status);
     }
 #ifndef NDEBUG
     DPRINTF("All kernels created\n");
 #endif
     // A_loader
-    status = clSetKernelArg(
-        kernel[0],
-        0,
-        sizeof(int),
-        (void *)&TOTAL_K);
+    status = clSetKernelArg(kernel[0], 0, sizeof(int), (void *)&TOTAL_K);
     CHECK(status);
-    status = clSetKernelArg(
-        kernel[0],
-        1,
-        sizeof(int),
-        (void *)&TOTAL_I);
+    status = clSetKernelArg(kernel[0], 1, sizeof(int), (void *)&TOTAL_I);
     CHECK(status);
-    status = clSetKernelArg(
-        kernel[0],
-        2,
-        sizeof(int),
-        (void *)&TOTAL_J);
+    status = clSetKernelArg(kernel[0], 2, sizeof(int), (void *)&TOTAL_J);
     CHECK(status);
-    status = clSetKernelArg(
-        kernel[0],
-        3,
-        sizeof(cl_mem),
-        (void *)&input_A_buf);
+    status = clSetKernelArg(kernel[0], 3, sizeof(cl_mem), (void *)&input_A_buf);
     CHECK(status);
     // B_loader
-    status = clSetKernelArg(
-        kernel[1],
-        0,
-        sizeof(int),
-        (void *)&TOTAL_K);
+    status = clSetKernelArg(kernel[1], 0, sizeof(int), (void *)&TOTAL_K);
     CHECK(status);
-    status = clSetKernelArg(
-        kernel[1],
-        1,
-        sizeof(int),
-        (void *)&TOTAL_I);
+    status = clSetKernelArg(kernel[1], 1, sizeof(int), (void *)&TOTAL_I);
     CHECK(status);
-    status = clSetKernelArg(
-        kernel[1],
-        2,
-        sizeof(int),
-        (void *)&TOTAL_J);
+    status = clSetKernelArg(kernel[1], 2, sizeof(int), (void *)&TOTAL_J);
     CHECK(status);
-    status = clSetKernelArg(
-        kernel[1],
-        3,
-        sizeof(cl_mem),
-        (void *)&input_B_buf);
+    status = clSetKernelArg(kernel[1], 3, sizeof(cl_mem), (void *)&input_B_buf);
     CHECK(status);
     // Out
-    status = clSetKernelArg(
-        kernel[2],
-        0,
-        sizeof(int),
-        (void *)&TOTAL_K);
+    status = clSetKernelArg(kernel[2], 0, sizeof(int), (void *)&TOTAL_K);
     CHECK(status);
-    status = clSetKernelArg(
-        kernel[2],
-        1,
-        sizeof(int),
-        (void *)&TOTAL_I);
+    status = clSetKernelArg(kernel[2], 1, sizeof(int), (void *)&TOTAL_I);
     CHECK(status);
-    status = clSetKernelArg(
-        kernel[2],
-        2,
-        sizeof(int),
-        (void *)&TOTAL_J);
+    status = clSetKernelArg(kernel[2], 2, sizeof(int), (void *)&TOTAL_J);
     CHECK(status);
     // unloader
-    status = clSetKernelArg(
-        kernel[3],
-        0,
-        sizeof(int),
-        (void *)&TOTAL_I);
+    status = clSetKernelArg(kernel[3], 0, sizeof(int), (void *)&TOTAL_I);
     CHECK(status);
-    status = clSetKernelArg(
-        kernel[3],
-        1,
-        sizeof(int),
-        (void *)&TOTAL_J);
+    status = clSetKernelArg(kernel[3], 1, sizeof(int), (void *)&TOTAL_J);
     CHECK(status);
-    status = clSetKernelArg(
-        kernel[3],
-        2,
-        sizeof(cl_mem),
-        (void *)&output_C_buf);
+    status = clSetKernelArg(kernel[3], 2, sizeof(cl_mem), (void *)&output_C_buf);
     CHECK(status);
 
     //----------------------------------------------
@@ -404,23 +304,16 @@ int syrk(const float *A, float *C, const int OUTERMOST_I, const int OUTERMOST_K,
 
     cl_event kernel_exec_event[NUM_KERNELS_TO_CREATE];
 #ifndef NDEBUG
-    DPRINTF("\n===== Host-CPU enqeuing the OpenCL kernels to the FPGA device ======\n\n");
+    DPRINTF("\n===== Host-CPU enqeuing the OpenCL kernels to the FPGA device "
+            "======\n\n");
 #endif
     for (int i = 0; i < NUM_KERNELS_TO_CREATE; i++) {
         // Alternatively, can use clEnqueueTaskKernel
 #ifndef NDEBUG
-    	    DPRINTF("clEnqueueNDRangeKernel[%d]: %s!\n", i, syrk_kernel_name[i]);
+        DPRINTF("clEnqueueNDRangeKernel[%d]: %s!\n", i, syrk_kernel_name[i]);
 #endif
-    	    status = clEnqueueNDRangeKernel(
-            cmdQueue[i],
-            kernel[i],
-            1,
-            NULL,
-            globalWorkSize,
-            localWorkSize,
-            0,
-            NULL,
-            &kernel_exec_event[i]);
+        status = clEnqueueNDRangeKernel(cmdQueue[i], kernel[i], 1, NULL, globalWorkSize, localWorkSize, 0, NULL,
+                                        &kernel_exec_event[i]);
         CHECK(status);
     }
 #ifndef NDEBUG
@@ -433,16 +326,16 @@ int syrk(const float *A, float *C, const int OUTERMOST_I, const int OUTERMOST_K,
 
     for (int i = 0; i < NUM_QUEUES_TO_CREATE; i++) {
 #ifndef NDEBUG
-    	    DPRINTF("cmd queue: %d\n", i);
+        DPRINTF("cmd queue: %d\n", i);
 #endif
-    	    fflush(stdout);
+        fflush(stdout);
         status = clFinish(cmdQueue[i]);
         CHECK(status);
     }
 #ifndef NDEBUG
     DPRINTF(" *** FPGA execution finished!\n");
     DPRINTF("\n\n");
-//#endif
+    // #endif
     double k_start_time[NUM_KERNELS_TO_CREATE];
     double k_end_time[NUM_KERNELS_TO_CREATE];
     double k_exec_time[NUM_KERNELS_TO_CREATE];
@@ -467,15 +360,18 @@ int syrk(const float *A, float *C, const int OUTERMOST_I, const int OUTERMOST_K,
             k_latest_end_time = k_end_time[i];
     }
 
-    // IMPORTANT: we care about the finish time of drain_C, once data is drained we are done
+    // IMPORTANT: we care about the finish time of drain_C, once data is drained
+    // we are done
     k_latest_end_time = k_end_time[NUM_KERNELS_TO_CREATE - 1];
 
     for (int i = 0; i < NUM_KERNELS_TO_CREATE; i++) {
-        printf("  Kernel execution time on FPGA: %s, \n   \t\t\t\t\t\t\t\t\texec time = %.5f s, start=%.5f s, end=%.5f s\n", syrk_kernel_name[i], k_exec_time[i], k_start_time[i], k_end_time[i]);
+        printf("  Kernel execution time on FPGA: %s, \n   \t\t\t\t\t\t\t\t\texec "
+               "time = %.5f s, start=%.5f s, end=%.5f s\n",
+               syrk_kernel_name[i], k_exec_time[i], k_start_time[i], k_end_time[i]);
     }
 
     double k_overall_exec_time = k_latest_end_time - k_earliest_start_time;
-//#ifndef NDEBUG
+    // #ifndef NDEBUG
     printf("\n");
     printf("  Loader kernels start time\t\t= %.5f s\n", k_earliest_start_time);
     printf("  Unloader kernels end time\t\t= %.5f s\n", k_latest_end_time);
@@ -486,10 +382,11 @@ int syrk(const float *A, float *C, const int OUTERMOST_I, const int OUTERMOST_K,
 
     double num_operations = (double)2.0 * (TOTAL_K) * (double)(TOTAL_I) * (double)(TOTAL_J);
 
-    printf("  # operations = %.0f\n", num_operations );
+    printf("  # operations = %.0f\n", num_operations);
     printf("  Throughput: %.5f GFLOPS\n", (double)1.0e-9 * num_operations / k_overall_exec_time);
 
-    DPRINTF("\n===== Host-CPU transferring result matrix C from the FPGA device global memory (DDR4) via PCIe ======\n\n");
+    DPRINTF("\n===== Host-CPU transferring result matrix C from the FPGA "
+            "device global memory (DDR4) via PCIe ======\n\n");
 #endif
     // Read the results back from the device, blocking read
     float *serialized_Z;
@@ -498,35 +395,27 @@ int syrk(const float *A, float *C, const int OUTERMOST_I, const int OUTERMOST_K,
     }
 
     clEnqueueReadBuffer(
-        //cmdQueue[KID_DRAIN_MAT_C],
-        cmdQueue[NUM_KERNELS_TO_CREATE], // using a special queue for reading buffer C
-        output_C_buf,
-        CL_TRUE,
-        0,
-        num_elem_C * sizeof(cl_float),
-        serialized_Z,
-        0,
-        NULL,
-        NULL);
+        // cmdQueue[KID_DRAIN_MAT_C],
+        cmdQueue[NUM_KERNELS_TO_CREATE], // using a special queue for reading
+                                         // buffer C
+        output_C_buf, CL_TRUE, 0, num_elem_C * sizeof(cl_float), serialized_Z, 0, NULL, NULL);
     CHECK(status);
 
     printf("\nDeserialization...\n");
 
-
     // Deserialize Z
     addr = 0;
     for (size_t i = 0; i < OUTERMOST_I; i++)
-    for (size_t j = i; j < OUTERMOST_J; j++)
-    for (size_t iii = 0; iii < III; iii++) 
-    for (size_t ii = 0; ii < II; ii++)
-    for (size_t jj = 0; jj < JJ; jj++)
-    for (size_t jjj = 0; jjj < JJJ; jjj++) {
-        size_t total_i = iii + III * ii + III * II * i;
-        size_t total_j = jjj + JJJ * jj + JJJ * JJ * j;
-        C[total_j + total_i*TOTAL_J] = serialized_Z[addr];
-        C[total_i + total_j*TOTAL_I] = serialized_Z[addr];
-        addr += 1;
-    }
+        for (size_t j = i; j < OUTERMOST_J; j++)
+            for (size_t iii = 0; iii < III; iii++)
+                for (size_t ii = 0; ii < II; ii++)
+                    for (size_t jj = 0; jj < JJ; jj++)
+                        for (size_t jjj = 0; jjj < JJJ; jjj++) {
+                            size_t total_i = iii + III * ii + III * II * i;
+                            size_t total_j = jjj + JJJ * jj + JJJ * JJ * j;
+                            C[total_j + total_i * TOTAL_J] = serialized_Z[addr];
+                            C[total_i + total_j * TOTAL_I] = serialized_Z[addr];
+                            addr += 1;
+                        }
     return 0;
 }
-
diff --git a/src/runtime/local/kernels/FPGAOPENCL/syrk_interface.h b/src/runtime/local/kernels/FPGAOPENCL/syrk_interface.h
old mode 100755
new mode 100644
index 4f3587a1d..0970027f4
--- a/src/runtime/local/kernels/FPGAOPENCL/syrk_interface.h
+++ b/src/runtime/local/kernels/FPGAOPENCL/syrk_interface.h
@@ -5,5 +5,3 @@
 extern int syrk(const float *A, float *C, const int OUTERMOST_I, const int OUTERMOST_K, DCTX(ctx));
 
 #endif
-
-
diff --git a/src/runtime/local/kernels/Fill.h b/src/runtime/local/kernels/Fill.h
index dad7ac980..4229a8b82 100644
--- a/src/runtime/local/kernels/Fill.h
+++ b/src/runtime/local/kernels/Fill.h
@@ -20,22 +20,22 @@
 #include <runtime/local/datastructures/DataObjectFactory.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
 #include <runtime/local/datastructures/Matrix.h>
+#include <runtime/local/datastructures/ValueTypeUtils.h>
+#include <stdexcept>
 
 // ****************************************************************************
 // Struct for partial template specialization
 // ****************************************************************************
 
-template<class DTRes, typename VTArg>
-struct Fill {
-    static void apply(DTRes *& res, VTArg arg, size_t numRows, size_t numCols, DCTX(ctx)) = delete;
+template <class DTRes, typename VTArg> struct Fill {
+    static void apply(DTRes *&res, VTArg arg, size_t numRows, size_t numCols, DCTX(ctx)) = delete;
 };
 
 // ****************************************************************************
 // Convenience function
 // ****************************************************************************
 
-template<class DTRes, typename VTArg>
-void fill(DTRes *& res, VTArg arg, size_t numRows, size_t numCols, DCTX(ctx)) {
+template <class DTRes, typename VTArg> void fill(DTRes *&res, VTArg arg, size_t numRows, size_t numCols, DCTX(ctx)) {
     Fill<DTRes, VTArg>::apply(res, arg, numRows, numCols, ctx);
 }
 
@@ -47,18 +47,13 @@ void fill(DTRes *& res, VTArg arg, size_t numRows, size_t numCols, DCTX(ctx)) {
 // DenseMatrix
 // ----------------------------------------------------------------------------
 
-template<typename VT>
-struct Fill<DenseMatrix<VT>, VT> {
-    static void apply(DenseMatrix<VT> *& res, VT arg, size_t numRows, size_t numCols, DCTX(ctx)) {
+template <typename VT> struct Fill<DenseMatrix<VT>, VT> {
+    static void apply(DenseMatrix<VT> *&res, VT arg, size_t numRows, size_t numCols, DCTX(ctx)) {
+        if (res != nullptr)
+            throw std::invalid_argument("Trying to fill an already existing DenseMatrix.");
 
-        if(res == nullptr)
-            res = DataObjectFactory::create<DenseMatrix<VT>>(numRows, numCols, arg == 0);
-
-        if(arg != 0) {
-            VT *valuesRes = res->getValues();
-            for(auto i = 0ul; i < res->getNumItems(); ++i)
-                valuesRes[i] = arg;
-        }
+        res = DataObjectFactory::create<DenseMatrix<VT>>(numRows, numCols, false);
+        std::fill(res->getValues(), res->getValues() + res->getNumItems(), arg);
     }
 };
 
@@ -66,12 +61,12 @@ struct Fill<DenseMatrix<VT>, VT> {
 // Matrix
 // ----------------------------------------------------------------------------
 
-template<typename VT>
-struct Fill<Matrix<VT>, VT> {
-    static void apply(Matrix<VT> *& res, VT arg, size_t numRows, size_t numCols, DCTX(ctx)) {
-        if (res == nullptr)
-            res = DataObjectFactory::create<DenseMatrix<VT>>(numRows, numCols, arg == 0);
+template <typename VT> struct Fill<Matrix<VT>, VT> {
+    static void apply(Matrix<VT> *&res, VT arg, size_t numRows, size_t numCols, DCTX(ctx)) {
+        if (res != nullptr)
+            throw std::invalid_argument("Trying to fill an already existing DenseMatrix.");
 
+        res = DataObjectFactory::create<DenseMatrix<VT>>(numRows, numCols, arg == 0);
         if (arg != 0) {
             res->prepareAppend();
             for (size_t r = 0; r < numRows; ++r)
@@ -80,4 +75,4 @@ struct Fill<Matrix<VT>, VT> {
             res->finishAppend();
         }
     }
-};
\ No newline at end of file
+};
diff --git a/src/runtime/local/kernels/FilterCol.h b/src/runtime/local/kernels/FilterCol.h
index 402ab30b7..3e9671456 100644
--- a/src/runtime/local/kernels/FilterCol.h
+++ b/src/runtime/local/kernels/FilterCol.h
@@ -31,17 +31,16 @@
 // Struct for partial template specialization
 // ****************************************************************************
 
-template<class DTRes, class DTArg, typename VTSel>
-struct FilterCol {
-    static void apply(DTRes *& res, const DTArg * arg, const DenseMatrix<VTSel> * sel, DCTX(ctx)) = delete;
+template <class DTRes, class DTArg, typename VTSel> struct FilterCol {
+    static void apply(DTRes *&res, const DTArg *arg, const DenseMatrix<VTSel> *sel, DCTX(ctx)) = delete;
 };
 
 // ****************************************************************************
 // Convenience function
 // ****************************************************************************
 
-template<class DTRes, class DTArg, typename VTSel>
-void filterCol(DTRes *& res, const DTArg * arg, const DenseMatrix<VTSel> * sel, DCTX(ctx)) {
+template <class DTRes, class DTArg, typename VTSel>
+void filterCol(DTRes *&res, const DTArg *arg, const DenseMatrix<VTSel> *sel, DCTX(ctx)) {
     FilterCol<DTRes, DTArg, VTSel>::apply(res, arg, sel, ctx);
 }
 
@@ -53,31 +52,30 @@ void filterCol(DTRes *& res, const DTArg * arg, const DenseMatrix<VTSel> * sel,
 // DenseMatrix <- DenseMatrix
 // ----------------------------------------------------------------------------
 
-template<typename VT, typename VTSel>
-struct FilterCol<DenseMatrix<VT>, DenseMatrix<VT>, VTSel> {
-    static void apply(DenseMatrix<VT> *& res, const DenseMatrix<VT> * arg, const DenseMatrix<VTSel> * sel, DCTX(ctx)) {
+template <typename VT, typename VTSel> struct FilterCol<DenseMatrix<VT>, DenseMatrix<VT>, VTSel> {
+    static void apply(DenseMatrix<VT> *&res, const DenseMatrix<VT> *arg, const DenseMatrix<VTSel> *sel, DCTX(ctx)) {
         const size_t numRows = arg->getNumRows();
         const size_t numColsArg = arg->getNumCols();
 
-        if(sel->getNumRows() != numColsArg)
+        if (sel->getNumRows() != numColsArg)
             throw std::runtime_error("sel must have exactly one entry (row) for each column in arg");
-        if(sel->getNumCols() != 1)
+        if (sel->getNumCols() != 1)
             throw std::runtime_error("sel must be a single-column matrix");
 
         size_t numColsRes = 0;
-        for(size_t c = 0; c < numColsArg; c++)
+        for (size_t c = 0; c < numColsArg; c++)
             numColsRes += sel->get(c, 0);
 
-        if(res == nullptr)
+        if (res == nullptr)
             res = DataObjectFactory::create<DenseMatrix<VT>>(numRows, numColsRes, false);
 
-        const VT * valuesArg = arg->getValues();
-        VT * valuesRes = res->getValues();
+        const VT *valuesArg = arg->getValues();
+        VT *valuesRes = res->getValues();
         const size_t rowSkipArg = arg->getRowSkip();
         const size_t rowSkipRes = res->getRowSkip();
-        for(size_t r = 0; r < numRows; r++) {
-            for(size_t ca = 0, cr = 0; ca < numColsArg; ca++)
-                if(sel->get(ca, 0))
+        for (size_t r = 0; r < numRows; r++) {
+            for (size_t ca = 0, cr = 0; ca < numColsArg; ca++)
+                if (sel->get(ca, 0))
                     valuesRes[cr++] = valuesArg[ca];
             valuesArg += rowSkipArg;
             valuesRes += rowSkipRes;
@@ -89,9 +87,8 @@ struct FilterCol<DenseMatrix<VT>, DenseMatrix<VT>, VTSel> {
 // Matrix <- Matrix
 // ----------------------------------------------------------------------------
 
-template<typename VT, typename VTSel>
-struct FilterCol<Matrix<VT>, Matrix<VT>, VTSel> {
-    static void apply(Matrix<VT> *& res, const Matrix<VT> * arg, const Matrix<VTSel> * sel, DCTX(ctx)) {
+template <typename VT, typename VTSel> struct FilterCol<Matrix<VT>, Matrix<VT>, VTSel> {
+    static void apply(Matrix<VT> *&res, const Matrix<VT> *arg, const Matrix<VTSel> *sel, DCTX(ctx)) {
         const size_t numRows = arg->getNumRows();
         const size_t numColsArg = arg->getNumCols();
 
@@ -117,4 +114,4 @@ struct FilterCol<Matrix<VT>, Matrix<VT>, VTSel> {
     }
 };
 
-#endif //SRC_RUNTIME_LOCAL_KERNELS_FILTERCOL_H
\ No newline at end of file
+#endif // SRC_RUNTIME_LOCAL_KERNELS_FILTERCOL_H
\ No newline at end of file
diff --git a/src/runtime/local/kernels/FilterRow.h b/src/runtime/local/kernels/FilterRow.h
index 8f33f0573..0c53ec2ab 100644
--- a/src/runtime/local/kernels/FilterRow.h
+++ b/src/runtime/local/kernels/FilterRow.h
@@ -34,17 +34,16 @@
 // Struct for partial template specialization
 // ****************************************************************************
 
-template<class DTRes, class DTArg, typename VTSel>
-struct FilterRow {
-    static void apply(DTRes *& res, const DTArg * arg, const DenseMatrix<VTSel> * sel, DCTX(ctx)) = delete;
+template <class DTRes, class DTArg, typename VTSel> struct FilterRow {
+    static void apply(DTRes *&res, const DTArg *arg, const DenseMatrix<VTSel> *sel, DCTX(ctx)) = delete;
 };
 
 // ****************************************************************************
 // Convenience function
 // ****************************************************************************
 
-template<class DTRes, class DTArg, typename VTSel>
-void filterRow(DTRes *& res, const DTArg * arg, const DenseMatrix<VTSel> * sel, DCTX(ctx)) {
+template <class DTRes, class DTArg, typename VTSel>
+void filterRow(DTRes *&res, const DTArg *arg, const DenseMatrix<VTSel> *sel, DCTX(ctx)) {
     FilterRow<DTRes, DTArg, VTSel>::apply(res, arg, sel, ctx);
 }
 
@@ -56,30 +55,29 @@ void filterRow(DTRes *& res, const DTArg * arg, const DenseMatrix<VTSel> * sel,
 // DenseMatrix <- DenseMatrix
 // ----------------------------------------------------------------------------
 
-template<typename VT, typename VTSel>
-struct FilterRow<DenseMatrix<VT>, DenseMatrix<VT>, VTSel> {
-    static void apply(DenseMatrix<VT> *& res, const DenseMatrix<VT> * arg, const DenseMatrix<VTSel> * sel, DCTX(ctx)) {
+template <typename VT, typename VTSel> struct FilterRow<DenseMatrix<VT>, DenseMatrix<VT>, VTSel> {
+    static void apply(DenseMatrix<VT> *&res, const DenseMatrix<VT> *arg, const DenseMatrix<VTSel> *sel, DCTX(ctx)) {
         const size_t numRowsArg = arg->getNumRows();
         const size_t numCols = arg->getNumCols();
 
-        if(sel->getNumRows() != numRowsArg)
+        if (sel->getNumRows() != numRowsArg)
             throw std::runtime_error("sel must have exactly one entry (row) for each row in arg");
-        if(sel->getNumCols() != 1)
+        if (sel->getNumCols() != 1)
             throw std::runtime_error("sel must be a single-column matrix");
 
         size_t numRowsRes = 0;
-        for(size_t r = 0; r < numRowsArg; r++)
+        for (size_t r = 0; r < numRowsArg; r++)
             numRowsRes += sel->get(r, 0);
 
-        if(res == nullptr)
+        if (res == nullptr)
             res = DataObjectFactory::create<DenseMatrix<VT>>(numRowsRes, numCols, false);
 
-        const VT * valuesArg = arg->getValues();
-        VT * valuesRes = res->getValues();
+        const VT *valuesArg = arg->getValues();
+        VT *valuesRes = res->getValues();
         const size_t rowSkipArg = arg->getRowSkip();
         const size_t rowSkipRes = res->getRowSkip();
-        for(size_t r = 0; r < numRowsArg; r++) {
-            if(sel->get(r, 0)) {
+        for (size_t r = 0; r < numRowsArg; r++) {
+            if (sel->get(r, 0)) {
                 memcpy(valuesRes, valuesArg, numCols * sizeof(VT));
                 valuesRes += rowSkipRes;
             }
@@ -95,18 +93,17 @@ struct FilterRow<DenseMatrix<VT>, DenseMatrix<VT>, VTSel> {
 // 0 (row-wise) or 1 (column-wise)
 #define FILTERROW_FRAME_MODE 0
 
-template<typename VTSel>
-struct FilterRow<Frame, Frame, VTSel> {
-    static void apply(Frame *& res, const Frame * arg, const DenseMatrix<VTSel> * sel, DCTX(ctx)) {
+template <typename VTSel> struct FilterRow<Frame, Frame, VTSel> {
+    static void apply(Frame *&res, const Frame *arg, const DenseMatrix<VTSel> *sel, DCTX(ctx)) {
         const size_t numRows = arg->getNumRows();
         const size_t numCols = arg->getNumCols();
-        const ValueTypeCode * schema = arg->getSchema();
-        
-        if(sel->getNumRows() != numRows)
+        const ValueTypeCode *schema = arg->getSchema();
+
+        if (sel->getNumRows() != numRows)
             throw std::runtime_error("sel must have exactly one entry (row) for each row in arg");
-        if(sel->getNumCols() != 1)
+        if (sel->getNumCols() != 1)
             throw std::runtime_error("sel must be a single-column matrix");
-        
+
 #if FILTERROW_FRAME_MODE == 0
         // Add some padding due to stores in units of 8 bytes (see below). This
         // formula is a little pessimistic, though.
@@ -114,39 +111,36 @@ struct FilterRow<Frame, Frame, VTSel> {
 #elif FILTERROW_FRAME_MODE == 1
         const size_t numRowsAlloc = numRows;
 #endif
-        if(res == nullptr)
-            res = DataObjectFactory::create<Frame>(
-                    numRowsAlloc, numCols, schema, arg->getLabels(), false
-            );
-        
-        const VTSel * valuesSel = sel->getValues();
-        
+        if (res == nullptr)
+            res = DataObjectFactory::create<Frame>(numRowsAlloc, numCols, schema, arg->getLabels(), false);
+
+        const VTSel *valuesSel = sel->getValues();
+
 #if FILTERROW_FRAME_MODE == 0
         // Some information on each column.
-        size_t * const elementSizes = new size_t[numCols];
-        const uint8_t ** argCols = new const uint8_t *[numCols];
-        uint8_t ** resCols = new uint8_t *[numCols];
+        size_t *const elementSizes = new size_t[numCols];
+        const uint8_t **argCols = new const uint8_t *[numCols];
+        uint8_t **resCols = new uint8_t *[numCols];
         // Initialize information on each column.
-        for(size_t c = 0; c < numCols; c++) {
+        for (size_t c = 0; c < numCols; c++) {
             elementSizes[c] = ValueTypeUtils::sizeOf(schema[c]);
             argCols[c] = reinterpret_cast<const uint8_t *>(arg->getColumnRaw(c));
             resCols[c] = reinterpret_cast<uint8_t *>(res->getColumnRaw(c));
         }
         // Actual filtering.
-        for(size_t r = 0; r < numRows; r++) {
-            if(valuesSel[r]) {
-                for(size_t c = 0; c < numCols; c++) {
+        for (size_t r = 0; r < numRows; r++) {
+            if (valuesSel[r]) {
+                for (size_t c = 0; c < numCols; c++) {
                     // We always copy in units of 8 bytes (uint64_t). If the
                     // actual element size is lower, the superfluous bytes will
                     // be overwritten by the next match. With this approach, we
                     // do not need to call memcpy for each element, nor
                     // interpret the types for a L/S of fitting size.
-                    *reinterpret_cast<uint64_t *>(resCols[c]) = 
-                            *reinterpret_cast<const uint64_t *>(argCols[c]);
+                    *reinterpret_cast<uint64_t *>(resCols[c]) = *reinterpret_cast<const uint64_t *>(argCols[c]);
                     resCols[c] += elementSizes[c];
                 }
             }
-            for(size_t c = 0; c < numCols; c++)
+            for (size_t c = 0; c < numCols; c++)
                 argCols[c] += elementSizes[c];
         }
         auto resColsInit0 = reinterpret_cast<uint8_t *>(res->getColumnRaw(0));
@@ -168,9 +162,8 @@ struct FilterRow<Frame, Frame, VTSel> {
 // Matrix <- Matrix
 // ----------------------------------------------------------------------------
 
-template<typename VT, typename VTSel>
-struct FilterRow<Matrix<VT>, Matrix<VT>, VTSel> {
-    static void apply(Matrix<VT> *& res, const Matrix<VT> * arg, const Matrix<VTSel> * sel, DCTX(ctx)) {
+template <typename VT, typename VTSel> struct FilterRow<Matrix<VT>, Matrix<VT>, VTSel> {
+    static void apply(Matrix<VT> *&res, const Matrix<VT> *arg, const Matrix<VTSel> *sel, DCTX(ctx)) {
         const size_t numRowsArg = arg->getNumRows();
         const size_t numCols = arg->getNumCols();
 
@@ -199,4 +192,4 @@ struct FilterRow<Matrix<VT>, Matrix<VT>, VTSel> {
     }
 };
 
-#endif //SRC_RUNTIME_LOCAL_KERNELS_FILTERROW_H
\ No newline at end of file
+#endif // SRC_RUNTIME_LOCAL_KERNELS_FILTERROW_H
\ No newline at end of file
diff --git a/src/runtime/local/kernels/Gemv.h b/src/runtime/local/kernels/Gemv.h
index 20a4f6683..a46c3afcc 100644
--- a/src/runtime/local/kernels/Gemv.h
+++ b/src/runtime/local/kernels/Gemv.h
@@ -18,6 +18,7 @@
 #define SRC_RUNTIME_LOCAL_KERNELS_GEMV_H
 
 #include <runtime/local/context/DaphneContext.h>
+#include <runtime/local/datastructures/CSRMatrix.h>
 #include <runtime/local/datastructures/DataObjectFactory.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
 
@@ -27,17 +28,15 @@
 // Struct for partial template specialization
 // ****************************************************************************
 
-template<class DTRes, class DTMat, class DTVec>
-struct Gemv {
-    static void apply(DTRes *& res, const DTMat * mat, const DTVec * vec, DCTX(ctx)) = delete;
+template <class DTRes, class DTMat, class DTVec> struct Gemv {
+    static void apply(DTRes *&res, const DTMat *mat, const DTVec *vec, DCTX(ctx)) = delete;
 };
 
 // ****************************************************************************
 // Convenience function
 // ****************************************************************************
 
-template<class DTRes, class DTMat, class DTVec>
-void gemv(DTRes *& res, const DTMat * mat, const DTVec * vec, DCTX(ctx)) {
+template <class DTRes, class DTMat, class DTVec> void gemv(DTRes *&res, const DTMat *mat, const DTVec *vec, DCTX(ctx)) {
     Gemv<DTRes, DTMat, DTVec>::apply(res, mat, vec, ctx);
 }
 
@@ -49,59 +48,36 @@ void gemv(DTRes *& res, const DTMat * mat, const DTVec * vec, DCTX(ctx)) {
 // DenseMatrix <- DenseMatrix
 // ----------------------------------------------------------------------------
 
-template<>
-struct Gemv<DenseMatrix<double>, DenseMatrix<double>, DenseMatrix<double>> {
-    static void apply(DenseMatrix<double> *& res, const DenseMatrix<double> * mat, const DenseMatrix<double> * vec, DCTX(ctx)) {
+template <> struct Gemv<DenseMatrix<double>, DenseMatrix<double>, DenseMatrix<double>> {
+    static void apply(DenseMatrix<double> *&res, const DenseMatrix<double> *mat, const DenseMatrix<double> *vec,
+                      DCTX(ctx)) {
         const size_t numRows = mat->getNumRows();
         const size_t numCols = mat->getNumCols();
 
-        if(res == nullptr)
+        if (res == nullptr)
             res = DataObjectFactory::create<DenseMatrix<double>>(numCols, 1, false);
 
-        cblas_dgemv(CblasRowMajor,
-            CblasTrans,
-            numRows,
-            numCols,
-            1.0,
-            mat->getValues(),
-            mat->getRowSkip(),
-            vec->getValues(),
-            vec->getRowSkip(),
-            0.0,
-            res->getValues(),
-            res->getRowSkip()
-        );
+        cblas_dgemv(CblasRowMajor, CblasTrans, numRows, numCols, 1.0, mat->getValues(), mat->getRowSkip(),
+                    vec->getValues(), vec->getRowSkip(), 0.0, res->getValues(), res->getRowSkip());
     }
 };
 
-template<>
-struct Gemv<DenseMatrix<float>, DenseMatrix<float>, DenseMatrix<float>> {
-    static void apply(DenseMatrix<float> *& res, const DenseMatrix<float> * mat, const DenseMatrix<float> * vec, DCTX(ctx)) {
+template <> struct Gemv<DenseMatrix<float>, DenseMatrix<float>, DenseMatrix<float>> {
+    static void apply(DenseMatrix<float> *&res, const DenseMatrix<float> *mat, const DenseMatrix<float> *vec,
+                      DCTX(ctx)) {
         const size_t numRows = mat->getNumRows();
         const size_t numCols = mat->getNumCols();
 
-        if(res == nullptr)
+        if (res == nullptr)
             res = DataObjectFactory::create<DenseMatrix<float>>(numCols, 1, false);
 
-        cblas_sgemv(CblasRowMajor,
-            CblasTrans,
-            numRows,
-            numCols,
-            1.0,
-            mat->getValues(),
-            mat->getRowSkip(),
-            vec->getValues(),
-            vec->getRowSkip(),
-            0.0,
-            res->getValues(),
-            res->getRowSkip()
-        );
+        cblas_sgemv(CblasRowMajor, CblasTrans, numRows, numCols, 1.0, mat->getValues(), mat->getRowSkip(),
+                    vec->getValues(), vec->getRowSkip(), 0.0, res->getValues(), res->getRowSkip());
     }
 };
 
-template<typename VT>
-struct Gemv<DenseMatrix<VT>, CSRMatrix<VT>, DenseMatrix<VT>> {
-    static void apply(DenseMatrix<VT> *& res, const CSRMatrix<VT> * mat, const DenseMatrix<VT> * vec, DCTX(ctx)) {
+template <typename VT> struct Gemv<DenseMatrix<VT>, CSRMatrix<VT>, DenseMatrix<VT>> {
+    static void apply(DenseMatrix<VT> *&res, const CSRMatrix<VT> *mat, const DenseMatrix<VT> *vec, DCTX(ctx)) {
         const size_t nr1 = mat->getNumRows();
         [[maybe_unused]] const size_t nc1 = mat->getNumCols();
 
@@ -109,35 +85,34 @@ struct Gemv<DenseMatrix<VT>, CSRMatrix<VT>, DenseMatrix<VT>> {
         const size_t nc2 = vec->getNumCols();
 
         if (nc1 != nr2) {
-            throw std::runtime_error(
-                "Gemv - #cols of mat and #rows of vec must be the same");
+            throw std::runtime_error("Gemv - #cols of mat and #rows of vec must be the same");
         }
 
-        if(res == nullptr)
+        if (res == nullptr)
             res = DataObjectFactory::create<DenseMatrix<VT>>(nr1, nc2, false);
 
-        const VT * valuesRhs = vec->getValues();
-        VT * valuesRes = res->getValues();
+        const VT *valuesRhs = vec->getValues();
+        VT *valuesRes = res->getValues();
 
         const size_t rowSkipRhs = vec->getRowSkip();
         const size_t rowSkipRes = res->getRowSkip();
 
         memset(valuesRes, VT(0), sizeof(VT) * nr1 * nc2);
-        for(size_t r = 0; r < nr1; r++) {
+        for (size_t r = 0; r < nr1; r++) {
             const size_t rowNumNonZeros = mat->getNumNonZeros(r);
-            const size_t * rowColIdxs = mat->getColIdxs(r);
-            const VT * rowValues = mat->getValues(r);
+            const size_t *rowColIdxs = mat->getColIdxs(r);
+            const VT *rowValues = mat->getValues(r);
 
             const size_t rowIdxRes = r * rowSkipRes;
-            for(size_t i = 0; i < rowNumNonZeros; i++) {
+            for (size_t i = 0; i < rowNumNonZeros; i++) {
                 const size_t c = rowColIdxs[i];
                 const size_t rowIdxRhs = c * rowSkipRhs;
 
-                for(size_t j = 0; j < nc2; j++) {
-		    valuesRes[rowIdxRes + j] += rowValues[i] * valuesRhs[rowIdxRhs + j];
+                for (size_t j = 0; j < nc2; j++) {
+                    valuesRes[rowIdxRes + j] += rowValues[i] * valuesRhs[rowIdxRhs + j];
                 }
             }
         }
     }
 };
-#endif //SRC_RUNTIME_LOCAL_KERNELS_GEMV_H
+#endif // SRC_RUNTIME_LOCAL_KERNELS_GEMV_H
diff --git a/src/runtime/local/kernels/GetColIdx.h b/src/runtime/local/kernels/GetColIdx.h
index 67454bee1..ef0836358 100644
--- a/src/runtime/local/kernels/GetColIdx.h
+++ b/src/runtime/local/kernels/GetColIdx.h
@@ -29,19 +29,17 @@
 #include <cstddef>
 #include <cstdint>
 
-
 // ****************************************************************************
 // Convenience function
 // ****************************************************************************
 
 size_t getColIdx(
     // input frame
-    const Frame * arg,
+    const Frame *arg,
     // column name
-    const char * colName,
+    const char *colName,
     // context
-    DCTX(ctx)
-) {
+    DCTX(ctx)) {
     return arg->getColumnIdx(colName);
 }
-#endif //SRC_RUNTIME_LOCAL_KERNELS_INNERJOIN_H
+#endif // SRC_RUNTIME_LOCAL_KERNELS_INNERJOIN_H
diff --git a/src/runtime/local/kernels/Group.h b/src/runtime/local/kernels/Group.h
index 8d399f49f..0d8df6410 100644
--- a/src/runtime/local/kernels/Group.h
+++ b/src/runtime/local/kernels/Group.h
@@ -17,16 +17,16 @@
 #ifndef SRC_RUNTIME_LOCAL_KERNELS_GROUP_H
 #define SRC_RUNTIME_LOCAL_KERNELS_GROUP_H
 
+#include <ir/daphneir/Daphne.h>
 #include <runtime/local/context/DaphneContext.h>
 #include <runtime/local/datastructures/DataObjectFactory.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
 #include <runtime/local/datastructures/Frame.h>
 #include <runtime/local/datastructures/ValueTypeCode.h>
 #include <runtime/local/datastructures/ValueTypeUtils.h>
-#include <runtime/local/kernels/Order.h>
 #include <runtime/local/kernels/ExtractCol.h>
+#include <runtime/local/kernels/Order.h>
 #include <util/DeduceType.h>
-#include <ir/daphneir/Daphne.h>
 
 #include <iterator>
 #include <vector>
@@ -35,19 +35,18 @@
 // Struct for partial template specialization
 // ****************************************************************************
 
-template<class DT>
-struct Group {
-    static void apply(DT *& res, const DT * arg, const char ** keyCols, size_t numKeyCols,
-        const char ** aggCols, size_t numAggCols, mlir::daphne::GroupEnum * aggFuncs, size_t numAggFuncs, DCTX(ctx)) = delete;
+template <class DT> struct Group {
+    static void apply(DT *&res, const DT *arg, const char **keyCols, size_t numKeyCols, const char **aggCols,
+                      size_t numAggCols, mlir::daphne::GroupEnum *aggFuncs, size_t numAggFuncs, DCTX(ctx)) = delete;
 };
 
 // ****************************************************************************
 // Convenience function
 // ****************************************************************************
 
-template<class DT>
-void group(DT *& res, const DT * arg, const char ** keyCols, size_t numKeyCols,
-        const char ** aggCols, size_t numAggCols, mlir::daphne::GroupEnum * aggFuncs, size_t numAggFuncs, DCTX(ctx)) {
+template <class DT>
+void group(DT *&res, const DT *arg, const char **keyCols, size_t numKeyCols, const char **aggCols, size_t numAggCols,
+           mlir::daphne::GroupEnum *aggFuncs, size_t numAggFuncs, DCTX(ctx)) {
     Group<DT>::apply(res, arg, keyCols, numKeyCols, aggCols, numAggCols, aggFuncs, numAggFuncs, ctx);
 }
 
@@ -59,76 +58,96 @@ void group(DT *& res, const DT * arg, const char ** keyCols, size_t numKeyCols,
 // Frame <- Frame
 // ----------------------------------------------------------------------------
 
-// returns the result of the aggregation function aggFunc over the (contiguous) memory between the begin and end pointer 
-template<typename VTRes, typename VTArg>
-VTRes aggregate (const mlir::daphne::GroupEnum & aggFunc, const VTArg * begin, const VTArg* end) {
+// returns the result of the aggregation function aggFunc over the (contiguous)
+// memory between the begin and end pointer
+template <typename VTRes, typename VTArg>
+VTRes aggregate(const mlir::daphne::GroupEnum &aggFunc, const VTArg *begin, const VTArg *end) {
     using mlir::daphne::GroupEnum;
-    switch(aggFunc) {
-        case GroupEnum::COUNT: return end-begin; break; // TODO: Do we need to check for Null elements here?
-        case GroupEnum::SUM: return std::accumulate(begin, end, (VTRes) 0); break;
-        case GroupEnum::MIN: return *std::min_element(begin, end); break;
-        case GroupEnum::MAX: return *std::max_element(begin, end); break; 
-        case GroupEnum::AVG: return std::accumulate(begin, end, (double) 0)/(double) (end-begin); break;
-        default : return *begin; break;
+    switch (aggFunc) {
+    case GroupEnum::COUNT:
+        return end - begin;
+        break; // TODO: Do we need to check for Null elements here?
+    case GroupEnum::SUM:
+        return std::accumulate(begin, end, (VTRes)0);
+        break;
+    case GroupEnum::MIN:
+        return *std::min_element(begin, end);
+        break;
+    case GroupEnum::MAX:
+        return *std::max_element(begin, end);
+        break;
+    case GroupEnum::AVG:
+        return std::accumulate(begin, end, (double)0) / (double)(end - begin);
+        break;
+    default:
+        return *begin;
+        break;
     }
 }
 
-// struct which calls the aggregate() function (specified via aggFunc) on each duplicate group in the groups vector and on
-// all implied single groups for a sepcified column (colIdx) of the argument frame (arg) and stores the result in the
-// specified column (colIdx) of the result frame (res)
-template<typename VTRes, typename VTArg>
-struct ColumnGroupAgg {
-    static void apply(Frame * res, const Frame * arg, size_t colIdx, std::vector<std::pair<size_t, size_t>> * groups, mlir::daphne::GroupEnum aggFunc, DCTX(ctx)) {
-        VTRes * valuesRes = res->getColumn<VTRes>(colIdx)->getValues();
-        const VTArg * valuesArg = arg->getColumn<VTArg>(colIdx)->getValues();
+// struct which calls the aggregate() function (specified via aggFunc) on each
+// duplicate group in the groups vector and on all implied single groups for a
+// sepcified column (colIdx) of the argument frame (arg) and stores the result
+// in the specified column (colIdx) of the result frame (res)
+template <typename VTRes, typename VTArg> struct ColumnGroupAgg {
+    static void apply(Frame *res, const Frame *arg, size_t colIdx, std::vector<std::pair<size_t, size_t>> *groups,
+                      mlir::daphne::GroupEnum aggFunc, DCTX(ctx)) {
+        VTRes *valuesRes = res->getColumn<VTRes>(colIdx)->getValues();
+        const VTArg *valuesArg = arg->getColumn<VTArg>(colIdx)->getValues();
         size_t rowRes = 0;
         size_t numRows = arg->getNumRows();
 
         // case for no duplicates
         if (groups == nullptr || groups->empty()) {
-            for(size_t r = 0; r < numRows; r++) 
-                valuesRes[rowRes++] = aggregate<VTRes,VTArg>(aggFunc, valuesArg + r, valuesArg + r + 1);
+            for (size_t r = 0; r < numRows; r++)
+                valuesRes[rowRes++] = aggregate<VTRes, VTArg>(aggFunc, valuesArg + r, valuesArg + r + 1);
             return;
         }
-        
-        for(size_t r = 0; r < groups->front().first; r++)
-            valuesRes[rowRes++] = aggregate<VTRes,VTArg>(aggFunc, valuesArg + r, valuesArg + r + 1);
-        for(auto it = groups->begin(); it != groups->end(); ++it) {
-            valuesRes[rowRes++] = aggregate<VTRes,VTArg>(aggFunc, valuesArg + it->first, valuesArg + it->second);
-            for(size_t r = it->second; r < (std::next(it) != groups->end() ? std::next(it)->first : it->second); r++){
-                valuesRes[rowRes++] = aggregate<VTRes,VTArg>(aggFunc, valuesArg + r, valuesArg + r + 1);
-            } 
+
+        for (size_t r = 0; r < groups->front().first; r++)
+            valuesRes[rowRes++] = aggregate<VTRes, VTArg>(aggFunc, valuesArg + r, valuesArg + r + 1);
+        for (auto it = groups->begin(); it != groups->end(); ++it) {
+            valuesRes[rowRes++] = aggregate<VTRes, VTArg>(aggFunc, valuesArg + it->first, valuesArg + it->second);
+            for (size_t r = it->second; r < (std::next(it) != groups->end() ? std::next(it)->first : it->second); r++) {
+                valuesRes[rowRes++] = aggregate<VTRes, VTArg>(aggFunc, valuesArg + r, valuesArg + r + 1);
+            }
         }
-        for(size_t r = groups->back().second; r < numRows; r++) 
-            valuesRes[rowRes++] = aggregate<VTRes,VTArg>(aggFunc, valuesArg + r, valuesArg + r + 1);
+        for (size_t r = groups->back().second; r < numRows; r++)
+            valuesRes[rowRes++] = aggregate<VTRes, VTArg>(aggFunc, valuesArg + r, valuesArg + r + 1);
     }
 };
 
-std::string myStringifyGroupEnum(mlir::daphne::GroupEnum val) {
+inline std::string myStringifyGroupEnum(mlir::daphne::GroupEnum val) {
     using mlir::daphne::GroupEnum;
     switch (val) {
-        case GroupEnum::COUNT: return "COUNT";
-        case GroupEnum::SUM: return "SUM";
-        case GroupEnum::MIN: return "MIN";
-        case GroupEnum::MAX: return "MAX";
-        case GroupEnum::AVG: return "AVG";
+    case GroupEnum::COUNT:
+        return "COUNT";
+    case GroupEnum::SUM:
+        return "SUM";
+    case GroupEnum::MIN:
+        return "MIN";
+    case GroupEnum::MAX:
+        return "MAX";
+    case GroupEnum::AVG:
+        return "AVG";
     }
     return "";
 }
 
 template <> struct Group<Frame> {
-    static void apply(Frame *& res, const Frame * arg, const char ** keyCols, size_t numKeyCols,
-        const char ** aggCols, size_t numAggCols, mlir::daphne::GroupEnum * aggFuncs, size_t numAggFuncs, DCTX(ctx)) {
+    static void apply(Frame *&res, const Frame *arg, const char **keyCols, size_t numKeyCols, const char **aggCols,
+                      size_t numAggCols, mlir::daphne::GroupEnum *aggFuncs, size_t numAggFuncs, DCTX(ctx)) {
         size_t numRowsArg = arg->getNumRows();
         size_t numColsRes = numKeyCols + numAggCols;
         size_t numRowsRes = numRowsArg;
-        if (arg == nullptr || (keyCols == nullptr && numKeyCols != 0) || (aggCols == nullptr && numAggCols != 0) || (aggFuncs == nullptr && numAggFuncs != 0))   {
+        if (arg == nullptr || (keyCols == nullptr && numKeyCols != 0) || (aggCols == nullptr && numAggCols != 0) ||
+            (aggFuncs == nullptr && numAggFuncs != 0)) {
             throw std::runtime_error("group-kernel called with invalid arguments");
         }
 
         // check if labels contain *
         std::vector<std::string> starLabels;
-        const std::string * argLabels = arg->getLabels();
+        const std::string *argLabels = arg->getLabels();
         const size_t numColsArg = arg->getNumCols();
         std::vector<std::string> aggColsVec;
         for (size_t m = 0; m < numAggCols; m++) {
@@ -138,11 +157,13 @@ template <> struct Group<Frame> {
             std::string delimiter = ".";
             std::string keyLabel = keyCols[i];
             const std::string frameName = keyLabel.substr(0, keyLabel.find(delimiter));
-            const std::string colLabel = keyLabel.substr(keyLabel.find(delimiter) + delimiter.length(), keyLabel.length());
+            const std::string colLabel =
+                keyLabel.substr(keyLabel.find(delimiter) + delimiter.length(), keyLabel.length());
             if (strcmp(keyCols[i], "*") == 0) {
                 for (size_t m = 0; m < numColsArg; m++) {
-                    // check that we do not include columns in the result that are used for aggregations and would lead to duplicates
-                    if(std::find(aggColsVec.begin(), aggColsVec.end(), argLabels[m]) == aggColsVec.end()) {
+                    // check that we do not include columns in the result that
+                    // are used for aggregations and would lead to duplicates
+                    if (std::find(aggColsVec.begin(), aggColsVec.end(), argLabels[m]) == aggColsVec.end()) {
                         starLabels.push_back(argLabels[m]);
                     }
                 }
@@ -153,8 +174,8 @@ template <> struct Group<Frame> {
             } else if (colLabel.compare("*") == 0) { // f.*
                 for (size_t m = 0; m < numColsArg; m++) {
                     std::string frameArg = argLabels[m].substr(0, argLabels[m].find(delimiter));
-                    if (frameName.compare(argLabels[m].substr(0, argLabels[m].find(delimiter))) == 0
-                        && frameName.compare(frameArg) == 0) {
+                    if (frameName.compare(argLabels[m].substr(0, argLabels[m].find(delimiter))) == 0 &&
+                        frameName.compare(frameArg) == 0) {
                         starLabels.push_back(argLabels[m]);
                     }
                 }
@@ -162,54 +183,53 @@ template <> struct Group<Frame> {
             }
         }
 
-
         // convert labels to indices
         auto idxs = std::shared_ptr<size_t[]>(new size_t[numColsRes]);
-        numKeyCols = starLabels.size()? starLabels.size() : numKeyCols;
-        bool * ascending = new bool[starLabels.size()];
+        numKeyCols = starLabels.size() ? starLabels.size() : numKeyCols;
+        bool *ascending = new bool[starLabels.size()];
         for (size_t i = 0; i < numKeyCols; ++i) {
-          idxs[i] = starLabels.size() ? arg->getColumnIdx(starLabels[i])
-                                      : arg->getColumnIdx(keyCols[i]);
-          ascending[i] = true;
+            idxs[i] = starLabels.size() ? arg->getColumnIdx(starLabels[i]) : arg->getColumnIdx(keyCols[i]);
+            ascending[i] = true;
         }
         for (size_t i = numKeyCols; i < numColsRes; i++) {
-            idxs[i] = arg->getColumnIdx(aggCols[i-numKeyCols]);
+            idxs[i] = arg->getColumnIdx(aggCols[i - numKeyCols]);
         }
-        
-        // reduce frame columns to keyCols and numAggCols (without copying values or the idx array) and reorder them accordingly 
-        Frame* reduced{};
+
+        // reduce frame columns to keyCols and numAggCols (without copying
+        // values or the idx array) and reorder them accordingly
+        Frame *reduced{};
         auto sel = DataObjectFactory::create<DenseMatrix<size_t>>(numColsRes, 1, idxs);
         extractCol(reduced, arg, sel, ctx);
         DataObjectFactory::destroy(sel);
-    
-        std::iota(idxs.get(), idxs.get()+numColsRes, 0);
+
+        std::iota(idxs.get(), idxs.get() + numColsRes, 0);
         auto groups = new std::vector<std::pair<size_t, size_t>>;
-        Frame* ordered{};     
+        Frame *ordered{};
 
         // order frame rows by groups and get the group vector;
-        if (numKeyCols > 0){
+        if (numKeyCols > 0) {
             order(ordered, reduced, idxs.get(), numKeyCols, ascending, numKeyCols, false, ctx, groups);
             DataObjectFactory::destroy(reduced);
         } else {
-            //skip for pure aggregation over all rows (no grouping) 
+            // skip for pure aggregation over all rows (no grouping)
             groups->push_back(std::make_pair(0, numRowsArg));
             ordered = reduced;
         }
-        delete [] ascending;
+        delete[] ascending;
         size_t inGroups = 0;
-        for (auto & group : *groups){
-            inGroups += group.second-group.first;
-        }  
-        numRowsRes -= inGroups-groups->size();
+        for (auto &group : *groups) {
+            inGroups += group.second - group.first;
+        }
+        numRowsRes -= inGroups - groups->size();
 
         // create the result frame
-        std::string * labels = new std::string[numColsRes];
-        ValueTypeCode * schema = new ValueTypeCode[numColsRes];
+        std::string *labels = new std::string[numColsRes];
+        ValueTypeCode *schema = new ValueTypeCode[numColsRes];
         if (starLabels.size()) {
             for (size_t i = 0; i < numKeyCols; i++) {
                 labels[i] = starLabels[i];
                 schema[i] = ordered->getColumnType(idxs[i]);
-            } 
+            }
         } else {
             for (size_t i = 0; i < numKeyCols; i++) {
                 labels[i] = keyCols[i];
@@ -218,30 +238,45 @@ template <> struct Group<Frame> {
         }
         using mlir::daphne::GroupEnum;
         for (size_t i = numKeyCols; i < numColsRes; i++) {
-            // TODO Maybe we can find a good way to call mlir::daphne::stringifyGroupEnum,
-            // we would need to link with the respective library.
-//            labels[i] = mlir::daphne::stringifyGroupEnum(aggFuncs[i-numKeyCols]).str() + "(" +  aggCols[i-numKeyCols] + ")";
-            labels[i] = myStringifyGroupEnum(aggFuncs[i-numKeyCols]) + "(" +  aggCols[i-numKeyCols] + ")";
-            switch(aggFuncs[i-numKeyCols]) {
-                case GroupEnum::COUNT: schema[i] = ValueTypeCode::UI64; break;
-                case GroupEnum::SUM: schema[i] = ordered->getColumnType(idxs[i]); break;
-                case GroupEnum::MIN: schema[i] = ordered->getColumnType(idxs[i]); break;
-                case GroupEnum::MAX: schema[i] = ordered->getColumnType(idxs[i]); break;
-                case GroupEnum::AVG: schema[i] = ValueTypeCode::F64; break;
+            // TODO Maybe we can find a good way to call
+            // mlir::daphne::stringifyGroupEnum, we would need to link with the
+            // respective library.
+            //            labels[i] =
+            //            mlir::daphne::stringifyGroupEnum(aggFuncs[i-numKeyCols]).str()
+            //            + "(" +  aggCols[i-numKeyCols] + ")";
+            labels[i] = myStringifyGroupEnum(aggFuncs[i - numKeyCols]) + "(" + aggCols[i - numKeyCols] + ")";
+            switch (aggFuncs[i - numKeyCols]) {
+            case GroupEnum::COUNT:
+                schema[i] = ValueTypeCode::UI64;
+                break;
+            case GroupEnum::SUM:
+                schema[i] = ordered->getColumnType(idxs[i]);
+                break;
+            case GroupEnum::MIN:
+                schema[i] = ordered->getColumnType(idxs[i]);
+                break;
+            case GroupEnum::MAX:
+                schema[i] = ordered->getColumnType(idxs[i]);
+                break;
+            case GroupEnum::AVG:
+                schema[i] = ValueTypeCode::F64;
+                break;
             }
-        } 
-        
+        }
+
         res = DataObjectFactory::create<Frame>(numRowsRes, numColsRes, schema, labels, false);
-        delete [] labels;
-        delete [] schema;
+        delete[] labels;
+        delete[] schema;
 
         // copying key columns and column-wise group aggregation
         for (size_t i = 0; i < numColsRes; i++) {
-            DeduceValueTypeAndExecute<ColumnGroupAgg>::apply(res->getSchema()[i], ordered->getSchema()[i], res, ordered, i, groups, (i < numKeyCols) ? (GroupEnum) 0 : aggFuncs[i-numKeyCols], ctx);
-        }        
+            DeduceValueTypeAndExecute<ColumnGroupAgg>::apply(
+                res->getSchema()[i], ordered->getSchema()[i], res, ordered, i, groups,
+                (i < numKeyCols) ? (GroupEnum)0 : aggFuncs[i - numKeyCols], ctx);
+        }
         delete groups;
         DataObjectFactory::destroy(ordered);
-   }
+    }
 };
 
-#endif //SRC_RUNTIME_LOCAL_KERNELS_GROUP_H
+#endif // SRC_RUNTIME_LOCAL_KERNELS_GROUP_H
diff --git a/src/runtime/local/kernels/GroupJoin.h b/src/runtime/local/kernels/GroupJoin.h
index b997b2c48..5bd81dc53 100644
--- a/src/runtime/local/kernels/GroupJoin.h
+++ b/src/runtime/local/kernels/GroupJoin.h
@@ -40,109 +40,97 @@
 // ****************************************************************************
 // TODO Maybe this should be a kernel on its own.
 
-template<typename VTLhs, typename VTRhs, typename VTAgg, typename VTTid>
+template <typename VTLhs, typename VTRhs, typename VTAgg, typename VTTid>
 void groupJoinCol(
-        // results
-        Frame *& res,
-        DenseMatrix<VTTid> *& resLhsTid,
-        // arguments
-        const DenseMatrix<VTLhs> * argLhs,
-        const DenseMatrix<VTRhs> * argRhs,
-        const DenseMatrix<VTAgg> * argAgg,
-        // context
-        DCTX(ctx)
-) {
-    if(argLhs->getNumCols() != 1)
+    // results
+    Frame *&res, DenseMatrix<VTTid> *&resLhsTid,
+    // arguments
+    const DenseMatrix<VTLhs> *argLhs, const DenseMatrix<VTRhs> *argRhs, const DenseMatrix<VTAgg> *argAgg,
+    // context
+    DCTX(ctx)) {
+    if (argLhs->getNumCols() != 1)
         throw std::runtime_error("parameter argLhs must be a single-column matrix");
-    if(argRhs->getNumCols() != 1)
+    if (argRhs->getNumCols() != 1)
         throw std::runtime_error("parameter argRhs must be a single-column matrix");
-    if(argAgg->getNumCols() != 1)
+    if (argAgg->getNumCols() != 1)
         throw std::runtime_error("parameter argAgg must be a single-column matrix");
-    if(argRhs->getNumRows() != argAgg->getNumRows())
+    if (argRhs->getNumRows() != argAgg->getNumRows())
         throw std::runtime_error("parameters argRhs and argAgg must have the same number of rows");
-        
+
     std::unordered_map<VTLhs, std::tuple<size_t, VTAgg, bool>> ht;
-    
+
     // ------------------------------------------------------------------------
     // Build phase on argLhs.
     // ------------------------------------------------------------------------
     const size_t numArgLhs = argLhs->getNumRows();
-    for(size_t i = 0; i < numArgLhs; i++)
+    for (size_t i = 0; i < numArgLhs; i++)
         ht.emplace(argLhs->get(i, 0), std::make_tuple(i, 0, false));
-    
+
     // ------------------------------------------------------------------------
     // Probe phase on argRhs.
     // ------------------------------------------------------------------------
     const size_t numArgRhs = argRhs->getNumRows();
-    for(size_t i = 0; i < numArgRhs; i++) {
+    for (size_t i = 0; i < numArgRhs; i++) {
         auto it = ht.find(argRhs->get(i, 0));
-        if(it != ht.end()) {
+        if (it != ht.end()) {
             std::get<1>(it->second) += argAgg->get(i, 0);
             std::get<2>(it->second) = true;
         }
     }
-    
+
     // ------------------------------------------------------------------------
     // Output phase.
     // ------------------------------------------------------------------------
-    
+
     // Determine the number of output rows.
     size_t numRes = 0;
-    for(auto it = ht.begin(); it != ht.end(); it++)
-        if(std::get<2>(it->second))
+    for (auto it = ht.begin(); it != ht.end(); it++)
+        if (std::get<2>(it->second))
             numRes++;
-    
+
     // Create the output data objects.
-    if(res == nullptr) {
+    if (res == nullptr) {
         ValueTypeCode schema[] = {ValueTypeUtils::codeFor<VTLhs>, ValueTypeUtils::codeFor<VTAgg>};
         res = DataObjectFactory::create<Frame>(numRes, 2, schema, nullptr, false);
     }
     auto resLhs = res->getColumn<VTLhs>(0);
     auto resAgg = res->getColumn<VTAgg>(1);
-    if(resLhsTid == nullptr)
+    if (resLhsTid == nullptr)
         resLhsTid = DataObjectFactory::create<DenseMatrix<VTTid>>(numRes, 1, false);
-    
+
     // Write the results.
     size_t pos = 0;
-    for(auto it = ht.begin(); it != ht.end(); it++)
-        if(std::get<2>(it->second)) {
-            resLhs   ->set(pos, 0, it->first);
-            resAgg   ->set(pos, 0, std::get<1>(it->second));
+    for (auto it = ht.begin(); it != ht.end(); it++)
+        if (std::get<2>(it->second)) {
+            resLhs->set(pos, 0, it->first);
+            resAgg->set(pos, 0, std::get<1>(it->second));
             resLhsTid->set(pos, 0, std::get<0>(it->second));
             pos++;
         }
-    
+
     // Free intermediate data objects.
     // TODO This is not possible at the moment due to a bug in Frame (ownership
     // of the underlying data is not shared correctly).
-//    DataObjectFactory::destroy(resLhs);
-//    DataObjectFactory::destroy(resAgg);
+    //    DataObjectFactory::destroy(resLhs);
+    //    DataObjectFactory::destroy(resAgg);
 }
 
-template<typename VTLhs, typename VTRhs, typename VTAgg, typename VTTid>
+template <typename VTLhs, typename VTRhs, typename VTAgg, typename VTTid>
 void groupJoinColIf(
-        // value type known only at run-time
-        ValueTypeCode vtcLhs,
-        ValueTypeCode vtcRhs,
-        ValueTypeCode vtcAgg,
-        // results
-        Frame *& res,
-        DenseMatrix<VTTid> *& resLhsTid,
-        // input frames
-        const Frame * lhs, const Frame * rhs,
-        // input column names
-        const char * lhsOn, const char * rhsOn, const char * rhsAgg,
-        // context
-        DCTX(ctx)
-) {
-    if(vtcLhs == ValueTypeUtils::codeFor<VTLhs> && vtcRhs == ValueTypeUtils::codeFor<VTRhs> && vtcAgg == ValueTypeUtils::codeFor<VTAgg>) {
-        groupJoinCol<VTLhs, VTRhs, VTAgg, VTTid>(
-                res, resLhsTid,
-                lhs->getColumn<VTLhs>(lhsOn),
-                rhs->getColumn<VTRhs>(rhsOn),
-                rhs->getColumn<VTAgg>(rhsAgg),
-                ctx
-        );
+    // value type known only at run-time
+    ValueTypeCode vtcLhs, ValueTypeCode vtcRhs, ValueTypeCode vtcAgg,
+    // results
+    Frame *&res, DenseMatrix<VTTid> *&resLhsTid,
+    // input frames
+    const Frame *lhs, const Frame *rhs,
+    // input column names
+    const char *lhsOn, const char *rhsOn, const char *rhsAgg,
+    // context
+    DCTX(ctx)) {
+    if (vtcLhs == ValueTypeUtils::codeFor<VTLhs> && vtcRhs == ValueTypeUtils::codeFor<VTRhs> &&
+        vtcAgg == ValueTypeUtils::codeFor<VTAgg>) {
+        groupJoinCol<VTLhs, VTRhs, VTAgg, VTTid>(res, resLhsTid, lhs->getColumn<VTLhs>(lhsOn),
+                                                 rhs->getColumn<VTRhs>(rhsOn), rhs->getColumn<VTAgg>(rhsAgg), ctx);
     }
 }
 
@@ -150,31 +138,32 @@ void groupJoinColIf(
 // Convenience function
 // ****************************************************************************
 
-template<typename VTLhsTid>
+template <typename VTLhsTid>
 void groupJoin(
-        // results
-        Frame *& res, DenseMatrix<VTLhsTid> *& lhsTid,
-        // input frames
-        const Frame * lhs, const Frame * rhs,
-        // input column names
-        const char * lhsOn, const char * rhsOn, const char * rhsAgg,
-        // context
-        DCTX(ctx)
-) {
+    // results
+    Frame *&res, DenseMatrix<VTLhsTid> *&lhsTid,
+    // input frames
+    const Frame *lhs, const Frame *rhs,
+    // input column names
+    const char *lhsOn, const char *rhsOn, const char *rhsAgg,
+    // context
+    DCTX(ctx)) {
     // Find out the value types of the columns to process.
     ValueTypeCode vtcLhsOn = lhs->getColumnType(lhsOn);
     ValueTypeCode vtcRhsOn = rhs->getColumnType(rhsOn);
     ValueTypeCode vtcRhsAgg = rhs->getColumnType(rhsAgg);
-    
+
     // Call the groupJoin-kernel on columns for the actual combination of
     // value types.
     // Repeat this for all type combinations...
-    groupJoinColIf<int64_t, int64_t, double, VTLhsTid>(vtcLhsOn, vtcRhsOn, vtcRhsAgg, res, lhsTid, lhs, rhs, lhsOn, rhsOn, rhsAgg, ctx);
-    groupJoinColIf<int64_t, int64_t, int64_t, VTLhsTid>(vtcLhsOn, vtcRhsOn, vtcRhsAgg, res, lhsTid, lhs, rhs, lhsOn, rhsOn, rhsAgg, ctx);
-    
+    groupJoinColIf<int64_t, int64_t, double, VTLhsTid>(vtcLhsOn, vtcRhsOn, vtcRhsAgg, res, lhsTid, lhs, rhs, lhsOn,
+                                                       rhsOn, rhsAgg, ctx);
+    groupJoinColIf<int64_t, int64_t, int64_t, VTLhsTid>(vtcLhsOn, vtcRhsOn, vtcRhsAgg, res, lhsTid, lhs, rhs, lhsOn,
+                                                        rhsOn, rhsAgg, ctx);
+
     // Set the column labels of the result frame.
     std::string labels[] = {lhsOn, std::string("SUM(") + rhsAgg + std::string(")")};
     res->setLabels(labels);
 }
 
-#endif //SRC_RUNTIME_LOCAL_KERNELS_GROUPJOIN_H
\ No newline at end of file
+#endif // SRC_RUNTIME_LOCAL_KERNELS_GROUPJOIN_H
\ No newline at end of file
diff --git a/src/runtime/local/kernels/HasSpecialValue.h b/src/runtime/local/kernels/HasSpecialValue.h
index 0e666d35f..274412095 100644
--- a/src/runtime/local/kernels/HasSpecialValue.h
+++ b/src/runtime/local/kernels/HasSpecialValue.h
@@ -48,7 +48,7 @@ template <class DTArg, typename TestType> struct HasSpecialValue {
  * @param testVal The value to test for in the matrix.
  * @return Returns true when finding a matchin element.
  */
-template <class DTArg, typename TestType> bool hasSpecialValue(const DTArg *arg, TestType testVal, DCTX(ctx)) { 
+template <class DTArg, typename TestType> bool hasSpecialValue(const DTArg *arg, TestType testVal, DCTX(ctx)) {
     return HasSpecialValue<DTArg, TestType>::apply(arg, testVal, ctx);
 }
 
@@ -65,9 +65,9 @@ template <typename VT, typename TestType> struct HasSpecialValue<DenseMatrix<VT>
         auto numRows = arg->getNumRows();
         auto numCols = arg->getNumCols();
 
-        if(std::isnan(testVal)) {
-            for(auto rowIdx = 0ul; rowIdx < numRows; rowIdx++) {
-                for(auto colIdx = 0ul; colIdx < numCols; colIdx++) {
+        if (std::isnan(testVal)) {
+            for (auto rowIdx = 0ul; rowIdx < numRows; rowIdx++) {
+                for (auto colIdx = 0ul; colIdx < numCols; colIdx++) {
                     auto val = arg->get(rowIdx, colIdx);
                     if (std::isnan(val)) {
                         return true;
@@ -75,8 +75,8 @@ template <typename VT, typename TestType> struct HasSpecialValue<DenseMatrix<VT>
                 }
             }
         } else {
-            for(auto rowIdx = 0ul; rowIdx < numRows; rowIdx++) {
-                for(auto colIdx = 0ul; colIdx < numCols; colIdx++) {
+            for (auto rowIdx = 0ul; rowIdx < numRows; rowIdx++) {
+                for (auto colIdx = 0ul; colIdx < numCols; colIdx++) {
                     auto val = arg->get(rowIdx, colIdx);
                     if (val == testVal) {
                         return true;
@@ -98,14 +98,14 @@ template <typename VT, typename TestType> struct HasSpecialValue<CSRMatrix<VT>,
         auto numRows = arg->getNumRows();
         auto numCols = arg->getNumCols();
         auto numNonZeros = arg->getNumNonZeros();
-        auto numElements = numRows*numCols;
+        auto numElements = numRows * numCols;
         auto vBegin = arg->getValues(0);
         auto vEnd = arg->getValues(numRows);
         auto hasZeroes = numNonZeros < numElements;
         auto zero = VT(0);
 
-        if(std::isnan(testVal)) {
-            for(auto it = vBegin; it != vEnd; it++) {
+        if (std::isnan(testVal)) {
+            for (auto it = vBegin; it != vEnd; it++) {
                 if (std::isnan(*it)) {
                     return true;
                 }
@@ -116,7 +116,7 @@ template <typename VT, typename TestType> struct HasSpecialValue<CSRMatrix<VT>,
                     return true;
                 }
             }
-            for(auto it = vBegin; it != vEnd; it++) {
+            for (auto it = vBegin; it != vEnd; it++) {
                 if ((*it) == testVal) {
                     return true;
                 }
diff --git a/src/runtime/local/kernels/IncRef.h b/src/runtime/local/kernels/IncRef.h
index b251babb7..051f569c9 100644
--- a/src/runtime/local/kernels/IncRef.h
+++ b/src/runtime/local/kernels/IncRef.h
@@ -23,21 +23,16 @@
 // Struct for partial template specialization
 // ****************************************************************************
 
-template<class DTArg>
-struct IncRef {
-    static void apply(const DTArg * arg, DCTX(ctx)) = delete;
+template <class DTArg> struct IncRef {
+    static void apply(const DTArg *arg, DCTX(ctx)) = delete;
 };
 
-template<>
-struct IncRef<Structure> {
-    static void apply(const Structure* arg, DCTX(ctx)) {
-        arg->increaseRefCounter();
-    }
+template <> struct IncRef<Structure> {
+    static void apply(const Structure *arg, DCTX(ctx)) { arg->increaseRefCounter(); }
 };
 
-template<>
-struct IncRef<char> {
-    static void apply(const char* arg, DCTX(ctx)) {
+template <> struct IncRef<char> {
+    static void apply(const char *arg, DCTX(ctx)) {
         // Increase the reference counter.
         ctx->stringRefCount.inc(arg);
     }
@@ -46,7 +41,4 @@ struct IncRef<char> {
 // ****************************************************************************
 // Convenience function
 // ****************************************************************************
-template<class DTArg>
-void incRef(const DTArg* arg, DCTX(ctx)) {
-    IncRef<DTArg>::apply(arg, ctx);
-}
+template <class DTArg> void incRef(const DTArg *arg, DCTX(ctx)) { IncRef<DTArg>::apply(arg, ctx); }
diff --git a/src/runtime/local/kernels/InnerJoin.h b/src/runtime/local/kernels/InnerJoin.h
index ed9e3931c..a6185056b 100644
--- a/src/runtime/local/kernels/InnerJoin.h
+++ b/src/runtime/local/kernels/InnerJoin.h
@@ -19,82 +19,52 @@
 // Helper functions
 // ****************************************************************************
 
-template<typename VTCol>
-void innerJoinSetValue(
-    DenseMatrix<VTCol> * res,
-    const DenseMatrix<VTCol> * arg,
-    const int64_t targetRow,
-    const int64_t fromRow,
-    DCTX(ctx)
-){
+template <typename VTCol>
+void innerJoinSetValue(DenseMatrix<VTCol> *res, const DenseMatrix<VTCol> *arg, const int64_t targetRow,
+                       const int64_t fromRow, DCTX(ctx)) {
     const VTCol argValue = arg->get(fromRow, 0);
     res->set(targetRow, 0, argValue);
 }
 
-template<typename VTCol>
-void innerJoinSet(
-    ValueTypeCode vtcType,
-    Frame *&res,
-    const Frame * arg,
-    const int64_t toRow,
-    const int64_t toCol,
-    const int64_t fromRow,
-    const int64_t fromCol,
-    DCTX(ctx)
-) {
-    if(vtcType == ValueTypeUtils::codeFor<VTCol>){
-        innerJoinSetValue<VTCol>(
-            res->getColumn<VTCol>(toCol),
-            arg->getColumn<VTCol>(fromCol),
-            toRow,
-            fromRow,
-            ctx
-        );
+template <typename VTCol>
+void innerJoinSet(ValueTypeCode vtcType, Frame *&res, const Frame *arg, const int64_t toRow, const int64_t toCol,
+                  const int64_t fromRow, const int64_t fromCol, DCTX(ctx)) {
+    if (vtcType == ValueTypeUtils::codeFor<VTCol>) {
+        innerJoinSetValue<VTCol>(res->getColumn<VTCol>(toCol), arg->getColumn<VTCol>(fromCol), toRow, fromRow, ctx);
     }
 }
 
-template<typename VTLhs, typename VTRhs>
+template <typename VTLhs, typename VTRhs>
 bool innerJoinEqual(
     // results
-    Frame *& res,
+    Frame *&res,
     // arguments
-    const DenseMatrix<VTLhs> * argLhs,
-    const DenseMatrix<VTRhs> * argRhs,
-    const int64_t targetLhs,
+    const DenseMatrix<VTLhs> *argLhs, const DenseMatrix<VTRhs> *argRhs, const int64_t targetLhs,
     const int64_t targetRhs,
     // context
-    DCTX(ctx)
-){
+    DCTX(ctx)) {
     const VTLhs l = argLhs->get(targetLhs, 0);
     const VTRhs r = argRhs->get(targetRhs, 0);
     return l == r;
 }
 
-template<typename VTLhs, typename VTRhs>
+template <typename VTLhs, typename VTRhs>
 bool innerJoinProbeIf(
     // value type known only at run-time
-    ValueTypeCode vtcLhs,
-    ValueTypeCode vtcRhs,
+    ValueTypeCode vtcLhs, ValueTypeCode vtcRhs,
     // results
-    Frame *& res,
+    Frame *&res,
     // input frames
-    const Frame * lhs, const Frame * rhs,
+    const Frame *lhs, const Frame *rhs,
     // input column names
-    const char * lhsOn, const char * rhsOn,
+    const char *lhsOn, const char *rhsOn,
     // input rows
     const int64_t targetL, const int64_t targetR,
     // context
-    DCTX(ctx)
-){
-    if(vtcLhs == ValueTypeUtils::codeFor<VTLhs> && vtcRhs == ValueTypeUtils::codeFor<VTRhs>) {
-        return innerJoinEqual<VTLhs, VTRhs>(
-                res,
-                lhs->getColumn<VTLhs>(lhsOn),
-                rhs->getColumn<VTRhs>(rhsOn),
-                targetL,
-                targetR,
-                ctx
-        );
+    DCTX(ctx)) {
+    if (vtcLhs == ValueTypeUtils::codeFor<VTLhs> && vtcRhs == ValueTypeUtils::codeFor<VTRhs>) {
+        return innerJoinEqual<VTLhs, VTRhs>(res, lhs->getColumn<VTLhs>(lhsOn), rhs->getColumn<VTRhs>(rhsOn), targetL,
+                                            targetR, ctx);
     }
     return false;
 }
@@ -103,16 +73,15 @@ bool innerJoinProbeIf(
 // Convenience function
 // ****************************************************************************
 
-void innerJoin(
+inline void innerJoin(
     // results
-    Frame *& res,
+    Frame *&res,
     // input frames
-    const Frame * lhs, const Frame * rhs,
+    const Frame *lhs, const Frame *rhs,
     // input column names
-    const char * lhsOn, const char * rhsOn,
+    const char *lhsOn, const char *rhsOn,
     // context
-    DCTX(ctx)
-) {
+    DCTX(ctx)) {
     // Find out the value types of the columns to process.
     ValueTypeCode vtcLhsOn = lhs->getColumnType(lhsOn);
     ValueTypeCode vtcRhsOn = rhs->getColumnType(rhsOn);
@@ -124,8 +93,8 @@ void innerJoin(
     const size_t numColRhs = rhs->getNumCols();
     const size_t numColLhs = lhs->getNumCols();
     const size_t totalCols = numColRhs + numColLhs;
-    const std::string * oldlabels_l = lhs->getLabels();
-    const std::string * oldlabels_r = rhs->getLabels();
+    const std::string *oldlabels_l = lhs->getLabels();
+    const std::string *oldlabels_r = rhs->getLabels();
 
     int64_t col_idx_res = 0;
     int64_t row_idx_res = 0;
@@ -134,12 +103,12 @@ void innerJoin(
     std::string newlabels[totalCols];
 
     // Setting Schema and Labels
-    for(size_t col_idx_l = 0; col_idx_l < numColLhs; col_idx_l++){
+    for (size_t col_idx_l = 0; col_idx_l < numColLhs; col_idx_l++) {
         schema[col_idx_res] = lhs->getColumnType(col_idx_l);
         newlabels[col_idx_res] = oldlabels_l[col_idx_l];
         col_idx_res++;
     }
-    for(size_t col_idx_r = 0; col_idx_r < numColRhs; col_idx_r++){
+    for (size_t col_idx_r = 0; col_idx_r < numColRhs; col_idx_r++) {
         schema[col_idx_res] = rhs->getColumnType(col_idx_r);
         newlabels[col_idx_res] = oldlabels_r[col_idx_r];
         col_idx_res++;
@@ -148,71 +117,29 @@ void innerJoin(
     // Creating Result Frame
     res = DataObjectFactory::create<Frame>(totalRows, totalCols, schema, newlabels, false);
 
-    for(size_t row_idx_l = 0; row_idx_l < numRowLhs; row_idx_l++){
-        for(size_t row_idx_r = 0; row_idx_r < numRowRhs; row_idx_r++){
+    for (size_t row_idx_l = 0; row_idx_l < numRowLhs; row_idx_l++) {
+        for (size_t row_idx_r = 0; row_idx_r < numRowRhs; row_idx_r++) {
             col_idx_res = 0;
-            //PROBE ROWS
+            // PROBE ROWS
             bool hit = false;
-            hit = hit || innerJoinProbeIf<int64_t, int64_t>(
-                vtcLhsOn, vtcRhsOn,
-                res,
-                lhs, rhs,
-                lhsOn, rhsOn,
-                row_idx_l, row_idx_r,
-                ctx);
-            hit = hit || innerJoinProbeIf<double, double>(
-                vtcLhsOn, vtcRhsOn,
-                res,
-                lhs, rhs,
-                lhsOn, rhsOn,
-                row_idx_l, row_idx_r,
-                ctx);
-            if(hit){
-                for(size_t idx_c = 0; idx_c < numColLhs; idx_c++){
-                    innerJoinSet<int64_t>(
-                        schema[col_idx_res],
-                        res,
-                        lhs,
-                        row_idx_res,
-                        col_idx_res,
-                        row_idx_l,
-                        idx_c,
-                        ctx
-                    );
-                    innerJoinSet<double>(
-                        schema[col_idx_res],
-                        res,
-                        lhs,
-                        row_idx_res,
-                        col_idx_res,
-                        row_idx_l,
-                        idx_c,
-                        ctx
-                    );
+            hit = hit || innerJoinProbeIf<int64_t, int64_t>(vtcLhsOn, vtcRhsOn, res, lhs, rhs, lhsOn, rhsOn, row_idx_l,
+                                                            row_idx_r, ctx);
+            hit = hit || innerJoinProbeIf<double, double>(vtcLhsOn, vtcRhsOn, res, lhs, rhs, lhsOn, rhsOn, row_idx_l,
+                                                          row_idx_r, ctx);
+            if (hit) {
+                for (size_t idx_c = 0; idx_c < numColLhs; idx_c++) {
+                    innerJoinSet<int64_t>(schema[col_idx_res], res, lhs, row_idx_res, col_idx_res, row_idx_l, idx_c,
+                                          ctx);
+                    innerJoinSet<double>(schema[col_idx_res], res, lhs, row_idx_res, col_idx_res, row_idx_l, idx_c,
+                                         ctx);
                     col_idx_res++;
                 }
-                for(size_t idx_c = 0; idx_c < numColRhs; idx_c++){
-                    innerJoinSet<int64_t>(
-                        schema[col_idx_res],
-                        res,
-                        rhs,
-                        row_idx_res,
-                        col_idx_res,
-                        row_idx_r,
-                        idx_c,
-                        ctx
-                    );
-
-                    innerJoinSet<double>(
-                        schema[col_idx_res],
-                        res,
-                        rhs,
-                        row_idx_res,
-                        col_idx_res,
-                        row_idx_r,
-                        idx_c,
-                        ctx
-                    );
+                for (size_t idx_c = 0; idx_c < numColRhs; idx_c++) {
+                    innerJoinSet<int64_t>(schema[col_idx_res], res, rhs, row_idx_res, col_idx_res, row_idx_r, idx_c,
+                                          ctx);
+
+                    innerJoinSet<double>(schema[col_idx_res], res, rhs, row_idx_res, col_idx_res, row_idx_r, idx_c,
+                                         ctx);
                     col_idx_res++;
                 }
                 row_idx_res++;
@@ -221,4 +148,4 @@ void innerJoin(
     }
     res->shrinkNumRows(row_idx_res);
 }
-#endif //SRC_RUNTIME_LOCAL_KERNELS_INNERJOIN_H
+#endif // SRC_RUNTIME_LOCAL_KERNELS_INNERJOIN_H
diff --git a/src/runtime/local/kernels/InsertCol.h b/src/runtime/local/kernels/InsertCol.h
index af3799c68..f4336e546 100644
--- a/src/runtime/local/kernels/InsertCol.h
+++ b/src/runtime/local/kernels/InsertCol.h
@@ -32,27 +32,18 @@
 // Struct for partial template specialization
 // ****************************************************************************
 
-template<class DTArg, class DTIns, typename VTSel>
-struct InsertCol {
-    static void apply(
-            DTArg *& res,
-            const DTArg * arg, const DTIns * ins,
-            const VTSel colLowerIncl, const VTSel colUpperExcl,
-            DCTX(ctx)
-    ) = delete;
+template <class DTArg, class DTIns, typename VTSel> struct InsertCol {
+    static void apply(DTArg *&res, const DTArg *arg, const DTIns *ins, const VTSel colLowerIncl,
+                      const VTSel colUpperExcl, DCTX(ctx)) = delete;
 };
 
 // ****************************************************************************
 // Convenience function
 // ****************************************************************************
 
-template<class DTArg, class DTIns, typename VTSel>
-void insertCol(
-        DTArg *& res,
-        const DTArg * arg, const DTIns * ins,
-        const VTSel colLowerIncl, const VTSel colUpperExcl,
-        DCTX(ctx)
-) {
+template <class DTArg, class DTIns, typename VTSel>
+void insertCol(DTArg *&res, const DTArg *arg, const DTIns *ins, const VTSel colLowerIncl, const VTSel colUpperExcl,
+               DCTX(ctx)) {
     InsertCol<DTArg, DTIns, VTSel>::apply(res, arg, ins, colLowerIncl, colUpperExcl, ctx);
 }
 
@@ -60,31 +51,36 @@ void insertCol(
 // Boundary validation
 // ****************************************************************************
 
-template<typename VTSel>
+template <typename VTSel>
 void validateArgsInsertCol(size_t colLowerIncl_Size, VTSel colLowerIncl, size_t colUpperExcl_Size, VTSel colUpperExcl,
-                    size_t numRowsArg, size_t numColsArg, size_t numRowsIns, size_t numColsIns) {
-    
-    if (colUpperExcl_Size < colLowerIncl_Size || numColsArg < colUpperExcl_Size
-        || (colLowerIncl_Size == numColsArg && colLowerIncl_Size != 0)) {
+                           size_t numRowsArg, size_t numColsArg, size_t numRowsIns, size_t numColsIns) {
+
+    if (colUpperExcl_Size < colLowerIncl_Size || numColsArg < colUpperExcl_Size ||
+        (colLowerIncl_Size == numColsArg && colLowerIncl_Size != 0)) {
         std::ostringstream errMsg;
         errMsg << "invalid arguments '" << colLowerIncl << ", " << colUpperExcl
-                << "' passed to InsertCol: it must hold 0 <= colLowerIncl <= colUpperExcl <= #columns "
-                << "and colLowerIncl < #columns (unless both are zero) where #columns of arg is '" << numColsArg << "'";
+               << "' passed to InsertCol: it must hold 0 <= colLowerIncl <= "
+                  "colUpperExcl <= #columns "
+               << "and colLowerIncl < #columns (unless both are zero) where "
+                  "#columns of arg is '"
+               << numColsArg << "'";
         throw std::out_of_range(errMsg.str());
     }
-    
-    if(numColsIns != colUpperExcl_Size - colLowerIncl_Size){
+
+    if (numColsIns != colUpperExcl_Size - colLowerIncl_Size) {
         std::ostringstream errMsg;
         errMsg << "invalid arguments '" << colLowerIncl << ", " << colUpperExcl
-                << "' passed to InsertCol: the number of addressed columns in arg '" << colUpperExcl_Size - colLowerIncl_Size
-                << "' and the number of columns in ins '" << numColsIns << "' must match";
+               << "' passed to InsertCol: the number of addressed columns in arg '"
+               << colUpperExcl_Size - colLowerIncl_Size << "' and the number of columns in ins '" << numColsIns
+               << "' must match";
         throw std::out_of_range(errMsg.str());
     }
-    
-    if(numRowsIns != numRowsArg) {
+
+    if (numRowsIns != numRowsArg) {
         std::ostringstream errMsg;
-        errMsg << "invalid arguments passed to InsertCol: the number of rows in arg '" << numRowsArg
-                << "' and ins '" << numRowsIns << "' must match";
+        errMsg << "invalid arguments passed to InsertCol: the number of rows "
+                  "in arg '"
+               << numRowsArg << "' and ins '" << numRowsIns << "' must match";
         throw std::out_of_range(errMsg.str());
     }
 }
@@ -97,14 +93,9 @@ void validateArgsInsertCol(size_t colLowerIncl_Size, VTSel colLowerIncl, size_t
 // DenseMatrix <- DenseMatrix
 // ----------------------------------------------------------------------------
 
-template<typename VTArg, typename VTSel>
-struct InsertCol<DenseMatrix<VTArg>, DenseMatrix<VTArg>, VTSel> {
-    static void apply(
-            DenseMatrix<VTArg> *& res,
-            const DenseMatrix<VTArg> * arg, const DenseMatrix<VTArg> * ins,
-            VTSel colLowerIncl, VTSel colUpperExcl,
-            DCTX(ctx)
-    ) {
+template <typename VTArg, typename VTSel> struct InsertCol<DenseMatrix<VTArg>, DenseMatrix<VTArg>, VTSel> {
+    static void apply(DenseMatrix<VTArg> *&res, const DenseMatrix<VTArg> *arg, const DenseMatrix<VTArg> *ins,
+                      VTSel colLowerIncl, VTSel colUpperExcl, DCTX(ctx)) {
         const size_t numRowsArg = arg->getNumRows();
         const size_t numColsArg = arg->getNumCols();
         const size_t numRowsIns = ins->getNumRows();
@@ -112,25 +103,25 @@ struct InsertCol<DenseMatrix<VTArg>, DenseMatrix<VTArg>, VTSel> {
 
         const size_t colLowerIncl_Size = static_cast<const size_t>(colLowerIncl);
         const size_t colUpperExcl_Size = static_cast<const size_t>(colUpperExcl);
-        
-        validateArgsInsertCol(colLowerIncl_Size, colLowerIncl, colUpperExcl_Size, colUpperExcl,
-                    numRowsArg, numColsArg, numRowsIns, numColsIns);
 
-        if(res == nullptr)
+        validateArgsInsertCol(colLowerIncl_Size, colLowerIncl, colUpperExcl_Size, colUpperExcl, numRowsArg, numColsArg,
+                              numRowsIns, numColsIns);
+
+        if (res == nullptr)
             res = DataObjectFactory::create<DenseMatrix<VTArg>>(numRowsArg, numColsArg, false);
-        
-        VTArg * valuesRes = res->getValues();
-        const VTArg * valuesArg = arg->getValues();
-        const VTArg * valuesIns = ins->getValues();
+
+        VTArg *valuesRes = res->getValues();
+        const VTArg *valuesArg = arg->getValues();
+        const VTArg *valuesIns = ins->getValues();
         const size_t rowSkipRes = res->getRowSkip();
         const size_t rowSkipArg = arg->getRowSkip();
         const size_t rowSkipIns = ins->getRowSkip();
-        
+
         // TODO Can be simplified/more efficient in certain cases.
-        for(size_t r = 0; r < numRowsArg; r++) {
-            memcpy(valuesRes, valuesArg, colLowerIncl_Size * sizeof(VTArg));
-            memcpy(valuesRes + colLowerIncl_Size, valuesIns, numColsIns * sizeof(VTArg));
-            memcpy(valuesRes + colUpperExcl_Size, valuesArg + colUpperExcl_Size, (numColsArg - colUpperExcl_Size) * sizeof(VTArg));
+        for (size_t r = 0; r < numRowsArg; r++) {
+            std::copy(valuesArg, valuesArg + colLowerIncl_Size, valuesRes);
+            std::copy(valuesIns, valuesIns + numColsIns, valuesRes + colLowerIncl_Size);
+            std::copy(valuesArg + colUpperExcl_Size, valuesArg + numColsArg, valuesRes + colUpperExcl_Size);
             valuesRes += rowSkipRes;
             valuesArg += rowSkipArg;
             valuesIns += rowSkipIns;
@@ -142,29 +133,25 @@ struct InsertCol<DenseMatrix<VTArg>, DenseMatrix<VTArg>, VTSel> {
 // Matrix <- Matrix
 // ----------------------------------------------------------------------------
 
-template<typename VTArg, typename VTSel>
-struct InsertCol<Matrix<VTArg>, Matrix<VTArg>, VTSel> {
-    static void apply(
-            Matrix<VTArg> *& res,
-            const Matrix<VTArg> * arg, const Matrix<VTArg> * ins,
-            VTSel colLowerIncl, VTSel colUpperExcl,
-            DCTX(ctx)
-    ) {
+template <typename VTArg, typename VTSel> struct InsertCol<Matrix<VTArg>, Matrix<VTArg>, VTSel> {
+    static void apply(Matrix<VTArg> *&res, const Matrix<VTArg> *arg, const Matrix<VTArg> *ins, VTSel colLowerIncl,
+                      VTSel colUpperExcl, DCTX(ctx)) {
         const size_t numRowsArg = arg->getNumRows();
         const size_t numColsArg = arg->getNumCols();
 
         const size_t colLowerIncl_Size = static_cast<const size_t>(colLowerIncl);
         const size_t colUpperExcl_Size = static_cast<const size_t>(colUpperExcl);
 
-        validateArgsInsertCol(colLowerIncl_Size, colLowerIncl, colUpperExcl_Size, colUpperExcl,
-                    numRowsArg, numColsArg, ins->getNumRows(), ins->getNumCols());
+        validateArgsInsertCol(colLowerIncl_Size, colLowerIncl, colUpperExcl_Size, colUpperExcl, numRowsArg, numColsArg,
+                              ins->getNumRows(), ins->getNumCols());
 
         if (res == nullptr)
             res = DataObjectFactory::create<DenseMatrix<VTArg>>(numRowsArg, numColsArg, false);
 
         res->prepareAppend();
         for (size_t r = 0; r < numRowsArg; ++r) {
-            // fill values left of insertion, then between and lastly to its right
+            // fill values left of insertion, then between and lastly to its
+            // right
             for (size_t c = 0; c < colLowerIncl_Size; ++c)
                 res->append(r, c, arg->get(r, c));
             for (size_t c = colLowerIncl_Size; c < colUpperExcl_Size; ++c)
@@ -176,4 +163,4 @@ struct InsertCol<Matrix<VTArg>, Matrix<VTArg>, VTSel> {
     }
 };
 
-#endif //SRC_RUNTIME_LOCAL_KERNELS_INSERTROW_H
+#endif // SRC_RUNTIME_LOCAL_KERNELS_INSERTROW_H
diff --git a/src/runtime/local/kernels/InsertRow.h b/src/runtime/local/kernels/InsertRow.h
index 29eb38430..9503b26e4 100644
--- a/src/runtime/local/kernels/InsertRow.h
+++ b/src/runtime/local/kernels/InsertRow.h
@@ -32,27 +32,18 @@
 // Struct for partial template specialization
 // ****************************************************************************
 
-template<class DTArg, class DTIns, typename VTSel>
-struct InsertRow {
-    static void apply(
-        DTArg *& res,
-        const DTArg * arg, const DTIns * ins,
-        const VTSel rowLowerIncl, const VTSel rowUpperExcl,
-        DCTX(ctx)
-    ) = delete;
+template <class DTArg, class DTIns, typename VTSel> struct InsertRow {
+    static void apply(DTArg *&res, const DTArg *arg, const DTIns *ins, const VTSel rowLowerIncl,
+                      const VTSel rowUpperExcl, DCTX(ctx)) = delete;
 };
 
 // ****************************************************************************
 // Convenience function
 // ****************************************************************************
 
-template<class DTArg, class DTIns, typename VTSel>
-void insertRow(
-        DTArg *& res,
-        const DTArg * arg, const DTIns * ins,
-        const VTSel rowLowerIncl, const VTSel rowUpperExcl,
-        DCTX(ctx)
-) {
+template <class DTArg, class DTIns, typename VTSel>
+void insertRow(DTArg *&res, const DTArg *arg, const DTIns *ins, const VTSel rowLowerIncl, const VTSel rowUpperExcl,
+               DCTX(ctx)) {
     InsertRow<DTArg, DTIns, VTSel>::apply(res, arg, ins, rowLowerIncl, rowUpperExcl, ctx);
 }
 
@@ -60,31 +51,36 @@ void insertRow(
 // Boundary validation
 // ****************************************************************************
 
-template<typename VTSel>
+template <typename VTSel>
 void validateArgsInsertRow(size_t rowLowerIncl_Size, VTSel rowLowerIncl, size_t rowUpperExcl_Size, VTSel rowUpperExcl,
-                    size_t numRowsArg, size_t numColsArg, size_t numRowsIns, size_t numColsIns) {
-    
-    if (rowUpperExcl_Size < rowLowerIncl_Size || numRowsArg < rowUpperExcl_Size
-        || (rowLowerIncl_Size == numRowsArg && rowLowerIncl_Size != 0)) {
+                           size_t numRowsArg, size_t numColsArg, size_t numRowsIns, size_t numColsIns) {
+
+    if (rowUpperExcl_Size < rowLowerIncl_Size || numRowsArg < rowUpperExcl_Size ||
+        (rowLowerIncl_Size == numRowsArg && rowLowerIncl_Size != 0)) {
         std::ostringstream errMsg;
         errMsg << "invalid arguments '" << rowLowerIncl << ", " << rowUpperExcl
-                << "' passed to InsertRow: it must hold 0 <= rowLowerIncl <= rowUpperExcl <= #rows "
-                << "and rowLowerIncl < #rows (unless both are zero) where #rows of arg is '" << numRowsArg << "'";
+               << "' passed to InsertRow: it must hold 0 <= rowLowerIncl <= "
+                  "rowUpperExcl <= #rows "
+               << "and rowLowerIncl < #rows (unless both are zero) where #rows "
+                  "of arg is '"
+               << numRowsArg << "'";
         throw std::out_of_range(errMsg.str());
     }
-    
-    if(numRowsIns != rowUpperExcl_Size - rowLowerIncl_Size){
+
+    if (numRowsIns != rowUpperExcl_Size - rowLowerIncl_Size) {
         std::ostringstream errMsg;
         errMsg << "invalid arguments '" << rowLowerIncl << ", " << rowUpperExcl
-                << "' passed to InsertRow: the number of addressed rows in arg '" << rowUpperExcl_Size - rowLowerIncl_Size
-                << "' and the number of rows in ins '" << numRowsIns << "' must match";
+               << "' passed to InsertRow: the number of addressed rows in arg '"
+               << rowUpperExcl_Size - rowLowerIncl_Size << "' and the number of rows in ins '" << numRowsIns
+               << "' must match";
         throw std::out_of_range(errMsg.str());
     }
-    
-    if(numColsIns != numColsArg) {
+
+    if (numColsIns != numColsArg) {
         std::ostringstream errMsg;
-        errMsg << "invalid arguments passed to InsertRow: the number of columns in arg '" << numColsArg
-                << "' and ins '" << numColsIns << "' must match";
+        errMsg << "invalid arguments passed to InsertRow: the number of "
+                  "columns in arg '"
+               << numColsArg << "' and ins '" << numColsIns << "' must match";
         throw std::out_of_range(errMsg.str());
     }
 }
@@ -97,14 +93,9 @@ void validateArgsInsertRow(size_t rowLowerIncl_Size, VTSel rowLowerIncl, size_t
 // DenseMatrix <- DenseMatrix
 // ----------------------------------------------------------------------------
 
-template<typename VT, typename VTSel>
-struct InsertRow<DenseMatrix<VT>, DenseMatrix<VT>, VTSel> {
-    static void apply(
-            DenseMatrix<VT> *& res,
-            const DenseMatrix<VT> * arg, const DenseMatrix<VT> * ins,
-            VTSel rowLowerIncl, VTSel rowUpperExcl,
-            DCTX(ctx)
-    ) {
+template <typename VT, typename VTSel> struct InsertRow<DenseMatrix<VT>, DenseMatrix<VT>, VTSel> {
+    static void apply(DenseMatrix<VT> *&res, const DenseMatrix<VT> *arg, const DenseMatrix<VT> *ins, VTSel rowLowerIncl,
+                      VTSel rowUpperExcl, DCTX(ctx)) {
         const size_t numRowsArg = arg->getNumRows();
         const size_t numColsArg = arg->getNumCols();
         const size_t numRowsIns = ins->getNumRows();
@@ -112,34 +103,34 @@ struct InsertRow<DenseMatrix<VT>, DenseMatrix<VT>, VTSel> {
 
         const size_t rowLowerIncl_Size = static_cast<const size_t>(rowLowerIncl);
         const size_t rowUpperExcl_Size = static_cast<const size_t>(rowUpperExcl);
-        
-        validateArgsInsertRow(rowLowerIncl_Size, rowLowerIncl, rowUpperExcl_Size, rowUpperExcl,
-                    numRowsArg, numColsArg, numRowsIns, numColsIns);
 
-        if(res == nullptr)
+        validateArgsInsertRow(rowLowerIncl_Size, rowLowerIncl, rowUpperExcl_Size, rowUpperExcl, numRowsArg, numColsArg,
+                              numRowsIns, numColsIns);
+
+        if (res == nullptr)
             res = DataObjectFactory::create<DenseMatrix<VT>>(numRowsArg, numColsArg, false);
-        
-        VT * valuesRes = res->getValues();
-        const VT * valuesArg = arg->getValues();
-        const VT * valuesIns = ins->getValues();
+
+        VT *valuesRes = res->getValues();
+        const VT *valuesArg = arg->getValues();
+        const VT *valuesIns = ins->getValues();
         const size_t rowSkipRes = res->getRowSkip();
         const size_t rowSkipArg = arg->getRowSkip();
         const size_t rowSkipIns = ins->getRowSkip();
-        
+
         // TODO Can be simplified/more efficient in certain cases.
-        for(size_t r = 0; r < rowLowerIncl_Size; r++) {
-            memcpy(valuesRes, valuesArg, numColsArg * sizeof(VT));
+        for (size_t r = 0; r < rowLowerIncl_Size; r++) {
+            std::copy(valuesArg, valuesArg + numColsArg, valuesRes);
             valuesRes += rowSkipRes;
             valuesArg += rowSkipArg;
         }
-        for(size_t r = rowLowerIncl_Size; r < rowUpperExcl_Size; r++) {
-            memcpy(valuesRes, valuesIns, numColsArg * sizeof(VT));
+        for (size_t r = rowLowerIncl_Size; r < rowUpperExcl_Size; r++) {
+            std::copy(valuesIns, valuesIns + numColsArg, valuesRes);
             valuesRes += rowSkipRes;
             valuesIns += rowSkipIns;
         }
         valuesArg += rowSkipArg * numRowsIns; // skip rows in arg
-        for(size_t r = rowUpperExcl_Size; r < numRowsArg; r++) {
-            memcpy(valuesRes, valuesArg, numColsArg * sizeof(VT));
+        for (size_t r = rowUpperExcl_Size; r < numRowsArg; r++) {
+            std::copy(valuesArg, valuesArg + numColsArg, valuesRes);
             valuesRes += rowSkipRes;
             valuesArg += rowSkipArg;
         }
@@ -150,22 +141,17 @@ struct InsertRow<DenseMatrix<VT>, DenseMatrix<VT>, VTSel> {
 // Matrix <- Matrix
 // ----------------------------------------------------------------------------
 
-template<typename VT, typename VTSel>
-struct InsertRow<Matrix<VT>, Matrix<VT>, VTSel> {
-    static void apply(
-            Matrix<VT> *& res,
-            const Matrix<VT> * arg, const Matrix<VT> * ins,
-            VTSel rowLowerIncl, VTSel rowUpperExcl,
-            DCTX(ctx)
-    ) {
+template <typename VT, typename VTSel> struct InsertRow<Matrix<VT>, Matrix<VT>, VTSel> {
+    static void apply(Matrix<VT> *&res, const Matrix<VT> *arg, const Matrix<VT> *ins, VTSel rowLowerIncl,
+                      VTSel rowUpperExcl, DCTX(ctx)) {
         const size_t numRowsArg = arg->getNumRows();
         const size_t numColsArg = arg->getNumCols();
 
         const size_t rowLowerIncl_Size = static_cast<const size_t>(rowLowerIncl);
         const size_t rowUpperExcl_Size = static_cast<const size_t>(rowUpperExcl);
 
-        validateArgsInsertRow(rowLowerIncl_Size, rowLowerIncl, rowUpperExcl_Size, rowUpperExcl,
-                    numRowsArg, numColsArg, ins->getNumRows(), ins->getNumCols());
+        validateArgsInsertRow(rowLowerIncl_Size, rowLowerIncl, rowUpperExcl_Size, rowUpperExcl, numRowsArg, numColsArg,
+                              ins->getNumRows(), ins->getNumCols());
 
         if (res == nullptr)
             res = DataObjectFactory::create<DenseMatrix<VT>>(numRowsArg, numColsArg, false);
@@ -179,7 +165,7 @@ struct InsertRow<Matrix<VT>, Matrix<VT>, VTSel> {
         for (size_t r = rowLowerIncl_Size; r < rowUpperExcl_Size; ++r)
             for (size_t c = 0; c < numColsArg; ++c)
                 res->append(r, c, ins->get(r - rowLowerIncl_Size, c));
-                
+
         for (size_t r = rowUpperExcl_Size; r < numRowsArg; ++r)
             for (size_t c = 0; c < numColsArg; ++c)
                 res->append(r, c, arg->get(r, c));
@@ -187,4 +173,4 @@ struct InsertRow<Matrix<VT>, Matrix<VT>, VTSel> {
     }
 };
 
-#endif //SRC_RUNTIME_LOCAL_KERNELS_INSERTROW_H
+#endif // SRC_RUNTIME_LOCAL_KERNELS_INSERTROW_H
diff --git a/src/runtime/local/kernels/IsSymmetric.h b/src/runtime/local/kernels/IsSymmetric.h
index b6e0c2952..60e4d7222 100644
--- a/src/runtime/local/kernels/IsSymmetric.h
+++ b/src/runtime/local/kernels/IsSymmetric.h
@@ -33,9 +33,7 @@ template <class DTArg> struct IsSymmetric {
 // Convenience function
 // ****************************************************************************
 
-template <class DTArg> bool isSymmetric(const DTArg *arg, DCTX(ctx)) {
-    return IsSymmetric<DTArg>::apply(arg, ctx);
-}
+template <class DTArg> bool isSymmetric(const DTArg *arg, DCTX(ctx)) { return IsSymmetric<DTArg>::apply(arg, ctx); }
 
 // ****************************************************************************
 // (Partial) template specializations for different DataTypes
@@ -49,7 +47,7 @@ template <class DTArg> bool isSymmetric(const DTArg *arg, DCTX(ctx)) {
  * @brief Checks for symmetrie of a `DenseMatrix`.
  *
  * Checks for symmetrie in a DenseMatrix. Returning early if a check failes, or
- * the matrix is not square. Singular matrixes are considered square. 
+ * the matrix is not square. Singular matrixes are considered square.
  */
 
 template <typename VT> struct IsSymmetric<DenseMatrix<VT>> {
@@ -101,15 +99,16 @@ template <typename VT> struct IsSymmetric<CSRMatrix<VT>> {
             return true;
         }
 
-        std::vector<size_t> positions(numRows, -1); // indexes of the column index array.
+        std::vector<size_t> positions(numRows,
+                                      -1); // indexes of the column index array.
 
         for (size_t rowIdx = 0; rowIdx < numRows; rowIdx++) {
 
-            const VT* rowA = arg->getValues(rowIdx);
-            const size_t* colIdxsA = arg->getColIdxs(rowIdx);
+            const VT *rowA = arg->getValues(rowIdx);
+            const size_t *colIdxsA = arg->getColIdxs(rowIdx);
             const size_t numNonZerosA = arg->getNumNonZeros(rowIdx);
 
-            for (size_t idx = 0;  idx < numNonZerosA; idx++) {
+            for (size_t idx = 0; idx < numNonZerosA; idx++) {
                 const size_t colIdxA = colIdxsA[idx];
 
                 if (colIdxA <= rowIdx) { // Exit early if diagonal element or before.
@@ -120,8 +119,8 @@ template <typename VT> struct IsSymmetric<CSRMatrix<VT>> {
                 VT valA = rowA[idx];
 
                 // B references the transposed element to compare for symmetry.
-                const VT* rowB = arg->getValues(colIdxA);
-                const size_t* colIdxsB = arg->getColIdxs(colIdxA);
+                const VT *rowB = arg->getValues(colIdxA);
+                const size_t *colIdxsB = arg->getColIdxs(colIdxA);
                 const size_t numNonZerosB = arg->getNumNonZeros(colIdxA);
 
                 positions[colIdxA]++; // colIdxA is rowIdxB
@@ -134,15 +133,15 @@ template <typename VT> struct IsSymmetric<CSRMatrix<VT>> {
                 const size_t colIdxB = colIdxsB[posB];
                 VT valB = rowB[posB];
 
-
-                if( colIdxB != rowIdx || valA != valB) { // Indexes or values differ, not sym.
+                if (colIdxB != rowIdx || valA != valB) { // Indexes or values differ, not sym.
                     return false;
                 }
             }
 
             const size_t rowLastPos = positions[rowIdx];
 
-            if (rowLastPos == static_cast<size_t>(-1) && numNonZerosA != 0) { // Not all elements of this row were iterated over, not sym!
+            if (rowLastPos == static_cast<size_t>(-1) && numNonZerosA != 0) { // Not all elements of this row were
+                                                                              // iterated over, not sym!
                 return false;
             }
         }
diff --git a/src/runtime/local/kernels/Length.h b/src/runtime/local/kernels/Length.h
index f7e8092a2..ccfd5c21c 100644
--- a/src/runtime/local/kernels/Length.h
+++ b/src/runtime/local/kernels/Length.h
@@ -17,6 +17,8 @@
 #pragma once
 
 #include <runtime/local/context/DaphneContext.h>
+#include <runtime/local/datastructures/CSRMatrix.h>
+#include <runtime/local/datastructures/DenseMatrix.h>
 #include <runtime/local/datastructures/List.h>
 
 // ****************************************************************************
@@ -24,7 +26,4 @@
 // ****************************************************************************
 
 // TODO Don't specialize for the data/value type, use the List-level.
-template<class DT>
-size_t length(const List<DT> * arg, DCTX(ctx)) {
-    return arg->length();
-}
+template <class DT> size_t length(const List<DT> *arg, DCTX(ctx)) { return arg->length(); }
diff --git a/src/runtime/local/kernels/Map.h b/src/runtime/local/kernels/Map.h
index 48137f77c..8f2382353 100644
--- a/src/runtime/local/kernels/Map.h
+++ b/src/runtime/local/kernels/Map.h
@@ -28,19 +28,18 @@
 // Struct for partial template specialization
 // ****************************************************************************
 
-template<class DTRes, class DTArg>
-struct Map {
-    // We could have a more specialized function pointer here i.e. (DTRes::VT)(*func)(DTArg::VT).
-    // The problem is that this is currently not supported by kernels.json.
-    static void apply(DTRes *& res, const DTArg * arg , void* func, DCTX(ctx)) = delete;
+template <class DTRes, class DTArg> struct Map {
+    // We could have a more specialized function pointer here i.e.
+    // (DTRes::VT)(*func)(DTArg::VT). The problem is that this is currently not
+    // supported by kernels.json.
+    static void apply(DTRes *&res, const DTArg *arg, void *func, DCTX(ctx)) = delete;
 };
 
 // ****************************************************************************
 // Convenience function
 // ****************************************************************************
 
-template<class DTRes, class DTArg>
-void map(DTRes *& res, const DTArg * arg, void* func, DCTX(ctx)) {
+template <class DTRes, class DTArg> void map(DTRes *&res, const DTArg *arg, void *func, DCTX(ctx)) {
     Map<DTRes, DTArg>::apply(res, arg, func, ctx);
 }
 
@@ -52,22 +51,21 @@ void map(DTRes *& res, const DTArg * arg, void* func, DCTX(ctx)) {
 // DenseMatrix
 // ----------------------------------------------------------------------------
 
-template<typename VTRes, typename VTArg>
-struct Map<DenseMatrix<VTRes>, DenseMatrix<VTArg>> {
-    static void apply(DenseMatrix<VTRes> *& res, const DenseMatrix<VTArg> * arg, void* func, DCTX(ctx)) {
+template <typename VTRes, typename VTArg> struct Map<DenseMatrix<VTRes>, DenseMatrix<VTArg>> {
+    static void apply(DenseMatrix<VTRes> *&res, const DenseMatrix<VTArg> *arg, void *func, DCTX(ctx)) {
         const size_t numRows = arg->getNumRows();
         const size_t numCols = arg->getNumCols();
-        
+
         if (res == nullptr)
             res = DataObjectFactory::create<DenseMatrix<VTRes>>(numRows, numCols, false);
-        
-        auto udf = reinterpret_cast<VTRes(*)(VTArg)>(func);
 
-        const VTArg * valuesArg = arg->getValues();
-        VTRes * valuesRes = res->getValues();
+        auto udf = reinterpret_cast<VTRes (*)(VTArg)>(func);
+
+        const VTArg *valuesArg = arg->getValues();
+        VTRes *valuesRes = res->getValues();
 
-        for(size_t r = 0; r < numRows; r++) {
-            for(size_t c = 0; c < numCols; c++)
+        for (size_t r = 0; r < numRows; r++) {
+            for (size_t c = 0; c < numCols; c++)
                 valuesRes[c] = udf(valuesArg[c]);
             valuesArg += arg->getRowSkip();
             valuesRes += res->getRowSkip();
@@ -79,16 +77,15 @@ struct Map<DenseMatrix<VTRes>, DenseMatrix<VTArg>> {
 // Matrix
 // ----------------------------------------------------------------------------
 
-template<typename VTRes, typename VTArg>
-struct Map<Matrix<VTRes>, Matrix<VTArg>> {
-    static void apply(Matrix<VTRes> *& res, const Matrix<VTArg> * arg, void* func, DCTX(ctx)) {
+template <typename VTRes, typename VTArg> struct Map<Matrix<VTRes>, Matrix<VTArg>> {
+    static void apply(Matrix<VTRes> *&res, const Matrix<VTArg> *arg, void *func, DCTX(ctx)) {
         const size_t numRows = arg->getNumRows();
         const size_t numCols = arg->getNumCols();
-        
+
         if (res == nullptr)
             res = DataObjectFactory::create<DenseMatrix<VTRes>>(numRows, numCols, false);
-        
-        auto udf = reinterpret_cast<VTRes(*)(VTArg)>(func);
+
+        auto udf = reinterpret_cast<VTRes (*)(VTArg)>(func);
 
         res->prepareAppend();
         for (size_t r = 0; r < numRows; ++r)
@@ -98,4 +95,4 @@ struct Map<Matrix<VTRes>, Matrix<VTArg>> {
     }
 };
 
-#endif //SRC_RUNTIME_LOCAL_KERNELS_MAP_H
+#endif // SRC_RUNTIME_LOCAL_KERNELS_MAP_H
diff --git a/src/runtime/local/kernels/MatMul.cpp b/src/runtime/local/kernels/MatMul.cpp
index 06fda7bd5..fc9e933b2 100644
--- a/src/runtime/local/kernels/MatMul.cpp
+++ b/src/runtime/local/kernels/MatMul.cpp
@@ -16,270 +16,249 @@
 
 #include "MatMul.h"
 
-#include <cblas.h>
 #include <Eigen/Dense>
 #include <Eigen/src/Core/util/Constants.h>
+#include <cblas.h>
 #include <cstdint>
 #include <spdlog/spdlog.h>
 
 // ****************************************************************************
 // DOT
 // ****************************************************************************
-template<typename T>
-T launch_dot(const int32_t n, const T* x, const int32_t incx, const T* y, const int32_t incy);
+template <typename T> T launch_dot(const int32_t n, const T *x, const int32_t incx, const T *y, const int32_t incy);
 
-template<>
-float launch_dot(const int32_t n, const float* x, const int32_t incx, const float* y, const int32_t incy) {
+template <> float launch_dot(const int32_t n, const float *x, const int32_t incx, const float *y, const int32_t incy) {
     return cblas_sdot(n, x, incx, y, incy);
 }
 
-template<>
-double launch_dot(const int32_t n, const double* x, const int32_t incx, const double* y, const int32_t incy) {
+template <>
+double launch_dot(const int32_t n, const double *x, const int32_t incx, const double *y, const int32_t incy) {
     return cblas_ddot(n, x, incx, y, incy);
 }
 
-template<>
-int32_t launch_dot(const int32_t n, const int32_t* x, const int32_t incx, const int32_t* y, const int32_t incy) {
-    auto e_x = Eigen::Vector<int32_t, Eigen::Dynamic>::Map(x, n, 1,
-                                                                                  Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>(1, incx));
+template <>
+int32_t launch_dot(const int32_t n, const int32_t *x, const int32_t incx, const int32_t *y, const int32_t incy) {
+    auto e_x =
+        Eigen::Vector<int32_t, Eigen::Dynamic>::Map(x, n, 1, Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>(1, incx));
 
-    auto e_y = Eigen::Vector<int32_t, Eigen::Dynamic>::Map(y, n, 1,
-                                                                                  Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>(1, incy));
+    auto e_y =
+        Eigen::Vector<int32_t, Eigen::Dynamic>::Map(y, n, 1, Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>(1, incy));
     return e_x.dot(e_y);
 }
 
-template<>
-int64_t launch_dot(const int32_t n, const int64_t* x, const int32_t incx, const int64_t* y, const int32_t incy) {
-    auto e_x = Eigen::Vector<int64_t, Eigen::Dynamic>::Map(x, n, 1,
-                                                                                  Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>(1, incx));
+template <>
+int64_t launch_dot(const int32_t n, const int64_t *x, const int32_t incx, const int64_t *y, const int32_t incy) {
+    auto e_x =
+        Eigen::Vector<int64_t, Eigen::Dynamic>::Map(x, n, 1, Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>(1, incx));
 
-    auto e_y = Eigen::Vector<int64_t, Eigen::Dynamic>::Map(y, n, 1,
-                                                                                  Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>(1, incy));
+    auto e_y =
+        Eigen::Vector<int64_t, Eigen::Dynamic>::Map(y, n, 1, Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>(1, incy));
     return e_x.dot(e_y);
 }
 
-
 // ****************************************************************************
 // GEMV
 // ****************************************************************************
-template<typename T>
-void launch_gemv(bool transa, bool transb, size_t m, size_t n, const T alpha, const T* A, const int32_t lda,
-                 const T* x, const int32_t incx, const T beta, T* y, const int32_t incy);
-
-template<>
-void launch_gemv(bool transa, bool transb, size_t m, size_t n, const float alpha, const float* A, const int32_t lda,
-                 const float* x, const int32_t incx, const float beta, float* y, const int32_t incy) {
-        cblas_sgemv(CblasRowMajor, transa ? CblasTrans : CblasNoTrans, m, n, alpha, A, lda, x, incx, beta, y, incy);
+template <typename T>
+void launch_gemv(bool transa, bool transb, size_t m, size_t n, const T alpha, const T *A, const int32_t lda, const T *x,
+                 const int32_t incx, const T beta, T *y, const int32_t incy);
+
+template <>
+void launch_gemv(bool transa, bool transb, size_t m, size_t n, const float alpha, const float *A, const int32_t lda,
+                 const float *x, const int32_t incx, const float beta, float *y, const int32_t incy) {
+    cblas_sgemv(CblasRowMajor, transa ? CblasTrans : CblasNoTrans, m, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-template<>
-void launch_gemv(bool transa, bool transb, size_t m, size_t n, const double alpha, const double* A, const int32_t lda,
-                 const double* x, const int32_t incx, const double beta, double* y, const int32_t incy) {
+template <>
+void launch_gemv(bool transa, bool transb, size_t m, size_t n, const double alpha, const double *A, const int32_t lda,
+                 const double *x, const int32_t incx, const double beta, double *y, const int32_t incy) {
     cblas_dgemv(CblasRowMajor, transa ? CblasTrans : CblasNoTrans, m, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-template<>
-void launch_gemv(bool transa, bool transb, size_t m, size_t n, const int32_t alpha, const int32_t* A, const int32_t lda,
-                 const int32_t* x, const int32_t incx, const int32_t beta, int32_t* y, const int32_t incy) {
-    if(transa) {
-        auto e_A = Eigen::Matrix<int32_t, Eigen::Dynamic, Eigen::Dynamic>::Map(A, m, n,
-                Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>(1, lda)).transpose();
+template <>
+void launch_gemv(bool transa, bool transb, size_t m, size_t n, const int32_t alpha, const int32_t *A, const int32_t lda,
+                 const int32_t *x, const int32_t incx, const int32_t beta, int32_t *y, const int32_t incy) {
+    if (transa) {
+        auto e_A = Eigen::Matrix<int32_t, Eigen::Dynamic, Eigen::Dynamic>::Map(
+                       A, m, n, Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>(1, lda))
+                       .transpose();
 
-        auto e_x = Eigen::Matrix<int32_t, Eigen::Dynamic, Eigen::Dynamic>::Map(x, m, 1,
-                                                                                  Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>(1, incx));
+        auto e_x = Eigen::Matrix<int32_t, Eigen::Dynamic, Eigen::Dynamic>::Map(
+            x, m, 1, Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>(1, incx));
 
-        auto e_y = Eigen::Matrix<int32_t, Eigen::Dynamic, Eigen::Dynamic>::Map(y, n, 1,
-                                                                                  Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>(1, incy));
+        auto e_y = Eigen::Matrix<int32_t, Eigen::Dynamic, Eigen::Dynamic>::Map(
+            y, n, 1, Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>(1, incy));
         e_y.noalias() = e_A * e_x;
 
-    }
-    else {
-        auto e_A = Eigen::Matrix<int32_t, Eigen::Dynamic, Eigen::Dynamic>::Map(A, m, n,
-                Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>(1, lda));
+    } else {
+        auto e_A = Eigen::Matrix<int32_t, Eigen::Dynamic, Eigen::Dynamic>::Map(
+            A, m, n, Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>(1, lda));
 
-        auto e_x = Eigen::Matrix<int32_t, Eigen::Dynamic, Eigen::Dynamic>::Map(x, n, 1,
-                                                                                  Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>(1, incx));
+        auto e_x = Eigen::Matrix<int32_t, Eigen::Dynamic, Eigen::Dynamic>::Map(
+            x, n, 1, Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>(1, incx));
 
-        auto e_y = Eigen::Matrix<int32_t, Eigen::Dynamic, Eigen::Dynamic>::Map(y, m, 1,
-                                                                                  Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>(1, incy));
+        auto e_y = Eigen::Matrix<int32_t, Eigen::Dynamic, Eigen::Dynamic>::Map(
+            y, m, 1, Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>(1, incy));
 
         e_y.noalias() = e_A * e_x;
     }
 }
 
-template<>
-void launch_gemv(bool transa, bool transb, size_t m, size_t n, const int64_t alpha, const int64_t* A, const int32_t lda,
-                 const int64_t* x, const int32_t incx, const int64_t beta, int64_t* y, const int32_t incy) {
-    if(transa) {
-        auto e_A = Eigen::Matrix<int64_t, Eigen::Dynamic, Eigen::Dynamic>::Map(A, m, n,
-                Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>(1, lda)).transpose();
+template <>
+void launch_gemv(bool transa, bool transb, size_t m, size_t n, const int64_t alpha, const int64_t *A, const int32_t lda,
+                 const int64_t *x, const int32_t incx, const int64_t beta, int64_t *y, const int32_t incy) {
+    if (transa) {
+        auto e_A = Eigen::Matrix<int64_t, Eigen::Dynamic, Eigen::Dynamic>::Map(
+                       A, m, n, Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>(1, lda))
+                       .transpose();
 
-        auto e_x = Eigen::Matrix<int64_t, Eigen::Dynamic, Eigen::Dynamic>::Map(x, m, 1,
-                                                                                  Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>(1, incx));
+        auto e_x = Eigen::Matrix<int64_t, Eigen::Dynamic, Eigen::Dynamic>::Map(
+            x, m, 1, Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>(1, incx));
 
-        auto e_y = Eigen::Matrix<int64_t, Eigen::Dynamic, Eigen::Dynamic>::Map(y, n, 1,
-                                                                                  Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>(1, incy));
+        auto e_y = Eigen::Matrix<int64_t, Eigen::Dynamic, Eigen::Dynamic>::Map(
+            y, n, 1, Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>(1, incy));
         e_y.noalias() = e_A * e_x;
 
-    }
-    else {
-        auto e_A = Eigen::Matrix<int64_t, Eigen::Dynamic, Eigen::Dynamic>::Map(A, m, n,
-                Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>(1, lda));
+    } else {
+        auto e_A = Eigen::Matrix<int64_t, Eigen::Dynamic, Eigen::Dynamic>::Map(
+            A, m, n, Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>(1, lda));
 
-        auto e_x = Eigen::Matrix<int64_t, Eigen::Dynamic, Eigen::Dynamic>::Map(x, n, 1,
-                                                                                  Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>(1, incx));
+        auto e_x = Eigen::Matrix<int64_t, Eigen::Dynamic, Eigen::Dynamic>::Map(
+            x, n, 1, Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>(1, incx));
 
-        auto e_y = Eigen::Matrix<int64_t, Eigen::Dynamic, Eigen::Dynamic>::Map(y, m, 1,
-                                                                                  Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>(1, incy));
+        auto e_y = Eigen::Matrix<int64_t, Eigen::Dynamic, Eigen::Dynamic>::Map(
+            y, m, 1, Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>(1, incy));
 
         e_y.noalias() = e_A * e_x;
-
     }
 }
 
 // ****************************************************************************
 // GEMM
 // ****************************************************************************
-template<typename T>
-void launch_gemm(bool transa, bool transb, const int32_t m, const int32_t n, const int32_t k, const T alpha, const T* A,
-        int32_t lda, const T* B, int32_t ldb, const T beta, T *C, int32_t ldc);
+template <typename T>
+void launch_gemm(bool transa, bool transb, const int32_t m, const int32_t n, const int32_t k, const T alpha, const T *A,
+                 int32_t lda, const T *B, int32_t ldb, const T beta, T *C, int32_t ldc);
 
-template<>
+template <>
 [[maybe_unused]] void launch_gemm<float>(bool transa, bool transb, const int32_t m, const int32_t n, const int32_t k,
-        const float alpha, const float* A, const int32_t lda, const float* B, const int32_t ldb, const float beta,
-        float *C, const int32_t ldc) {
-    cblas_sgemm(CblasRowMajor, transa ? CblasTrans : CblasNoTrans, transb ? CblasTrans : CblasNoTrans,
-            m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+                                         const float alpha, const float *A, const int32_t lda, const float *B,
+                                         const int32_t ldb, const float beta, float *C, const int32_t ldc) {
+    cblas_sgemm(CblasRowMajor, transa ? CblasTrans : CblasNoTrans, transb ? CblasTrans : CblasNoTrans, m, n, k, alpha,
+                A, lda, B, ldb, beta, C, ldc);
 }
 
-template<>
+template <>
 [[maybe_unused]] void launch_gemm<double>(bool transa, bool transb, const int32_t m, const int32_t n, const int32_t k,
-        const double alpha, const double* A, const int32_t lda, const double* B, const int32_t ldb, const double beta,
-        double *C, const int32_t ldc) {
-    cblas_dgemm(CblasRowMajor, transa ? CblasTrans : CblasNoTrans, transb ? CblasTrans : CblasNoTrans,
-            m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+                                          const double alpha, const double *A, const int32_t lda, const double *B,
+                                          const int32_t ldb, const double beta, double *C, const int32_t ldc) {
+    cblas_dgemm(CblasRowMajor, transa ? CblasTrans : CblasNoTrans, transb ? CblasTrans : CblasNoTrans, m, n, k, alpha,
+                A, lda, B, ldb, beta, C, ldc);
 }
 
-template<>
+template <>
 [[maybe_unused]] void launch_gemm<int32_t>(bool transa, bool transb, const int32_t m, const int32_t n, const int32_t k,
-        const int32_t alpha, const int32_t* A, int32_t lda, const int32_t* B, int32_t ldb, const int32_t beta, int32_t *C,
-        int32_t ldc) {
-
-    if(transa && transb) {
-        auto eigenA = Eigen::Matrix<int32_t, Eigen::Dynamic, Eigen::Dynamic>::Map(A, k, m,
-                                                                                  Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>(
-                                                                                          1, lda)).transpose();
-        auto eigenB = Eigen::Matrix<int32_t, Eigen::Dynamic, Eigen::Dynamic>::Map(B, n, k,
-                                                                                  Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>(
-                                                                                          1, ldb)).transpose();
-        auto eigenC = Eigen::Matrix<int32_t, Eigen::Dynamic, Eigen::Dynamic>::Map(C, n, m,
-                                                                                  Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>(
-                                                                                          1, ldc));
+                                           const int32_t alpha, const int32_t *A, int32_t lda, const int32_t *B,
+                                           int32_t ldb, const int32_t beta, int32_t *C, int32_t ldc) {
+
+    if (transa && transb) {
+        auto eigenA = Eigen::Matrix<int32_t, Eigen::Dynamic, Eigen::Dynamic>::Map(
+                          A, k, m, Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>(1, lda))
+                          .transpose();
+        auto eigenB = Eigen::Matrix<int32_t, Eigen::Dynamic, Eigen::Dynamic>::Map(
+                          B, n, k, Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>(1, ldb))
+                          .transpose();
+        auto eigenC = Eigen::Matrix<int32_t, Eigen::Dynamic, Eigen::Dynamic>::Map(
+            C, n, m, Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>(1, ldc));
         eigenC.noalias() = eigenA * eigenB;
 
-    }
-    else if (transa) {
-        auto eigenA = Eigen::Matrix<int32_t, Eigen::Dynamic, Eigen::Dynamic>::Map(A, k, m,
-                                                                                  Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>(
-                                                                                          1, lda)).transpose();
-        auto eigenB = Eigen::Matrix<int32_t, Eigen::Dynamic, Eigen::Dynamic>::Map(B, n, k,
-                                                                                  Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>(
-                                                                                          1, ldb));
-        auto eigenC = Eigen::Matrix<int32_t, Eigen::Dynamic, Eigen::Dynamic>::Map(C, n, m,
-                                                                                  Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>(
-                                                                                          1, ldc));
+    } else if (transa) {
+        auto eigenA = Eigen::Matrix<int32_t, Eigen::Dynamic, Eigen::Dynamic>::Map(
+                          A, k, m, Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>(1, lda))
+                          .transpose();
+        auto eigenB = Eigen::Matrix<int32_t, Eigen::Dynamic, Eigen::Dynamic>::Map(
+            B, n, k, Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>(1, ldb));
+        auto eigenC = Eigen::Matrix<int32_t, Eigen::Dynamic, Eigen::Dynamic>::Map(
+            C, n, m, Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>(1, ldc));
         eigenC.noalias() = eigenA * eigenB;
-    }
-    else if (transb) {
-        auto eigenA = Eigen::Matrix<int32_t, Eigen::Dynamic, Eigen::Dynamic>::Map(A, m, k,
-                                                                                  Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>(
-                                                                                          1, lda));
-        auto eigenB = Eigen::Matrix<int32_t, Eigen::Dynamic, Eigen::Dynamic>::Map(B, n, k,
-                                                                                  Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>(
-                                                                                          1, ldb)).transpose();
-        auto eigenC = Eigen::Matrix<int32_t, Eigen::Dynamic, Eigen::Dynamic>::Map(C, m, n,
-                                                                                  Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>(
-                                                                                          1, ldc));
+    } else if (transb) {
+        auto eigenA = Eigen::Matrix<int32_t, Eigen::Dynamic, Eigen::Dynamic>::Map(
+            A, m, k, Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>(1, lda));
+        auto eigenB = Eigen::Matrix<int32_t, Eigen::Dynamic, Eigen::Dynamic>::Map(
+                          B, n, k, Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>(1, ldb))
+                          .transpose();
+        auto eigenC = Eigen::Matrix<int32_t, Eigen::Dynamic, Eigen::Dynamic>::Map(
+            C, m, n, Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>(1, ldc));
         eigenC.noalias() = eigenA * eigenB;
-    }
-    else {
-        auto eigenA = Eigen::Matrix<int32_t, Eigen::Dynamic, Eigen::Dynamic>::Map(A, m, k,
-                                                                                  Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>(1, lda));
-        auto eigenB = Eigen::Matrix<int32_t, Eigen::Dynamic, Eigen::Dynamic>::Map(B, k, n,
-                                                                                  Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>(1, ldb));
-        auto eigenC = Eigen::Matrix<int32_t, Eigen::Dynamic, Eigen::Dynamic>::Map(C, m, n,
-                                                                                  Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>(1, ldc));
+    } else {
+        auto eigenA = Eigen::Matrix<int32_t, Eigen::Dynamic, Eigen::Dynamic>::Map(
+            A, m, k, Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>(1, lda));
+        auto eigenB = Eigen::Matrix<int32_t, Eigen::Dynamic, Eigen::Dynamic>::Map(
+            B, k, n, Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>(1, ldb));
+        auto eigenC = Eigen::Matrix<int32_t, Eigen::Dynamic, Eigen::Dynamic>::Map(
+            C, m, n, Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>(1, ldc));
         eigenC.noalias() = eigenA * eigenB;
-
     }
 }
 
-template<>
+template <>
 [[maybe_unused]] void launch_gemm<int64_t>(bool transa, bool transb, const int32_t m, const int32_t n, const int32_t k,
-        const int64_t alpha, const int64_t* A, int32_t lda, const int64_t* B, int32_t ldb, const int64_t beta, int64_t *C,
-        int32_t ldc) {
-    if(transa && transb) {
-        auto eigenA = Eigen::Matrix<int64_t, Eigen::Dynamic, Eigen::Dynamic>::Map(A, k, m,
-                                                                                  Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>(
-                                                                                          1, lda)).transpose();
-        auto eigenB = Eigen::Matrix<int64_t, Eigen::Dynamic, Eigen::Dynamic>::Map(B, n, k,
-                                                                                  Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>(
-                                                                                          1, ldb)).transpose();
-        auto eigenC = Eigen::Matrix<int64_t, Eigen::Dynamic, Eigen::Dynamic>::Map(C, n, m,
-                                                                                  Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>(
-                                                                                          1, ldc));
+                                           const int64_t alpha, const int64_t *A, int32_t lda, const int64_t *B,
+                                           int32_t ldb, const int64_t beta, int64_t *C, int32_t ldc) {
+    if (transa && transb) {
+        auto eigenA = Eigen::Matrix<int64_t, Eigen::Dynamic, Eigen::Dynamic>::Map(
+                          A, k, m, Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>(1, lda))
+                          .transpose();
+        auto eigenB = Eigen::Matrix<int64_t, Eigen::Dynamic, Eigen::Dynamic>::Map(
+                          B, n, k, Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>(1, ldb))
+                          .transpose();
+        auto eigenC = Eigen::Matrix<int64_t, Eigen::Dynamic, Eigen::Dynamic>::Map(
+            C, n, m, Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>(1, ldc));
         eigenC.noalias() = eigenA * eigenB;
 
-    }
-    else if (transa) {
-        auto eigenA = Eigen::Matrix<int64_t, Eigen::Dynamic, Eigen::Dynamic>::Map(A, k, m,
-                                                                                  Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>(
-                                                                                          1, lda)).transpose();
-        auto eigenB = Eigen::Matrix<int64_t, Eigen::Dynamic, Eigen::Dynamic>::Map(B, n, k,
-                                                                                  Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>(
-                                                                                          1, ldb));
-        auto eigenC = Eigen::Matrix<int64_t, Eigen::Dynamic, Eigen::Dynamic>::Map(C, n, m,
-                                                                                  Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>(
-                                                                                          1, ldc));
+    } else if (transa) {
+        auto eigenA = Eigen::Matrix<int64_t, Eigen::Dynamic, Eigen::Dynamic>::Map(
+                          A, k, m, Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>(1, lda))
+                          .transpose();
+        auto eigenB = Eigen::Matrix<int64_t, Eigen::Dynamic, Eigen::Dynamic>::Map(
+            B, n, k, Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>(1, ldb));
+        auto eigenC = Eigen::Matrix<int64_t, Eigen::Dynamic, Eigen::Dynamic>::Map(
+            C, n, m, Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>(1, ldc));
         eigenC.noalias() = eigenA * eigenB;
-    }
-    else if (transb) {
-        auto eigenA = Eigen::Matrix<int64_t, Eigen::Dynamic, Eigen::Dynamic>::Map(A, m, k,
-                                                                                  Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>(
-                                                                                          1, lda));
-        auto eigenB = Eigen::Matrix<int64_t, Eigen::Dynamic, Eigen::Dynamic>::Map(B, n, k,
-                                                                                  Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>(
-                                                                                          1, ldb)).transpose();
-        auto eigenC = Eigen::Matrix<int64_t, Eigen::Dynamic, Eigen::Dynamic>::Map(C, m, n,
-                                                                                  Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>(
-                                                                                          1, ldc));
+    } else if (transb) {
+        auto eigenA = Eigen::Matrix<int64_t, Eigen::Dynamic, Eigen::Dynamic>::Map(
+            A, m, k, Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>(1, lda));
+        auto eigenB = Eigen::Matrix<int64_t, Eigen::Dynamic, Eigen::Dynamic>::Map(
+                          B, n, k, Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>(1, ldb))
+                          .transpose();
+        auto eigenC = Eigen::Matrix<int64_t, Eigen::Dynamic, Eigen::Dynamic>::Map(
+            C, m, n, Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>(1, ldc));
         eigenC.noalias() = eigenA * eigenB;
-    }
-    else {
-        auto eigenA = Eigen::Matrix<int64_t, Eigen::Dynamic, Eigen::Dynamic>::Map(A, m, k,
-                                                                                  Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>(1, lda));
-        auto eigenB = Eigen::Matrix<int64_t, Eigen::Dynamic, Eigen::Dynamic>::Map(B, k, n,
-                                                                                  Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>(1, ldb));
-        auto eigenC = Eigen::Matrix<int64_t, Eigen::Dynamic, Eigen::Dynamic>::Map(C, m, n,
-                                                                                  Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>(1, ldc));
+    } else {
+        auto eigenA = Eigen::Matrix<int64_t, Eigen::Dynamic, Eigen::Dynamic>::Map(
+            A, m, k, Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>(1, lda));
+        auto eigenB = Eigen::Matrix<int64_t, Eigen::Dynamic, Eigen::Dynamic>::Map(
+            B, k, n, Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>(1, ldb));
+        auto eigenC = Eigen::Matrix<int64_t, Eigen::Dynamic, Eigen::Dynamic>::Map(
+            C, m, n, Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>(1, ldc));
         eigenC.noalias() = eigenA * eigenB;
     }
 }
 
-template<typename VT>
+template <typename VT>
 void MatMul<DenseMatrix<VT>, DenseMatrix<VT>, DenseMatrix<VT>>::apply(DenseMatrix<VT> *&res, const DenseMatrix<VT> *lhs,
-        const DenseMatrix<VT> *rhs, bool transa, bool transb, DCTX(dctx)) {
+                                                                      const DenseMatrix<VT> *rhs, bool transa,
+                                                                      bool transb, DCTX(dctx)) {
     const auto nr1 = static_cast<int>(transa ? lhs->getNumCols() : lhs->getNumRows());
     const auto nc1 = static_cast<int>(transa ? lhs->getNumRows() : lhs->getNumCols());
     const auto nr2 = static_cast<int>(transb ? rhs->getNumCols() : rhs->getNumRows());
     const auto nc2 = static_cast<int>(transb ? rhs->getNumRows() : rhs->getNumCols());
     if (nc1 != nr2) {
-        throw std::runtime_error(
-            "MatMul - #cols of lhs and #rows of rhs must be the same");
+        throw std::runtime_error("MatMul - #cols of lhs and #rows of rhs must be the same");
     }
     const VT alpha = 1.0f;
     const VT beta = 0.0f;
-    if(res == nullptr)
+    if (res == nullptr)
         res = DataObjectFactory::create<DenseMatrix<VT>>(nr1, nc2, false);
 
     // adding BLAS nomenclature - should be optimized away by the compiler ;-)
@@ -294,23 +273,21 @@ void MatMul<DenseMatrix<VT>, DenseMatrix<VT>, DenseMatrix<VT>>::apply(DenseMatri
     const auto B = rhs->getValues();
     auto C = res->getValues();
 
-    if(nr1 == 1 && nc2 == 1) {// Vector-Vector
+    if (nr1 == 1 && nc2 == 1) { // Vector-Vector
         dctx->logger->debug("launch_dot<{}>(a[{}x{}], b[{}x{}])", typeid(alpha).name(), m, k, k, n);
-        res->set(0, 0, launch_dot(nc1, A, transa ?  lda : 1,
-                            B, transb ? 1 : ldb));
-    }
-    else if(nc2 == 1) {      // Matrix-Vector
+        res->set(0, 0, launch_dot(nc1, A, transa ? lda : 1, B, transb ? 1 : ldb));
+    } else if (nc2 == 1) { // Matrix-Vector
         dctx->logger->debug("launch_gemv<{}>(A[{},{}], x[{}])", typeid(alpha).name(), m, k, k);
-        launch_gemv<VT>(transa, transb, lhs->getNumRows(), lhs->getNumCols(), alpha, A, lda, B, transb ? 1 : ldb, beta, C, ldc);
-  }
-  else { // Matrix-Matrix
-        dctx->logger->debug("launch_gemm<{}>(C[{}x{}], A[{},{}], B[{}x{}], transA:{}, transB:{})",
-                typeid(alpha).name(), m, n, m, k, k, n, transa, transb);
+        launch_gemv<VT>(transa, transb, lhs->getNumRows(), lhs->getNumCols(), alpha, A, lda, B, transb ? 1 : ldb, beta,
+                        C, ldc);
+    } else { // Matrix-Matrix
+        dctx->logger->debug("launch_gemm<{}>(C[{}x{}], A[{},{}], B[{}x{}], "
+                            "transA:{}, transB:{})",
+                            typeid(alpha).name(), m, n, m, k, k, n, transa, transb);
         launch_gemm<VT>(transa, transb, nr1, nc2, nc1, alpha, A, lda, B, ldb, beta, C, ldc);
     }
 }
 
-
 // explicit instantiations to satisfy linker
 template struct MatMul<DenseMatrix<float>, DenseMatrix<float>, DenseMatrix<float>>;
 template struct MatMul<DenseMatrix<double>, DenseMatrix<double>, DenseMatrix<double>>;
diff --git a/src/runtime/local/kernels/MatMul.h b/src/runtime/local/kernels/MatMul.h
index a8065e39a..7e9710125 100644
--- a/src/runtime/local/kernels/MatMul.h
+++ b/src/runtime/local/kernels/MatMul.h
@@ -17,8 +17,8 @@
 #pragma once
 
 #include <runtime/local/context/DaphneContext.h>
-#include <runtime/local/datastructures/DataObjectFactory.h>
 #include <runtime/local/datastructures/CSRMatrix.h>
+#include <runtime/local/datastructures/DataObjectFactory.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
 #include <runtime/local/datastructures/Matrix.h>
 #include <runtime/local/kernels/CastObj.h>
@@ -29,13 +29,11 @@
 // Struct for partial template specialization
 // ****************************************************************************
 
-template<class DTRes, class DTLhs, class DTRhs>
-struct MatMul {
-    static void apply(DTRes *& res, const DTLhs * lhs, const DTRhs * rhs, bool transa, bool transb, DCTX(ctx)) = delete;
+template <class DTRes, class DTLhs, class DTRhs> struct MatMul {
+    static void apply(DTRes *&res, const DTLhs *lhs, const DTRhs *rhs, bool transa, bool transb, DCTX(ctx)) = delete;
 };
 
-template<typename T>
-struct MatMul<DenseMatrix<T>, DenseMatrix<T>, DenseMatrix<T>> {
+template <typename T> struct MatMul<DenseMatrix<T>, DenseMatrix<T>, DenseMatrix<T>> {
     static void apply(DenseMatrix<T> *&res, const DenseMatrix<T> *lhs, const DenseMatrix<T> *rhs, bool transa,
                       bool transb, DCTX(dctx));
 };
@@ -44,8 +42,8 @@ struct MatMul<DenseMatrix<T>, DenseMatrix<T>, DenseMatrix<T>> {
 // Convenience function
 // ****************************************************************************
 
-template<class DTRes, class DTLhs, class DTRhs>
-void matMul(DTRes *& res, const DTLhs * lhs, const DTRhs * rhs, bool transa, bool transb, DCTX(ctx)) {
+template <class DTRes, class DTLhs, class DTRhs>
+void matMul(DTRes *&res, const DTLhs *lhs, const DTRhs *rhs, bool transa, bool transb, DCTX(ctx)) {
     MatMul<DTRes, DTLhs, DTRhs>::apply(res, lhs, rhs, transa, transb, ctx);
 }
 
@@ -53,9 +51,9 @@ void matMul(DTRes *& res, const DTLhs * lhs, const DTRhs * rhs, bool transa, boo
 // DenseMatrix <- CSRMatrix, DenseMatrix
 // ----------------------------------------------------------------------------
 
-template<typename VT>
-struct MatMul<DenseMatrix<VT>, CSRMatrix<VT>, DenseMatrix<VT>> {
-    static void apply(DenseMatrix<VT> *& res, const CSRMatrix<VT> * lhs, const DenseMatrix<VT> * rhs, bool transa, bool transb, DCTX(ctx)) {
+template <typename VT> struct MatMul<DenseMatrix<VT>, CSRMatrix<VT>, DenseMatrix<VT>> {
+    static void apply(DenseMatrix<VT> *&res, const CSRMatrix<VT> *lhs, const DenseMatrix<VT> *rhs, bool transa,
+                      bool transb, DCTX(ctx)) {
         const size_t nr1 = lhs->getNumRows();
         [[maybe_unused]] const size_t nc1 = lhs->getNumCols();
 
@@ -63,33 +61,32 @@ struct MatMul<DenseMatrix<VT>, CSRMatrix<VT>, DenseMatrix<VT>> {
         const size_t nc2 = rhs->getNumCols();
 
         if (nc1 != nr2) {
-            throw std::runtime_error(
-                "MatMul - #cols of lhs and #rows of rhs must be the same");
+            throw std::runtime_error("MatMul - #cols of lhs and #rows of rhs must be the same");
         }
         // FIXME: transpose isn't supported atm
 
-        if(res == nullptr)
+        if (res == nullptr)
             res = DataObjectFactory::create<DenseMatrix<VT>>(nr1, nc2, false);
 
-        const VT * valuesRhs = rhs->getValues();
-        VT * valuesRes = res->getValues();
+        const VT *valuesRhs = rhs->getValues();
+        VT *valuesRes = res->getValues();
 
         const size_t rowSkipRhs = rhs->getRowSkip();
         const size_t rowSkipRes = res->getRowSkip();
 
         memset(valuesRes, VT(0), sizeof(VT) * nr1 * nc2);
-        for(size_t r = 0; r < nr1; r++) {
+        for (size_t r = 0; r < nr1; r++) {
             const size_t rowNumNonZeros = lhs->getNumNonZeros(r);
-            const size_t * rowColIdxs = lhs->getColIdxs(r);
-            const VT * rowValues = lhs->getValues(r);
+            const size_t *rowColIdxs = lhs->getColIdxs(r);
+            const VT *rowValues = lhs->getValues(r);
 
             const size_t rowIdxRes = r * rowSkipRes;
-            for(size_t i = 0; i < rowNumNonZeros; i++) {
+            for (size_t i = 0; i < rowNumNonZeros; i++) {
                 const size_t c = rowColIdxs[i];
                 const size_t rowIdxRhs = c * rowSkipRhs;
 
-                for(size_t j = 0; j < nc2; j++) {
-		            valuesRes[rowIdxRes + j] += rowValues[i] * valuesRhs[rowIdxRhs + j];
+                for (size_t j = 0; j < nc2; j++) {
+                    valuesRes[rowIdxRes + j] += rowValues[i] * valuesRhs[rowIdxRhs + j];
                 }
             }
         }
@@ -100,18 +97,18 @@ struct MatMul<DenseMatrix<VT>, CSRMatrix<VT>, DenseMatrix<VT>> {
 // Matrix <- Matrix, Matrix
 // ----------------------------------------------------------------------------
 
-template<typename VT>
-struct MatMul<Matrix<VT>, Matrix<VT>, Matrix<VT>> {
-    static void apply(Matrix<VT> *& res, const Matrix<VT> * lhs, const Matrix<VT> * rhs, bool transa, bool transb, DCTX(ctx)) {
+template <typename VT> struct MatMul<Matrix<VT>, Matrix<VT>, Matrix<VT>> {
+    static void apply(Matrix<VT> *&res, const Matrix<VT> *lhs, const Matrix<VT> *rhs, bool transa, bool transb,
+                      DCTX(ctx)) {
         const size_t lhsRows = transa ? lhs->getNumCols() : lhs->getNumRows();
         const size_t lhsCols = transa ? lhs->getNumRows() : lhs->getNumCols();
         const size_t rhsRows = transb ? rhs->getNumCols() : rhs->getNumRows();
         const size_t rhsCols = transb ? rhs->getNumRows() : rhs->getNumCols();
-        
+
         if (lhsCols != rhsRows)
             throw std::runtime_error("MatMul: #cols of lhs and #rows of rhs must be the same");
 
-        if(res == nullptr)
+        if (res == nullptr)
             res = DataObjectFactory::create<DenseMatrix<VT>>(lhsRows, rhsCols, false);
 
         res->prepareAppend();
@@ -134,41 +131,44 @@ struct MatMul<Matrix<VT>, Matrix<VT>, Matrix<VT>> {
 // CSRMatrix <- CSRMatrix, CSRMatrix
 // ----------------------------------------------------------------------------
 
-template<typename VT>
-struct MatMul<CSRMatrix<VT>, CSRMatrix<VT>, CSRMatrix<VT>> { // ToDo: support transpose
-    static void apply(CSRMatrix<VT> *& res, const CSRMatrix<VT> * lhs, const CSRMatrix<VT> * rhs, bool transa, bool transb, DCTX(ctx)) {
+template <typename VT> struct MatMul<CSRMatrix<VT>, CSRMatrix<VT>,
+                                     CSRMatrix<VT>> { // ToDo: support transpose
+    static void apply(CSRMatrix<VT> *&res, const CSRMatrix<VT> *lhs, const CSRMatrix<VT> *rhs, bool transa, bool transb,
+                      DCTX(ctx)) {
         const size_t nr1 = lhs->getNumRows();
         const size_t nc1 = lhs->getNumCols();
         const size_t nr2 = rhs->getNumRows();
         const size_t nc2 = rhs->getNumCols();
 
-        if(nc1 != nr2)
+        if (nc1 != nr2)
             throw std::runtime_error("#cols of lhs and #rows of rhs must be the same");
 
         // TODO: Better estimation of the number of non-zeros
         size_t estimationNumNonZeros = lhs->getNumNonZeros() * rhs->getNumNonZeros();
-        if(res == nullptr)
+        if (res == nullptr)
             res = DataObjectFactory::create<CSRMatrix<VT>>(nr1, nc2, estimationNumNonZeros, true);
 
-        const VT* valuesLhs = lhs->getValues();
-        const size_t* colIdxsLhs = lhs->getColIdxs();
-        const size_t* rowOffsetsLhs = lhs->getRowOffsets();
+        const VT *valuesLhs = lhs->getValues();
+        const size_t *colIdxsLhs = lhs->getColIdxs();
+        const size_t *rowOffsetsLhs = lhs->getRowOffsets();
 
-        const VT* valuesRhs = rhs->getValues();
-        const size_t* colIdxsRhs = rhs->getColIdxs();
-        const size_t* rowOffsetsRhs = rhs->getRowOffsets();
+        const VT *valuesRhs = rhs->getValues();
+        const size_t *colIdxsRhs = rhs->getColIdxs();
+        const size_t *rowOffsetsRhs = rhs->getRowOffsets();
 
         for (size_t row = 0; row < nr1; row++) {
             for (size_t col = 0; col < nc2; col++) {
                 VT sum = VT(0);
-                // Dot product between the row `row` of Lhs and the col `col` of Rhs
+                // Dot product between the row `row` of Lhs and the col `col` of
+                // Rhs
                 for (size_t j = rowOffsetsLhs[row]; j < rowOffsetsLhs[row + 1]; j++) {
                     size_t k = colIdxsLhs[j];
                     // For this we need to find the values Rhs[k, col]
                     // (we already have Lhs[row, k])
                     size_t i = rowOffsetsRhs[k];
                     size_t endRhsRow = rowOffsetsRhs[k + 1];
-                    // We are scanning the k^{th} row of Rhs to find a value at the col `col`
+                    // We are scanning the k^{th} row of Rhs to find a value at
+                    // the col `col`
                     while (i < endRhsRow && colIdxsRhs[i] < col)
                         i++;
                     if (i < endRhsRow && colIdxsRhs[i] == col)
diff --git a/src/runtime/local/kernels/MatrixConstant.h b/src/runtime/local/kernels/MatrixConstant.h
index 4e1e646ad..9b9f0a477 100644
--- a/src/runtime/local/kernels/MatrixConstant.h
+++ b/src/runtime/local/kernels/MatrixConstant.h
@@ -26,17 +26,15 @@
 // ****************************************************************************
 // Struct for partial template specialization
 // ****************************************************************************
-template<class DTRes>
-struct MatrixConstant {
-    static void apply(DTRes *& res, uint64_t matrixAddr, DCTX(ctx)) = delete;
+template <class DTRes> struct MatrixConstant {
+    static void apply(DTRes *&res, uint64_t matrixAddr, DCTX(ctx)) = delete;
 };
 
 // ****************************************************************************
 // Convenience function
 // ****************************************************************************
 
-template<class DTRes>
-void matrixConstant(DTRes *& res, uint64_t matrixAddr, DCTX(ctx)) { 
+template <class DTRes> void matrixConstant(DTRes *&res, uint64_t matrixAddr, DCTX(ctx)) {
     MatrixConstant<DTRes>::apply(res, matrixAddr, ctx);
 }
 
@@ -48,33 +46,33 @@ void matrixConstant(DTRes *& res, uint64_t matrixAddr, DCTX(ctx)) {
 // DenseMatrix
 // ----------------------------------------------------------------------------
 
-template<typename VT>
-struct MatrixConstant<DenseMatrix<VT>> {
-    static void apply(DenseMatrix<VT> *& res, uint64_t matrixAddr, DCTX(ctx)) {
+template <typename VT> struct MatrixConstant<DenseMatrix<VT>> {
+    static void apply(DenseMatrix<VT> *&res, uint64_t matrixAddr, DCTX(ctx)) {
         // We create a copy of the DenseMatrix backing the matrix literal.
         // This is important since the matrix literal may be used inside a loop
-        // with multiple iterations or inside a function with multiple invocations.
-        // If we handed out the original DenseMatrix, it would be freed by DAPHNE's
-        // garbage collection by the end of the loop's/function's body.
+        // with multiple iterations or inside a function with multiple
+        // invocations. If we handed out the original DenseMatrix, it would be
+        // freed by DAPHNE's garbage collection by the end of the
+        // loop's/function's body.
 
-        // TODO Currently, the original DenseMatrix objects created by the parser
-        // are never freed, which is a memory leak. However, since matrix literals
-        // should be used only for tiny matrices, the problem is not significant.
-        // They will be freed automatically at the end of the DAPHNE process.
-        // However, in long-running distributed workers these matrix objects might
-        // pile up over time.
+        // TODO Currently, the original DenseMatrix objects created by the
+        // parser are never freed, which is a memory leak. However, since matrix
+        // literals should be used only for tiny matrices, the problem is not
+        // significant. They will be freed automatically at the end of the
+        // DAPHNE process. However, in long-running distributed workers these
+        // matrix objects might pile up over time.
 
-        DenseMatrix<VT> * orig = reinterpret_cast<DenseMatrix<VT>*>(matrixAddr);
+        DenseMatrix<VT> *orig = reinterpret_cast<DenseMatrix<VT> *>(matrixAddr);
         const size_t numRows = orig->getNumRows();
         const size_t numCols = orig->getNumCols();
 
-        if(res == nullptr)
+        if (res == nullptr)
             res = DataObjectFactory::create<DenseMatrix<VT>>(numRows, numCols, false);
-        
-        const VT * valuesOrig = orig->getValues();
-        VT * valuesRes = res->getValues();
+
+        const VT *valuesOrig = orig->getValues();
+        VT *valuesRes = res->getValues();
 
         memcpy(valuesRes, valuesOrig, numRows * numCols * sizeof(VT));
     }
 };
-#endif //SRC_RUNTIME_LOCAL_KERNELS_MATRIXCONSTANT_H
+#endif // SRC_RUNTIME_LOCAL_KERNELS_MATRIXCONSTANT_H
diff --git a/src/runtime/local/kernels/MaxPoolBackward.h b/src/runtime/local/kernels/MaxPoolBackward.h
index 35712e28a..953263838 100644
--- a/src/runtime/local/kernels/MaxPoolBackward.h
+++ b/src/runtime/local/kernels/MaxPoolBackward.h
@@ -1,37 +1,28 @@
-#include "Pooling.h"
 #include "Padding.h"
-
+#include "Pooling.h"
 
 // ****************************************************************************
 // Struct for partial template specialization
 // ****************************************************************************
 
-template<class DTRes, class DTArg>
-struct MaxPoolBackward {
-    static void apply(  DTRes *&res, const DTArg *input, const DTArg *dOut,
-                        const size_t batch_size, const size_t num_channels, 
-                        const size_t img_h, const size_t img_w,
-                        const size_t pool_h, const size_t pool_w,
-                        const size_t stride_h, const size_t stride_w, 
-                        const size_t pad_h, const size_t pad_w,  DCTX(dctx)) = delete;
+template <class DTRes, class DTArg> struct MaxPoolBackward {
+    static void apply(DTRes *&res, const DTArg *input, const DTArg *dOut, const size_t batch_size,
+                      const size_t num_channels, const size_t img_h, const size_t img_w, const size_t pool_h,
+                      const size_t pool_w, const size_t stride_h, const size_t stride_w, const size_t pad_h,
+                      const size_t pad_w, DCTX(dctx)) = delete;
 };
 
 // ****************************************************************************
 // Convenience function
 // ****************************************************************************
 
-template<class DTRes, class DTArg>
-void maxPoolBackward(   DTRes *&res, const DTArg *input, const DTArg *dOut,
-                        const size_t batch_size, const size_t num_channels, 
-                        const size_t img_h, const size_t img_w,
-                        const size_t pool_h, const size_t pool_w,
-                        const size_t stride_h, const size_t stride_w, 
-                        const size_t pad_h, const size_t pad_w,  DCTX(dctx)) {
-    MaxPoolBackward<DTRes, DTArg>::apply(res, input, dOut,
-                        batch_size, num_channels, img_h, img_w,
-                        pool_h, pool_w, 
-                        stride_h, stride_w, 
-                        pad_h, pad_w, dctx);
+template <class DTRes, class DTArg>
+void maxPoolBackward(DTRes *&res, const DTArg *input, const DTArg *dOut, const size_t batch_size,
+                     const size_t num_channels, const size_t img_h, const size_t img_w, const size_t pool_h,
+                     const size_t pool_w, const size_t stride_h, const size_t stride_w, const size_t pad_h,
+                     const size_t pad_w, DCTX(dctx)) {
+    MaxPoolBackward<DTRes, DTArg>::apply(res, input, dOut, batch_size, num_channels, img_h, img_w, pool_h, pool_w,
+                                         stride_h, stride_w, pad_h, pad_w, dctx);
 }
 
 // ****************************************************************************
@@ -42,20 +33,11 @@ void maxPoolBackward(   DTRes *&res, const DTArg *input, const DTArg *dOut,
 // DenseMatrix <- DenseMatrix
 // ----------------------------------------------------------------------------
 
-template <typename VTRes, typename VTArg>
-struct MaxPoolBackward<DenseMatrix<VTRes>, DenseMatrix<VTArg>>
-{
-    static void 
-    apply(DenseMatrix<VTRes> *&res, 
-          const DenseMatrix<VTArg> *input,
-          const DenseMatrix<VTArg> *dOut,
-          const size_t batch_size, const size_t num_channels, 
-          const size_t img_h, const size_t img_w,       
-          const size_t pool_h, const size_t pool_w,
-          const size_t stride_h, const size_t stride_w,
-          const size_t pad_h, const size_t pad_w,  
-          DCTX(dctx))
-    {    
+template <typename VTRes, typename VTArg> struct MaxPoolBackward<DenseMatrix<VTRes>, DenseMatrix<VTArg>> {
+    static void apply(DenseMatrix<VTRes> *&res, const DenseMatrix<VTArg> *input, const DenseMatrix<VTArg> *dOut,
+                      const size_t batch_size, const size_t num_channels, const size_t img_h, const size_t img_w,
+                      const size_t pool_h, const size_t pool_w, const size_t stride_h, const size_t stride_w,
+                      const size_t pad_h, const size_t pad_w, DCTX(dctx)) {
         auto HW = img_h * img_w;
         auto C = num_channels;
         auto CHW = C * HW;
@@ -64,44 +46,41 @@ struct MaxPoolBackward<DenseMatrix<VTRes>, DenseMatrix<VTArg>>
         auto Q = getPQ(img_w, pool_w, pad_w, stride_h);
         auto CPQ = C * P * Q;
         auto start = 0;
-        auto stop = batch_size;    
+        auto stop = batch_size;
 
         auto padded_img_h = img_h + 2 * pad_h;
         auto padded_img_w = img_w + 2 * pad_w;
-        DenseMatrix<VTArg> *res_padded = DataObjectFactory::create<DenseMatrix<VTArg>>(1, padded_img_h * padded_img_w, true);
-        DenseMatrix<VTArg> *input_padded = DataObjectFactory::create<DenseMatrix<VTArg>>(1, padded_img_h * padded_img_w, true);
-        
-        if (res == nullptr)
-        {
+        DenseMatrix<VTArg> *res_padded =
+            DataObjectFactory::create<DenseMatrix<VTArg>>(1, padded_img_h * padded_img_w, true);
+        DenseMatrix<VTArg> *input_padded =
+            DataObjectFactory::create<DenseMatrix<VTArg>>(1, padded_img_h * padded_img_w, true);
+
+        if (res == nullptr) {
             res = DataObjectFactory::create<DenseMatrix<VTArg>>(batch_size, CHW, true);
         }
-        
+
         for (uint32_t i = start; i < stop; i++)
-            for (uint32_t c = 0; c < C; c++)
-            {
+            for (uint32_t c = 0; c < C; c++) {
                 auto off_input = i * CHW + c * HW;
                 Padding(input_padded->getValues(), input->getValues(), pad_h, pad_w, img_w, img_h, off_input);
                 for (uint32_t p = 0; p < P; p++)
-                    for (uint32_t q = 0; q < Q; q++)
-                    {
+                    for (uint32_t q = 0; q < Q; q++) {
                         auto max_index = p * stride_h * padded_img_w + q * stride_w;
-                        for (uint32_t h = 0; h < pool_h; h++)                        
-                            for (uint32_t w = 0; w < pool_w; w++)
-                            {
+                        for (uint32_t h = 0; h < pool_h; h++)
+                            for (uint32_t w = 0; w < pool_w; w++) {
                                 auto off_padded = (p * stride_h + h) * padded_img_w + q * stride_w + w;
                                 if (input_padded->getValues()[off_padded] > input_padded->getValues()[max_index])
                                     max_index = off_padded;
-                                  
-                            }              
+                            }
                         auto off_output = i * CPQ + c * P * Q + p * Q + q;
-                        res_padded->getValues()[max_index] = res_padded->getValues()[max_index]
-                                                           + dOut->getValues()[off_output];           
+                        res_padded->getValues()[max_index] =
+                            res_padded->getValues()[max_index] + dOut->getValues()[off_output];
                     }
-                CleanPaddingAndSave(res->getValues(), res_padded->getValues(), pad_h, pad_w, img_w, img_h, off_input);                
+                CleanPaddingAndSave(res->getValues(), res_padded->getValues(), pad_h, pad_w, img_w, img_h, off_input);
                 for (uint32_t i = 0; i < padded_img_h * padded_img_w; i++)
                     res_padded->getValues()[i] = 0;
             }
         DataObjectFactory::destroy(res_padded);
-        DataObjectFactory::destroy(input_padded); 
+        DataObjectFactory::destroy(input_padded);
     }
 };
\ No newline at end of file
diff --git a/src/runtime/local/kernels/MaxPoolForward.h b/src/runtime/local/kernels/MaxPoolForward.h
index 0821a9c2b..c0a94d974 100644
--- a/src/runtime/local/kernels/MaxPoolForward.h
+++ b/src/runtime/local/kernels/MaxPoolForward.h
@@ -1,38 +1,27 @@
 #include "Pooling.h"
 
-
 // ****************************************************************************
 // Struct for partial template specialization
 // ****************************************************************************
 
-template<class DTRes, class DTArg>
-struct MaxPoolForward {
-    static void apply(DTRes *&res, size_t& res_h, size_t& res_w,
-                          const DTArg *data,
-                          const size_t batch_size, const size_t num_channels, 
-                          const size_t img_h, const size_t img_w,
-                          const size_t pool_h, const size_t pool_w,
-                          const size_t stride_h, const size_t stride_w, 
-                          const size_t pad_h, const size_t pad_w,  DCTX(dctx)) = delete;
+template <class DTRes, class DTArg> struct MaxPoolForward {
+    static void apply(DTRes *&res, size_t &res_h, size_t &res_w, const DTArg *data, const size_t batch_size,
+                      const size_t num_channels, const size_t img_h, const size_t img_w, const size_t pool_h,
+                      const size_t pool_w, const size_t stride_h, const size_t stride_w, const size_t pad_h,
+                      const size_t pad_w, DCTX(dctx)) = delete;
 };
 
 // ****************************************************************************
 // Convenience function
 // ****************************************************************************
 
-template<class DTRes, class DTArg>
-void maxPoolForward(DTRes *&res, size_t& res_h, size_t& res_w,
-                          const DTArg *data,
-                          const size_t batch_size, const size_t num_channels, 
-                          const size_t img_h, const size_t img_w,
-                          const size_t pool_h, const size_t pool_w,
-                          const size_t stride_h, const size_t stride_w, 
-                          const size_t pad_h, const size_t pad_w,  DCTX(dctx)) {
-    MaxPoolForward<DTRes, DTArg>::apply(res, res_h, res_w,
-                        data, batch_size, num_channels, img_h, img_w,
-                        pool_h, pool_w, 
-                        stride_h, stride_w, 
-                        pad_h, pad_w, dctx);
+template <class DTRes, class DTArg>
+void maxPoolForward(DTRes *&res, size_t &res_h, size_t &res_w, const DTArg *data, const size_t batch_size,
+                    const size_t num_channels, const size_t img_h, const size_t img_w, const size_t pool_h,
+                    const size_t pool_w, const size_t stride_h, const size_t stride_w, const size_t pad_h,
+                    const size_t pad_w, DCTX(dctx)) {
+    MaxPoolForward<DTRes, DTArg>::apply(res, res_h, res_w, data, batch_size, num_channels, img_h, img_w, pool_h, pool_w,
+                                        stride_h, stride_w, pad_h, pad_w, dctx);
 }
 
 // ****************************************************************************
@@ -43,24 +32,13 @@ void maxPoolForward(DTRes *&res, size_t& res_h, size_t& res_w,
 // DenseMatrix <- DenseMatrix
 // ----------------------------------------------------------------------------
 
-template <typename VTRes, typename VTArg>
-struct MaxPoolForward<DenseMatrix<VTRes>, DenseMatrix<VTArg>>
-{
-    static void 
-    apply(DenseMatrix<VTRes> *&res, 
-          size_t &res_h, size_t &res_w,
-          const DenseMatrix<VTArg> *data,
-          const size_t batch_size, const size_t num_channels, 
-          const size_t img_h, const size_t img_w,       
-          const size_t pool_h, const size_t pool_w,
-          const size_t stride_h, const size_t stride_w,
-          const size_t pad_h, const size_t pad_w,  
-          DCTX(dctx))
-    {
-        NN::Pooling::Forward<NN::Pooling::MAX, DenseMatrix<VTRes>, DenseMatrix<VTArg>>::apply(res, res_h, res_w,
-                        data, batch_size, num_channels, img_h, img_w,
-                        pool_h, pool_w, 
-                        stride_h, stride_w, 
-                        pad_h, pad_w, dctx);
+template <typename VTRes, typename VTArg> struct MaxPoolForward<DenseMatrix<VTRes>, DenseMatrix<VTArg>> {
+    static void apply(DenseMatrix<VTRes> *&res, size_t &res_h, size_t &res_w, const DenseMatrix<VTArg> *data,
+                      const size_t batch_size, const size_t num_channels, const size_t img_h, const size_t img_w,
+                      const size_t pool_h, const size_t pool_w, const size_t stride_h, const size_t stride_w,
+                      const size_t pad_h, const size_t pad_w, DCTX(dctx)) {
+        NN::Pooling::Forward<NN::Pooling::MAX, DenseMatrix<VTRes>, DenseMatrix<VTArg>>::apply(
+            res, res_h, res_w, data, batch_size, num_channels, img_h, img_w, pool_h, pool_w, stride_h, stride_w, pad_h,
+            pad_w, dctx);
     }
 };
\ No newline at end of file
diff --git a/src/runtime/local/kernels/Now.h b/src/runtime/local/kernels/Now.h
index 9bf14f8d1..dddd8713c 100644
--- a/src/runtime/local/kernels/Now.h
+++ b/src/runtime/local/kernels/Now.h
@@ -32,10 +32,10 @@ int64_t now(DCTX(ctx)) {
     using clock = std::chrono::high_resolution_clock;
 
     if (clock::period::num != 1 || clock::period::den != 1000000000) {
-        throw std::runtime_error(
-            "now() expects std::chrono::high_resolution_clock to be in nano seconds");
+        throw std::runtime_error("now() expects std::chrono::high_resolution_clock to be in nano "
+                                 "seconds");
     }
     return clock::now().time_since_epoch().count();
 }
 
-#endif //SRC_RUNTIME_LOCAL_KERNELS_NOW_H
+#endif // SRC_RUNTIME_LOCAL_KERNELS_NOW_H
diff --git a/src/runtime/local/kernels/NumCells.h b/src/runtime/local/kernels/NumCells.h
index 3ceaae241..3908c7ec6 100644
--- a/src/runtime/local/kernels/NumCells.h
+++ b/src/runtime/local/kernels/NumCells.h
@@ -24,8 +24,6 @@
 // Convenience function
 // ****************************************************************************
 
-size_t numCells(const Structure * arg, DCTX(ctx)) {
-    return arg->getNumCols() * arg->getNumRows();
-}
+size_t numCells(const Structure *arg, DCTX(ctx)) { return arg->getNumCols() * arg->getNumRows(); }
 
-#endif //SRC_RUNTIME_LOCAL_KERNELS_NUMCELLS_H
\ No newline at end of file
+#endif // SRC_RUNTIME_LOCAL_KERNELS_NUMCELLS_H
\ No newline at end of file
diff --git a/src/runtime/local/kernels/NumCols.h b/src/runtime/local/kernels/NumCols.h
index 6edf0e274..4f405bcfb 100644
--- a/src/runtime/local/kernels/NumCols.h
+++ b/src/runtime/local/kernels/NumCols.h
@@ -24,8 +24,6 @@
 // Convenience function
 // ****************************************************************************
 
-size_t numCols(const Structure * arg, DCTX(ctx)) {
-    return arg->getNumCols();
-}
+size_t numCols(const Structure *arg, DCTX(ctx)) { return arg->getNumCols(); }
 
-#endif //SRC_RUNTIME_LOCAL_KERNELS_NUMCOLS_H
\ No newline at end of file
+#endif // SRC_RUNTIME_LOCAL_KERNELS_NUMCOLS_H
\ No newline at end of file
diff --git a/src/runtime/local/kernels/NumDistinctApprox.h b/src/runtime/local/kernels/NumDistinctApprox.h
index 6dcf70ab2..936aea425 100644
--- a/src/runtime/local/kernels/NumDistinctApprox.h
+++ b/src/runtime/local/kernels/NumDistinctApprox.h
@@ -67,35 +67,35 @@ template <class DTArg> size_t numDistinctApprox(const DTArg *arg, size_t K, int6
 
 template <typename VT> struct NumDistinctApprox<DenseMatrix<VT>> {
     static size_t apply(const DenseMatrix<VT> *arg, size_t K, int64_t seed, DCTX(ctx)) {
-        
+
         if (seed == -1)
             seed = std::chrono::high_resolution_clock::now().time_since_epoch().count();
-  
+
         const size_t numRows = arg->getNumRows();
         const size_t numCols = arg->getNumCols();
-  
+
         UniqueBoundedSet<uint32_t> uBSet(K);
-  
+
         uint32_t hashedValueOut = 0;
-  
-        for(auto rowIdx = 0ul; rowIdx < numRows; rowIdx++) {
-            for(auto colIdx = 0ul; colIdx < numCols; colIdx++) {
+
+        for (auto rowIdx = 0ul; rowIdx < numRows; rowIdx++) {
+            for (auto colIdx = 0ul; colIdx < numCols; colIdx++) {
                 auto el = arg->get(rowIdx, colIdx);
                 MurmurHash3_x86_32(&el, sizeof(VT), seed, &hashedValueOut);
                 uBSet.push(hashedValueOut);
             }
         }
-  
-        // When the set is not full, we know exactly how many distinct items are in there.
+
+        // When the set is not full, we know exactly how many distinct items are
+        // in there.
         if (uBSet.size() < K) {
             return uBSet.size();
         }
-  
+
         size_t kMinVal = uBSet.top();
         const size_t maxVal = std::numeric_limits<std::uint32_t>::max();
-        double kMinValNormed =
-            static_cast<double>(kMinVal) / static_cast<double>(maxVal);
-    
+        double kMinValNormed = static_cast<double>(kMinVal) / static_cast<double>(maxVal);
+
         return static_cast<size_t>(static_cast<double>((K - 1)) / kMinValNormed);
     }
 };
@@ -106,7 +106,7 @@ template <typename VT> struct NumDistinctApprox<DenseMatrix<VT>> {
 
 template <typename VT> struct NumDistinctApprox<CSRMatrix<VT>> {
     static size_t apply(const CSRMatrix<VT> *arg, size_t K, int64_t seed, DCTX(ctx)) {
-        
+
         if (seed == -1)
             seed = std::chrono::high_resolution_clock::now().time_since_epoch().count();
 
@@ -124,29 +124,28 @@ template <typename VT> struct NumDistinctApprox<CSRMatrix<VT>> {
             uBSet.push(hashedValueOut);
         }
 
-        for(size_t rowIdx = 0; rowIdx < numRows; rowIdx++) {
-            const VT* values = arg->getValues(rowIdx);
+        for (size_t rowIdx = 0; rowIdx < numRows; rowIdx++) {
+            const VT *values = arg->getValues(rowIdx);
 
             const size_t numNonZerosInRow = arg->getNumNonZeros(rowIdx);
-            for(size_t colIdx = 0; colIdx < numNonZerosInRow; colIdx++) {
+            for (size_t colIdx = 0; colIdx < numNonZerosInRow; colIdx++) {
                 VT el = values[colIdx];
                 MurmurHash3_x86_32(&el, sizeof(VT), seed, &hashedValueOut);
                 uBSet.push(hashedValueOut);
             }
         }
 
-        // When the set is not full, we know exactly how many distinct items are in there.
+        // When the set is not full, we know exactly how many distinct items are
+        // in there.
         if (uBSet.size() < K) {
             return uBSet.size();
         }
 
         size_t kMinVal = uBSet.top();
         const size_t maxVal = std::numeric_limits<std::uint32_t>::max();
-        double kMinValNormed =
-            static_cast<double>(kMinVal) / static_cast<double>(maxVal);
+        double kMinValNormed = static_cast<double>(kMinVal) / static_cast<double>(maxVal);
 
         return static_cast<size_t>(static_cast<double>((K - 1)) / kMinValNormed);
-
     }
 };
 
@@ -173,14 +172,14 @@ template <typename VT> struct NumDistinctApprox<Matrix<VT>> {
             }
         }
 
-        // When the set is not full, we know exactly how many distinct items are in there.
+        // When the set is not full, we know exactly how many distinct items are
+        // in there.
         if (uBSet.size() < K)
             return uBSet.size();
 
         size_t kMinVal = uBSet.top();
         const size_t maxVal = std::numeric_limits<std::uint32_t>::max();
-        double kMinValNormed =
-            static_cast<double>(kMinVal) / static_cast<double>(maxVal);
+        double kMinValNormed = static_cast<double>(kMinVal) / static_cast<double>(maxVal);
 
         return static_cast<size_t>(static_cast<double>((K - 1)) / kMinValNormed);
     }
diff --git a/src/runtime/local/kernels/NumRows.h b/src/runtime/local/kernels/NumRows.h
index 18fdc7ba7..8242f8162 100644
--- a/src/runtime/local/kernels/NumRows.h
+++ b/src/runtime/local/kernels/NumRows.h
@@ -24,8 +24,6 @@
 // Convenience function
 // ****************************************************************************
 
-size_t numRows(const Structure * arg, DCTX(ctx)) {
-    return arg->getNumRows();
-}
+size_t numRows(const Structure *arg, DCTX(ctx)) { return arg->getNumRows(); }
 
-#endif //SRC_RUNTIME_LOCAL_KERNELS_NUMROWS_H
\ No newline at end of file
+#endif // SRC_RUNTIME_LOCAL_KERNELS_NUMROWS_H
\ No newline at end of file
diff --git a/src/runtime/local/kernels/OneHot.h b/src/runtime/local/kernels/OneHot.h
index af67abf86..1ebe430bb 100644
--- a/src/runtime/local/kernels/OneHot.h
+++ b/src/runtime/local/kernels/OneHot.h
@@ -21,8 +21,11 @@
 #include <runtime/local/datastructures/DataObjectFactory.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
 #include <runtime/local/datastructures/Matrix.h>
+#include <runtime/local/datastructures/ValueTypeUtils.h>
 
 #include <stdexcept>
+#include <string>
+#include <unordered_map>
 
 #include <cstddef>
 #include <cstdint>
@@ -32,17 +35,16 @@
 // Struct for partial template specialization
 // ****************************************************************************
 
-template<class DTRes, class DTArg>
-struct OneHot {
-    static void apply(DTRes *& res, const DTArg * arg, const DenseMatrix<int64_t> * info, DCTX(ctx)) = delete;
+template <class DTRes, class DTArg> struct OneHot {
+    static void apply(DTRes *&res, const DTArg *arg, const DenseMatrix<int64_t> *info, DCTX(ctx)) = delete;
 };
 
 // ****************************************************************************
 // Convenience function
 // ****************************************************************************
 
-template<class DTRes, class DTArg>
-void oneHot(DTRes *& res, const DTArg * arg, const DenseMatrix<int64_t> * info, DCTX(ctx)) {
+template <class DTRes, class DTArg>
+void oneHot(DTRes *&res, const DTArg *arg, const DenseMatrix<int64_t> *info, DCTX(ctx)) {
     OneHot<DTRes, DTArg>::apply(res, arg, info, ctx);
 }
 
@@ -54,24 +56,21 @@ void oneHot(DTRes *& res, const DTArg * arg, const DenseMatrix<int64_t> * info,
 // DenseMatrix <- DenseMatrix
 // ----------------------------------------------------------------------------
 
-template<typename VT>
-struct OneHot<DenseMatrix<VT>, DenseMatrix<VT>> {
-    static void apply(DenseMatrix<VT> *& res, const DenseMatrix<VT> * arg, const DenseMatrix<int64_t> * info, DCTX(ctx)) {
+template <typename VT> struct OneHot<DenseMatrix<VT>, DenseMatrix<VT>> {
+    static void apply(DenseMatrix<VT> *&res, const DenseMatrix<VT> *arg, const DenseMatrix<int64_t> *info, DCTX(ctx)) {
         if (info->getNumRows() != 1) {
-            throw std::runtime_error(
-                "OneHot - parameter 'info' must be a row matrix");
+            throw std::runtime_error("OneHot - parameter 'info' must be a row matrix");
         }
 
         const size_t numColsArg = arg->getNumCols();
 
         if (numColsArg != info->getNumCols()) {
-            throw std::runtime_error(
-                "OneHot - parameter 'info' must provide information for each "
-                "column of parameter arg");
+            throw std::runtime_error("OneHot - parameter 'info' must provide information for each "
+                                     "column of parameter arg");
         }
 
         size_t numColsRes = 0;
-        const int64_t * valuesInfo = info->getValues();
+        const int64_t *valuesInfo = info->getValues();
         for (size_t c = 0; c < numColsArg; c++) {
             const int64_t numDistinct = valuesInfo[c];
             if (numDistinct == -1)
@@ -89,20 +88,20 @@ struct OneHot<DenseMatrix<VT>, DenseMatrix<VT>> {
 
         const size_t numRows = arg->getNumRows();
 
-        if(res == nullptr)
+        if (res == nullptr)
             res = DataObjectFactory::create<DenseMatrix<VT>>(numRows, numColsRes, false);
 
-        const VT * valuesArg = arg->getValues();
-        VT * valuesRes = res->getValues();
-        
+        const VT *valuesArg = arg->getValues();
+        VT *valuesRes = res->getValues();
+
         const size_t rowSkipArg = arg->getRowSkip();
         const size_t rowSkipRes = res->getRowSkip();
-        
-        for(size_t r = 0; r < numRows; r++) {
+
+        for (size_t r = 0; r < numRows; r++) {
             size_t cRes = 0;
-            for(size_t cArg = 0; cArg < numColsArg; cArg++) {
+            for (size_t cArg = 0; cArg < numColsArg; cArg++) {
                 const int64_t numDistinct = valuesInfo[cArg];
-                if(numDistinct == -1)
+                if (numDistinct == -1)
                     // retain value from argument matrix
                     valuesRes[cRes++] = valuesArg[cArg];
                 else if (numDistinct != 0) {
@@ -112,7 +111,9 @@ struct OneHot<DenseMatrix<VT>, DenseMatrix<VT>> {
                     if (argVal >= 0 && argVal < static_cast<size_t>(numDistinct))
                         valuesRes[cRes + argVal] = 1;
                     else
-                        throw std::out_of_range("OneHot: arg values that are encoded (info value != -1) must be positive and smaller than the corresponding info value");
+                        throw std::out_of_range("OneHot: arg values that are encoded (info value "
+                                                "!= -1) must be positive and smaller than the "
+                                                "corresponding info value");
                     cRes += numDistinct;
                 }
             }
@@ -122,21 +123,68 @@ struct OneHot<DenseMatrix<VT>, DenseMatrix<VT>> {
     }
 };
 
+// ----------------------------------------------------------------------------
+// DenseMatrix <- DenseMatrix<string>
+// ----------------------------------------------------------------------------
+
+template <typename VTRes, typename VTArg>
+void oneHotString(DenseMatrix<VTRes> *&res, const DenseMatrix<VTArg> *arg, const DenseMatrix<int64_t> *info,
+                  DCTX(ctx)) {
+    const size_t numRows = arg->getNumRows();
+    const size_t numColsArg = arg->getNumCols();
+    const size_t rowSkipArg = arg->getRowSkip();
+    auto recode_result = DataObjectFactory::create<DenseMatrix<VTRes>>(numRows, numColsArg, false);
+
+    VTRes *valuesRes = recode_result->getValues();
+    const VTArg *valuesArg = arg->getValues();
+
+    // Recode arg with string elements to a dense matrix based on indices without ordering
+    for (size_t cArg = 0; cArg < numColsArg; cArg++) {
+        std::unordered_map<VTArg, size_t> firstIndexMap;
+        for (size_t r = 0; r < numRows; r++) {
+            size_t value_index = (rowSkipArg * r) + cArg;
+            if (firstIndexMap.find(valuesArg[value_index]) == firstIndexMap.end()) {
+                firstIndexMap[valuesArg[value_index]] = firstIndexMap.size();
+            }
+            valuesRes[value_index] = VTRes(firstIndexMap[valuesArg[value_index]]);
+        }
+    }
+
+    // call oneHot with recoded matrix as arg
+    oneHot(res, recode_result, info, ctx);
+
+    DataObjectFactory::destroy(recode_result);
+}
+
+template <typename VT> struct OneHot<DenseMatrix<VT>, DenseMatrix<std::string>> {
+    static void apply(DenseMatrix<VT> *&res, const DenseMatrix<std::string> *arg, const DenseMatrix<int64_t> *info,
+                      DCTX(ctx)) {
+        oneHotString<VT, std::string>(res, arg, info, ctx);
+    }
+};
+
+template <typename VT> struct OneHot<DenseMatrix<VT>, DenseMatrix<FixedStr16>> {
+    static void apply(DenseMatrix<VT> *&res, const DenseMatrix<FixedStr16> *arg, const DenseMatrix<int64_t> *info,
+                      DCTX(ctx)) {
+        oneHotString<VT, FixedStr16>(res, arg, info, ctx);
+    }
+};
+
 // ----------------------------------------------------------------------------
 // Matrix <- Matrix
 // ----------------------------------------------------------------------------
 
-template<typename VT>
-struct OneHot<Matrix<VT>, Matrix<VT>> {
-    static void apply(Matrix<VT> *& res, const Matrix<VT> * arg, const Matrix<int64_t> * info, DCTX(ctx)) {
+template <typename VT> struct OneHot<Matrix<VT>, Matrix<VT>> {
+    static void apply(Matrix<VT> *&res, const Matrix<VT> *arg, const Matrix<int64_t> *info, DCTX(ctx)) {
         const size_t numColsArg = arg->getNumCols();
         const size_t numRows = arg->getNumRows();
-        
+
         if (info->getNumRows() != 1)
             throw std::runtime_error("OneHot: parameter 'info' must be a row matrix");
         if (numColsArg != info->getNumCols())
-            throw std::runtime_error("OneHot: parameter 'info' must provide information for each column of parameter arg");
-        
+            throw std::runtime_error("OneHot: parameter 'info' must provide information for each "
+                                     "column of parameter arg");
+
         size_t numColsRes = 0;
         for (size_t c = 0; c < numColsArg; c++) {
             const int64_t numDistinct = info->get(0, c);
@@ -145,12 +193,14 @@ struct OneHot<Matrix<VT>, Matrix<VT>> {
             else if (numDistinct > 0)
                 numColsRes += numDistinct;
             else if (numDistinct != 0)
-                throw std::runtime_error("OneHot: parameter 'info' must be an integer greater or equal than -1");
+                throw std::runtime_error("OneHot: parameter 'info' must be an "
+                                         "integer greater or equal than -1");
         }
 
         if (numColsRes == 0)
-            throw std::runtime_error("OneHot: parameter 'info' must contain at least one non-zero entry");
-        
+            throw std::runtime_error("OneHot: parameter 'info' must contain at "
+                                     "least one non-zero entry");
+
         if (res == nullptr)
             res = DataObjectFactory::create<DenseMatrix<VT>>(numRows, numColsRes, false);
 
@@ -169,8 +219,10 @@ struct OneHot<Matrix<VT>, Matrix<VT>> {
                     if (argVal >= 0 && argVal < static_cast<size_t>(numDistinct))
                         res->append(r, cRes + argVal, 1);
                     else
-                        throw std::out_of_range("OneHot: arg values that are encoded (info value != -1) "
-                                                "must be positive and smaller than the corresponding info value");
+                        throw std::out_of_range("OneHot: arg values that are encoded (info value "
+                                                "!= -1) "
+                                                "must be positive and smaller than the "
+                                                "corresponding info value");
                     cRes += numDistinct;
                 }
             }
@@ -179,4 +231,4 @@ struct OneHot<Matrix<VT>, Matrix<VT>> {
     }
 };
 
-#endif //SRC_RUNTIME_LOCAL_KERNELS_ONEHOT_H
+#endif // SRC_RUNTIME_LOCAL_KERNELS_ONEHOT_H
diff --git a/src/runtime/local/kernels/Order.h b/src/runtime/local/kernels/Order.h
index 6f742b0d8..afa6d1a10 100644
--- a/src/runtime/local/kernels/Order.h
+++ b/src/runtime/local/kernels/Order.h
@@ -35,9 +35,10 @@
 // Struct for partial template specialization
 // ****************************************************************************
 
-template<class DTRes, class DTArg>
-struct Order {
-    static void apply(DTRes *& res, const DTArg * arg, size_t * colIdxs, size_t numColIdxs, bool * ascending, size_t numAscending, bool returnIdx, DCTX(ctx), std::vector<std::pair<size_t, size_t>> * groupsRes = nullptr) = delete;
+template <class DTRes, class DTArg> struct Order {
+    static void apply(DTRes *&res, const DTArg *arg, size_t *colIdxs, size_t numColIdxs, bool *ascending,
+                      size_t numAscending, bool returnIdx, DCTX(ctx),
+                      std::vector<std::pair<size_t, size_t>> *groupsRes = nullptr) = delete;
 };
 
 // ****************************************************************************
@@ -47,48 +48,50 @@ struct Order {
 // Note that we normally don't pass any arguments after the DaphneContext. In
 // this case it is only okay because groupsRes has a default and is meant to be
 // used only by other kernels, not from DaphneDSL.
-template<class DTRes, class DTArg>
-void order(DTRes *& res, const DTArg * arg, size_t * colIdxs, size_t numColIdxs, bool * ascending, size_t numAscending, bool returnIdx, DCTX(ctx), std::vector<std::pair<size_t, size_t>> * groupsRes = nullptr) {
-    Order<DTRes,DTArg>::apply(res, arg, colIdxs, numColIdxs, ascending, numAscending, returnIdx, ctx, groupsRes);
+template <class DTRes, class DTArg>
+void order(DTRes *&res, const DTArg *arg, size_t *colIdxs, size_t numColIdxs, bool *ascending, size_t numAscending,
+           bool returnIdx, DCTX(ctx), std::vector<std::pair<size_t, size_t>> *groupsRes = nullptr) {
+    Order<DTRes, DTArg>::apply(res, arg, colIdxs, numColIdxs, ascending, numAscending, returnIdx, ctx, groupsRes);
 }
 
 // ****************************************************************************
 // Functions called by multiple template specializations
 // ****************************************************************************
 
-// sorts input idx DenseMatrix within the index ranges in the input groups vector on the values of the input column (column with id colIdx in DenseMatrix arg; id 0 for column matrices)
-template<typename VTIdx, typename VT>
-void columnIDSort(DenseMatrix<VTIdx> *&idx, const DenseMatrix<VT>* arg, size_t colIdx, std::vector<std::pair<VTIdx, VTIdx>> &groups, bool ascending, DCTX(ctx)) {
-    VTIdx * indices = idx->getValues();
-    const VT * values = arg->getValues();
+// sorts input idx DenseMatrix within the index ranges in the input groups
+// vector on the values of the input column (column with id colIdx in
+// DenseMatrix arg; id 0 for column matrices)
+template <typename VTIdx, typename VT>
+void columnIDSort(DenseMatrix<VTIdx> *&idx, const DenseMatrix<VT> *arg, size_t colIdx,
+                  std::vector<std::pair<VTIdx, VTIdx>> &groups, bool ascending, DCTX(ctx)) {
+    VTIdx *indices = idx->getValues();
+    const VT *values = arg->getValues();
     const size_t rowSkip = arg->getRowSkip();
-    auto compare = ascending ?
-        std::function{[&values, rowSkip, colIdx](VTIdx i, VTIdx j) {
-            return values[i * rowSkip + colIdx] < values[j * rowSkip + colIdx];
-        }} :
-        std::function{[&values, rowSkip, colIdx](VTIdx i, VTIdx j) {
-            return values[i * rowSkip + colIdx] > values[j * rowSkip + colIdx];}};
+    auto compare = ascending ? std::function{[&values, rowSkip, colIdx](VTIdx i, VTIdx j) {
+        return values[i * rowSkip + colIdx] < values[j * rowSkip + colIdx];
+    }}
+                             : std::function{[&values, rowSkip, colIdx](VTIdx i, VTIdx j) {
+                                   return values[i * rowSkip + colIdx] > values[j * rowSkip + colIdx];
+                               }};
     for (const auto &group : groups)
-        std::stable_sort(indices+group.first, indices+group.second, compare);
+        std::stable_sort(indices + group.first, indices + group.second, compare);
 }
 
 // function overload for generic Matrix type
-template<typename VTIdx, typename VT>
-void columnIDSort(DenseMatrix<VTIdx> *&idx, const Matrix<VT>* arg, size_t colIdx, std::vector<std::pair<VTIdx, VTIdx>> &groups, bool ascending, DCTX(ctx)) {
-    VTIdx * indices = idx->getValues();
-    auto compare = ascending ?
-        std::function{[&arg, colIdx](VTIdx i, VTIdx j) {
-            return arg->get(i, colIdx) < arg->get(j, colIdx);
-        }} :
-        std::function{[&arg, colIdx](VTIdx i, VTIdx j) {
-            return arg->get(i, colIdx) > arg->get(j, colIdx);}};
+template <typename VTIdx, typename VT>
+void columnIDSort(DenseMatrix<VTIdx> *&idx, const Matrix<VT> *arg, size_t colIdx,
+                  std::vector<std::pair<VTIdx, VTIdx>> &groups, bool ascending, DCTX(ctx)) {
+    VTIdx *indices = idx->getValues();
+    auto compare =
+        ascending
+            ? std::function{[&arg, colIdx](VTIdx i, VTIdx j) { return arg->get(i, colIdx) < arg->get(j, colIdx); }}
+            : std::function{[&arg, colIdx](VTIdx i, VTIdx j) { return arg->get(i, colIdx) > arg->get(j, colIdx); }};
     for (const auto &group : groups)
-        std::stable_sort(indices+group.first, indices+group.second, compare);
+        std::stable_sort(indices + group.first, indices + group.second, compare);
 }
 
-template<typename VT>
-VT* nextWithRowskip(VT*& first, VT*& last, const size_t rowSkip) {
-    for (VT* next = first; next != last; next+=rowSkip) {
+template <typename VT> VT *nextWithRowskip(VT *&first, VT *&last, const size_t rowSkip) {
+    for (VT *next = first; next != last; next += rowSkip) {
         if (*next != *first)
             return next;
     }
@@ -96,8 +99,8 @@ VT* nextWithRowskip(VT*& first, VT*& last, const size_t rowSkip) {
 }
 
 // function overload for generic Matrix type
-template<typename VT>
-size_t nextWithRowskip(const size_t firstIdx, const size_t lastIdx, const size_t colIdx, Matrix<VT> * arg) {
+template <typename VT>
+size_t nextWithRowskip(const size_t firstIdx, const size_t lastIdx, const size_t colIdx, Matrix<VT> *arg) {
     const VT firstVal = arg->get(firstIdx, colIdx);
     for (size_t nextIdx = firstIdx; nextIdx < lastIdx; ++nextIdx) {
         if (arg->get(nextIdx, colIdx) != firstVal)
@@ -106,30 +109,33 @@ size_t nextWithRowskip(const size_t firstIdx, const size_t lastIdx, const size_t
     return lastIdx;
 }
 
-// scans a column for groups of duplicates (stored as VTIdx pairs forming index ranges)
-// performed only within the index ranges in the input groups vector on the values of the input column (DenseMatrix)
-// replaces the groups in the input vector with the groups found during the scan
-template<typename VTIdx, typename VT>
-void columnGroupScan(std::vector<std::pair<VTIdx, VTIdx>> &groups, DenseMatrix<VT>* col, const size_t colIdx, DCTX(ctx)) {
+// scans a column for groups of duplicates (stored as VTIdx pairs forming index
+// ranges) performed only within the index ranges in the input groups vector on
+// the values of the input column (DenseMatrix) replaces the groups in the input
+// vector with the groups found during the scan
+template <typename VTIdx, typename VT>
+void columnGroupScan(std::vector<std::pair<VTIdx, VTIdx>> &groups, DenseMatrix<VT> *col, const size_t colIdx,
+                     DCTX(ctx)) {
     const size_t numOldGroups = groups.size();
-    VT * values = col->getValues();
+    VT *values = col->getValues();
     const size_t rowSkip = col->getRowSkip();
     for (size_t g = 0; g < numOldGroups; g++) {
-        VT * first = values + groups[g].first * rowSkip + colIdx;
-        VT * last = values + groups[g].second * rowSkip + colIdx;
+        VT *first = values + groups[g].first * rowSkip + colIdx;
+        VT *last = values + groups[g].second * rowSkip + colIdx;
         while (first != last) {
-            VT * next = nextWithRowskip<VT>(first, last, rowSkip);
-            if ((size_t) (next - first) > rowSkip)
-                groups.push_back(std::make_pair((first-values-colIdx)/rowSkip, (next-values-colIdx)/rowSkip));
+            VT *next = nextWithRowskip<VT>(first, last, rowSkip);
+            if ((size_t)(next - first) > rowSkip)
+                groups.push_back(
+                    std::make_pair((first - values - colIdx) / rowSkip, (next - values - colIdx) / rowSkip));
             first = next;
         }
     }
-    groups.erase(groups.begin(),groups.begin()+numOldGroups);
+    groups.erase(groups.begin(), groups.begin() + numOldGroups);
 }
 
 // function overload for generic Matrix type
-template<typename VTIdx, typename VT>
-void columnGroupScan(std::vector<std::pair<VTIdx, VTIdx>> &groups, Matrix<VT>* col, const size_t colIdx, DCTX(ctx)) {
+template <typename VTIdx, typename VT>
+void columnGroupScan(std::vector<std::pair<VTIdx, VTIdx>> &groups, Matrix<VT> *col, const size_t colIdx, DCTX(ctx)) {
     const size_t numOldGroups = groups.size();
     for (size_t g = 0; g < numOldGroups; ++g) {
         size_t currentRowIdx = groups[g].first;
@@ -141,15 +147,17 @@ void columnGroupScan(std::vector<std::pair<VTIdx, VTIdx>> &groups, Matrix<VT>* c
             currentRowIdx = nextIdx;
         }
     }
-    groups.erase(groups.begin(), groups.begin()+numOldGroups);
+    groups.erase(groups.begin(), groups.begin() + numOldGroups);
 }
 
-// sorts IDs inside groups by the key column, reorder the column via a row extraction into a temporary
-// DenseMatrix and then scan for groups of duplicates to be tie broken by another subsequent key column
-template<typename VTIdx, typename DTCol>
-void multiColumnIDSort(DenseMatrix<VTIdx> *&idx, const DTCol* col, size_t colIdx, std::vector<std::pair<VTIdx, VTIdx>> &groups, bool ascending, DCTX(ctx)){
+// sorts IDs inside groups by the key column, reorder the column via a row
+// extraction into a temporary DenseMatrix and then scan for groups of
+// duplicates to be tie broken by another subsequent key column
+template <typename VTIdx, typename DTCol>
+void multiColumnIDSort(DenseMatrix<VTIdx> *&idx, const DTCol *col, size_t colIdx,
+                       std::vector<std::pair<VTIdx, VTIdx>> &groups, bool ascending, DCTX(ctx)) {
     columnIDSort(idx, col, colIdx, groups, ascending, ctx);
-    DTCol * tmp = nullptr;
+    DTCol *tmp = nullptr;
     extractRow<DTCol, DTCol, VTIdx>(tmp, col, idx, ctx);
     columnGroupScan(groups, tmp, colIdx, ctx);
     DataObjectFactory::destroy(tmp);
@@ -159,44 +167,50 @@ void multiColumnIDSort(DenseMatrix<VTIdx> *&idx, const DTCol* col, size_t colIdx
 //  Frame order structs
 // ----------------------------------------------------------------------------
 
-template<typename VTCol>
-struct ColumnIDSort {
-    static void apply(const Frame * arg, DenseMatrix<size_t> *&idx, std::vector<std::pair<size_t, size_t>> &groups, bool ascending, size_t colIdx, DCTX(ctx)) {
+template <typename VTCol> struct ColumnIDSort {
+    static void apply(const Frame *arg, DenseMatrix<size_t> *&idx, std::vector<std::pair<size_t, size_t>> &groups,
+                      bool ascending, size_t colIdx, DCTX(ctx)) {
         columnIDSort(idx, arg->getColumn<VTCol>(colIdx), 0, groups, ascending, ctx);
     }
 };
 
-// sorts IDs inside groups by the key column, reorder the column via a row extraction into a temporary 
-// DenseMatrix and then scan for groups of duplicates to be tie broken by another subsequent key column
-template<typename VTCol>
-struct MultiColumnIDSort {
-    static void apply(const Frame * arg, DenseMatrix<size_t> *&idx, std::vector<std::pair<size_t, size_t>> &groups, bool ascending, size_t colIdx, DCTX(ctx)) {
+// sorts IDs inside groups by the key column, reorder the column via a row
+// extraction into a temporary DenseMatrix and then scan for groups of
+// duplicates to be tie broken by another subsequent key column
+template <typename VTCol> struct MultiColumnIDSort {
+    static void apply(const Frame *arg, DenseMatrix<size_t> *&idx, std::vector<std::pair<size_t, size_t>> &groups,
+                      bool ascending, size_t colIdx, DCTX(ctx)) {
         multiColumnIDSort(idx, arg->getColumn<VTCol>(colIdx), 0, groups, ascending, ctx);
     }
 };
 
 struct OrderFrame {
-    static void apply(DenseMatrix<size_t> *& idx, const Frame * arg, size_t * colIdxs, size_t numColIdxs, bool * ascending, size_t numAscending, std::vector<std::pair<size_t, size_t>> * groupsRes, DCTX(ctx)) {
+    static void apply(DenseMatrix<size_t> *&idx, const Frame *arg, size_t *colIdxs, size_t numColIdxs, bool *ascending,
+                      size_t numAscending, std::vector<std::pair<size_t, size_t>> *groupsRes, DCTX(ctx)) {
         size_t numRows = arg->getNumRows();
         idx = DataObjectFactory::create<DenseMatrix<size_t>>(numRows, 1, false);
         auto indices = idx->getValues();
-        std::iota(indices, indices+numRows, 0);
-        
+        std::iota(indices, indices + numRows, 0);
+
         std::vector<std::pair<size_t, size_t>> groups;
         groups.push_back(std::make_pair(0, numRows));
-            
+
         if (numColIdxs > 1) {
-            for (size_t i = 0; i < numColIdxs-1; i++) {
-                DeduceValueTypeAndExecute<MultiColumnIDSort>::apply(arg->getSchema()[colIdxs[i]], arg, idx, groups, ascending[i], colIdxs[i], ctx);
+            for (size_t i = 0; i < numColIdxs - 1; i++) {
+                DeduceValueTypeAndExecute<MultiColumnIDSort>::apply(arg->getSchema()[colIdxs[i]], arg, idx, groups,
+                                                                    ascending[i], colIdxs[i], ctx);
             }
         }
 
-        // efficient last sort pass OR finalizing the groups vector for further use
-        size_t colIdx = colIdxs[numColIdxs-1];
+        // efficient last sort pass OR finalizing the groups vector for further
+        // use
+        size_t colIdx = colIdxs[numColIdxs - 1];
         if (groupsRes == nullptr) {
-            DeduceValueTypeAndExecute<ColumnIDSort>::apply(arg->getSchema()[colIdx], arg, idx, groups, ascending[numColIdxs-1], colIdx, ctx);
+            DeduceValueTypeAndExecute<ColumnIDSort>::apply(arg->getSchema()[colIdx], arg, idx, groups,
+                                                           ascending[numColIdxs - 1], colIdx, ctx);
         } else {
-            DeduceValueTypeAndExecute<MultiColumnIDSort>::apply(arg->getSchema()[colIdx], arg, idx, groups, ascending[numColIdxs-1], colIdx, ctx);
+            DeduceValueTypeAndExecute<MultiColumnIDSort>::apply(arg->getSchema()[colIdx], arg, idx, groups,
+                                                                ascending[numColIdxs - 1], colIdx, ctx);
             groupsRes->insert(groupsRes->end(), groups.begin(), groups.end());
         }
     }
@@ -211,11 +225,13 @@ struct OrderFrame {
 // ----------------------------------------------------------------------------
 
 template <> struct Order<Frame, Frame> {
-    static void apply(Frame *& res, const Frame * arg, size_t * colIdxs, size_t numColIdxs, bool * ascending, size_t numAscending, bool returnIdx, DCTX(ctx), std::vector<std::pair<size_t, size_t>> * groupsRes = nullptr) {
+    static void apply(Frame *&res, const Frame *arg, size_t *colIdxs, size_t numColIdxs, bool *ascending,
+                      size_t numAscending, bool returnIdx, DCTX(ctx),
+                      std::vector<std::pair<size_t, size_t>> *groupsRes = nullptr) {
         if (arg == nullptr || colIdxs == nullptr || numColIdxs == 0 || ascending == nullptr || returnIdx) {
             throw std::runtime_error("order-kernel called with invalid arguments");
         }
-        DenseMatrix<size_t>* idx = nullptr;
+        DenseMatrix<size_t> *idx = nullptr;
         OrderFrame::apply(idx, arg, colIdxs, numColIdxs, ascending, numAscending, groupsRes, ctx);
         extractRow(res, arg, idx, ctx);
         DataObjectFactory::destroy(idx);
@@ -227,11 +243,13 @@ template <> struct Order<Frame, Frame> {
 // ----------------------------------------------------------------------------
 
 template <typename VTRes> struct Order<DenseMatrix<VTRes>, Frame> {
-    static void apply(DenseMatrix<VTRes> *& res, const Frame * arg, size_t * colIdxs, size_t numColIdxs, bool * ascending, size_t numAscending, bool returnIdx, DCTX(ctx), std::vector<std::pair<size_t, size_t>> * groupsRes = nullptr) {
+    static void apply(DenseMatrix<VTRes> *&res, const Frame *arg, size_t *colIdxs, size_t numColIdxs, bool *ascending,
+                      size_t numAscending, bool returnIdx, DCTX(ctx),
+                      std::vector<std::pair<size_t, size_t>> *groupsRes = nullptr) {
         if (arg == nullptr || colIdxs == nullptr || numColIdxs == 0 || ascending == nullptr || !returnIdx) {
             throw std::runtime_error("order-kernel called with invalid arguments");
         }
-        DenseMatrix<size_t>* idx = nullptr;
+        DenseMatrix<size_t> *idx = nullptr;
         OrderFrame::apply(idx, arg, colIdxs, numColIdxs, ascending, numAscending, groupsRes, ctx);
         res = idx;
     }
@@ -241,45 +259,46 @@ template <typename VTRes> struct Order<DenseMatrix<VTRes>, Frame> {
 // DenseMatrix <- DenseMatrix
 // ----------------------------------------------------------------------------
 
-template <typename VTRes, typename VTArg>
-struct Order<DenseMatrix<VTRes>, DenseMatrix<VTArg>> {
-    static void apply(DenseMatrix<VTRes> *& res, const DenseMatrix<VTArg> * arg, size_t * colIdxs, size_t numColIdxs, bool * ascending, size_t numAscending, bool returnIdx, DCTX(ctx), std::vector<std::pair<size_t, size_t>> * groupsRes = nullptr) {
+template <typename VTRes, typename VTArg> struct Order<DenseMatrix<VTRes>, DenseMatrix<VTArg>> {
+    static void apply(DenseMatrix<VTRes> *&res, const DenseMatrix<VTArg> *arg, size_t *colIdxs, size_t numColIdxs,
+                      bool *ascending, size_t numAscending, bool returnIdx, DCTX(ctx),
+                      std::vector<std::pair<size_t, size_t>> *groupsRes = nullptr) {
         size_t numRows = arg->getNumRows();
         if (arg == nullptr || colIdxs == nullptr || numColIdxs == 0 || ascending == nullptr ||
             (returnIdx == false && !std::is_same<VTRes, VTArg>::value) ||
-            (returnIdx == true && !std::is_same<VTRes, size_t>::value)
-        ) {
+            (returnIdx == true && !std::is_same<VTRes, size_t>::value)) {
             throw std::runtime_error("order-kernel called with invalid arguments");
         }
 
         auto idx = DataObjectFactory::create<DenseMatrix<size_t>>(numRows, 1, false);
         auto indices = idx->getValues();
-        std::iota(indices, indices+numRows, 0);
+        std::iota(indices, indices + numRows, 0);
         std::vector<std::pair<size_t, size_t>> groups;
         groups.push_back(std::make_pair(0, numRows));
 
         if (numColIdxs > 1) {
-            for (size_t i = 0; i < numColIdxs-1; i++) {
+            for (size_t i = 0; i < numColIdxs - 1; i++) {
                 multiColumnIDSort(idx, arg, colIdxs[i], groups, ascending[i], ctx);
             }
         }
 
         if (groupsRes == nullptr) {
-            columnIDSort(idx, arg, colIdxs[numColIdxs-1], groups, ascending[numColIdxs-1], ctx);
+            columnIDSort(idx, arg, colIdxs[numColIdxs - 1], groups, ascending[numColIdxs - 1], ctx);
         } else {
-            multiColumnIDSort(idx, arg, colIdxs[numColIdxs-1], groups, ascending[numColIdxs-1], ctx);
+            multiColumnIDSort(idx, arg, colIdxs[numColIdxs - 1], groups, ascending[numColIdxs - 1], ctx);
             groupsRes->insert(groupsRes->end(), groups.begin(), groups.end());
         }
 
-        if (returnIdx)
-        {
-           res = (DenseMatrix<VTRes>*) idx;
+        if (returnIdx) {
+            res = (DenseMatrix<VTRes> *)idx;
         } else {
-            if constexpr(std::is_same<VTArg, VTRes>::value)
+            if constexpr (std::is_same<VTArg, VTRes>::value)
                 extractRow(res, arg, idx, ctx);
             else
-                // When returnIdx is false, then VTRes and VTArg should be the same, so this should not happen.
-                throw std::runtime_error("when returnIdx is false, then VTRes and VTArg should be the same");
+                // When returnIdx is false, then VTRes and VTArg should be the
+                // same, so this should not happen.
+                throw std::runtime_error("when returnIdx is false, then VTRes "
+                                         "and VTArg should be the same");
             DataObjectFactory::destroy(idx);
         }
     }
@@ -289,32 +308,32 @@ struct Order<DenseMatrix<VTRes>, DenseMatrix<VTArg>> {
 // Matrix <- Matrix
 // ----------------------------------------------------------------------------
 
-template <typename VTRes, typename VTArg>
-struct Order<Matrix<VTRes>, Matrix<VTArg>> {
-    static void apply(Matrix<VTRes> *& res, const Matrix<VTArg> * arg, size_t * colIdxs, size_t numColIdxs, bool * ascending, size_t numAscending, bool returnIdx, DCTX(ctx), std::vector<std::pair<size_t, size_t>> * groupsRes = nullptr) {
+template <typename VTRes, typename VTArg> struct Order<Matrix<VTRes>, Matrix<VTArg>> {
+    static void apply(Matrix<VTRes> *&res, const Matrix<VTArg> *arg, size_t *colIdxs, size_t numColIdxs,
+                      bool *ascending, size_t numAscending, bool returnIdx, DCTX(ctx),
+                      std::vector<std::pair<size_t, size_t>> *groupsRes = nullptr) {
         size_t numRows = arg->getNumRows();
 
         if (arg == nullptr || colIdxs == nullptr || numColIdxs == 0 || ascending == nullptr ||
-            (returnIdx == false && !std::is_same<VTRes, VTArg>::value)
-        ) {
+            (returnIdx == false && !std::is_same<VTRes, VTArg>::value)) {
             throw std::runtime_error("order-kernel called with invalid arguments");
         }
 
         auto idx = DataObjectFactory::create<DenseMatrix<size_t>>(numRows, 1, false);
         auto indices = idx->getValues();
-        std::iota(indices, indices+numRows, 0);
+        std::iota(indices, indices + numRows, 0);
         std::vector<std::pair<size_t, size_t>> groups;
         groups.push_back(std::make_pair(0, numRows));
 
         if (numColIdxs > 1) {
-            for (size_t i = 0; i < numColIdxs-1; ++i)
+            for (size_t i = 0; i < numColIdxs - 1; ++i)
                 multiColumnIDSort(idx, arg, colIdxs[i], groups, ascending[i], ctx);
         }
 
         if (groupsRes == nullptr) {
-            columnIDSort(idx, arg, colIdxs[numColIdxs-1], groups, ascending[numColIdxs-1], ctx);
+            columnIDSort(idx, arg, colIdxs[numColIdxs - 1], groups, ascending[numColIdxs - 1], ctx);
         } else {
-            multiColumnIDSort(idx, arg, colIdxs[numColIdxs-1], groups, ascending[numColIdxs-1], ctx);
+            multiColumnIDSort(idx, arg, colIdxs[numColIdxs - 1], groups, ascending[numColIdxs - 1], ctx);
             groupsRes->insert(groupsRes->end(), groups.begin(), groups.end());
         }
 
@@ -329,11 +348,13 @@ struct Order<Matrix<VTRes>, Matrix<VTArg>> {
                 res->append(r, 0, static_cast<VTRes>(indices[r]));
             res->finishAppend();
         } else {
-            if constexpr(std::is_same<VTArg, VTRes>::value)
+            if constexpr (std::is_same<VTArg, VTRes>::value)
                 extractRow(res, arg, idx, ctx);
             else
-                // When returnIdx is false, then VTRes and VTArg should be the same, so this should not happen.
-                throw std::runtime_error("when returnIdx is false, then VTRes and VTArg should be the same");
+                // When returnIdx is false, then VTRes and VTArg should be the
+                // same, so this should not happen.
+                throw std::runtime_error("when returnIdx is false, then VTRes "
+                                         "and VTArg should be the same");
         }
 
         DataObjectFactory::destroy(idx);
diff --git a/src/runtime/local/kernels/OuterBinary.h b/src/runtime/local/kernels/OuterBinary.h
index 6b8384e85..8753668cd 100644
--- a/src/runtime/local/kernels/OuterBinary.h
+++ b/src/runtime/local/kernels/OuterBinary.h
@@ -29,17 +29,16 @@
 // Struct for partial template specialization
 // ****************************************************************************
 
-template<class DTRes, class DTLhs, class DTRhs>
-struct OuterBinary {
-    static void apply(BinaryOpCode opCode, DTRes *& res, const DTLhs * lhs, const DTRhs * rhs, DCTX(ctx)) = delete;
+template <class DTRes, class DTLhs, class DTRhs> struct OuterBinary {
+    static void apply(BinaryOpCode opCode, DTRes *&res, const DTLhs *lhs, const DTRhs *rhs, DCTX(ctx)) = delete;
 };
 
 // ****************************************************************************
 // Convenience function
 // ****************************************************************************
 
-template<class DTRes, class DTLhs, class DTRhs>
-void outerBinary(BinaryOpCode opCode, DTRes *& res, const DTLhs * lhs, const DTRhs * rhs, DCTX(ctx)) {
+template <class DTRes, class DTLhs, class DTRhs>
+void outerBinary(BinaryOpCode opCode, DTRes *&res, const DTLhs *lhs, const DTRhs *rhs, DCTX(ctx)) {
     OuterBinary<DTRes, DTLhs, DTRhs>::apply(opCode, res, lhs, rhs, ctx);
 }
 
@@ -51,28 +50,29 @@ void outerBinary(BinaryOpCode opCode, DTRes *& res, const DTLhs * lhs, const DTR
 // DenseMatrix <- DenseMatrix, DenseMatrix
 // ----------------------------------------------------------------------------
 
-template<typename VTRes, typename VTLhs, typename VTRhs>
+template <typename VTRes, typename VTLhs, typename VTRhs>
 struct OuterBinary<DenseMatrix<VTRes>, DenseMatrix<VTLhs>, DenseMatrix<VTRhs>> {
-    static void apply(BinaryOpCode opCode, DenseMatrix<VTRes> *& res, const DenseMatrix<VTLhs> * lhs, const DenseMatrix<VTRhs> * rhs, DCTX(ctx)) {
-        if(lhs->getNumCols() != 1)
+    static void apply(BinaryOpCode opCode, DenseMatrix<VTRes> *&res, const DenseMatrix<VTLhs> *lhs,
+                      const DenseMatrix<VTRhs> *rhs, DCTX(ctx)) {
+        if (lhs->getNumCols() != 1)
             throw std::runtime_error("outerBinary: lhs must be a column (mx1) matrix");
-        if(rhs->getNumRows() != 1)
+        if (rhs->getNumRows() != 1)
             throw std::runtime_error("outerBinary: rhs must be a row (1xn) matrix");
 
         const size_t numRowsLhs = lhs->getNumRows();
         const size_t numColsRhs = rhs->getNumCols();
 
-        if(res == nullptr)
+        if (res == nullptr)
             res = DataObjectFactory::create<DenseMatrix<VTRes>>(numRowsLhs, numColsRhs, false);
-        
-        const VTLhs * valuesLhs = lhs->getValues();
-        const VTRhs * valuesRhs = rhs->getValues();
-        VTRes * valuesRes = res->getValues();
-        
+
+        const VTLhs *valuesLhs = lhs->getValues();
+        const VTRhs *valuesRhs = rhs->getValues();
+        VTRes *valuesRes = res->getValues();
+
         EwBinaryScaFuncPtr<VTRes, VTLhs, VTRhs> func = getEwBinaryScaFuncPtr<VTRes, VTLhs, VTRhs>(opCode);
-        
-        for(size_t r = 0; r < numRowsLhs; r++) {
-            for(size_t c = 0; c < numColsRhs; c++)
+
+        for (size_t r = 0; r < numRowsLhs; r++) {
+            for (size_t c = 0; c < numColsRhs; c++)
                 valuesRes[c] = func(valuesLhs[0], valuesRhs[c], ctx);
             valuesLhs += lhs->getRowSkip();
             valuesRes += res->getRowSkip();
@@ -84,9 +84,10 @@ struct OuterBinary<DenseMatrix<VTRes>, DenseMatrix<VTLhs>, DenseMatrix<VTRhs>> {
 // Matrix <- Matrix, Matrix
 // ----------------------------------------------------------------------------
 
-template<typename VTRes, typename VTLhs, typename VTRhs>
+template <typename VTRes, typename VTLhs, typename VTRhs>
 struct OuterBinary<Matrix<VTRes>, Matrix<VTLhs>, Matrix<VTRhs>> {
-    static void apply(BinaryOpCode opCode, Matrix<VTRes> *& res, const Matrix<VTLhs> * lhs, const Matrix<VTRhs> * rhs, DCTX(ctx)) {
+    static void apply(BinaryOpCode opCode, Matrix<VTRes> *&res, const Matrix<VTLhs> *lhs, const Matrix<VTRhs> *rhs,
+                      DCTX(ctx)) {
         if (lhs->getNumCols() != 1)
             throw std::runtime_error("outerBinary: lhs must be a column (mx1) matrix");
         if (rhs->getNumRows() != 1)
@@ -97,9 +98,9 @@ struct OuterBinary<Matrix<VTRes>, Matrix<VTLhs>, Matrix<VTRhs>> {
 
         if (res == nullptr)
             res = DataObjectFactory::create<DenseMatrix<VTRes>>(numRowsLhs, numColsRhs, false);
-        
+
         EwBinaryScaFuncPtr<VTRes, VTLhs, VTRhs> func = getEwBinaryScaFuncPtr<VTRes, VTLhs, VTRhs>(opCode);
-        
+
         res->prepareAppend();
         for (size_t r = 0; r < numRowsLhs; ++r) {
             const VTLhs lhsVal = lhs->get(r, 0);
diff --git a/src/runtime/local/kernels/Padding.h b/src/runtime/local/kernels/Padding.h
index 9b9ceb7ad..59b67e37b 100644
--- a/src/runtime/local/kernels/Padding.h
+++ b/src/runtime/local/kernels/Padding.h
@@ -1,23 +1,20 @@
 #pragma once
 
-#include <cstdint>
 #include <cstddef>
+#include <cstdint>
 
-static uint32_t
-getPQ(uint32_t img_extent, uint32_t filter_extent, uint32_t pad_extent, uint32_t stride_extent) 
-{
+static uint32_t getPQ(uint32_t img_extent, uint32_t filter_extent, uint32_t pad_extent, uint32_t stride_extent) {
     uint32_t padded_image_extent = img_extent + 2 * pad_extent;
     return (padded_image_extent - filter_extent) / stride_extent + 1;
 }
 
 template <typename VT>
-static inline void
-Padding(VT *padded_input, const VT *input, size_t pad_h, size_t pad_w, size_t img_w, size_t img_h, uint32_t off)
-{
+static inline void Padding(VT *padded_input, const VT *input, size_t pad_h, size_t pad_w, size_t img_w, size_t img_h,
+                           uint32_t off) {
     auto padded_w = img_w + 2 * pad_w;
     for (uint32_t i = 0; i < img_h * img_w; i++)
         padded_input[i] = 0;
-    
+
     auto start = pad_h * padded_w + pad_w;
     for (uint32_t i = 0, j = 0; i < img_h; i++)
         for (uint32_t k = 0; k < img_w; k++, j++)
@@ -25,15 +22,13 @@ Padding(VT *padded_input, const VT *input, size_t pad_h, size_t pad_w, size_t im
 }
 
 template <typename VT>
-static inline void
-CleanPaddingAndSave(VT *res, const VT *dPooling_padded, size_t pad_h, size_t pad_w, size_t img_w, size_t img_h, uint32_t off_input)
-{
+static inline void CleanPaddingAndSave(VT *res, const VT *dPooling_padded, size_t pad_h, size_t pad_w, size_t img_w,
+                                       size_t img_h, uint32_t off_input) {
     auto start = pad_h * (img_w + 2 * pad_w) + pad_w;
     auto off_res = off_input;
     for (uint32_t i = 0, off_padded = 0; i < img_h; i++)
-        for (uint32_t j = 0; j < img_w; j++, off_res++)
-        {
+        for (uint32_t j = 0; j < img_w; j++, off_res++) {
             off_padded = start + i * (img_w + 2 * pad_w) + j;
             res[off_res] = dPooling_padded[off_padded];
-        }    
-}      
\ No newline at end of file
+        }
+}
\ No newline at end of file
diff --git a/src/runtime/local/kernels/Pooling.cpp b/src/runtime/local/kernels/Pooling.cpp
index 2dcbf5494..118a6fab7 100644
--- a/src/runtime/local/kernels/Pooling.cpp
+++ b/src/runtime/local/kernels/Pooling.cpp
@@ -19,96 +19,96 @@
 
 namespace NN::Pooling {
 
-    // uint32_t getPQ(uint32_t img_extent, uint32_t filter_extent, uint32_t pad_extent, uint32_t stride_extent) {
-    //     uint32_t padded_image_extent = img_extent + 2 * pad_extent;
-    //     return (padded_image_extent - filter_extent) / stride_extent + 1;
-    // }
+// uint32_t getPQ(uint32_t img_extent, uint32_t filter_extent, uint32_t
+// pad_extent, uint32_t stride_extent) {
+//     uint32_t padded_image_extent = img_extent + 2 * pad_extent;
+//     return (padded_image_extent - filter_extent) / stride_extent + 1;
+// }
 
-    template<template<typename> class OP, typename DTRes, typename DTArg>
-    void Forward<OP, DTRes, DTArg>::apply(DTRes *&res, size_t& res_h, size_t& res_w,
-            const DTArg *data, const size_t batch_size, const size_t num_channels, const size_t img_h, const size_t img_w,
-            const size_t pool_h, const size_t pool_w, const size_t stride_h, const size_t stride_w, const size_t pad_h,
-            const size_t pad_w, DCTX(dctx))
-    {
-        auto HW = img_h * img_w;
-        auto C = num_channels;
-        auto CHW = C * HW;
-        // padded height/width
-        auto P = getPQ(img_h, pool_h, pad_h, stride_w);
-        auto Q = getPQ(img_w, pool_w, pad_w, stride_h);
-        auto CPQ = C * P * Q;
-        res_h = P;
-        res_w = Q;
-        auto start = 0;
-        auto stop = batch_size;
+template <template <typename> class OP, typename DTRes, typename DTArg>
+void Forward<OP, DTRes, DTArg>::apply(DTRes *&res, size_t &res_h, size_t &res_w, const DTArg *data,
+                                      const size_t batch_size, const size_t num_channels, const size_t img_h,
+                                      const size_t img_w, const size_t pool_h, const size_t pool_w,
+                                      const size_t stride_h, const size_t stride_w, const size_t pad_h,
+                                      const size_t pad_w, DCTX(dctx)) {
+    auto HW = img_h * img_w;
+    auto C = num_channels;
+    auto CHW = C * HW;
+    // padded height/width
+    auto P = getPQ(img_h, pool_h, pad_h, stride_w);
+    auto Q = getPQ(img_w, pool_w, pad_w, stride_h);
+    auto CPQ = C * P * Q;
+    res_h = P;
+    res_w = Q;
+    auto start = 0;
+    auto stop = batch_size;
 
-        auto ii = start * CHW;
-        auto oi = start * CPQ;
+    auto ii = start * CHW;
+    auto oi = start * CPQ;
 
-        // 1 / pool length for averaging
-        auto plen = static_cast<typename DTRes::VT>(1) / static_cast<typename DTRes::VT>(pool_w * pool_h);
+    // 1 / pool length for averaging
+    auto plen = static_cast<typename DTRes::VT>(1) / static_cast<typename DTRes::VT>(pool_w * pool_h);
 
-        auto padded_img_h = img_h + 2 * pad_h;
-        auto padded_img_w = img_w + 2 * pad_w;
-        DTArg *padded_data = DataObjectFactory::create<DTArg>(1, padded_img_h * padded_img_w, true);
-        
-        if (res == nullptr) {
-            res = DataObjectFactory::create<DTRes>(batch_size, CPQ, true);
-        }
-        if (P == 1 && Q == 1 && img_w == 1) {
-            //quick-path w/o materialized index arrays and
-            //simplified inner loops for P = 1, Q = 1, W = 1
-            uint32_t lenh = std::min(pool_h, img_h);
-            for (uint32_t i = start; i < stop; i++, oi += C)
-                for (uint32_t c = 0, off = ii + (i - start) * CHW; c < C; c++, off += img_h) {
-                    res->getValues()[oi + c] = OP<typename DTArg::VT>::run(OP<typename DTArg::VT>::getNeutralElement(),
-                                                                           data->getValues(), off, lenh, plen);
-                }
-        }
-        else if (stride_w == 1 && stride_h == 1 && pad_h == 0 && pad_w == 0) {
-            //quick-path w/o materialized index arrays
-            for (uint32_t i = start; i < stop; i++)
-                for (uint32_t c = 0, off = ii + (i - start) * CHW, oix = oi + (i - start) * CPQ; c < C; c++, off += HW)
-                    for (uint32_t p = 0; p < P; p++, oix += Q)
-                        for (uint32_t h = p; h < std::min(p + pool_h, img_h); h++)
-                            for (uint32_t q = 0, off2 = off + h * img_w; q < Q; q++) {
-                                res->getValues()[oix + q] = OP<typename DTArg::VT>::run(res->getValues()[oix + q],
-                                        data->getValues(), off2 + q, std::min(pool_w, img_w - q), plen);
-                            }
-        }
-        else if(pad_h == 0 && pad_w == 0) {
-            // throw std::runtime_error("ToDo: pooling general case with stride & padding");
-            for (uint32_t i = start; i < stop; i++)
-                for (uint32_t c = 0, off = ii + (i - start) * CHW, oix = oi + (i - start) * CPQ; c < C; c++, off += HW)
-                    for (uint32_t p = 0; p < P; p++, oix += Q)
-                        for (uint32_t h = p * stride_h; h < std::min(p * stride_h + pool_h, img_h); h++)
-                            for (uint32_t q = 0, off2 = off + h * img_w; q < Q; q++) {
-                                res->getValues()[oix + q] = OP<typename DTArg::VT>::run(res->getValues()[oix + q],
-                                        data->getValues(), off2 + q * stride_w, std::min(pool_w, img_w - q * stride_w), plen);
-                            }
-        }
-        else{
-            for (uint32_t i = start; i < stop; i++)
-                for (uint32_t c = 0, off = ii + (i - start) * CHW, oix = oi + (i - start) * CPQ; c < C; c++, off += HW){
+    auto padded_img_h = img_h + 2 * pad_h;
+    auto padded_img_w = img_w + 2 * pad_w;
+    DTArg *padded_data = DataObjectFactory::create<DTArg>(1, padded_img_h * padded_img_w, true);
 
-                    Padding(padded_data->getValues(), data->getValues(), pad_h, pad_w, img_w, img_h, off);
-
-                    for (uint32_t p = 0; p < P; p++, oix += Q)
-                        for (uint32_t h = p * stride_h; h < std::min(p * stride_h + pool_h, padded_img_h); h++)
-                            for (uint32_t q = 0, off2 = h * padded_img_w; q < Q; q++) {
-                                res->getValues()[oix + q] = OP<typename DTArg::VT>::run(res->getValues()[oix + q],
-                                        padded_data->getValues(), off2 + q * stride_w, std::min(pool_w, padded_img_w - q * stride_w), plen);
-                            }
-                }
-                    
-        }
-        DataObjectFactory::destroy(padded_data);
+    if (res == nullptr) {
+        res = DataObjectFactory::create<DTRes>(batch_size, CPQ, true);
     }
+    if (P == 1 && Q == 1 && img_w == 1) {
+        // quick-path w/o materialized index arrays and
+        // simplified inner loops for P = 1, Q = 1, W = 1
+        uint32_t lenh = std::min(pool_h, img_h);
+        for (uint32_t i = start; i < stop; i++, oi += C)
+            for (uint32_t c = 0, off = ii + (i - start) * CHW; c < C; c++, off += img_h) {
+                res->getValues()[oi + c] = OP<typename DTArg::VT>::run(OP<typename DTArg::VT>::getNeutralElement(),
+                                                                       data->getValues(), off, lenh, plen);
+            }
+    } else if (stride_w == 1 && stride_h == 1 && pad_h == 0 && pad_w == 0) {
+        // quick-path w/o materialized index arrays
+        for (uint32_t i = start; i < stop; i++)
+            for (uint32_t c = 0, off = ii + (i - start) * CHW, oix = oi + (i - start) * CPQ; c < C; c++, off += HW)
+                for (uint32_t p = 0; p < P; p++, oix += Q)
+                    for (uint32_t h = p; h < std::min(p + pool_h, img_h); h++)
+                        for (uint32_t q = 0, off2 = off + h * img_w; q < Q; q++) {
+                            res->getValues()[oix + q] =
+                                OP<typename DTArg::VT>::run(res->getValues()[oix + q], data->getValues(), off2 + q,
+                                                            std::min(pool_w, img_w - q), plen);
+                        }
+    } else if (pad_h == 0 && pad_w == 0) {
+        // throw std::runtime_error("ToDo: pooling general case with stride &
+        // padding");
+        for (uint32_t i = start; i < stop; i++)
+            for (uint32_t c = 0, off = ii + (i - start) * CHW, oix = oi + (i - start) * CPQ; c < C; c++, off += HW)
+                for (uint32_t p = 0; p < P; p++, oix += Q)
+                    for (uint32_t h = p * stride_h; h < std::min(p * stride_h + pool_h, img_h); h++)
+                        for (uint32_t q = 0, off2 = off + h * img_w; q < Q; q++) {
+                            res->getValues()[oix + q] = OP<typename DTArg::VT>::run(
+                                res->getValues()[oix + q], data->getValues(), off2 + q * stride_w,
+                                std::min(pool_w, img_w - q * stride_w), plen);
+                        }
+    } else {
+        for (uint32_t i = start; i < stop; i++)
+            for (uint32_t c = 0, off = ii + (i - start) * CHW, oix = oi + (i - start) * CPQ; c < C; c++, off += HW) {
 
-    template struct Forward<AVG, DenseMatrix<float>, DenseMatrix<float>>;
-    template struct Forward<AVG, DenseMatrix<double>, DenseMatrix<double>>;
+                Padding(padded_data->getValues(), data->getValues(), pad_h, pad_w, img_w, img_h, off);
 
-    template struct Forward<MAX, DenseMatrix<float>, DenseMatrix<float>>;
-    template struct Forward<MAX, DenseMatrix<double>, DenseMatrix<double>>;
+                for (uint32_t p = 0; p < P; p++, oix += Q)
+                    for (uint32_t h = p * stride_h; h < std::min(p * stride_h + pool_h, padded_img_h); h++)
+                        for (uint32_t q = 0, off2 = h * padded_img_w; q < Q; q++) {
+                            res->getValues()[oix + q] = OP<typename DTArg::VT>::run(
+                                res->getValues()[oix + q], padded_data->getValues(), off2 + q * stride_w,
+                                std::min(pool_w, padded_img_w - q * stride_w), plen);
+                        }
+            }
+    }
+    DataObjectFactory::destroy(padded_data);
 }
 
+template struct Forward<AVG, DenseMatrix<float>, DenseMatrix<float>>;
+template struct Forward<AVG, DenseMatrix<double>, DenseMatrix<double>>;
+
+template struct Forward<MAX, DenseMatrix<float>, DenseMatrix<float>>;
+template struct Forward<MAX, DenseMatrix<double>, DenseMatrix<double>>;
+} // namespace NN::Pooling
diff --git a/src/runtime/local/kernels/Pooling.h b/src/runtime/local/kernels/Pooling.h
index fdad80f6d..8e79cb3a1 100644
--- a/src/runtime/local/kernels/Pooling.h
+++ b/src/runtime/local/kernels/Pooling.h
@@ -30,63 +30,62 @@
 
 namespace NN::Pooling {
 
-    template<typename VT>
-    struct AVG {
-        static inline VT run(VT initial_value, const VT *in, uint32_t start, uint32_t length, VT plen) {
-            VT ret = 0;
-            auto end = start + length;
-            for (auto i = start; i < end; ++i)
-                ret += in[i];
-            return ret * plen + initial_value;
-        }
+template <typename VT> struct AVG {
+    static inline VT run(VT initial_value, const VT *in, uint32_t start, uint32_t length, VT plen) {
+        VT ret = 0;
+        auto end = start + length;
+        for (auto i = start; i < end; ++i)
+            ret += in[i];
+        return ret * plen + initial_value;
+    }
 
-        static inline VT getNeutralElement() { return 0; }
-        static inline bool isMAX() { return false; }
-    };
+    static inline VT getNeutralElement() { return 0; }
+    static inline bool isMAX() { return false; }
+};
 
-    template<typename VT>
-    struct MAX {
-        static inline VT
-        run(VT initial_value, const VT *in, uint32_t start, uint32_t length, __attribute__((unused)) VT plen) {
-            VT ret = initial_value;
-            auto end = start + length;
-            for (auto i = start; i < end; ++i)
-                ret = std::max(ret, in[i]);
-            return ret;
-        }
+template <typename VT> struct MAX {
+    static inline VT run(VT initial_value, const VT *in, uint32_t start, uint32_t length,
+                         __attribute__((unused)) VT plen) {
+        VT ret = initial_value;
+        auto end = start + length;
+        for (auto i = start; i < end; ++i)
+            ret = std::max(ret, in[i]);
+        return ret;
+    }
 
-        static inline VT getNeutralElement() { return std::numeric_limits<VT>::max(); }
-        static inline bool isMAX() { return true; }
-    };
+    static inline VT getNeutralElement() { return std::numeric_limits<VT>::max(); }
+    static inline bool isMAX() { return true; }
+};
 
-    template<typename VT>
-    struct GetPaddedData {
-        static inline void
-        run(const VT *data, VT *padded_data, VT *selected_data, size_t pad_w, size_t pad_h, size_t img_w, size_t img_h, size_t padded_img_w, uint32_t off) {
-            uint32_t j = 0; uint32_t k = 0; uint32_t padded_index = 0;uint32_t data_index = 0;
-            for (j = 0; j < img_h * img_w; j++)
-                selected_data[j] = data[off + j];
-                                   
-            for (j = 0; j < (pad_h * padded_img_w); j++, padded_index++)
+template <typename VT> struct GetPaddedData {
+    static inline void run(const VT *data, VT *padded_data, VT *selected_data, size_t pad_w, size_t pad_h, size_t img_w,
+                           size_t img_h, size_t padded_img_w, uint32_t off) {
+        uint32_t j = 0;
+        uint32_t k = 0;
+        uint32_t padded_index = 0;
+        uint32_t data_index = 0;
+        for (j = 0; j < img_h * img_w; j++)
+            selected_data[j] = data[off + j];
+
+        for (j = 0; j < (pad_h * padded_img_w); j++, padded_index++)
+            padded_data[padded_index] = 0;
+        for (j = 0; j < img_h; j++) {
+            for (k = 0; k < pad_w; k++, padded_index++)
+                padded_data[padded_index] = 0;
+            for (k = 0; k < img_w; k++, data_index++, padded_index++)
+                padded_data[padded_index] = selected_data[data_index];
+            for (k = 0; k < pad_w; k++, padded_index++)
                 padded_data[padded_index] = 0;
-            for (j = 0; j < img_h; j++){
-                for (k = 0; k < pad_w; k++, padded_index++)
-                    padded_data[padded_index] = 0;                        
-                for (k = 0; k < img_w; k++, data_index++, padded_index++)
-                    padded_data[padded_index] = selected_data[data_index];
-                for (k = 0; k < pad_w; k++, padded_index++)
-                    padded_data[padded_index] = 0;
-                }
-                for (j = 0; j < (pad_h * padded_img_w); j++, padded_index++)
-                    padded_data[padded_index] = 0;
         }
-    };
+        for (j = 0; j < (pad_h * padded_img_w); j++, padded_index++)
+            padded_data[padded_index] = 0;
+    }
+};
 
-    template<template<typename> class OP, typename DTRes, typename DTArg>
-    struct Forward {
-        static void apply(DTRes *&res, size_t& res_h, size_t& res_w,
-                          const DTArg *data, const size_t batch_size, const size_t num_channels, const size_t img_h, const size_t img_w,
-                          const size_t pool_h, const size_t pool_w, const size_t stride_h, const size_t stride_w, const size_t pad_h,
-                          const size_t pad_w, DCTX(dctx));
-    };
-}
+template <template <typename> class OP, typename DTRes, typename DTArg> struct Forward {
+    static void apply(DTRes *&res, size_t &res_h, size_t &res_w, const DTArg *data, const size_t batch_size,
+                      const size_t num_channels, const size_t img_h, const size_t img_w, const size_t pool_h,
+                      const size_t pool_w, const size_t stride_h, const size_t stride_w, const size_t pad_h,
+                      const size_t pad_w, DCTX(dctx));
+};
+} // namespace NN::Pooling
diff --git a/src/runtime/local/kernels/PrintObj.h b/src/runtime/local/kernels/PrintObj.h
index b85faf96f..c13e37d96 100644
--- a/src/runtime/local/kernels/PrintObj.h
+++ b/src/runtime/local/kernels/PrintObj.h
@@ -17,6 +17,9 @@
 #pragma once
 
 #include <runtime/local/context/DaphneContext.h>
+#include <runtime/local/datastructures/CSRMatrix.h>
+#include <runtime/local/datastructures/Frame.h>
+#include <runtime/local/datastructures/List.h>
 
 #include <iostream>
 
@@ -26,21 +29,19 @@
 
 /**
  * @brief Prints a data object to standard output.
- * 
+ *
  * Template paramter `DT` should be a sub-class of `Structure`, e.g.
  * `DenseMatrix`, `CSRMatrix`, or `Frame`.
- * 
+ *
  * @param arg The data object to print.
  */
-template<class DT>
-void printObj(const DT * arg, [[maybe_unused]] bool newline, bool err, DCTX(ctx)) {
+template <class DT> void printObj(const DT *arg, [[maybe_unused]] bool newline, bool err, DCTX(ctx)) {
     arg->print(err ? std::cerr : std::cout);
 }
 
-template<>
-void printObj(const char * arg, bool newline, bool err, DCTX(ctx)) {
-    std::ostream & os = err ? std::cerr : std::cout;
+template <> void printObj(const char *arg, bool newline, bool err, DCTX(ctx)) {
+    std::ostream &os = err ? std::cerr : std::cout;
     os << arg;
-    if(newline)
+    if (newline)
         os << std::endl;
 }
diff --git a/src/runtime/local/kernels/PrintSca.h b/src/runtime/local/kernels/PrintSca.h
index 622b2cfb6..856d9c0a5 100644
--- a/src/runtime/local/kernels/PrintSca.h
+++ b/src/runtime/local/kernels/PrintSca.h
@@ -27,31 +27,28 @@
 
 /**
  * @brief Prints a scalar value to standard output.
- * 
+ *
  * @param arg The value to print.
  */
-template<typename VT>
-void printSca(VT arg, bool newline, bool err, DCTX(ctx)) {
-    std::ostream & os = err ? std::cerr : std::cout;
+template <typename VT> void printSca(VT arg, bool newline, bool err, DCTX(ctx)) {
+    std::ostream &os = err ? std::cerr : std::cout;
     os << arg;
-    if(newline)
+    if (newline)
         os << std::endl;
 }
 
-//For printing int8_t/uint8_t as numbers as opposed to characters
-template<>
-void printSca(int8_t arg, bool newline, bool err, DCTX(ctx)) {
-    std::ostream & os = err ? std::cerr : std::cout;
+// For printing int8_t/uint8_t as numbers as opposed to characters
+template <> void printSca(int8_t arg, bool newline, bool err, DCTX(ctx)) {
+    std::ostream &os = err ? std::cerr : std::cout;
     os << static_cast<int32_t>(arg);
-    if(newline)
+    if (newline)
         os << std::endl;
 }
-template<>
-void printSca(uint8_t arg, bool newline, bool err, DCTX(ctx)) {
-    std::ostream & os = err ? std::cerr : std::cout;
+template <> void printSca(uint8_t arg, bool newline, bool err, DCTX(ctx)) {
+    std::ostream &os = err ? std::cerr : std::cout;
     os << static_cast<uint32_t>(arg);
-    if(newline)
+    if (newline)
         os << std::endl;
 }
 
-#endif //SRC_RUNTIME_LOCAL_KERNELS_PRINTSCA_H
+#endif // SRC_RUNTIME_LOCAL_KERNELS_PRINTSCA_H
diff --git a/src/runtime/local/kernels/Quantize.h b/src/runtime/local/kernels/Quantize.h
index 990f8f121..83ff15004 100644
--- a/src/runtime/local/kernels/Quantize.h
+++ b/src/runtime/local/kernels/Quantize.h
@@ -30,21 +30,19 @@
 // Struct for partial template specialization
 // ****************************************************************************
 
-template<class DTRes, class DTArg>
-struct Quantize {
-    static void apply(DTRes *& res, const DTArg * arg, float min, float max, DCTX(ctx)) = delete;
+template <class DTRes, class DTArg> struct Quantize {
+    static void apply(DTRes *&res, const DTArg *arg, float min, float max, DCTX(ctx)) = delete;
 };
 
 // ****************************************************************************
 // Convenience function
 // ****************************************************************************
 
-template<class DTRes, class DTArg>
-void quantize(DTRes *& res, const DTArg * arg, float min, float max, DCTX(ctx)) {
+template <class DTRes, class DTArg> void quantize(DTRes *&res, const DTArg *arg, float min, float max, DCTX(ctx)) {
     Quantize<DTRes, DTArg>::apply(res, arg, min, max, ctx);
 }
 
-void calc_quantization_params(float min, float max, float& scale, uint8_t& quantized_zero) {
+inline void calc_quantization_params(float min, float max, float &scale, uint8_t &quantized_zero) {
     // Make sure that 0 is included
     min = (min > 0) ? 0 : min;
     max = (max < 0) ? 0 : max;
@@ -54,22 +52,20 @@ void calc_quantization_params(float min, float max, float& scale, uint8_t& quant
 
     scale = (max - min) / (1 + q_max - q_min);
 
-    float mapped_zero = q_max - max/scale;
+    float mapped_zero = q_max - max / scale;
     if (mapped_zero < q_min) {
         quantized_zero = q_min;
-    }
-    else if (mapped_zero > q_max) {
+    } else if (mapped_zero > q_max) {
         quantized_zero = q_max;
-    }
-    else {
+    } else {
         // Rounds half-way cases away from zero.
         quantized_zero = (uint8_t)(std::roundf(mapped_zero));
     }
 }
 
-uint8_t quantize_value(float a, float scale, uint8_t quantized_zero) {
+inline uint8_t quantize_value(float a, float scale, uint8_t quantized_zero) {
     // Map
-    float value = static_cast<float>(quantized_zero) + a/scale;
+    float value = static_cast<float>(quantized_zero) + a / scale;
 
     // Clip
     value = (value > 255) ? 255 : value;
@@ -87,13 +83,12 @@ uint8_t quantize_value(float a, float scale, uint8_t quantized_zero) {
 // DenseMatrix <- DenseMatrix
 // ----------------------------------------------------------------------------
 
-template<>
-struct Quantize<DenseMatrix<uint8_t>, DenseMatrix<float>> {
-    static void apply(DenseMatrix<uint8_t> *& res, const DenseMatrix<float> * arg, float min, float max, DCTX(ctx)) {
+template <> struct Quantize<DenseMatrix<uint8_t>, DenseMatrix<float>> {
+    static void apply(DenseMatrix<uint8_t> *&res, const DenseMatrix<float> *arg, float min, float max, DCTX(ctx)) {
         const size_t nr1 = arg->getNumRows();
         const size_t nc1 = arg->getNumCols();
 
-        if(res == nullptr) {
+        if (res == nullptr) {
             res = DataObjectFactory::create<DenseMatrix<uint8_t>>(nr1, nc1, false);
         } else {
             if (nr1 != res->getNumRows()) {
@@ -111,7 +106,7 @@ struct Quantize<DenseMatrix<uint8_t>, DenseMatrix<float>> {
         calc_quantization_params(min, max, scale, q_zero);
         for (int i = 0; i < (int)nr1; i++) {
             for (int j = 0; j < (int)nc1; j++) {
-                res->set(i,j, quantize_value(arg->get(i,j), scale, q_zero));
+                res->set(i, j, quantize_value(arg->get(i, j), scale, q_zero));
             }
         }
     }
@@ -121,16 +116,14 @@ struct Quantize<DenseMatrix<uint8_t>, DenseMatrix<float>> {
 // Matrix <- Matrix
 // ----------------------------------------------------------------------------
 
-template<>
-struct Quantize<Matrix<uint8_t>, Matrix<float>> {
-    static void apply(Matrix<uint8_t> *& res, const Matrix<float> * arg, float min, float max, DCTX(ctx)) {
+template <> struct Quantize<Matrix<uint8_t>, Matrix<float>> {
+    static void apply(Matrix<uint8_t> *&res, const Matrix<float> *arg, float min, float max, DCTX(ctx)) {
         const size_t numRows = arg->getNumRows();
         const size_t numCols = arg->getNumCols();
 
         if (res == nullptr) {
             res = DataObjectFactory::create<DenseMatrix<uint8_t>>(numRows, numCols, false);
-        }
-        else if (numRows != res->getNumRows() || numCols != res->getNumCols()) {
+        } else if (numRows != res->getNumRows() || numCols != res->getNumCols()) {
             throw std::runtime_error("Quantize: res must have the same shape as arg");
         }
 
diff --git a/src/runtime/local/kernels/RandMatrix.h b/src/runtime/local/kernels/RandMatrix.h
index 1c45f35a2..081bf46a8 100644
--- a/src/runtime/local/kernels/RandMatrix.h
+++ b/src/runtime/local/kernels/RandMatrix.h
@@ -28,26 +28,27 @@
 #include <stdexcept>
 #include <type_traits>
 
+#include <chrono>
 #include <cmath>
 #include <cstddef>
 #include <cstdint>
-#include <chrono>
 
 // ****************************************************************************
 // Struct for partial template specialization
 // ****************************************************************************
 
-template<class DTRes, typename VTArg>
-struct RandMatrix {
-    static void apply(DTRes *& res, size_t numRows, size_t numCols, VTArg min, VTArg max, double sparsity, int64_t seed, DCTX(ctx)) = delete;
+template <class DTRes, typename VTArg> struct RandMatrix {
+    static void apply(DTRes *&res, size_t numRows, size_t numCols, VTArg min, VTArg max, double sparsity, int64_t seed,
+                      DCTX(ctx)) = delete;
 };
 
 // ****************************************************************************
 // Convenience function
 // ****************************************************************************
 
-template<class DTRes, typename VTArg>
-void randMatrix(DTRes *& res, size_t numRows, size_t numCols, VTArg min, VTArg max, double sparsity, int64_t seed, DCTX(ctx)) {
+template <class DTRes, typename VTArg>
+void randMatrix(DTRes *&res, size_t numRows, size_t numCols, VTArg min, VTArg max, double sparsity, int64_t seed,
+                DCTX(ctx)) {
     RandMatrix<DTRes, VTArg>::apply(res, numRows, numCols, min, max, sparsity, seed, ctx);
 }
 
@@ -55,7 +56,7 @@ void randMatrix(DTRes *& res, size_t numRows, size_t numCols, VTArg min, VTArg m
 // Argument validation
 // ****************************************************************************
 
-template<typename VTArg>
+template <typename VTArg>
 void validateArgsRandMatrix(size_t numRows, size_t numCols, VTArg min, VTArg max, double sparsity) {
     if (!(numRows > 0))
         throw std::runtime_error("RandMatrix - numRows must be > 0");
@@ -64,12 +65,10 @@ void validateArgsRandMatrix(size_t numRows, size_t numCols, VTArg min, VTArg max
     if (min > max)
         throw std::runtime_error("RandMatrix - min must be <= max");
     if (!min && !max)
-        throw std::runtime_error(
-            "RandMatrix - min and max must not both be zero, consider "
-            "setting sparsity to zero instead");
+        throw std::runtime_error("RandMatrix - min and max must not both be zero, consider "
+                                 "setting sparsity to zero instead");
     if (sparsity < 0.0 || sparsity > 1.0)
-        throw std::runtime_error(
-            "sparsity has to be in the interval [0.0, 1.0]");
+        throw std::runtime_error("sparsity has to be in the interval [0.0, 1.0]");
 }
 
 // ****************************************************************************
@@ -80,12 +79,12 @@ void validateArgsRandMatrix(size_t numRows, size_t numCols, VTArg min, VTArg max
 // DenseMatrix
 // ----------------------------------------------------------------------------
 
-template<typename VT>
-struct RandMatrix<DenseMatrix<VT>, VT> {
-    static void apply(DenseMatrix<VT> *& res, size_t numRows, size_t numCols, VT min, VT max, double sparsity, int64_t seed, DCTX(ctx)) {
+template <typename VT> struct RandMatrix<DenseMatrix<VT>, VT> {
+    static void apply(DenseMatrix<VT> *&res, size_t numRows, size_t numCols, VT min, VT max, double sparsity,
+                      int64_t seed, DCTX(ctx)) {
         validateArgsRandMatrix(numRows, numCols, min, max, sparsity);
 
-        if(res == nullptr)
+        if (res == nullptr)
             res = DataObjectFactory::create<DenseMatrix<VT>>(numRows, numCols, false);
 
         if (seed == -1) {
@@ -97,33 +96,30 @@ struct RandMatrix<DenseMatrix<VT>, VT> {
         std::mt19937 genVal(seed);
         std::mt19937 genIndex(seed * 3);
 
-        static_assert(
-            std::is_floating_point<VT>::value || std::is_integral<VT>::value,
-            "the value type must be either floating point or integral");
+        static_assert(std::is_floating_point<VT>::value || std::is_integral<VT>::value,
+                      "the value type must be either floating point or integral");
 
-        typename std::conditional<
-                std::is_floating_point<VT>::value,
-                std::uniform_real_distribution<VT>,
-                std::uniform_int_distribution<VT>
-        >::type distrVal(min, max);
+        typename std::conditional<std::is_floating_point<VT>::value, std::uniform_real_distribution<VT>,
+                                  std::uniform_int_distribution<VT>>::type distrVal(min, max);
         std::uniform_int_distribution<int> distrIndex(0, numCols * numRows - 1);
 
-        VT * valuesRes = res->getValues();
+        VT *valuesRes = res->getValues();
 
-        // If sparsity >= 0.5, we initialize with random values and insert zeros,
-        // else if sparsity < 0.5, it is more efficient to initialize with zero values and insert random.
+        // If sparsity >= 0.5, we initialize with random values and insert
+        // zeros, else if sparsity < 0.5, it is more efficient to initialize
+        // with zero values and insert random.
         size_t insertedValuesLimit;
         if (sparsity >= 0.5) {
-            insertedValuesLimit = size_t(round((1 - sparsity) * numCols * numRows));                    
+            insertedValuesLimit = size_t(round((1 - sparsity) * numCols * numRows));
         } else {
             insertedValuesLimit = size_t(round(sparsity * numCols * numRows));
         }
-        
+
         // Fill Matrix with non-zero/random values
         // TODO It might be faster to pull the check on sparsity out of the
         // loop, including a duplication of the loop.
-        for(size_t r = 0; r < numRows; r++) {
-            for(size_t c = 0; c < numCols; c++) {
+        for (size_t r = 0; r < numRows; r++) {
+            for (size_t c = 0; c < numCols; c++) {
                 if (sparsity >= 0.5) {
                     valuesRes[c] = distrVal(genVal);
                     while (valuesRes[c] == 0)
@@ -135,7 +131,8 @@ struct RandMatrix<DenseMatrix<VT>, VT> {
             valuesRes += res->getRowSkip();
         }
 
-        // Use Knuth's algorithm to calculate unique random indexes equal to insertedValuesLimit, to be set to zero/random value.
+        // Use Knuth's algorithm to calculate unique random indexes equal to
+        // insertedValuesLimit, to be set to zero/random value.
         valuesRes = res->getValues();
         size_t iRange, iSize;
         iSize = 0;
@@ -144,12 +141,12 @@ struct RandMatrix<DenseMatrix<VT>, VT> {
         // TODO If res->getRowSkip() == res->getNumCols(), it might be faster
         // not to calculate row and col by / and %, but to directly use the
         // generated index.
-        for (iRange = 0; iRange < (numCols * numRows) && iSize < insertedValuesLimit; iRange++) {            
+        for (iRange = 0; iRange < (numCols * numRows) && iSize < insertedValuesLimit; iRange++) {
             size_t rRange = (numCols * numRows) - iRange;
             size_t rSize = insertedValuesLimit - iSize;
             if (fmod(distrIndex(genIndex), rRange) < rSize) {
                 size_t row = iRange / numCols;
-                size_t col = iRange % numCols; 
+                size_t col = iRange % numCols;
                 if (sparsity >= 0.5) {
                     valuesRes[row * res->getRowSkip() + col] = VT(0);
                 } else {
@@ -167,16 +164,17 @@ struct RandMatrix<DenseMatrix<VT>, VT> {
 // CSRMatrix
 // ----------------------------------------------------------------------------
 
-template<typename VT>
-struct RandMatrix<CSRMatrix<VT>, VT> {
-    static void apply(CSRMatrix<VT> *& res, size_t numRows, size_t numCols, VT min, VT max, double sparsity, int64_t seed, DCTX(ctx)) {
+template <typename VT> struct RandMatrix<CSRMatrix<VT>, VT> {
+    static void apply(CSRMatrix<VT> *&res, size_t numRows, size_t numCols, VT min, VT max, double sparsity,
+                      int64_t seed, DCTX(ctx)) {
         validateArgsRandMatrix(numRows, numCols, min, max, sparsity);
 
         // The exact number of non-zeros to generate.
-        // TODO Ideally, it should not be allowed that zero is included in [min, max].
+        // TODO Ideally, it should not be allowed that zero is included in [min,
+        // max].
         const auto nnz = static_cast<size_t>(round(numRows * numCols * sparsity));
-        
-        if(res == nullptr)
+
+        if (res == nullptr)
             res = DataObjectFactory::create<CSRMatrix<VT>>(numRows, numCols, nnz, false);
 
         // Initialize pseudo random number generators.
@@ -184,85 +182,80 @@ struct RandMatrix<CSRMatrix<VT>, VT> {
             seed = std::chrono::high_resolution_clock::now().time_since_epoch().count();
         std::default_random_engine gen(seed);
 
-        static_assert(
-            std::is_floating_point<VT>::value || std::is_integral<VT>::value,
-            "the value type must be either floating point or integral");
+        static_assert(std::is_floating_point<VT>::value || std::is_integral<VT>::value,
+                      "the value type must be either floating point or integral");
+
+        typename std::conditional<std::is_floating_point<VT>::value, std::uniform_real_distribution<VT>,
+                                  std::uniform_int_distribution<VT>>::type distrVal(min, max);
 
-        typename std::conditional<
-                std::is_floating_point<VT>::value,
-                std::uniform_real_distribution<VT>,
-                std::uniform_int_distribution<VT>
-        >::type distrVal(min, max);
-        
         std::uniform_int_distribution<size_t> distrRow(0, numRows - 1);
         std::uniform_int_distribution<size_t> distrCol(0, numCols - 1);
-        
-        // Generate non-zero values (positions in the matrix do not matter here).
-        VT * valuesRes = res->getValues();
-        for(size_t i = 0; i < nnz; i++)
+
+        // Generate non-zero values (positions in the matrix do not matter
+        // here).
+        VT *valuesRes = res->getValues();
+        for (size_t i = 0; i < nnz; i++)
             valuesRes[i] = distrVal(gen);
-        
+
         // Randomly determine the number of non-zeros per row. Store them in
         // the result matrix's rowOffsets array to avoid an additional
         // allocation and to make the prefix sum more cache-efficient.
-        size_t* rowOffsetsRes = res->getRowOffsets();
+        size_t *rowOffsetsRes = res->getRowOffsets();
 
         // We need signed ssize_t for the >0 check.
         auto nnzPerRow = reinterpret_cast<ssize_t *>(rowOffsetsRes + 1);
 
-        if(sparsity <= 0.5) {
+        if (sparsity <= 0.5) {
             // Start with empty rows, increment nnz of random row until the
             // desired total number of non-zeros is reached.
             std::fill_n(nnzPerRow, numRows, 0);
             size_t assigned = 0;
-            while(assigned < nnz) {
+            while (assigned < nnz) {
                 const size_t r = distrRow(gen);
-                if(nnzPerRow[r] < static_cast<ssize_t>(numCols)) {
+                if (nnzPerRow[r] < static_cast<ssize_t>(numCols)) {
                     nnzPerRow[r]++;
                     assigned++;
                 }
             }
-        }
-        else {
+        } else {
             // Start with full rows, decrement nnz of random row until the
             // desired total number of non-zeros is reached.
             std::fill_n(nnzPerRow, numRows, numCols);
             size_t assigned = numRows * numCols;
-            while(assigned > nnz) {
+            while (assigned > nnz) {
                 const size_t r = distrRow(gen);
-                if(nnzPerRow[r] > 0) {
+                if (nnzPerRow[r] > 0) {
                     nnzPerRow[r]--;
                     assigned--;
                 }
             }
         }
         // Generate random column indexes, sorted within each row.
-        size_t * colIdxsRes = res->getColIdxs();
-        if(sparsity <= 0.5) {
+        size_t *colIdxsRes = res->getColIdxs();
+        if (sparsity <= 0.5) {
             // Use the generated column indexes.
-            for(size_t r = 0; r < numRows; r++) {
+            for (size_t r = 0; r < numRows; r++) {
                 std::set<size_t> sortedColIdxs;
-                while(static_cast<ssize_t>(sortedColIdxs.size()) < nnzPerRow[r])
+                while (static_cast<ssize_t>(sortedColIdxs.size()) < nnzPerRow[r])
                     sortedColIdxs.emplace(distrCol(gen));
-                for(auto it = sortedColIdxs.begin(); it != sortedColIdxs.end(); it++)
+                for (auto it = sortedColIdxs.begin(); it != sortedColIdxs.end(); it++)
                     *colIdxsRes++ = *it;
             }
-        }
-        else {
+        } else {
             // Use all but the generated column indexes.
-            for(size_t r = 0; r < numRows; r++) {
+            for (size_t r = 0; r < numRows; r++) {
                 std::set<size_t> sortedColIdxs;
-                while(sortedColIdxs.size() < numCols - nnzPerRow[r])
+                while (sortedColIdxs.size() < numCols - nnzPerRow[r])
                     sortedColIdxs.emplace(distrCol(gen));
-                for(size_t c = 0; c < numCols; c++)
-                    if(!sortedColIdxs.count(c))
+                for (size_t c = 0; c < numCols; c++)
+                    if (!sortedColIdxs.count(c))
                         *colIdxsRes++ = c;
             }
         }
 
         // Calculate the row offsets as the prefix sum over the nnz per row.
         rowOffsetsRes[0] = 0;
-        for(size_t i = 1; i <= numRows; i++)
+        for (size_t i = 1; i <= numRows; i++)
             rowOffsetsRes[i] += rowOffsetsRes[i - 1];
     }
 };
@@ -271,9 +264,9 @@ struct RandMatrix<CSRMatrix<VT>, VT> {
 // Matrix
 // ----------------------------------------------------------------------------
 
-template<typename VT>
-struct RandMatrix<Matrix<VT>, VT> {
-    static void apply(Matrix<VT> *& res, size_t numRows, size_t numCols, VT min, VT max, double sparsity, int64_t seed, DCTX(ctx)) {
+template <typename VT> struct RandMatrix<Matrix<VT>, VT> {
+    static void apply(Matrix<VT> *&res, size_t numRows, size_t numCols, VT min, VT max, double sparsity, int64_t seed,
+                      DCTX(ctx)) {
         validateArgsRandMatrix(numRows, numCols, min, max, sparsity);
 
         if (res == nullptr)
@@ -287,23 +280,18 @@ struct RandMatrix<Matrix<VT>, VT> {
 
         std::mt19937 genVal(seed);
         std::mt19937 genIndex(seed * 3);
-        
-        static_assert(
-                std::is_floating_point<VT>::value || std::is_integral<VT>::value,
-                "the value type must be either floating point or integral"
-        );
-        typename std::conditional<
-                std::is_floating_point<VT>::value,
-                std::uniform_real_distribution<VT>,
-                std::uniform_int_distribution<VT>
-        >::type distrVal(min, max);
+
+        static_assert(std::is_floating_point<VT>::value || std::is_integral<VT>::value,
+                      "the value type must be either floating point or integral");
+        typename std::conditional<std::is_floating_point<VT>::value, std::uniform_real_distribution<VT>,
+                                  std::uniform_int_distribution<VT>>::type distrVal(min, max);
         std::uniform_int_distribution<int> distrIndex(0, numCols * numRows - 1);
 
-        // If sparsity >= 0.5, we initialize with random values and insert zeros,
-        // else if sparsity < 0.5, it is more efficient to initialize with zero values and insert random.
-        size_t insertedValuesLimit = (sparsity >= 0.5) ?
-                                static_cast<size_t>(round((1 - sparsity) * numCols * numRows)) :
-                                static_cast<size_t>(round(sparsity * numCols * numRows));
+        // If sparsity >= 0.5, we initialize with random values and insert
+        // zeros, else if sparsity < 0.5, it is more efficient to initialize
+        // with zero values and insert random.
+        size_t insertedValuesLimit = (sparsity >= 0.5) ? static_cast<size_t>(round((1 - sparsity) * numCols * numRows))
+                                                       : static_cast<size_t>(round(sparsity * numCols * numRows));
 
         // Fill Matrix with non-zero/random values
         // TODO It might be faster to pull the check on sparsity out of the
@@ -317,13 +305,14 @@ struct RandMatrix<Matrix<VT>, VT> {
                         randVal = distrVal(genVal);
                     res->append(r, c, randVal);
                 }
-                // values do not need to be explicitely set to 0 for 
+                // values do not need to be explicitely set to 0 for
                 // sparsity < 0.5 because append sets them to 0 if unspecified
             }
         }
         res->finishAppend();
 
-        // Use Knuth's algorithm to calculate unique random indexes equal to insertedValuesLimit, to be set to zero/random value.
+        // Use Knuth's algorithm to calculate unique random indexes equal to
+        // insertedValuesLimit, to be set to zero/random value.
         size_t iRange, iSize;
         iSize = 0;
         // TODO It might be faster to pull the check on sparsity out of the
@@ -331,12 +320,12 @@ struct RandMatrix<Matrix<VT>, VT> {
         // TODO If res->getRowSkip() == res->getNumCols(), it might be faster
         // not to calculate row and col by / and %, but to directly use the
         // generated index.
-        for (iRange = 0; iRange < (numCols * numRows) && iSize < insertedValuesLimit; ++iRange) {            
+        for (iRange = 0; iRange < (numCols * numRows) && iSize < insertedValuesLimit; ++iRange) {
             size_t rRange = (numCols * numRows) - iRange;
             size_t rSize = insertedValuesLimit - iSize;
             if (fmod(distrIndex(genIndex), rRange) < rSize) {
                 size_t row = iRange / numCols;
-                size_t col = iRange % numCols; 
+                size_t col = iRange % numCols;
                 if (sparsity >= 0.5) {
                     res->set(row, col, VT(0));
                 } else {
diff --git a/src/runtime/local/kernels/Read.h b/src/runtime/local/kernels/Read.h
index 6436cb8f5..c4af5fa2a 100644
--- a/src/runtime/local/kernels/Read.h
+++ b/src/runtime/local/kernels/Read.h
@@ -17,57 +17,55 @@
 #ifndef SRC_RUNTIME_LOCAL_KERNELS_READ_H
 #define SRC_RUNTIME_LOCAL_KERNELS_READ_H
 
+#include <parser/metadata/MetaDataParser.h>
 #include <runtime/local/context/DaphneContext.h>
 #include <runtime/local/datastructures/DataObjectFactory.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
 #include <runtime/local/datastructures/Frame.h>
 #include <runtime/local/io/File.h>
 #include <runtime/local/io/ReadCsv.h>
+#include <runtime/local/io/ReadDaphne.h>
 #include <runtime/local/io/ReadMM.h>
 #include <runtime/local/io/ReadParquet.h>
-#include <runtime/local/io/ReadDaphne.h>
-#include <parser/metadata/MetaDataParser.h>
 #if USE_HDFS
-	#include <runtime/local/io/HDFS/ReadHDFS.h>
+#include <runtime/local/io/HDFS/ReadHDFS.h>
 #endif
-#include <string>
-#include <regex>
 #include <map>
+#include <regex>
+#include <string>
 
 struct FileExt {
-	static std::map<std::string, int> create_map() {
-	std::map<std::string, int> m;
-		m["csv"] = 0;
-		m["mtx"] = 1;
-		m["parquet"] = 2;
-		m["dbdf"] = 3;
+    static std::map<std::string, int> create_map() {
+        std::map<std::string, int> m;
+        m["csv"] = 0;
+        m["mtx"] = 1;
+        m["parquet"] = 2;
+        m["dbdf"] = 3;
 #if USE_HDFS
-		m["hdfs"] = 4;
+        m["hdfs"] = 4;
 #endif
-		return m;
-	}
-	static const std::map<std::string, int> map;
+        return m;
+    }
+    static const std::map<std::string, int> map;
 };
 
 inline const std::map<std::string, int> FileExt::map = FileExt::create_map();
 
-int extValue(const char * filename);
+int extValue(const char *filename);
 
 // ****************************************************************************
 // Struct for partial template specialization
 // ****************************************************************************
 
-template<class DTRes>
-struct Read {
-    static void apply(DTRes *& res, const char * filename, DCTX(ctx)) = delete;
+template <class DTRes> struct Read {
+    static void apply(DTRes *&res, const char *filename, DCTX(ctx)) = delete;
 };
 
 // ****************************************************************************
 // Convenience function
 // ****************************************************************************
 
-template<class DTRes>
-void read(DTRes *& res, const char * filename, DCTX(ctx)) {
+template <class DTRes> void read(DTRes *&res, const char *filename, DCTX(ctx)) {
     Read<DTRes>::apply(res, filename, ctx);
 }
 
@@ -79,43 +77,38 @@ void read(DTRes *& res, const char * filename, DCTX(ctx)) {
 // DenseMatrix
 // ----------------------------------------------------------------------------
 
-template<typename VT>
-struct Read<DenseMatrix<VT>> {
-    static void apply(DenseMatrix<VT> *& res, const char * filename, DCTX(ctx)) {
-
-	FileMetaData fmd = MetaDataParser::readMetaData(filename);
-	int extv = extValue(filename);
-	switch(extv) {
-	case 0:
-		if(res == nullptr)
-			res = DataObjectFactory::create<DenseMatrix<VT>>(
-				fmd.numRows, fmd.numCols, false
-			);
-		readCsv(res, filename, fmd.numRows, fmd.numCols, ',');
-		break;
-	case 1:
-		readMM(res, filename);
-		break;
-	case 2:
-		if(res == nullptr)
-			res = DataObjectFactory::create<DenseMatrix<VT>>(fmd.numRows, fmd.numCols, false);
-		readParquet(res, filename, fmd.numRows, fmd.numCols);
-		break;
-	case 3:
-		readDaphne(res, filename);
-		break;
-#if USE_HDFS		
-	case 4: 
-		if(res == nullptr)
-			res = DataObjectFactory::create<DenseMatrix<VT>>(
-				fmd.numRows, fmd.numCols, false
-			);		
-		readHDFS(res, filename, ctx);				
-		break;
+template <typename VT> struct Read<DenseMatrix<VT>> {
+    static void apply(DenseMatrix<VT> *&res, const char *filename, DCTX(ctx)) {
+
+        FileMetaData fmd = MetaDataParser::readMetaData(filename);
+        int extv = extValue(filename);
+        switch (extv) {
+        case 0:
+            if (res == nullptr)
+                res = DataObjectFactory::create<DenseMatrix<VT>>(fmd.numRows, fmd.numCols, false);
+            readCsv(res, filename, fmd.numRows, fmd.numCols, ',');
+            break;
+        case 1:
+            readMM(res, filename);
+            break;
+        case 2:
+            if (res == nullptr)
+                res = DataObjectFactory::create<DenseMatrix<VT>>(fmd.numRows, fmd.numCols, false);
+            readParquet(res, filename, fmd.numRows, fmd.numCols);
+            break;
+        case 3:
+            readDaphne(res, filename);
+            break;
+#if USE_HDFS
+        case 4:
+            if (res == nullptr)
+                res = DataObjectFactory::create<DenseMatrix<VT>>(fmd.numRows, fmd.numCols, false);
+            readHDFS(res, filename, ctx);
+            break;
 #endif
-	default:
-		throw std::runtime_error("File extension not supported");
-	}
+        default:
+            throw std::runtime_error("File extension not supported");
+        }
     }
 };
 
@@ -123,39 +116,37 @@ struct Read<DenseMatrix<VT>> {
 // CSRMatrix
 // ----------------------------------------------------------------------------
 
-template<typename VT>
-struct Read<CSRMatrix<VT>> {
-    static void apply(CSRMatrix<VT> *& res, const char * filename, DCTX(ctx)) {
-
-	FileMetaData fmd = MetaDataParser::readMetaData(filename);
-	int extv = extValue(filename);
-	switch(extv) {
-	case 0:
-		if(fmd.numNonZeros == -1)
-                    throw std::runtime_error("Currently reading of sparse matrices requires a number of non zeros to be defined");
-
-		if(res == nullptr)
-			res = DataObjectFactory::create<CSRMatrix<VT>>(
-				fmd.numRows, fmd.numCols, fmd.numNonZeros, false
-			);
-
-		// FIXME: ensure file is sorted, or set `sorted` argument correctly
-		readCsv(res, filename, fmd.numRows, fmd.numCols, ',', fmd.numNonZeros, true);
-		break;
-	case 1:
-		readMM(res, filename);
-		break;
-	case 2:
-		if(res == nullptr)
-			res = DataObjectFactory::create<CSRMatrix<VT>>(fmd.numRows, fmd.numCols, fmd.numNonZeros, false);
-		readParquet(res, filename,fmd.numRows, fmd.numCols,fmd.numNonZeros, false);
-		break;
-	case 3:
-		readDaphne(res, filename);
-                break;
+template <typename VT> struct Read<CSRMatrix<VT>> {
+    static void apply(CSRMatrix<VT> *&res, const char *filename, DCTX(ctx)) {
+
+        FileMetaData fmd = MetaDataParser::readMetaData(filename);
+        int extv = extValue(filename);
+        switch (extv) {
+        case 0:
+            if (fmd.numNonZeros == -1)
+                throw std::runtime_error("Currently reading of sparse matrices requires a number of "
+                                         "non zeros to be defined");
+
+            if (res == nullptr)
+                res = DataObjectFactory::create<CSRMatrix<VT>>(fmd.numRows, fmd.numCols, fmd.numNonZeros, false);
+
+            // FIXME: ensure file is sorted, or set `sorted` argument correctly
+            readCsv(res, filename, fmd.numRows, fmd.numCols, ',', fmd.numNonZeros, true);
+            break;
+        case 1:
+            readMM(res, filename);
+            break;
+        case 2:
+            if (res == nullptr)
+                res = DataObjectFactory::create<CSRMatrix<VT>>(fmd.numRows, fmd.numCols, fmd.numNonZeros, false);
+            readParquet(res, filename, fmd.numRows, fmd.numCols, fmd.numNonZeros, false);
+            break;
+        case 3:
+            readDaphne(res, filename);
+            break;
         default:
             throw std::runtime_error("File extension not supported");
-	}
+        }
     }
 };
 
@@ -163,50 +154,45 @@ struct Read<CSRMatrix<VT>> {
 // Frame
 // ----------------------------------------------------------------------------
 
-template<>
-struct Read<Frame> {
-    static void apply(Frame *& res, const char * filename, DCTX(ctx)) {
+template <> struct Read<Frame> {
+    static void apply(Frame *&res, const char *filename, DCTX(ctx)) {
         FileMetaData fmd = MetaDataParser::readMetaData(filename);
-        
-        ValueTypeCode * schema;
-        if(fmd.isSingleValueType) {
+
+        ValueTypeCode *schema;
+        if (fmd.isSingleValueType) {
             schema = new ValueTypeCode[fmd.numCols];
-            for(size_t i = 0; i < fmd.numCols; i++)
+            for (size_t i = 0; i < fmd.numCols; i++)
                 schema[i] = fmd.schema[0];
-        }
-        else
+        } else
             schema = fmd.schema.data();
-        
-        std::string * labels;
-        if(fmd.labels.empty())
+
+        std::string *labels;
+        if (fmd.labels.empty())
             labels = nullptr;
         else
             labels = fmd.labels.data();
-        
-        if(res == nullptr)
-            res = DataObjectFactory::create<Frame>(
-                    fmd.numRows, fmd.numCols, schema, labels, false
-            );
-        
+
+        if (res == nullptr)
+            res = DataObjectFactory::create<Frame>(fmd.numRows, fmd.numCols, schema, labels, false);
+
         readCsv(res, filename, fmd.numRows, fmd.numCols, ',', schema);
-        
-        if(fmd.isSingleValueType)
+
+        if (fmd.isSingleValueType)
             delete[] schema;
     }
 };
 
-
-inline int extValue(const char * filename) {
-	int extv;
-	std::string fn(filename);
-	auto pos = fn.find_last_of('.');
-	std::string ext(fn.substr(pos+1)) ;
-	if (FileExt::map.count(ext) > 0) {
-		extv = FileExt::map.find(ext)->second;
-	} else {
-		extv = -1;
-	}
-	return extv;
+inline int extValue(const char *filename) {
+    int extv;
+    std::string fn(filename);
+    auto pos = fn.find_last_of('.');
+    std::string ext(fn.substr(pos + 1));
+    if (FileExt::map.count(ext) > 0) {
+        extv = FileExt::map.find(ext)->second;
+    } else {
+        extv = -1;
+    }
+    return extv;
 }
 
-#endif //SRC_RUNTIME_LOCAL_KERNELS_READ_H
+#endif // SRC_RUNTIME_LOCAL_KERNELS_READ_H
diff --git a/src/runtime/local/kernels/ReceiveFromNumpy.h b/src/runtime/local/kernels/ReceiveFromNumpy.h
index 768e5a592..2e90f98f5 100644
--- a/src/runtime/local/kernels/ReceiveFromNumpy.h
+++ b/src/runtime/local/kernels/ReceiveFromNumpy.h
@@ -27,17 +27,16 @@
 // Struct for partial template specialization
 // ****************************************************************************
 
-template<class DTRes>
-struct ReceiveFromNumpy {
-    static void apply(DTRes *& res, uint32_t upper, uint32_t lower, int64_t rows, int64_t cols, DCTX(ctx)) = delete;
+template <class DTRes> struct ReceiveFromNumpy {
+    static void apply(DTRes *&res, uint32_t upper, uint32_t lower, int64_t rows, int64_t cols, DCTX(ctx)) = delete;
 };
 
 // ****************************************************************************
 // Convenience function
 // ****************************************************************************
 
-template<class DTRes>
-void receiveFromNumpy(DTRes *& res, uint32_t upper, int32_t lower, int64_t rows, int64_t cols, DCTX(ctx)) {
+template <class DTRes>
+void receiveFromNumpy(DTRes *&res, uint32_t upper, int32_t lower, int64_t rows, int64_t cols, DCTX(ctx)) {
     ReceiveFromNumpy<DTRes>::apply(res, upper, lower, rows, cols, ctx);
 }
 
@@ -50,24 +49,17 @@ void receiveFromNumpy(DTRes *& res, uint32_t upper, int32_t lower, int64_t rows,
 // ----------------------------------------------------------------------------
 
 // TODO Should we make this a central utility?
-template<typename VT>
-struct NoOpDeleter {
-    void operator()(VT* p) {
+template <typename VT> struct NoOpDeleter {
+    void operator()(VT *p) {
         // Don't delete p because the memory comes from numpy.
     }
 };
 
-template<typename VT>
-struct ReceiveFromNumpy<DenseMatrix<VT>> {
-    static void apply(DenseMatrix<VT> *& res, uint32_t upper, uint32_t lower, int64_t rows, int64_t cols, DCTX(ctx)) {
+template <typename VT> struct ReceiveFromNumpy<DenseMatrix<VT>> {
+    static void apply(DenseMatrix<VT> *&res, uint32_t upper, uint32_t lower, int64_t rows, int64_t cols, DCTX(ctx)) {
         res = DataObjectFactory::create<DenseMatrix<VT>>(
-                rows, cols,
-                std::shared_ptr<VT[]>(
-                        (VT*)(((uint64_t)upper << 32) | lower),
-                        NoOpDeleter<VT>()
-                )
-        );
+            rows, cols, std::shared_ptr<VT[]>((VT *)(((uint64_t)upper << 32) | lower), NoOpDeleter<VT>()));
     }
 };
 
-#endif //SRC_RUNTIME_LOCAL_KERNELS_RECEIVEFROMNUMPY_H
+#endif // SRC_RUNTIME_LOCAL_KERNELS_RECEIVEFROMNUMPY_H
diff --git a/src/runtime/local/kernels/Recode.h b/src/runtime/local/kernels/Recode.h
index 042a7a0c2..0a6e7f5f8 100644
--- a/src/runtime/local/kernels/Recode.h
+++ b/src/runtime/local/kernels/Recode.h
@@ -22,26 +22,25 @@
 #include <runtime/local/datastructures/Matrix.h>
 
 #include <algorithm>
+#include <set>
 #include <stdexcept>
 #include <unordered_map>
-#include <set>
 #include <vector>
 
 // ****************************************************************************
 // Struct for partial template specialization
 // ****************************************************************************
 
-template<class DTRes, class DTDict, class DTArg>
-struct Recode {
-    static void apply(DTRes *& res, DTDict *& dict, const DTArg * arg, bool orderPreserving, DCTX(ctx)) = delete;
+template <class DTRes, class DTDict, class DTArg> struct Recode {
+    static void apply(DTRes *&res, DTDict *&dict, const DTArg *arg, bool orderPreserving, DCTX(ctx)) = delete;
 };
 
 // ****************************************************************************
 // Convenience function
 // ****************************************************************************
 
-template<class DTRes, class DTDict, class DTArg>
-void recode(DTRes *& res, DTDict *& dict, const DTArg * arg, bool orderPreserving, DCTX(ctx)) {
+template <class DTRes, class DTDict, class DTArg>
+void recode(DTRes *&res, DTDict *&dict, const DTArg *arg, bool orderPreserving, DCTX(ctx)) {
     Recode<DTRes, DTDict, DTArg>::apply(res, dict, arg, orderPreserving, ctx);
 }
 
@@ -53,75 +52,74 @@ void recode(DTRes *& res, DTDict *& dict, const DTArg * arg, bool orderPreservin
 // DenseMatrix <- DenseMatrix
 // ----------------------------------------------------------------------------
 
-template<typename VTVal, typename VTCode>
-struct Recode<DenseMatrix<VTCode>, DenseMatrix<VTVal>, DenseMatrix<VTVal>> {
-    static void apply(DenseMatrix<VTCode> *& res, DenseMatrix<VTVal> *& dict, const DenseMatrix<VTVal> * arg, bool orderPreserving, DCTX(ctx)) {
+template <typename VTVal, typename VTCode> struct Recode<DenseMatrix<VTCode>, DenseMatrix<VTVal>, DenseMatrix<VTVal>> {
+    static void apply(DenseMatrix<VTCode> *&res, DenseMatrix<VTVal> *&dict, const DenseMatrix<VTVal> *arg,
+                      bool orderPreserving, DCTX(ctx)) {
         // Validation.
         // TODO Remove this requirement, it's not strictly necessary.
-        if(arg->getNumCols() != 1)
+        if (arg->getNumCols() != 1)
             throw std::runtime_error("recode: the argument must have exactly one column");
-        
-        if(orderPreserving) {
+
+        if (orderPreserving) {
             // Determine the distinct values in the input.
             std::set<VTVal> distinct;
             const size_t numRowsArg = arg->getNumRows();
-            const VTVal * valuesArg = arg->getValues();
+            const VTVal *valuesArg = arg->getValues();
             const size_t rowSkipArg = arg->getRowSkip();
-            for(size_t r = 0; r < numRowsArg; r++) {
+            for (size_t r = 0; r < numRowsArg; r++) {
                 distinct.insert(*valuesArg);
                 valuesArg += rowSkipArg;
             }
 
             // Allocate output for the decoding dictionary.
-            if(dict == nullptr)
+            if (dict == nullptr)
                 dict = DataObjectFactory::create<DenseMatrix<VTVal>>(distinct.size(), 1, false);
 
             // Create recoding dictionary and store decoding dictionary.
             std::unordered_map<VTVal, VTCode> recodeDict;
-            VTVal * valuesDict = dict->getValues();
+            VTVal *valuesDict = dict->getValues();
             const size_t rowSkipDict = dict->getRowSkip();
             VTCode nextCode = 0;
-            for(auto it = distinct.begin(); it != distinct.end(); it++) {
+            for (auto it = distinct.begin(); it != distinct.end(); it++) {
                 recodeDict.emplace(*it, nextCode++);
                 *valuesDict = *it;
                 valuesDict += rowSkipDict;
             }
 
             // Allocate output for recoded data.
-            if(res == nullptr)
+            if (res == nullptr)
                 res = DataObjectFactory::create<DenseMatrix<VTCode>>(numRowsArg, 1, false);
 
             // Recode the data.
             valuesArg = arg->getValues(); // rewind
-            VTCode * valuesRes = res->getValues();
+            VTCode *valuesRes = res->getValues();
             const size_t rowSkipRes = res->getRowSkip();
-            for(size_t r = 0; r < numRowsArg; r++) {
+            for (size_t r = 0; r < numRowsArg; r++) {
                 *valuesRes = recodeDict[*valuesArg];
                 valuesArg += rowSkipArg;
                 valuesRes += rowSkipRes;
             }
-        }
-        else {
+        } else {
             const size_t numRowsArg = arg->getNumRows();
 
             // Allocate output for recoded data.
-            if(res == nullptr)
+            if (res == nullptr)
                 res = DataObjectFactory::create<DenseMatrix<VTCode>>(numRowsArg, 1, false);
-            
+
             // Internal data structure.
             VTCode nextCode = 0;
             std::unordered_map<VTVal, VTCode> recodeDict;
 
             // Recode the data.
-            const VTVal * valuesArg = arg->getValues();
+            const VTVal *valuesArg = arg->getValues();
             const size_t rowSkipArg = arg->getRowSkip();
-            VTCode * valuesRes = res->getValues();
+            VTCode *valuesRes = res->getValues();
             const size_t rowSkipRes = res->getRowSkip();
-            for(size_t r = 0; r < numRowsArg; r++) {
+            for (size_t r = 0; r < numRowsArg; r++) {
                 const VTVal v = *valuesArg;
 
                 auto it = recodeDict.find(v);
-                if(it != recodeDict.end())
+                if (it != recodeDict.end())
                     *valuesRes = it->second;
                 else {
                     recodeDict[v] = nextCode;
@@ -134,13 +132,13 @@ struct Recode<DenseMatrix<VTCode>, DenseMatrix<VTVal>, DenseMatrix<VTVal>> {
             }
 
             // Allocate output for the decoding dictionary.
-            if(dict == nullptr)
+            if (dict == nullptr)
                 dict = DataObjectFactory::create<DenseMatrix<VTVal>>(recodeDict.size(), 1, false);
 
             // Store decoding dictionary.
-            VTVal * valuesDict = dict->getValues();
+            VTVal *valuesDict = dict->getValues();
             const size_t rowSkipDict = dict->getRowSkip();
-            for(auto it = recodeDict.begin(); it != recodeDict.end(); it++) {
+            for (auto it = recodeDict.begin(); it != recodeDict.end(); it++) {
                 const VTVal v = it->first;
                 const VTCode c = it->second;
                 valuesDict[c * rowSkipDict] = v;
@@ -153,19 +151,18 @@ struct Recode<DenseMatrix<VTCode>, DenseMatrix<VTVal>, DenseMatrix<VTVal>> {
 // Matrix <- Matrix
 // ----------------------------------------------------------------------------
 
-template<typename VTVal, typename VTCode>
-struct Recode<Matrix<VTCode>, Matrix<VTVal>, Matrix<VTVal>> {
-    static void apply(Matrix<VTCode> *& res, Matrix<VTVal> *& dict, const Matrix<VTVal> * arg, bool orderPreserving, DCTX(ctx)) {
+template <typename VTVal, typename VTCode> struct Recode<Matrix<VTCode>, Matrix<VTVal>, Matrix<VTVal>> {
+    static void apply(Matrix<VTCode> *&res, Matrix<VTVal> *&dict, const Matrix<VTVal> *arg, bool orderPreserving,
+                      DCTX(ctx)) {
         // Validation.
         // TODO Remove this requirement, it's not strictly necessary.
         if (arg->getNumCols() != 1)
             throw std::runtime_error("recode: the argument must have exactly one column");
 
         const size_t numRowsArg = arg->getNumRows();
-        
+
         if (res == nullptr)
             res = DataObjectFactory::create<DenseMatrix<VTCode>>(numRowsArg, 1, false);
-        
 
         if (orderPreserving) {
             // Determine the distinct values in the input.
@@ -194,8 +191,7 @@ struct Recode<Matrix<VTCode>, Matrix<VTVal>, Matrix<VTVal>> {
             for (size_t r = 0; r < numRowsArg; ++r)
                 res->append(r, 0, recodeDict[arg->get(r, 0)]);
             res->finishAppend();
-        }
-        else {
+        } else {
             // Internal data structure.
             VTCode nextCode = 0;
             std::unordered_map<VTVal, VTCode> recodeDict;
diff --git a/src/runtime/local/kernels/Remove.h b/src/runtime/local/kernels/Remove.h
index 000bef283..dbb0578b4 100644
--- a/src/runtime/local/kernels/Remove.h
+++ b/src/runtime/local/kernels/Remove.h
@@ -17,15 +17,16 @@
 #pragma once
 
 #include <runtime/local/context/DaphneContext.h>
+#include <runtime/local/datastructures/CSRMatrix.h>
 #include <runtime/local/datastructures/DataObjectFactory.h>
+#include <runtime/local/datastructures/DenseMatrix.h>
 #include <runtime/local/datastructures/List.h>
 
 // ****************************************************************************
 // Convenience function
 // ****************************************************************************
 
-template<class DT>
-void remove(List<DT> *& resList, DT *& elem, const List<DT> * argList, size_t idx, DCTX(ctx)) {
+template <class DT> void remove(List<DT> *&resList, DT *&elem, const List<DT> *argList, size_t idx, DCTX(ctx)) {
     resList = DataObjectFactory::create<List<DT>>(argList);
     elem = const_cast<DT *>(resList->remove(idx));
-}
\ No newline at end of file
+}
diff --git a/src/runtime/local/kernels/Replace.h b/src/runtime/local/kernels/Replace.h
index 743c9928f..b642e7b9c 100644
--- a/src/runtime/local/kernels/Replace.h
+++ b/src/runtime/local/kernels/Replace.h
@@ -24,25 +24,23 @@
 
 #include <stdexcept>
 
-#include <string.h>
 #include <cstddef>
 #include <stdio.h>
+#include <string.h>
 
 // ****************************************************************************
 // Struct for partial template specialization
 // ****************************************************************************
 
-template<class DTRes, class DTArg, typename VT>
-struct Replace {
-    static void apply(DTRes *& res, const DTArg * arg, VT pattern, VT replacement, DCTX(ctx)) = delete;
+template <class DTRes, class DTArg, typename VT> struct Replace {
+    static void apply(DTRes *&res, const DTArg *arg, VT pattern, VT replacement, DCTX(ctx)) = delete;
 };
 
-
 // ****************************************************************************
 // Convenience function
 // ****************************************************************************
-template<class DTRes, class DTArg, typename VT>
-void replace(DTRes *& res, const DTArg * arg, VT pattern, VT replacement, DCTX(ctx)) {
+template <class DTRes, class DTArg, typename VT>
+void replace(DTRes *&res, const DTArg *arg, VT pattern, VT replacement, DCTX(ctx)) {
     Replace<DTRes, DTArg, VT>::apply(res, arg, pattern, replacement, ctx);
 }
 
@@ -54,20 +52,20 @@ void replace(DTRes *& res, const DTArg * arg, VT pattern, VT replacement, DCTX(c
 // DenseMatrix <- DenseMatrix
 // ----------------------------------------------------------------------------
 
-template<typename VT>
-struct Replace<DenseMatrix<VT>, DenseMatrix<VT>, VT> {
-    static void apply(DenseMatrix<VT> *& res, const DenseMatrix<VT> * arg, VT pattern, VT replacement, DCTX(ctx)) {
+template <typename VT> struct Replace<DenseMatrix<VT>, DenseMatrix<VT>, VT> {
+    static void apply(DenseMatrix<VT> *&res, const DenseMatrix<VT> *arg, VT pattern, VT replacement, DCTX(ctx)) {
         //------handling corner cases -------
         if (!arg) {
-            throw std::runtime_error(
-                "Replace - arg must not be nullptr");
+            throw std::runtime_error("Replace - arg must not be nullptr");
         }
         // variable declaration
         const size_t numRows = arg->getNumRows(); // number of rows
         const size_t numCols = arg->getNumCols(); // number of columns
         const size_t elementCount = numRows * numCols;
-        bool requireCopy=false; // this variable is to indicate whether we need to copy to res (when not using inplace update semantic)s
-        if(elementCount==0){// This case means that the kernel do nothing, i.e.,  no values to replace
+        bool requireCopy = false; // this variable is to indicate whether we need to copy to
+                                  // res (when not using inplace update semantic)s
+        if (elementCount == 0) {  // This case means that the kernel do nothing,
+                                  // i.e.,  no values to replace
             return;
         }
         if (res != nullptr) { // In this case, the caller reuses the res matrix
@@ -80,71 +78,67 @@ struct Replace<DenseMatrix<VT>, DenseMatrix<VT>, VT> {
                                          "different numCols than arg");
             }
         }
-        if((replacement!=replacement && pattern!=pattern) || (pattern == replacement)){// nothing to be done pattern equals replacement
-            if(res!=nullptr && res==arg){  // arg and res are the same
-                return; 
-            }
-            else if (res==nullptr){
-                res = DataObjectFactory::create<DenseMatrix<VT>>(numRows, numCols,  false);
+        if ((replacement != replacement && pattern != pattern) ||
+            (pattern == replacement)) {         // nothing to be done pattern equals replacement
+            if (res != nullptr && res == arg) { // arg and res are the same
+                return;
+            } else if (res == nullptr) {
+                res = DataObjectFactory::create<DenseMatrix<VT>>(numRows, numCols, false);
             }
-            //copy and return in this case replace will be a copy function that copies arg to res
-            const VT * allValues = arg->getValues();
-            VT * allUpdatedValues = res->getValues();
-            for(size_t r = 0; r < numRows; r++){
-                for(size_t c = 0; c < numCols; c++){
-                    allUpdatedValues[c]=allValues[c];
+            // copy and return in this case replace will be a copy function that
+            // copies arg to res
+            const VT *allValues = arg->getValues();
+            VT *allUpdatedValues = res->getValues();
+            for (size_t r = 0; r < numRows; r++) {
+                for (size_t c = 0; c < numCols; c++) {
+                    allUpdatedValues[c] = allValues[c];
                 }
                 allUpdatedValues += res->getRowSkip();
                 allValues += arg->getRowSkip();
             }
             return;
         }
-        if(res==nullptr){
+        if (res == nullptr) {
             res = DataObjectFactory::create<DenseMatrix<VT>>(numRows, numCols, false);
-            requireCopy=true;
+            requireCopy = true;
         }
-        const VT * allValues = arg->getValues() ;
-        VT * allUpdatedValues = res->getValues();
+        const VT *allValues = arg->getValues();
+        VT *allUpdatedValues = res->getValues();
         //--------main logic --------------------------
-        if(pattern!=pattern){ // pattern is NaN
-            for(size_t r = 0; r < numRows; r++){
-                for(size_t c = 0; c < numCols; c++){
-                    if(allValues[c]!=allValues[c]){
-                        allUpdatedValues[c]=replacement;
-                    }
-                    else if(requireCopy){
-                        allUpdatedValues[c]=allValues[c];
+        if (pattern != pattern) { // pattern is NaN
+            for (size_t r = 0; r < numRows; r++) {
+                for (size_t c = 0; c < numCols; c++) {
+                    if (allValues[c] != allValues[c]) {
+                        allUpdatedValues[c] = replacement;
+                    } else if (requireCopy) {
+                        allUpdatedValues[c] = allValues[c];
                     }
                 }
                 allUpdatedValues += res->getRowSkip();
                 allValues += arg->getRowSkip();
             }
-        }
-        else{ // pattern is not NaN --> replacement can still be NaN
-            for(size_t r = 0; r < numRows; r++){
-                for(size_t c = 0; c < numCols; c++){
-                    if(allValues[c]==pattern){
-                        allUpdatedValues[c]=replacement;
-                    }
-                    else if(requireCopy){
-                        allUpdatedValues[c]=allValues[c];
+        } else { // pattern is not NaN --> replacement can still be NaN
+            for (size_t r = 0; r < numRows; r++) {
+                for (size_t c = 0; c < numCols; c++) {
+                    if (allValues[c] == pattern) {
+                        allUpdatedValues[c] = replacement;
+                    } else if (requireCopy) {
+                        allUpdatedValues[c] = allValues[c];
                     }
                 }
                 allUpdatedValues += res->getRowSkip();
-                allValues += arg->getRowSkip();    
+                allValues += arg->getRowSkip();
             }
         }
-    }   
+    }
 };
 
-
 // ----------------------------------------------------------------------------
 // CSRMatrix <- CSRMatrix
 // ----------------------------------------------------------------------------
 
-template<typename VT>
-struct Replace<CSRMatrix<VT>, CSRMatrix<VT>, VT> {
-    static void apply(CSRMatrix<VT> *& res, const CSRMatrix<VT> * arg, VT pattern, VT replacement, DCTX(ctx)) {
+template <typename VT> struct Replace<CSRMatrix<VT>, CSRMatrix<VT>, VT> {
+    static void apply(CSRMatrix<VT> *&res, const CSRMatrix<VT> *arg, VT pattern, VT replacement, DCTX(ctx)) {
         if (!arg) {
             throw std::runtime_error("Replace - arg must not be nullptr");
         }
@@ -154,11 +148,12 @@ struct Replace<CSRMatrix<VT>, CSRMatrix<VT>, VT> {
         }
         const size_t numRows = arg->getNumRows();
         const size_t numCols = arg->getNumCols();
-        const size_t nnzElements= arg->getNumNonZeros();
-        if(nnzElements==0){// This case means that the kernel do nothing, i.e.,  no values to replace
+        const size_t nnzElements = arg->getNumNonZeros();
+        if (nnzElements == 0) { // This case means that the kernel do nothing,
+                                // i.e.,  no values to replace
             return;
         }
-        if(res!=arg && res!=nullptr){ // In this case, the caller reuses the res matrix
+        if (res != arg && res != nullptr) { // In this case, the caller reuses the res matrix
             if (res->getNumRows() != numRows) {
                 throw std::runtime_error("res is a not a nullptr but it has a "
                                          "different numRows than arg");
@@ -172,47 +167,46 @@ struct Replace<CSRMatrix<VT>, CSRMatrix<VT>, VT> {
                                          "different nnzElements than arg");
             }
         }
-        if((replacement!=replacement && pattern!=pattern) || (pattern == replacement)){// nothing to be done pattern equals replacement
-            if(res!=nullptr && res==arg){  // arg and res are the same
-                return; 
-            }
-            else if (res==nullptr){
-                res = DataObjectFactory::create<CSRMatrix<VT>>(numRows, numCols,  nnzElements, false);
+        if ((replacement != replacement && pattern != pattern) ||
+            (pattern == replacement)) {         // nothing to be done pattern equals replacement
+            if (res != nullptr && res == arg) { // arg and res are the same
+                return;
+            } else if (res == nullptr) {
+                res = DataObjectFactory::create<CSRMatrix<VT>>(numRows, numCols, nnzElements, false);
             }
-            //copy and return in this case replace will be a copy function that copies arg to res
-            memcpy(res->getRowOffsets(), arg->getRowOffsets(), (numRows+1)*sizeof(size_t));
-            memcpy(res->getColIdxs(), arg->getColIdxs(), nnzElements*sizeof(size_t));
-            memcpy(res->getValues(), arg->getValues(), nnzElements*sizeof(VT));
+            // copy and return in this case replace will be a copy function that
+            // copies arg to res
+            memcpy(res->getRowOffsets(), arg->getRowOffsets(), (numRows + 1) * sizeof(size_t));
+            memcpy(res->getColIdxs(), arg->getColIdxs(), nnzElements * sizeof(size_t));
+            memcpy(res->getValues(), arg->getValues(), nnzElements * sizeof(VT));
             return;
         }
-        if(res==nullptr){
-            res = DataObjectFactory::create<CSRMatrix<VT>>(numRows, numCols,  nnzElements, false);
-            memcpy(res->getRowOffsets(), arg->getRowOffsets(), (numRows+1)*sizeof(size_t));
-            memcpy(res->getColIdxs(), arg->getColIdxs(), nnzElements*sizeof(size_t));
-            memcpy(res->getValues(), arg->getValues(), nnzElements*sizeof(VT));
-
+        if (res == nullptr) {
+            res = DataObjectFactory::create<CSRMatrix<VT>>(numRows, numCols, nnzElements, false);
+            memcpy(res->getRowOffsets(), arg->getRowOffsets(), (numRows + 1) * sizeof(size_t));
+            memcpy(res->getColIdxs(), arg->getColIdxs(), nnzElements * sizeof(size_t));
+            memcpy(res->getValues(), arg->getValues(), nnzElements * sizeof(VT));
         }
         //--------main logic --------------------------
-        if(pattern!=pattern){ // pattern is NaN
-            for(size_t r = 0; r < numRows; r++){
-                const VT * allValues = arg->getValues(r);
-                VT * allUpdatedValues = res->getValues(r);
-                const size_t nnzElementsRes= arg->getNumNonZeros(r);
-                for(size_t c = 0; c < nnzElementsRes; c++){
-                    if(allValues[c]!=allValues[c]){
-                        allUpdatedValues[c]=replacement;
+        if (pattern != pattern) { // pattern is NaN
+            for (size_t r = 0; r < numRows; r++) {
+                const VT *allValues = arg->getValues(r);
+                VT *allUpdatedValues = res->getValues(r);
+                const size_t nnzElementsRes = arg->getNumNonZeros(r);
+                for (size_t c = 0; c < nnzElementsRes; c++) {
+                    if (allValues[c] != allValues[c]) {
+                        allUpdatedValues[c] = replacement;
                     }
                 }
             }
-        }
-        else{
-            for(size_t r = 0; r < numRows; r++){
-                const VT * allValues = arg->getValues(r);
-                VT * allUpdatedValues = res->getValues(r);
-                const size_t nnzElementsRes= arg->getNumNonZeros(r);
-                for(size_t c = 0; c < nnzElementsRes; c++){
-                    if(allValues[c]==pattern){
-                        allUpdatedValues[c]=replacement;
+        } else {
+            for (size_t r = 0; r < numRows; r++) {
+                const VT *allValues = arg->getValues(r);
+                VT *allUpdatedValues = res->getValues(r);
+                const size_t nnzElementsRes = arg->getNumNonZeros(r);
+                for (size_t c = 0; c < nnzElementsRes; c++) {
+                    if (allValues[c] == pattern) {
+                        allUpdatedValues[c] = replacement;
                     }
                 }
             }
@@ -220,15 +214,14 @@ struct Replace<CSRMatrix<VT>, CSRMatrix<VT>, VT> {
     }
 };
 
-
 // ----------------------------------------------------------------------------
 // Matrix <- Matrix
 // ----------------------------------------------------------------------------
 
-template<typename VT>
-struct Replace<Matrix<VT>, Matrix<VT>, VT> {
-    static void apply(Matrix<VT> *& res, const Matrix<VT> * arg, VT pattern, VT replacement, DCTX(ctx)) {
-        bool requireCopy = false; // this variable is to indicate whether we need to copy to res (when not using inplace update semantics)
+template <typename VT> struct Replace<Matrix<VT>, Matrix<VT>, VT> {
+    static void apply(Matrix<VT> *&res, const Matrix<VT> *arg, VT pattern, VT replacement, DCTX(ctx)) {
+        bool requireCopy = false; // this variable is to indicate whether we need to copy to
+                                  // res (when not using inplace update semantics)
 
         //------handling corner cases -------
         if (arg == nullptr)
@@ -239,17 +232,19 @@ struct Replace<Matrix<VT>, Matrix<VT>, VT> {
 
         if ((numRows == 0) && (numCols == 0))
             return;
-        
+
         if (res != nullptr && (numRows != res->getNumRows() || numCols != res->getNumCols()))
             throw std::runtime_error("Replace: res must have the same shape as arg");
 
-        if ((replacement!=replacement && pattern!=pattern) || (pattern == replacement)) { // nothing to be done pattern equals replacement
+        if ((replacement != replacement && pattern != pattern) ||
+            (pattern == replacement)) {       // nothing to be done pattern equals replacement
             if (res != nullptr && res == arg) // arg and res are the same
                 return;
             else if (res == nullptr)
                 res = DataObjectFactory::create<DenseMatrix<VT>>(numRows, numCols, false);
 
-            // copy and return in this case replace will be a copy function that copies arg to res
+            // copy and return in this case replace will be a copy function that
+            // copies arg to res
             res->prepareAppend();
             for (size_t r = 0; r < numRows; ++r)
                 for (size_t c = 0; c < numCols; ++c)
@@ -261,9 +256,9 @@ struct Replace<Matrix<VT>, Matrix<VT>, VT> {
 
         if (res == nullptr) {
             res = DataObjectFactory::create<DenseMatrix<VT>>(numRows, numCols, false);
-            requireCopy=true;
+            requireCopy = true;
         }
-        
+
         //--------main logic --------------------------
         if (requireCopy)
             res->prepareAppend();
@@ -281,8 +276,7 @@ struct Replace<Matrix<VT>, Matrix<VT>, VT> {
                     }
                 }
             }
-        }
-        else { // pattern is not NaN --> replacement can still be NaN
+        } else { // pattern is not NaN --> replacement can still be NaN
             for (size_t r = 0; r < numRows; ++r) {
                 for (size_t c = 0; c < numCols; ++c) {
                     if (requireCopy) {
@@ -302,4 +296,4 @@ struct Replace<Matrix<VT>, Matrix<VT>, VT> {
     }
 };
 
-#endif //SRC_RUNTIME_LOCAL_KERNELS_REPLACE_H
+#endif // SRC_RUNTIME_LOCAL_KERNELS_REPLACE_H
diff --git a/src/runtime/local/kernels/Reshape.h b/src/runtime/local/kernels/Reshape.h
index d15fe7636..918e38839 100644
--- a/src/runtime/local/kernels/Reshape.h
+++ b/src/runtime/local/kernels/Reshape.h
@@ -30,17 +30,16 @@
 // Struct for partial template specialization
 // ****************************************************************************
 
-template<class DTRes, class DTArg>
-struct Reshape {
-    static void apply(DTRes *& res, const DTArg * arg, size_t numRows, size_t numCols, DCTX(ctx)) = delete;
+template <class DTRes, class DTArg> struct Reshape {
+    static void apply(DTRes *&res, const DTArg *arg, size_t numRows, size_t numCols, DCTX(ctx)) = delete;
 };
 
 // ****************************************************************************
 // Convenience function
 // ****************************************************************************
 
-template<class DTRes, class DTArg>
-void reshape(DTRes *& res, const DTArg * arg, size_t numRows, size_t numCols, DCTX(ctx)) {
+template <class DTRes, class DTArg>
+void reshape(DTRes *&res, const DTArg *arg, size_t numRows, size_t numCols, DCTX(ctx)) {
     Reshape<DTRes, DTArg>::apply(res, arg, numRows, numCols, ctx);
 }
 
@@ -52,24 +51,23 @@ void reshape(DTRes *& res, const DTArg * arg, size_t numRows, size_t numCols, DC
 // DenseMatrix
 // ----------------------------------------------------------------------------
 
-template<typename VT>
-struct Reshape<DenseMatrix<VT>, DenseMatrix<VT>> {
-    static void apply(DenseMatrix<VT> *& res, const DenseMatrix<VT> * arg, size_t numRows, size_t numCols, DCTX(ctx)) {
-        if(numRows * numCols != arg->getNumRows() * arg->getNumCols())
+template <typename VT> struct Reshape<DenseMatrix<VT>, DenseMatrix<VT>> {
+    static void apply(DenseMatrix<VT> *&res, const DenseMatrix<VT> *arg, size_t numRows, size_t numCols, DCTX(ctx)) {
+        if (numRows * numCols != arg->getNumRows() * arg->getNumCols())
             throw std::runtime_error("reshape must retain the number of cells");
 
-        if(arg->getRowSkip() == arg->getNumCols() && res == nullptr)
+        if (arg->getRowSkip() == arg->getNumCols() && res == nullptr)
             res = DataObjectFactory::create<DenseMatrix<VT>>(numRows, numCols, arg->getValuesSharedPtr());
         else {
-            if(res == nullptr)
+            if (res == nullptr)
                 res = DataObjectFactory::create<DenseMatrix<VT>>(numRows, numCols, false);
 
             auto resVals = res->getValues();
             auto argVals = arg->getValues();
             size_t numArgRows = arg->getNumRows();
             size_t numArgCols = arg->getNumCols();
-            for(size_t r = 0; r < numArgRows; r++) {
-                memcpy(resVals, argVals, numArgCols * sizeof(VT));
+            for (size_t r = 0; r < numArgRows; r++) {
+                std::copy(argVals, argVals + numArgCols, resVals);
                 argVals += arg->getRowSkip();
                 resVals += numArgCols;
             }
@@ -81,9 +79,8 @@ struct Reshape<DenseMatrix<VT>, DenseMatrix<VT>> {
 // Matrix
 // ----------------------------------------------------------------------------
 
-template<typename VT>
-struct Reshape<Matrix<VT>, Matrix<VT>> {
-    static void apply(Matrix<VT> *& res, const Matrix<VT> * arg, size_t numRows, size_t numCols, DCTX(ctx)) {
+template <typename VT> struct Reshape<Matrix<VT>, Matrix<VT>> {
+    static void apply(Matrix<VT> *&res, const Matrix<VT> *arg, size_t numRows, size_t numCols, DCTX(ctx)) {
         const size_t numColsArg = arg->getNumCols();
 
         if (numRows * numCols != arg->getNumRows() * numColsArg)
@@ -103,4 +100,4 @@ struct Reshape<Matrix<VT>, Matrix<VT>> {
     }
 };
 
-#endif //SRC_RUNTIME_LOCAL_KERNELS_RESHAPE_H
\ No newline at end of file
+#endif // SRC_RUNTIME_LOCAL_KERNELS_RESHAPE_H
\ No newline at end of file
diff --git a/src/runtime/local/kernels/Reverse.h b/src/runtime/local/kernels/Reverse.h
index 6c04197dc..ccca4c36b 100644
--- a/src/runtime/local/kernels/Reverse.h
+++ b/src/runtime/local/kernels/Reverse.h
@@ -31,17 +31,15 @@
 // Struct for partial template specialization
 // ****************************************************************************
 
-template<class DTRes, typename DTArg>
-struct Reverse {
-    static void apply(DTRes *& res, const DTArg *arg, DCTX(ctx)) = delete;
+template <class DTRes, typename DTArg> struct Reverse {
+    static void apply(DTRes *&res, const DTArg *arg, DCTX(ctx)) = delete;
 };
 
 // ****************************************************************************
 // Convenience function
 // ****************************************************************************
 
-template<class DTRes, typename DTArg>
-void reverse(DTRes *& res, const DTArg *arg, DCTX(ctx)) {
+template <class DTRes, typename DTArg> void reverse(DTRes *&res, const DTArg *arg, DCTX(ctx)) {
     Reverse<DTRes, DTArg>::apply(res, arg, ctx);
 }
 
@@ -53,26 +51,25 @@ void reverse(DTRes *& res, const DTArg *arg, DCTX(ctx)) {
 // DenseMatrix <- DenseMatrix
 // ----------------------------------------------------------------------------
 
-template<typename VT>
-struct Reverse<DenseMatrix<VT>, DenseMatrix<VT>> {
-    static void apply(DenseMatrix<VT> *& res, const DenseMatrix<VT> *arg, DCTX(ctx)) {
+template <typename VT> struct Reverse<DenseMatrix<VT>, DenseMatrix<VT>> {
+    static void apply(DenseMatrix<VT> *&res, const DenseMatrix<VT> *arg, DCTX(ctx)) {
         size_t numRows = arg->getNumRows();
         size_t numCols = arg->getNumCols();
-        if(res == nullptr)
+        if (res == nullptr)
             res = DataObjectFactory::create<DenseMatrix<VT>>(numRows, numCols, false);
-        
+
         const VT *valuesArg = arg->getValues();
-        VT * valuesRes = res->getValues();
-        
+        VT *valuesRes = res->getValues();
+
         // This operation will often be applied to column (n x 1) matrices,
         // so this case could optionally be treated more efficiently.
-        if (arg->getRowSkip() == 1){ // We need to check RowSkip in case of sub Matrix (see DenseMatrix.h)
+        if (arg->getRowSkip() == 1) { // We need to check RowSkip in case of sub
+                                      // Matrix (see DenseMatrix.h)
             std::reverse_copy(valuesArg, valuesArg + numRows, valuesRes);
-        }
-        else {
+        } else {
             const VT *valuesArgLastRow = valuesArg + ((numRows - 1) * arg->getRowSkip());
             for (size_t r = 0; r < numRows; r++) {
-                memcpy(valuesRes, valuesArgLastRow, numCols * sizeof(VT));
+                std::copy(valuesArgLastRow, valuesArgLastRow + numCols, valuesRes);
                 valuesRes += res->getRowSkip();
                 valuesArgLastRow -= arg->getRowSkip();
             }
@@ -84,9 +81,8 @@ struct Reverse<DenseMatrix<VT>, DenseMatrix<VT>> {
 // Matrix <- Matrix
 // ----------------------------------------------------------------------------
 
-template<typename VT>
-struct Reverse<Matrix<VT>, Matrix<VT>> {
-    static void apply(Matrix<VT> *& res, const Matrix<VT> *arg, DCTX(ctx)) {
+template <typename VT> struct Reverse<Matrix<VT>, Matrix<VT>> {
+    static void apply(Matrix<VT> *&res, const Matrix<VT> *arg, DCTX(ctx)) {
         const size_t numRows = arg->getNumRows();
         const size_t numCols = arg->getNumCols();
 
@@ -101,4 +97,4 @@ struct Reverse<Matrix<VT>, Matrix<VT>> {
     }
 };
 
-#endif //SRC_RUNTIME_LOCAL_KERNELS_REVERSE_H
+#endif // SRC_RUNTIME_LOCAL_KERNELS_REVERSE_H
diff --git a/src/runtime/local/kernels/RowBind.h b/src/runtime/local/kernels/RowBind.h
index a196b3b46..ac807bcbe 100644
--- a/src/runtime/local/kernels/RowBind.h
+++ b/src/runtime/local/kernels/RowBind.h
@@ -18,9 +18,9 @@
 #define SRC_RUNTIME_LOCAL_KERNELS_ROWBIND_H
 
 #include <runtime/local/context/DaphneContext.h>
+#include <runtime/local/datastructures/CSRMatrix.h>
 #include <runtime/local/datastructures/DataObjectFactory.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
-#include <runtime/local/datastructures/CSRMatrix.h>
 #include <runtime/local/datastructures/Frame.h>
 #include <runtime/local/datastructures/Matrix.h>
 #include <runtime/local/datastructures/ValueTypeUtils.h>
@@ -34,17 +34,16 @@
 // Struct for partial template specialization
 // ****************************************************************************
 
-template<class DTRes, class DTUp, class DTLow>
-struct RowBind {
-    static void apply(DTRes *& res, const DTUp * ups, const DTLow * lows, DCTX(ctx)) = delete;
+template <class DTRes, class DTUp, class DTLow> struct RowBind {
+    static void apply(DTRes *&res, const DTUp *ups, const DTLow *lows, DCTX(ctx)) = delete;
 };
 
 // ****************************************************************************
 // Convenience function
 // ****************************************************************************
 
-template<class DTRes, class DTUp, class DTLow>
-void rowBind(DTRes *& res, const DTUp * ups, const DTLow * lows, DCTX(ctx)) {
+template <class DTRes, class DTUp, class DTLow>
+void rowBind(DTRes *&res, const DTUp *ups, const DTLow *lows, DCTX(ctx)) {
     RowBind<DTRes, DTUp, DTLow>::apply(res, ups, lows, ctx);
 }
 
@@ -56,34 +55,31 @@ void rowBind(DTRes *& res, const DTUp * ups, const DTLow * lows, DCTX(ctx)) {
 // DenseMatrix <- DenseMatrix, DenseMatrix
 // ----------------------------------------------------------------------------
 
-template<typename VT>
-struct RowBind<DenseMatrix<VT>, DenseMatrix<VT>, DenseMatrix<VT>> {
-    static void apply(DenseMatrix<VT> *& res, const DenseMatrix<VT> * ups, const DenseMatrix<VT> * lows, DCTX(ctx)) {
+template <typename VT> struct RowBind<DenseMatrix<VT>, DenseMatrix<VT>, DenseMatrix<VT>> {
+    static void apply(DenseMatrix<VT> *&res, const DenseMatrix<VT> *ups, const DenseMatrix<VT> *lows, DCTX(ctx)) {
         const size_t numCols = ups->getNumCols();
-        if(numCols != lows->getNumCols())
-            throw std::runtime_error(
-                "the two operands must have the same number of columns, but ups has " + std::to_string(numCols) +
-                " and lows has " + std::to_string(lows->getNumCols()) + " columns"
-            );
-        
+        if (numCols != lows->getNumCols())
+            throw std::runtime_error("the two operands must have the same number of columns, but "
+                                     "ups has " +
+                                     std::to_string(numCols) + " and lows has " + std::to_string(lows->getNumCols()) +
+                                     " columns");
+
         const size_t numRowsUps = ups->getNumRows();
         const size_t numRowsLows = lows->getNumRows();
         const size_t numColsUps = ups->getNumCols();
         const size_t numColsLows = lows->getNumCols();
-        
-        if(res == nullptr)
+
+        if (res == nullptr)
             res = DataObjectFactory::create<DenseMatrix<VT>>(numRowsUps + numRowsLows, numCols, false);
-        
-        
-        const VT * valuesUps = ups->getValues();
-        const VT * valuesLows = lows->getValues();
-        VT * valuesRes = res->getValues();
-        
+
+        const VT *valuesUps = ups->getValues();
+        const VT *valuesLows = lows->getValues();
+        VT *valuesRes = res->getValues();
+
         // TODO Take rowSkip into account. If ups/lows/res are views into
         // column segments of larger data objects, we must proceed row-by-row.
         memcpy(valuesRes, valuesUps, numColsUps * numRowsUps * sizeof(VT));
         memcpy(valuesRes + numRowsUps * numColsUps, valuesLows, numColsLows * numRowsLows * sizeof(VT));
-        
     }
 };
 
@@ -91,33 +87,30 @@ struct RowBind<DenseMatrix<VT>, DenseMatrix<VT>, DenseMatrix<VT>> {
 // Frame <- Frame, Frame
 // ----------------------------------------------------------------------------
 
-template<>
-struct RowBind<Frame, Frame, Frame> {
-    static void apply(Frame *& res, const Frame * ups, const Frame * lows, const DCTX(ctx)) {
+template <> struct RowBind<Frame, Frame, Frame> {
+    static void apply(Frame *&res, const Frame *ups, const Frame *lows, const DCTX(ctx)) {
         const size_t numCols = ups->getNumCols();
-        const ValueTypeCode* schema = ups->getSchema();
-        
-        if(numCols != lows->getNumCols())
-            throw std::runtime_error(
-                "the two operands must have the same number of columns, but ups has " + std::to_string(numCols) +
-                " and lows has " + std::to_string(lows->getNumCols()) + " columns"
-            );
-        for(size_t i = 0; i < numCols; i++) {
-            if(schema[i] != lows->getSchema()[i])
+        const ValueTypeCode *schema = ups->getSchema();
+
+        if (numCols != lows->getNumCols())
+            throw std::runtime_error("the two operands must have the same number of columns, but "
+                                     "ups has " +
+                                     std::to_string(numCols) + " and lows has " + std::to_string(lows->getNumCols()) +
+                                     " columns");
+        for (size_t i = 0; i < numCols; i++) {
+            if (schema[i] != lows->getSchema()[i])
                 throw std::runtime_error("ups and lows must have the same schema");
-            if(ups->getLabels()[i] != lows->getLabels()[i])
+            if (ups->getLabels()[i] != lows->getLabels()[i])
                 throw std::runtime_error("ups and lows must have the same column names");
         }
-        
-        res = DataObjectFactory::create<Frame>(
-                ups->getNumRows() + lows->getNumRows(), numCols,
-                schema, ups->getLabels(), false
-        );
-        for(size_t i = 0; i < numCols; i++){
-            const void * colUps = ups->getColumnRaw(i);
-            const void * colLows = lows->getColumnRaw(i);
-            uint8_t * colRes = reinterpret_cast<uint8_t *>(res->getColumnRaw(i));
-            
+
+        res = DataObjectFactory::create<Frame>(ups->getNumRows() + lows->getNumRows(), numCols, schema,
+                                               ups->getLabels(), false);
+        for (size_t i = 0; i < numCols; i++) {
+            const void *colUps = ups->getColumnRaw(i);
+            const void *colLows = lows->getColumnRaw(i);
+            uint8_t *colRes = reinterpret_cast<uint8_t *>(res->getColumnRaw(i));
+
             const size_t elemSize = ValueTypeUtils::sizeOf(schema[i]);
             memcpy(colRes, colUps, ups->getNumRows() * elemSize);
             memcpy(colRes + ups->getNumRows() * elemSize, colLows, lows->getNumRows() * elemSize);
@@ -125,40 +118,38 @@ struct RowBind<Frame, Frame, Frame> {
     }
 };
 
-
 // ----------------------------------------------------------------------------
 // CSRMatrix <- CSRMatrix, CSRMatrix
 // ----------------------------------------------------------------------------
 
-template<typename VT>
-struct RowBind<CSRMatrix<VT>, CSRMatrix<VT>, CSRMatrix<VT>> {
-    static void apply(CSRMatrix<VT> *& res, const CSRMatrix<VT> * ups, const CSRMatrix<VT> * lows, DCTX(ctx)) {
-        if(ups->getNumCols() != lows->getNumCols())
-            throw std::runtime_error(
-                "the two operands must have the same number of columns, but ups has " + std::to_string(ups->getNumCols()) +
-                " and lows has " + std::to_string(lows->getNumCols()) + " columns"
-            );
+template <typename VT> struct RowBind<CSRMatrix<VT>, CSRMatrix<VT>, CSRMatrix<VT>> {
+    static void apply(CSRMatrix<VT> *&res, const CSRMatrix<VT> *ups, const CSRMatrix<VT> *lows, DCTX(ctx)) {
+        if (ups->getNumCols() != lows->getNumCols())
+            throw std::runtime_error("the two operands must have the same number of columns, but "
+                                     "ups has " +
+                                     std::to_string(ups->getNumCols()) + " and lows has " +
+                                     std::to_string(lows->getNumCols()) + " columns");
 
         auto upsRowOffsets = ups->getRowOffsets();
         auto lowsRowOffsets = lows->getRowOffsets();
-        
+
         const size_t upsNumNonZeros = ups->getNumNonZeros();
         const size_t lowsNumNonZeros = lows->getNumNonZeros();
-        
+
         size_t numRowsRes = ups->getNumRows() + lows->getNumRows();
         size_t numNonZerosRes = upsNumNonZeros + lowsNumNonZeros;
 
-        if(!res)
+        if (!res)
             res = DataObjectFactory::create<CSRMatrix<VT>>(numRowsRes, ups->getNumCols(), numNonZerosRes, false);
 
         auto resRowOffsets = res->getRowOffsets();
-        
+
         // Ups
         size_t startOffset = upsRowOffsets[0];
         size_t offsetsSubsetLength = upsNumNonZeros;
 
-        if(ups->isView())
-            for(size_t rOffset = 0; rOffset < ups->getNumRows(); rOffset++)
+        if (ups->isView())
+            for (size_t rOffset = 0; rOffset < ups->getNumRows(); rOffset++)
                 resRowOffsets[rOffset] = upsRowOffsets[rOffset] - startOffset;
         else
             memcpy(resRowOffsets, upsRowOffsets, ups->getNumRows() * sizeof(size_t));
@@ -170,10 +161,11 @@ struct RowBind<CSRMatrix<VT>, CSRMatrix<VT>, CSRMatrix<VT>> {
         startOffset = lowsRowOffsets[0];
         offsetsSubsetLength = lowsNumNonZeros;
 
-        for(size_t rOffset = 0; rOffset < lows->getNumRows(); rOffset++)
+        for (size_t rOffset = 0; rOffset < lows->getNumRows(); rOffset++)
             resRowOffsets[rOffset + ups->getNumRows()] = lowsRowOffsets[rOffset] - startOffset + lowsTranslate;
         memcpy(&res->getValues()[lowsTranslate], &lows->getValues()[startOffset], offsetsSubsetLength * sizeof(VT));
-        memcpy(&res->getColIdxs()[lowsTranslate], &lows->getColIdxs()[startOffset], offsetsSubsetLength * sizeof(size_t));
+        memcpy(&res->getColIdxs()[lowsTranslate], &lows->getColIdxs()[startOffset],
+               offsetsSubsetLength * sizeof(size_t));
 
         res->getRowOffsets()[numRowsRes] = lowsTranslate + lowsNumNonZeros;
     }
@@ -183,19 +175,18 @@ struct RowBind<CSRMatrix<VT>, CSRMatrix<VT>, CSRMatrix<VT>> {
 // Matrix <- Matrix, Matrix
 // ----------------------------------------------------------------------------
 
-template<typename VT>
-struct RowBind<Matrix<VT>, Matrix<VT>, Matrix<VT>> {
-    static void apply(Matrix<VT> *& res, const Matrix<VT> * ups, const Matrix<VT> * lows, DCTX(ctx)) {
+template <typename VT> struct RowBind<Matrix<VT>, Matrix<VT>, Matrix<VT>> {
+    static void apply(Matrix<VT> *&res, const Matrix<VT> *ups, const Matrix<VT> *lows, DCTX(ctx)) {
         const size_t numRowsUps = ups->getNumRows();
         const size_t numColsUps = ups->getNumCols();
         const size_t numRowsLows = lows->getNumRows();
         const size_t numColsLows = lows->getNumCols();
 
         if (numColsUps != numColsLows)
-            throw std::runtime_error(
-                "the two operands must have the same number of columns, but ups has " + std::to_string(numColsUps) +
-                " and lows has " + std::to_string(numColsLows) + " columns"
-            );
+            throw std::runtime_error("the two operands must have the same "
+                                     "number of columns, but ups has " +
+                                     std::to_string(numColsUps) + " and lows has " + std::to_string(numColsLows) +
+                                     " columns");
 
         if (res == nullptr)
             res = DataObjectFactory::create<DenseMatrix<VT>>(numRowsUps + numRowsLows, numColsUps, false);
@@ -212,4 +203,4 @@ struct RowBind<Matrix<VT>, Matrix<VT>, Matrix<VT>> {
     }
 };
 
-#endif //SRC_RUNTIME_LOCAL_KERNELS_ROWBIND_H
+#endif // SRC_RUNTIME_LOCAL_KERNELS_ROWBIND_H
diff --git a/src/runtime/local/kernels/Sample.h b/src/runtime/local/kernels/Sample.h
index 55ec2c24a..b8066e734 100644
--- a/src/runtime/local/kernels/Sample.h
+++ b/src/runtime/local/kernels/Sample.h
@@ -27,27 +27,26 @@
 #include <stdexcept>
 #include <type_traits>
 
+#include <chrono>
 #include <cmath>
 #include <cstddef>
 #include <cstdint>
-#include <chrono>
 #include <unordered_set>
 
 // ****************************************************************************
 // Struct for partial template specialization
 // ****************************************************************************
 
-template<class DTRes, typename VTArg>
-struct Sample {
-    static void apply(DTRes *& res, VTArg range, size_t size, bool withReplacement, int64_t seed, DCTX(ctx)) = delete;
+template <class DTRes, typename VTArg> struct Sample {
+    static void apply(DTRes *&res, VTArg range, size_t size, bool withReplacement, int64_t seed, DCTX(ctx)) = delete;
 };
 
 // ****************************************************************************
 // Convenience function
 // ****************************************************************************
 
-template<class DTRes, typename VTArg>
-void sample(DTRes *& res, VTArg range, size_t size, bool withReplacement, int64_t seed, DCTX(ctx)) {
+template <class DTRes, typename VTArg>
+void sample(DTRes *&res, VTArg range, size_t size, bool withReplacement, int64_t seed, DCTX(ctx)) {
     Sample<DTRes, VTArg>::apply(res, range, size, withReplacement, seed, ctx);
 }
 
@@ -59,20 +58,18 @@ void sample(DTRes *& res, VTArg range, size_t size, bool withReplacement, int64_
 // DenseMatrix
 // ----------------------------------------------------------------------------
 
-template<typename VT>
-struct Sample<DenseMatrix<VT>, VT> {
-    static void apply(DenseMatrix<VT> *& res, VT range, int64_t size, bool withReplacement, int64_t seed, DCTX(ctx)) {
+template <typename VT> struct Sample<DenseMatrix<VT>, VT> {
+    static void apply(DenseMatrix<VT> *&res, VT range, int64_t size, bool withReplacement, int64_t seed, DCTX(ctx)) {
         if (size <= 0)
             throw std::runtime_error("size (rows) must be > 0");
         if (range <= 0)
             throw std::runtime_error("range must be > 0");
-        if (!withReplacement && !std::is_floating_point<VT>::value &&
-            range < static_cast<VT>(size)) {
+        if (!withReplacement && !std::is_floating_point<VT>::value && range < static_cast<VT>(size)) {
             throw std::runtime_error("if no duplicates are allowed, "
                                      "then must be range >= size");
         }
 
-        if(res == nullptr)
+        if (res == nullptr)
             res = DataObjectFactory::create<DenseMatrix<VT>>(size, 1, false);
 
         if (seed == -1) {
@@ -84,48 +81,43 @@ struct Sample<DenseMatrix<VT>, VT> {
         std::mt19937 genVal(seed);
 
         if (!std::is_floating_point<VT>::value && !std::is_integral<VT>::value)
-            throw std::runtime_error(
-                "the value type must be either floating point or integral");
+            throw std::runtime_error("the value type must be either floating point or integral");
 
         // TODO For std::uniform_real_distribution, the upper bound is not
         // included in the interval of possible values, so when VT is a
         // floating-point type, std::nextafter() is not required. However, we
         // don't lose much by that, so it is fine for now.
-        typename std::conditional<
-                    std::is_floating_point<VT>::value,
-                    std::uniform_real_distribution<VT>,
-                    std::uniform_int_distribution<VT>>::type distrVal(0, std::nextafter(range, 0));
-        if (withReplacement) {            
+        typename std::conditional<std::is_floating_point<VT>::value, std::uniform_real_distribution<VT>,
+                                  std::uniform_int_distribution<VT>>::type distrVal(0, std::nextafter(range, 0));
+        if (withReplacement) {
 
             VT *valuesRes = res->getValues();
-            for (int64_t c = 0; c < size; c++)
-            {
+            for (int64_t c = 0; c < size; c++) {
                 valuesRes[c] = distrVal(genVal);
             }
-        }
-        else {
-            // If range is `double` we can simply store each number we 
+        } else {
+            // If range is `double` we can simply store each number we
             // generate and check if it already exists each time (doubles
             // are rarely duplicate).
-            if (std::is_floating_point<VT>::value){
-                
+            if (std::is_floating_point<VT>::value) {
+
                 std::unordered_set<VT> contained;
                 VT *valuesRes = res->getValues();
-                for (int64_t c = 0; c < size; c++)
-                {
+                for (int64_t c = 0; c < size; c++) {
                     VT generatedValue = distrVal(genVal);
-                    while (contained.find(generatedValue) != contained.end()){
-                        generatedValue = distrVal(genVal);                        
-                    }                    
+                    while (contained.find(generatedValue) != contained.end()) {
+                        generatedValue = distrVal(genVal);
+                    }
                     valuesRes[c] = generatedValue;
                     contained.insert(generatedValue);
-                }   
+                }
             }
-            // Else if range is `int` the above method does not work efficiently.
-            // Ex. size = range, finding the correct number is increasingly
-            // harder as we fill the array. We must implement an efficient algorithm
-            // to create non-duplicate numbers (see Knuth's algorithm).
-            else {                
+            // Else if range is `int` the above method does not work
+            // efficiently. Ex. size = range, finding the correct number is
+            // increasingly harder as we fill the array. We must implement an
+            // efficient algorithm to create non-duplicate numbers (see Knuth's
+            // algorithm).
+            else {
                 VT *valuesRes = res->getValues();
                 VT iRange;
                 int64_t iSize;
@@ -142,4 +134,4 @@ struct Sample<DenseMatrix<VT>, VT> {
         }
     }
 };
-#endif //SRC_RUNTIME_LOCAL_KERNELS_SAMPLEOP_H
+#endif // SRC_RUNTIME_LOCAL_KERNELS_SAMPLEOP_H
diff --git a/src/runtime/local/kernels/SaveDaphneLibResult.h b/src/runtime/local/kernels/SaveDaphneLibResult.h
index 77102eb9c..717fac204 100644
--- a/src/runtime/local/kernels/SaveDaphneLibResult.h
+++ b/src/runtime/local/kernels/SaveDaphneLibResult.h
@@ -19,22 +19,21 @@
 
 #include <runtime/local/context/DaphneContext.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
+#include <runtime/local/datastructures/Frame.h>
 
 // ****************************************************************************
 // Struct for partial template specialization
 // ****************************************************************************
 
-template<class DTArg>
-struct SaveDaphneLibResult {
-    static void apply(const DTArg * arg, DCTX(ctx)) = delete;
+template <class DTArg> struct SaveDaphneLibResult {
+    static void apply(const DTArg *arg, DCTX(ctx)) = delete;
 };
 
 // ****************************************************************************
 // Convenience function
 // ****************************************************************************
 
-template<class DTArg>
-void saveDaphneLibResult(const DTArg * arg, DCTX(ctx)) {
+template <class DTArg> void saveDaphneLibResult(const DTArg *arg, DCTX(ctx)) {
     SaveDaphneLibResult<DTArg>::apply(arg, ctx);
 }
 
@@ -46,20 +45,19 @@ void saveDaphneLibResult(const DTArg * arg, DCTX(ctx)) {
 // DenseMatrix
 // ----------------------------------------------------------------------------
 
-template<typename VT>
-struct SaveDaphneLibResult<DenseMatrix<VT>> {
-    static void apply(const DenseMatrix<VT> * arg,  DCTX(ctx)) {
+template <typename VT> struct SaveDaphneLibResult<DenseMatrix<VT>> {
+    static void apply(const DenseMatrix<VT> *arg, DCTX(ctx)) {
         // Increase the reference counter of the data object to be transferred
         // to numpy, such that the data is not garbage collected by DAPHNE.
         // TODO But who will free the memory in the end?
         arg->increaseRefCounter();
 
-        DaphneLibResult* daphneLibRes = ctx->getUserConfig().result_struct;
-        
-        if(!daphneLibRes)
+        DaphneLibResult *daphneLibRes = ctx->getUserConfig().result_struct;
+
+        if (!daphneLibRes)
             throw std::runtime_error("saveDaphneLibRes(): daphneLibRes is nullptr");
 
-        daphneLibRes->address = const_cast<void*>(reinterpret_cast<const void*>(arg->getValues()));
+        daphneLibRes->address = const_cast<void *>(reinterpret_cast<const void *>(arg->getValues()));
         daphneLibRes->cols = arg->getNumCols();
         daphneLibRes->rows = arg->getNumRows();
         daphneLibRes->vtc = (int64_t)ValueTypeUtils::codeFor<VT>;
@@ -70,37 +68,36 @@ struct SaveDaphneLibResult<DenseMatrix<VT>> {
 // Frame
 // ----------------------------------------------------------------------------
 
-template<>
-struct SaveDaphneLibResult<Frame> {
-    static void apply(const Frame * arg,  DCTX(ctx)) {
+template <> struct SaveDaphneLibResult<Frame> {
+    static void apply(const Frame *arg, DCTX(ctx)) {
         // Increase the reference counter of the data object to be transferred
         // to python, such that the data is not garbage collected by DAPHNE.
         // TODO But who will free the memory in the end?
         arg->increaseRefCounter();
 
-        DaphneLibResult* daphneLibRes = ctx->getUserConfig().result_struct;
-        
-        if(!daphneLibRes)
+        DaphneLibResult *daphneLibRes = ctx->getUserConfig().result_struct;
+
+        if (!daphneLibRes)
             throw std::runtime_error("saveDaphneLibRes(): daphneLibRes is nullptr");
 
         const size_t numCols = arg->getNumCols();
 
         // Create fresh arrays for vtcs, labels and columns.
-        int64_t* vtcs = new int64_t[numCols];
-        char** labels = new char*[numCols];
-        void** columns = new void*[numCols];
-        for(size_t i = 0; i < numCols; i++) {
+        int64_t *vtcs = new int64_t[numCols];
+        char **labels = new char *[numCols];
+        void **columns = new void *[numCols];
+        for (size_t i = 0; i < numCols; i++) {
             vtcs[i] = static_cast<int64_t>(arg->getSchema()[i]);
-            labels[i] = const_cast<char*>(arg->getLabels()[i].c_str());
-            columns[i] = const_cast<void*>(reinterpret_cast<const void*>(arg->getColumnRaw(i)));
+            labels[i] = const_cast<char *>(arg->getLabels()[i].c_str());
+            columns[i] = const_cast<void *>(reinterpret_cast<const void *>(arg->getColumnRaw(i)));
         }
 
         daphneLibRes->cols = numCols;
         daphneLibRes->rows = arg->getNumRows();
         daphneLibRes->vtcs = vtcs;
         daphneLibRes->labels = labels;
-        daphneLibRes->columns = columns; 
+        daphneLibRes->columns = columns;
     }
 };
 
-#endif //SRC_RUNTIME_LOCAL_KERNELS_SAVEDAPHNELIBRESULT_H
+#endif // SRC_RUNTIME_LOCAL_KERNELS_SAVEDAPHNELIBRESULT_H
diff --git a/src/runtime/local/kernels/SemiJoin.h b/src/runtime/local/kernels/SemiJoin.h
index 32bf50a35..eb815e0b7 100644
--- a/src/runtime/local/kernels/SemiJoin.h
+++ b/src/runtime/local/kernels/SemiJoin.h
@@ -40,88 +40,78 @@
 // ****************************************************************************
 // TODO Maybe this should be a kernel on its own.
 
-template<typename VTLhs, typename VTRhs, typename VTTid>
+template <typename VTLhs, typename VTRhs, typename VTTid>
 void semiJoinCol(
-        // results
-        Frame *& res,
-        DenseMatrix<VTTid> *& resLhsTid,
-        // arguments
-        const DenseMatrix<VTLhs> * argLhs,
-        const DenseMatrix<VTRhs> * argRhs,
-        // context
-        DCTX(ctx)
-) {
-    if(argLhs->getNumCols() != 1)
+    // results
+    Frame *&res, DenseMatrix<VTTid> *&resLhsTid,
+    // arguments
+    const DenseMatrix<VTLhs> *argLhs, const DenseMatrix<VTRhs> *argRhs,
+    // context
+    DCTX(ctx)) {
+    if (argLhs->getNumCols() != 1)
         throw std::runtime_error("parameter argLhs must be a single-column matrix");
-    if(argRhs->getNumCols() != 1)
+    if (argRhs->getNumCols() != 1)
         throw std::runtime_error("parameter argRhs must be a single-column matrix");
-        
+
     std::unordered_set<VTRhs> hs;
-    
+
     // ------------------------------------------------------------------------
     // Build phase on argRhs.
     // ------------------------------------------------------------------------
-    
+
     const size_t numArgRhs = argRhs->getNumRows();
-    for(size_t i = 0; i < numArgRhs; i++)
+    for (size_t i = 0; i < numArgRhs; i++)
         hs.emplace(argRhs->get(i, 0));
-    
+
     // ------------------------------------------------------------------------
     // Probe phase on argLhs.
     // ------------------------------------------------------------------------
-    
+
     const size_t numArgLhs = argLhs->getNumRows();
-    
+
     // Create the output data objects.
-    if(res == nullptr) {
+    if (res == nullptr) {
         ValueTypeCode schema[] = {ValueTypeUtils::codeFor<VTLhs>};
         res = DataObjectFactory::create<Frame>(numArgLhs, 1, schema, nullptr, false);
     }
     auto resLhs = res->getColumn<VTLhs>(0);
-    if(resLhsTid == nullptr)
+    if (resLhsTid == nullptr)
         resLhsTid = DataObjectFactory::create<DenseMatrix<VTTid>>(numArgLhs, 1, false);
-    
+
     size_t pos = 0;
-    for(size_t i = 0; i < numArgLhs; i++) {
+    for (size_t i = 0; i < numArgLhs; i++) {
         const VTLhs vLhs = argLhs->get(i, 0);
-        if(hs.count(vLhs)) {
-            resLhs   ->set(pos, 0, vLhs);
+        if (hs.count(vLhs)) {
+            resLhs->set(pos, 0, vLhs);
             resLhsTid->set(pos, 0, i);
             pos++;
         }
     }
-    
+
     res->shrinkNumRows(pos);
     resLhsTid->shrinkNumRows(pos);
-    
+
     // Free intermediate data objects.
     // TODO This is not possible at the moment due to a bug in Frame (ownership
     // of the underlying data is not shared correctly).
-//    DataObjectFactory::destroy(resLhs);
+    //    DataObjectFactory::destroy(resLhs);
 }
 
-template<typename VTLhs, typename VTRhs, typename VTTid>
+template <typename VTLhs, typename VTRhs, typename VTTid>
 void semiJoinColIf(
-        // value type known only at run-time
-        ValueTypeCode vtcLhs,
-        ValueTypeCode vtcRhs,
-        // results
-        Frame *& res,
-        DenseMatrix<VTTid> *& resLhsTid,
-        // input frames
-        const Frame * lhs, const Frame * rhs,
-        // input column names
-        const char * lhsOn, const char * rhsOn,
-        // context
-        DCTX(ctx)
-) {
-    if(vtcLhs == ValueTypeUtils::codeFor<VTLhs> && vtcRhs == ValueTypeUtils::codeFor<VTRhs>) {
-        semiJoinCol<VTLhs, VTRhs, VTTid>(
-                res, resLhsTid,
-                lhs->getColumn<VTLhs>(lhsOn),
-                rhs->getColumn<VTRhs>(rhsOn),
-                ctx
-        );
+    // value type known only at run-time
+    ValueTypeCode vtcLhs, ValueTypeCode vtcRhs,
+    // results
+    Frame *&res, DenseMatrix<VTTid> *&resLhsTid,
+    // input frames
+    const Frame *lhs, const Frame *rhs,
+    // input column names
+    const char *lhsOn, const char *rhsOn,
+    // context
+    DCTX(ctx)) {
+    if (vtcLhs == ValueTypeUtils::codeFor<VTLhs> && vtcRhs == ValueTypeUtils::codeFor<VTRhs>) {
+        semiJoinCol<VTLhs, VTRhs, VTTid>(res, resLhsTid, lhs->getColumn<VTLhs>(lhsOn), rhs->getColumn<VTRhs>(rhsOn),
+                                         ctx);
     }
 }
 
@@ -129,30 +119,29 @@ void semiJoinColIf(
 // Convenience function
 // ****************************************************************************
 
-template<typename VTLhsTid>
+template <typename VTLhsTid>
 void semiJoin(
-        // results
-        Frame *& res, DenseMatrix<VTLhsTid> *& lhsTid,
-        // input frames
-        const Frame * lhs, const Frame * rhs,
-        // input column names
-        const char * lhsOn, const char * rhsOn,
-        // context
-        DCTX(ctx)
-) {
+    // results
+    Frame *&res, DenseMatrix<VTLhsTid> *&lhsTid,
+    // input frames
+    const Frame *lhs, const Frame *rhs,
+    // input column names
+    const char *lhsOn, const char *rhsOn,
+    // context
+    DCTX(ctx)) {
     // Find out the value types of the columns to process.
     ValueTypeCode vtcLhsOn = lhs->getColumnType(lhsOn);
     ValueTypeCode vtcRhsOn = rhs->getColumnType(rhsOn);
-    
+
     // Call the semiJoin-kernel on columns for the actual combination of
     // value types.
     // Repeat this for all type combinations...
     semiJoinColIf<int64_t, int64_t, VTLhsTid>(vtcLhsOn, vtcRhsOn, res, lhsTid, lhs, rhs, lhsOn, rhsOn, ctx);
     semiJoinColIf<int64_t, int64_t, VTLhsTid>(vtcLhsOn, vtcRhsOn, res, lhsTid, lhs, rhs, lhsOn, rhsOn, ctx);
-    
+
     // Set the column labels of the result frame.
     std::string labels[] = {lhsOn};
     res->setLabels(labels);
 }
 
-#endif //SRC_RUNTIME_LOCAL_KERNELS_SEMIJOIN_H
\ No newline at end of file
+#endif // SRC_RUNTIME_LOCAL_KERNELS_SEMIJOIN_H
\ No newline at end of file
diff --git a/src/runtime/local/kernels/Seq.h b/src/runtime/local/kernels/Seq.h
index 62c295d5f..85bf1ee24 100644
--- a/src/runtime/local/kernels/Seq.h
+++ b/src/runtime/local/kernels/Seq.h
@@ -21,26 +21,24 @@
 #include <runtime/local/datastructures/DataObjectFactory.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
 
-#include <stdlib.h>
 #include <cmath>
 #include <iomanip>
 #include <stdexcept>
+#include <stdlib.h>
 
 // ****************************************************************************
 // Struct for partial template specialization
 // ****************************************************************************
 
-template<class DT>
-struct Seq{
-    static void apply(DT *& res, typename DT::VT start,typename DT::VT end, typename DT::VT inc, DCTX(ctx)) = delete;
+template <class DT> struct Seq {
+    static void apply(DT *&res, typename DT::VT start, typename DT::VT end, typename DT::VT inc, DCTX(ctx)) = delete;
 };
 
 // ****************************************************************************
 // Convenience function
 // ****************************************************************************
 
-template<class DT>
-void seq(DT *& res, typename DT::VT start, typename DT::VT end, typename DT::VT inc, DCTX(ctx)) {
+template <class DT> void seq(DT *&res, typename DT::VT start, typename DT::VT end, typename DT::VT inc, DCTX(ctx)) {
     Seq<DT>::apply(res, start, end, inc, ctx);
 }
 
@@ -48,9 +46,8 @@ void seq(DT *& res, typename DT::VT start, typename DT::VT end, typename DT::VT
 // (Partial) template specializations for different data/value types
 // ****************************************************************************
 
-template<typename VT>
-struct Seq<DenseMatrix<VT>> {
-    static void apply(DenseMatrix<VT> *& res, VT start, VT end, VT inc, DCTX(ctx)) {
+template <typename VT> struct Seq<DenseMatrix<VT>> {
+    static void apply(DenseMatrix<VT> *&res, VT start, VT end, VT inc, DCTX(ctx)) {
         if (std::isnan(inc))
             throw std::runtime_error("inc cannot be NaN");
         if (std::isnan(start))
@@ -60,43 +57,44 @@ struct Seq<DenseMatrix<VT>> {
         if (inc == 0)
             throw std::runtime_error("inc should not be zero");
 
-        if( (start<end && inc<0) || (start>end && inc>0)){
+        if ((start < end && inc < 0) || (start > end && inc > 0)) {
             // Return matrix with zero rows.
             res = DataObjectFactory::create<DenseMatrix<VT>>(0, 1, false);
             return;
         }
-        
-        VT initialDistanceToEnd= abs(end-start);
-        const size_t expectedNumRows= ceil((initialDistanceToEnd/abs(inc)))+1; // number of steps = expectedNumRows and numRows might = expectedNumRows -1 ot expectedNumRows
-        const size_t numCols=1;
+
+        VT initialDistanceToEnd = abs(end - start);
+        const size_t expectedNumRows =
+            ceil((initialDistanceToEnd / abs(inc))) + 1; // number of steps = expectedNumRows and numRows might =
+                                                         // expectedNumRows -1 ot expectedNumRows
+        const size_t numCols = 1;
         // should the kernel do such a check or reallocate res matrix directly?
         if (res == nullptr)
             res = DataObjectFactory::create<DenseMatrix<VT>>(expectedNumRows, numCols, false);
         else if (res->getNumRows() != expectedNumRows)
-            throw std::runtime_error(
-                "input matrix is not null and may not fit the sequence");
+            throw std::runtime_error("input matrix is not null and may not fit the sequence");
 
-        VT * allValues= res->getValues();
+        VT *allValues = res->getValues();
 
-        VT accumulatorValue= start;
+        VT accumulatorValue = start;
 
-        for(size_t i =0; i<expectedNumRows; i++){
-            allValues[i]= accumulatorValue;
-            accumulatorValue+=inc;
+        for (size_t i = 0; i < expectedNumRows; i++) {
+            allValues[i] = accumulatorValue;
+            accumulatorValue += inc;
         }
 
-        VT lastValue=allValues[expectedNumRows-1];
+        VT lastValue = allValues[expectedNumRows - 1];
 
         VT eps = 1.0e-13;
 
-        // on my machine the difference is (1.7e-15) greater  than epsilon std::numeric_limits<VT>::epsilon() 
-        if ( (end < start) && end-lastValue>eps ) { // reversed sequence
-            res->shrinkNumRows(expectedNumRows-1);
-        }
-        else if ( (end > start) && lastValue-end> eps ){ // normal sequence
-            res->shrinkNumRows(expectedNumRows-1);
+        // on my machine the difference is (1.7e-15) greater  than epsilon
+        // std::numeric_limits<VT>::epsilon()
+        if ((end < start) && end - lastValue > eps) { // reversed sequence
+            res->shrinkNumRows(expectedNumRows - 1);
+        } else if ((end > start) && lastValue - end > eps) { // normal sequence
+            res->shrinkNumRows(expectedNumRows - 1);
         }
     }
 };
 
-#endif //SRC_RUNTIME_LOCAL_KERNELS_SEQ_H
+#endif // SRC_RUNTIME_LOCAL_KERNELS_SEQ_H
diff --git a/src/runtime/local/kernels/SetColLabels.h b/src/runtime/local/kernels/SetColLabels.h
index d95f94e4d..d7282c824 100644
--- a/src/runtime/local/kernels/SetColLabels.h
+++ b/src/runtime/local/kernels/SetColLabels.h
@@ -30,26 +30,25 @@
 // Convenience function
 // ****************************************************************************
 
-void setColLabels(Frame *& res, const Frame * arg, const char ** labels, size_t numLabels, DCTX(ctx)) {
+inline void setColLabels(Frame *&res, const Frame *arg, const char **labels, size_t numLabels, DCTX(ctx)) {
     const size_t numCols = arg->getNumCols();
-    if(numLabels != numCols)
-        throw std::runtime_error(
-                "the number of given labels does not match the number of columns of the given frame"
-        );
-    std::string * labelsStr = new std::string[numCols];
-    for(size_t c = 0; c < numCols; c++)
+    if (numLabels != numCols)
+        throw std::runtime_error("the number of given labels does not match "
+                                 "the number of columns of the given frame");
+    std::string *labelsStr = new std::string[numCols];
+    for (size_t c = 0; c < numCols; c++)
         labelsStr[c] = labels[c];
-    
+
     // Create a view on the input frame (zero-copy) and modify the column
     // labels of the view.
     auto colIdxs = new size_t[numCols];
-    for(size_t c = 0; c < numCols; c++)
+    for (size_t c = 0; c < numCols; c++)
         colIdxs[c] = c;
     res = DataObjectFactory::create<Frame>(arg, 0, arg->getNumRows(), numCols, colIdxs);
     delete[] colIdxs;
     res->setLabels(labelsStr);
-    
+
     delete[] labelsStr;
 }
 
-#endif //SRC_RUNTIME_LOCAL_KERNELS_SETCOLLABELS_H
\ No newline at end of file
+#endif // SRC_RUNTIME_LOCAL_KERNELS_SETCOLLABELS_H
diff --git a/src/runtime/local/kernels/SetColLabelsPrefix.h b/src/runtime/local/kernels/SetColLabelsPrefix.h
index bcbd932f4..11ecccdcc 100644
--- a/src/runtime/local/kernels/SetColLabelsPrefix.h
+++ b/src/runtime/local/kernels/SetColLabelsPrefix.h
@@ -30,24 +30,24 @@
 // Convenience function
 // ****************************************************************************
 
-void setColLabelsPrefix(Frame *& res, const Frame * arg, const char * prefix, DCTX(ctx)) {
+inline void setColLabelsPrefix(Frame *&res, const Frame *arg, const char *prefix, DCTX(ctx)) {
     const size_t numCols = arg->getNumCols();
-    const std::string * oldLabels = arg->getLabels();
-    std::string * newLabels = new std::string[numCols];
-    
-    for(size_t i = 0; i < numCols; i++)
+    const std::string *oldLabels = arg->getLabels();
+    std::string *newLabels = new std::string[numCols];
+
+    for (size_t i = 0; i < numCols; i++)
         newLabels[i] = LabelUtils::setPrefix(prefix, oldLabels[i]);
-    
+
     // Create a view on the input frame (zero-copy) and modify the column
     // labels of the view.
     auto colIdxs = new size_t[numCols];
-    for(size_t c = 0; c < numCols; c++)
+    for (size_t c = 0; c < numCols; c++)
         colIdxs[c] = c;
     res = DataObjectFactory::create<Frame>(arg, 0, arg->getNumRows(), numCols, colIdxs);
     delete[] colIdxs;
     res->setLabels(newLabels);
-    
+
     delete[] newLabels;
 }
 
-#endif //SRC_RUNTIME_LOCAL_KERNELS_SETCOLLABELSPREFIX_H
\ No newline at end of file
+#endif // SRC_RUNTIME_LOCAL_KERNELS_SETCOLLABELSPREFIX_H
diff --git a/src/runtime/local/kernels/SliceCol.h b/src/runtime/local/kernels/SliceCol.h
index e8f23c976..a1a9af19b 100644
--- a/src/runtime/local/kernels/SliceCol.h
+++ b/src/runtime/local/kernels/SliceCol.h
@@ -35,17 +35,16 @@
 // Struct for partial template specialization
 // ****************************************************************************
 
-template<class DTRes, class DTArg, typename VTSel>
-struct SliceCol {
-    static void apply(DTRes *& res, const DTArg * arg, const VTSel lowerIncl, const VTSel upperExcl, DCTX(ctx)) = delete;
+template <class DTRes, class DTArg, typename VTSel> struct SliceCol {
+    static void apply(DTRes *&res, const DTArg *arg, const VTSel lowerIncl, const VTSel upperExcl, DCTX(ctx)) = delete;
 };
 
 // ****************************************************************************
 // Convenience function
 // ****************************************************************************
 
-template<class DTRes, class DTArg, typename VTSel>
-void sliceCol(DTRes *& res, const DTArg * arg, const VTSel lowerIncl, const VTSel upperExcl, DCTX(ctx)) {
+template <class DTRes, class DTArg, typename VTSel>
+void sliceCol(DTRes *&res, const DTArg *arg, const VTSel lowerIncl, const VTSel upperExcl, DCTX(ctx)) {
     SliceCol<DTRes, DTArg, VTSel>::apply(res, arg, lowerIncl, upperExcl, ctx);
 }
 
@@ -53,16 +52,17 @@ void sliceCol(DTRes *& res, const DTArg * arg, const VTSel lowerIncl, const VTSe
 // Boundary validation
 // ****************************************************************************
 
-template<typename VTSel>
-void validateArgsSliceCol(VTSel lowerIncl, VTSel upperExcl, size_t numColsArg) {
-    if (lowerIncl < 0 || upperExcl < lowerIncl || numColsArg < static_cast<size_t>(upperExcl)
-        || (static_cast<size_t>(lowerIncl) == numColsArg && lowerIncl != 0)) {
-            std::ostringstream errMsg;
-            errMsg << "invalid arguments '" << lowerIncl << ", " << upperExcl << "' passed to SliceCol: "
-                    << "it must hold 0 <= lowerIncl <= upperExcl <= #columns "
-                    << "and lowerIncl < #columns (unless both are zero) where #columns of arg is '" << numColsArg << "'";
-            throw std::out_of_range(errMsg.str());
-        }
+template <typename VTSel> void validateArgsSliceCol(VTSel lowerIncl, VTSel upperExcl, size_t numColsArg) {
+    if (lowerIncl < 0 || upperExcl < lowerIncl || numColsArg < static_cast<size_t>(upperExcl) ||
+        (static_cast<size_t>(lowerIncl) == numColsArg && lowerIncl != 0)) {
+        std::ostringstream errMsg;
+        errMsg << "invalid arguments '" << lowerIncl << ", " << upperExcl
+               << "' passed to SliceCol: " << "it must hold 0 <= lowerIncl <= upperExcl <= #columns "
+               << "and lowerIncl < #columns (unless both are zero) where "
+                  "#columns of arg is '"
+               << numColsArg << "'";
+        throw std::out_of_range(errMsg.str());
+    }
 }
 
 // ****************************************************************************
@@ -73,35 +73,34 @@ void validateArgsSliceCol(VTSel lowerIncl, VTSel upperExcl, size_t numColsArg) {
 // DenseMatrix <- DenseMatrix
 // ----------------------------------------------------------------------------
 
-template<typename VTArg, typename VTSel>
-struct SliceCol<DenseMatrix<VTArg>, DenseMatrix<VTArg>, VTSel> {
-    static void apply(DenseMatrix<VTArg> *& res, const DenseMatrix<VTArg> * arg, const VTSel lowerIncl, const VTSel upperExcl, DCTX(ctx)) {
+template <typename VTArg, typename VTSel> struct SliceCol<DenseMatrix<VTArg>, DenseMatrix<VTArg>, VTSel> {
+    static void apply(DenseMatrix<VTArg> *&res, const DenseMatrix<VTArg> *arg, const VTSel lowerIncl,
+                      const VTSel upperExcl, DCTX(ctx)) {
         const size_t numColsArg = arg->getNumCols();
         validateArgsSliceCol(lowerIncl, upperExcl, numColsArg);
         res = arg->sliceCol(lowerIncl, upperExcl);
-    }        
+    }
 };
 
 // ----------------------------------------------------------------------------
 // Frame <- Frame
 // ----------------------------------------------------------------------------
 
-template <typename VTSel>
-struct SliceCol<Frame, Frame, VTSel> {
-    static void apply(Frame *& res, const Frame * arg, const VTSel lowerIncl, const VTSel upperExcl, DCTX(ctx)) {
+template <typename VTSel> struct SliceCol<Frame, Frame, VTSel> {
+    static void apply(Frame *&res, const Frame *arg, const VTSel lowerIncl, const VTSel upperExcl, DCTX(ctx)) {
         const size_t numColsArg = arg->getNumCols();
         validateArgsSliceCol(lowerIncl, upperExcl, numColsArg);
         res = arg->sliceCol(lowerIncl, upperExcl);
-    }        
+    }
 };
 
 // ----------------------------------------------------------------------------
 // Matrix <- Matrix
 // ----------------------------------------------------------------------------
 
-template<typename VTArg, typename VTSel>
-struct SliceCol<Matrix<VTArg>, Matrix<VTArg>, VTSel> {
-    static void apply(Matrix<VTArg> *& res, const Matrix<VTArg> * arg, const VTSel lowerIncl, const VTSel upperExcl, DCTX(ctx)) {
+template <typename VTArg, typename VTSel> struct SliceCol<Matrix<VTArg>, Matrix<VTArg>, VTSel> {
+    static void apply(Matrix<VTArg> *&res, const Matrix<VTArg> *arg, const VTSel lowerIncl, const VTSel upperExcl,
+                      DCTX(ctx)) {
         const size_t numRowsArg = arg->getNumRows();
         const size_t numColsRes = static_cast<size_t>(upperExcl - lowerIncl);
         validateArgsSliceCol(lowerIncl, upperExcl, arg->getNumCols());
@@ -114,7 +113,7 @@ struct SliceCol<Matrix<VTArg>, Matrix<VTArg>, VTSel> {
             for (size_t c = 0; c < numColsRes; ++c)
                 res->append(r, c, arg->get(r, static_cast<const size_t>(lowerIncl) + c));
         res->finishAppend();
-    }        
+    }
 };
 
-#endif //SRC_RUNTIME_LOCAL_KERNELS_SLICECOL_H
\ No newline at end of file
+#endif // SRC_RUNTIME_LOCAL_KERNELS_SLICECOL_H
\ No newline at end of file
diff --git a/src/runtime/local/kernels/SliceRow.h b/src/runtime/local/kernels/SliceRow.h
index 9b5def663..4beb6d516 100644
--- a/src/runtime/local/kernels/SliceRow.h
+++ b/src/runtime/local/kernels/SliceRow.h
@@ -35,17 +35,16 @@
 // Struct for partial template specialization
 // ****************************************************************************
 
-template<class DTRes, class DTArg, typename VTSel>
-struct SliceRow {
-    static void apply(DTRes *& res, const DTArg * arg, const VTSel lowerIncl, const VTSel upperExcl, DCTX(ctx)) = delete;
+template <class DTRes, class DTArg, typename VTSel> struct SliceRow {
+    static void apply(DTRes *&res, const DTArg *arg, const VTSel lowerIncl, const VTSel upperExcl, DCTX(ctx)) = delete;
 };
 
 // ****************************************************************************
 // Convenience function
 // ****************************************************************************
 
-template<class DTRes, class DTArg, typename VTSel>
-void sliceRow(DTRes *& res, const DTArg * arg, const VTSel lowerIncl, const VTSel upperExcl, DCTX(ctx)) {
+template <class DTRes, class DTArg, typename VTSel>
+void sliceRow(DTRes *&res, const DTArg *arg, const VTSel lowerIncl, const VTSel upperExcl, DCTX(ctx)) {
     SliceRow<DTRes, DTArg, VTSel>::apply(res, arg, lowerIncl, upperExcl, ctx);
 }
 
@@ -53,16 +52,17 @@ void sliceRow(DTRes *& res, const DTArg * arg, const VTSel lowerIncl, const VTSe
 // Boundary validation
 // ****************************************************************************
 
-template<typename VTSel>
-void validateArgsSliceRow(VTSel lowerIncl, VTSel upperExcl, size_t numRowsArg) {
-    if (lowerIncl < 0 || upperExcl < lowerIncl || numRowsArg < static_cast<size_t>(upperExcl)
-        || (static_cast<size_t>(lowerIncl) == numRowsArg && lowerIncl != 0)) {
-            std::ostringstream errMsg;
-            errMsg << "invalid arguments '" << lowerIncl << ", " << upperExcl << "' passed to SliceRow: "
-                    << "it must hold 0 <= lowerIncl <= upperExcl <= #rows "
-                    << "and lowerIncl < #rows (unless both are zero) where #rows of arg is '" << numRowsArg << "'";
-            throw std::out_of_range(errMsg.str());
-        }
+template <typename VTSel> void validateArgsSliceRow(VTSel lowerIncl, VTSel upperExcl, size_t numRowsArg) {
+    if (lowerIncl < 0 || upperExcl < lowerIncl || numRowsArg < static_cast<size_t>(upperExcl) ||
+        (static_cast<size_t>(lowerIncl) == numRowsArg && lowerIncl != 0)) {
+        std::ostringstream errMsg;
+        errMsg << "invalid arguments '" << lowerIncl << ", " << upperExcl
+               << "' passed to SliceRow: " << "it must hold 0 <= lowerIncl <= upperExcl <= #rows "
+               << "and lowerIncl < #rows (unless both are zero) where #rows of "
+                  "arg is '"
+               << numRowsArg << "'";
+        throw std::out_of_range(errMsg.str());
+    }
 }
 
 // ****************************************************************************
@@ -73,35 +73,34 @@ void validateArgsSliceRow(VTSel lowerIncl, VTSel upperExcl, size_t numRowsArg) {
 // DenseMatrix <- DenseMatrix
 // ----------------------------------------------------------------------------
 
-template<typename VTArg, typename VTSel>
-struct SliceRow<DenseMatrix<VTArg>, DenseMatrix<VTArg>, VTSel> {
-    static void apply(DenseMatrix<VTArg> *& res, const DenseMatrix<VTArg> * arg, const VTSel lowerIncl, const VTSel upperExcl, DCTX(ctx)) {
+template <typename VTArg, typename VTSel> struct SliceRow<DenseMatrix<VTArg>, DenseMatrix<VTArg>, VTSel> {
+    static void apply(DenseMatrix<VTArg> *&res, const DenseMatrix<VTArg> *arg, const VTSel lowerIncl,
+                      const VTSel upperExcl, DCTX(ctx)) {
         const size_t numRowsArg = arg->getNumRows();
         validateArgsSliceRow(lowerIncl, upperExcl, numRowsArg);
         res = arg->sliceRow(lowerIncl, upperExcl);
-    }        
+    }
 };
 
 // ----------------------------------------------------------------------------
 // Frame <- Frame
 // ----------------------------------------------------------------------------
 
-template <typename VTSel>
-struct SliceRow<Frame, Frame, VTSel> {
-    static void apply(Frame *& res, const Frame * arg, const VTSel lowerIncl, const VTSel upperExcl, DCTX(ctx)) {
+template <typename VTSel> struct SliceRow<Frame, Frame, VTSel> {
+    static void apply(Frame *&res, const Frame *arg, const VTSel lowerIncl, const VTSel upperExcl, DCTX(ctx)) {
         const size_t numRowsArg = arg->getNumRows();
         validateArgsSliceRow(lowerIncl, upperExcl, numRowsArg);
         res = arg->sliceRow(lowerIncl, upperExcl);
-    }        
+    }
 };
 
 // ----------------------------------------------------------------------------
 // Matrix <- Matrix
 // ----------------------------------------------------------------------------
 
-template<typename VTArg, typename VTSel>
-struct SliceRow<Matrix<VTArg>, Matrix<VTArg>, VTSel> {
-    static void apply(Matrix<VTArg> *& res, const Matrix<VTArg> * arg, const VTSel lowerIncl, const VTSel upperExcl, DCTX(ctx)) {
+template <typename VTArg, typename VTSel> struct SliceRow<Matrix<VTArg>, Matrix<VTArg>, VTSel> {
+    static void apply(Matrix<VTArg> *&res, const Matrix<VTArg> *arg, const VTSel lowerIncl, const VTSel upperExcl,
+                      DCTX(ctx)) {
         const size_t numColsArg = arg->getNumCols();
         const size_t numRowsRes = static_cast<const size_t>(upperExcl - lowerIncl);
         validateArgsSliceRow(lowerIncl, upperExcl, arg->getNumRows());
@@ -114,7 +113,7 @@ struct SliceRow<Matrix<VTArg>, Matrix<VTArg>, VTSel> {
             for (size_t c = 0; c < numColsArg; ++c)
                 res->append(r, c, arg->get(static_cast<const size_t>(lowerIncl) + r, c));
         res->finishAppend();
-    }        
+    }
 };
 
-#endif //SRC_RUNTIME_LOCAL_KERNELS_SLICEROW_H
\ No newline at end of file
+#endif // SRC_RUNTIME_LOCAL_KERNELS_SLICEROW_H
\ No newline at end of file
diff --git a/src/runtime/local/kernels/Solve.h b/src/runtime/local/kernels/Solve.h
index 195836259..33ae90323 100644
--- a/src/runtime/local/kernels/Solve.h
+++ b/src/runtime/local/kernels/Solve.h
@@ -30,17 +30,16 @@
 // Struct for partial template specialization
 // ****************************************************************************
 
-template<class DTRes, class DTLhs, class DTRhs>
-struct Solve {
-    static void apply(DTRes *& res, const DTLhs * lhs, const DTRhs * rhs, DCTX(ctx)) = delete;
+template <class DTRes, class DTLhs, class DTRhs> struct Solve {
+    static void apply(DTRes *&res, const DTLhs *lhs, const DTRhs *rhs, DCTX(ctx)) = delete;
 };
 
 // ****************************************************************************
 // Convenience function
 // ****************************************************************************
 
-template<class DTRes, class DTLhs, class DTRhs>
-void solve(DTRes *& res, const DTLhs * lhs, const DTRhs * rhs, DCTX(ctx)) {
+template <class DTRes, class DTLhs, class DTRhs>
+void solve(DTRes *&res, const DTLhs *lhs, const DTRhs *rhs, DCTX(ctx)) {
     Solve<DTRes, DTLhs, DTRhs>::apply(res, lhs, rhs, ctx);
 }
 
@@ -52,15 +51,14 @@ void solve(DTRes *& res, const DTLhs * lhs, const DTRhs * rhs, DCTX(ctx)) {
 // DenseMatrix <- DenseMatrix, DenseMatrix
 // ----------------------------------------------------------------------------
 
-template<>
-struct Solve<DenseMatrix<float>, DenseMatrix<float>, DenseMatrix<float>> {
-    static void apply(DenseMatrix<float> *& res, const DenseMatrix<float> * lhs, const DenseMatrix<float> * rhs, DCTX(ctx)) {
+template <> struct Solve<DenseMatrix<float>, DenseMatrix<float>, DenseMatrix<float>> {
+    static void apply(DenseMatrix<float> *&res, const DenseMatrix<float> *lhs, const DenseMatrix<float> *rhs,
+                      DCTX(ctx)) {
         const auto nr1 = static_cast<int>(lhs->getNumRows());
         const auto nc1 = static_cast<int>(lhs->getNumCols());
         const auto nc2 = static_cast<int>(rhs->getNumCols());
         if (nr1 != static_cast<int>(rhs->getNumRows()))
-            throw std::runtime_error(
-                "#rows of lhs and #rows of rhs must be the same");
+            throw std::runtime_error("#rows of lhs and #rows of rhs must be the same");
         if (nr1 != nc1)
             throw std::runtime_error("#rows and #cols of lhs must be the same");
         if (static_cast<int>(lhs->getRowSkip()) != nc1)
@@ -68,29 +66,31 @@ struct Solve<DenseMatrix<float>, DenseMatrix<float>, DenseMatrix<float>> {
         if (nc2 != 1)
             throw std::runtime_error("#cols of rhs must be 1");
 
-        if(res == nullptr)
+        if (res == nullptr)
             res = DataObjectFactory::create<DenseMatrix<float>>(nr1, nc2, false);
 
         // solve system of equations via LU decomposition
-        int ipiv[nr1];       // permutation indexes
-        float work[nr1*nc1]; // LU factorization of gesv
-        memcpy(work, lhs->getValues(), nr1*nc1*sizeof(float));         //for in-place A
-        memcpy(res->getValues(), rhs->getValues(), nr1*sizeof(float)); //for in-place b-out
+        int ipiv[nr1];         // permutation indexes
+        float work[nr1 * nc1]; // LU factorization of gesv
+        memcpy(work, lhs->getValues(),
+               nr1 * nc1 * sizeof(float)); // for in-place A
+        memcpy(res->getValues(), rhs->getValues(),
+               nr1 * sizeof(float)); // for in-place b-out
         [[maybe_unused]] int info = LAPACKE_sgesv(LAPACK_ROW_MAJOR, nr1, 1, work, nc1, ipiv, res->getValues(), 1);
         if (info > 0)
-            throw std::runtime_error("A factor Ui is exactly singular, so the solution could not be computed");
+            throw std::runtime_error("A factor Ui is exactly singular, so the "
+                                     "solution could not be computed");
     }
 };
 
-template<>
-struct Solve<DenseMatrix<double>, DenseMatrix<double>, DenseMatrix<double>> {
-    static void apply(DenseMatrix<double> *& res, const DenseMatrix<double> * lhs, const DenseMatrix<double> * rhs, DCTX(ctx)) {
+template <> struct Solve<DenseMatrix<double>, DenseMatrix<double>, DenseMatrix<double>> {
+    static void apply(DenseMatrix<double> *&res, const DenseMatrix<double> *lhs, const DenseMatrix<double> *rhs,
+                      DCTX(ctx)) {
         const auto nr1 = static_cast<int>(lhs->getNumRows());
         const auto nc1 = static_cast<int>(lhs->getNumCols());
         const auto nc2 = static_cast<int>(rhs->getNumCols());
         if (nr1 != static_cast<int>(rhs->getNumRows()))
-            throw std::runtime_error(
-                "#rows of lhs and #rows of rhs must be the same");
+            throw std::runtime_error("#rows of lhs and #rows of rhs must be the same");
         if (nr1 != nc1)
             throw std::runtime_error("#rows and #cols of lhs must be the same");
         if (static_cast<int>(lhs->getRowSkip()) != nc1)
@@ -98,16 +98,19 @@ struct Solve<DenseMatrix<double>, DenseMatrix<double>, DenseMatrix<double>> {
         if (nc2 != 1)
             throw std::runtime_error("#cols of rhs must be 1");
 
-        if(res == nullptr)
+        if (res == nullptr)
             res = DataObjectFactory::create<DenseMatrix<double>>(nr1, nc2, false);
 
         // solve system of equations via LU decomposition
-        int ipiv[nr1];       // permutation indexes
-        double work[nr1*nc1]; // LU factorization of gesv
-        memcpy(work, lhs->getValues(), nr1*nc1*sizeof(double));         //for in-place A
-        memcpy(res->getValues(), rhs->getValues(), nr1*sizeof(double)); //for in-place b-out
+        int ipiv[nr1];          // permutation indexes
+        double work[nr1 * nc1]; // LU factorization of gesv
+        memcpy(work, lhs->getValues(),
+               nr1 * nc1 * sizeof(double)); // for in-place A
+        memcpy(res->getValues(), rhs->getValues(),
+               nr1 * sizeof(double)); // for in-place b-out
         [[maybe_unused]] int info = LAPACKE_dgesv(LAPACK_ROW_MAJOR, nr1, 1, work, nc1, ipiv, res->getValues(), 1);
         if (info > 0)
-            throw std::runtime_error("A factor Ui is exactly singular, so the solution could not be computed");
+            throw std::runtime_error("A factor Ui is exactly singular, so the "
+                                     "solution could not be computed");
     }
 };
diff --git a/src/runtime/local/kernels/Sparsity.h b/src/runtime/local/kernels/Sparsity.h
index 87034a988..a49402dcf 100644
--- a/src/runtime/local/kernels/Sparsity.h
+++ b/src/runtime/local/kernels/Sparsity.h
@@ -18,14 +18,13 @@
 #define SRC_RUNTIME_LOCAL_KERNELS_SPARSITY_H
 
 #include <runtime/local/context/DaphneContext.h>
+#include <runtime/local/datastructures/CSRMatrix.h>
+#include <runtime/local/datastructures/DenseMatrix.h>
 
 // ****************************************************************************
 // Convenience function
 // ****************************************************************************
 
-template<class DTArg>
-double sparsity(const DTArg * arg, DCTX(ctx)) {
-    return -1.0;
-}
+template <class DTArg> double sparsity(const DTArg *arg, DCTX(ctx)) { return -1.0; }
 
-#endif //SRC_RUNTIME_LOCAL_KERNELS_SPARSITY_H
\ No newline at end of file
+#endif // SRC_RUNTIME_LOCAL_KERNELS_SPARSITY_H
diff --git a/src/runtime/local/kernels/Stop.h b/src/runtime/local/kernels/Stop.h
index 059e551f4..14febdebb 100644
--- a/src/runtime/local/kernels/Stop.h
+++ b/src/runtime/local/kernels/Stop.h
@@ -18,6 +18,6 @@
 
 #include <runtime/local/context/DaphneContext.h>
 
-void stop(const char *message, DCTX(ctx)) {
+inline void stop(const char *message, DCTX(ctx)) {
     throw std::runtime_error(std::string("system stopped: ") + message);
-}
\ No newline at end of file
+}
diff --git a/src/runtime/local/kernels/Syrk.h b/src/runtime/local/kernels/Syrk.h
index 3d87b91eb..b24bdad10 100644
--- a/src/runtime/local/kernels/Syrk.h
+++ b/src/runtime/local/kernels/Syrk.h
@@ -28,17 +28,15 @@
 // Struct for partial template specialization
 // ****************************************************************************
 
-template<class DTRes, class DTArg>
-struct Syrk {
-    static void apply(DTRes *& res, const DTArg * arg, DCTX(ctx)) = delete;
+template <class DTRes, class DTArg> struct Syrk {
+    static void apply(DTRes *&res, const DTArg *arg, DCTX(ctx)) = delete;
 };
 
 // ****************************************************************************
 // Convenience function
 // ****************************************************************************
 
-template<class DTRes, class DTArg>
-void syrk(DTRes *& res, const DTArg * arg, DCTX(ctx)) {
+template <class DTRes, class DTArg> void syrk(DTRes *&res, const DTArg *arg, DCTX(ctx)) {
     Syrk<DTRes, DTArg>::apply(res, arg, ctx);
 }
 
@@ -50,26 +48,16 @@ void syrk(DTRes *& res, const DTArg * arg, DCTX(ctx)) {
 // DenseMatrix <- DenseMatrix
 // ----------------------------------------------------------------------------
 
-template<>
-struct Syrk<DenseMatrix<double>, DenseMatrix<double>> {
-    static void apply(DenseMatrix<double> *& res, const DenseMatrix<double> * arg, DCTX(ctx)) {
+template <> struct Syrk<DenseMatrix<double>, DenseMatrix<double>> {
+    static void apply(DenseMatrix<double> *&res, const DenseMatrix<double> *arg, DCTX(ctx)) {
         const size_t numRows = arg->getNumRows();
         const size_t numCols = arg->getNumCols();
 
-        if(res == nullptr)
+        if (res == nullptr)
             res = DataObjectFactory::create<DenseMatrix<double>>(numCols, numCols, false);
 
-        cblas_dsyrk(CblasRowMajor,
-            CblasUpper,
-            CblasTrans,
-            numCols,
-            numRows,
-            1.0,
-            arg->getValues(),
-            arg->getRowSkip(),
-            0.0,
-            res->getValues(),
-            res->getRowSkip());
+        cblas_dsyrk(CblasRowMajor, CblasUpper, CblasTrans, numCols, numRows, 1.0, arg->getValues(), arg->getRowSkip(),
+                    0.0, res->getValues(), res->getRowSkip());
         for (auto r = 0u; r < numCols; ++r) {
             for (auto c = r + 1; c < numCols; ++c) {
                 res->set(c, r, res->get(r, c));
@@ -78,26 +66,16 @@ struct Syrk<DenseMatrix<double>, DenseMatrix<double>> {
     }
 };
 
-template<>
-struct Syrk<DenseMatrix<float>, DenseMatrix<float>> {
-    static void apply(DenseMatrix<float> *& res, const DenseMatrix<float> * arg, DCTX(ctx)) {
+template <> struct Syrk<DenseMatrix<float>, DenseMatrix<float>> {
+    static void apply(DenseMatrix<float> *&res, const DenseMatrix<float> *arg, DCTX(ctx)) {
         const size_t numRows = arg->getNumRows();
         const size_t numCols = arg->getNumCols();
 
-        if(res == nullptr)
+        if (res == nullptr)
             res = DataObjectFactory::create<DenseMatrix<float>>(numCols, numCols, false);
 
-        cblas_ssyrk(CblasRowMajor,
-            CblasUpper,
-            CblasTrans,
-            numCols,
-            numRows,
-            1.0,
-            arg->getValues(),
-            arg->getRowSkip(),
-            0.0,
-            res->getValues(),
-            res->getRowSkip());
+        cblas_ssyrk(CblasRowMajor, CblasUpper, CblasTrans, numCols, numRows, 1.0, arg->getValues(), arg->getRowSkip(),
+                    0.0, res->getValues(), res->getRowSkip());
         for (auto r = 0u; r < numCols; ++r) {
             for (auto c = r + 1; c < numCols; ++c) {
                 res->set(c, r, res->get(r, c));
@@ -110,13 +88,12 @@ struct Syrk<DenseMatrix<float>, DenseMatrix<float>> {
 // CSRMatrix <- CSRMatrix
 // ----------------------------------------------------------------------------
 
-template<typename VT>
-struct Syrk<CSRMatrix<VT>, CSRMatrix<VT>> {
-    static void apply(CSRMatrix<VT> *& res, const CSRMatrix<VT> * arg, DCTX(ctx)) {
+template <typename VT> struct Syrk<CSRMatrix<VT>, CSRMatrix<VT>> {
+    static void apply(CSRMatrix<VT> *&res, const CSRMatrix<VT> *arg, DCTX(ctx)) {
         const size_t numRows = arg->getNumRows();
         const size_t numCols = arg->getNumCols();
-        
-        if(res == nullptr)
+
+        if (res == nullptr)
             res = DataObjectFactory::create<CSRMatrix<VT>>(numCols, numRows, arg->getNumNonZeros(), false);
         throw std::runtime_error("TODO: Syrk for Sparse");
     }
diff --git a/src/runtime/local/kernels/ThetaJoin.h b/src/runtime/local/kernels/ThetaJoin.h
index ecb527ef9..13bcebb9e 100644
--- a/src/runtime/local/kernels/ThetaJoin.h
+++ b/src/runtime/local/kernels/ThetaJoin.h
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-
 #ifndef SRC_RUNTIME_LOCAL_KERNELS_THETAJOIN_H
 #define SRC_RUNTIME_LOCAL_KERNELS_THETAJOIN_H
 
@@ -30,24 +29,22 @@ using mlir::daphne::CompareOperation;
 // Struct for partial template specialization
 // ****************************************************************************
 
-template<class DTRes, class DTLhs, class DTRhs>
-class ThetaJoin {
+template <class DTRes, class DTLhs, class DTRhs> class ThetaJoin {
   public:
-    static void apply(DTRes*& res, const DTLhs* lhs, const DTRhs* rhs, const char** lhsOn, size_t numLhsOn,
-                      const char** rhsOn, size_t numRhsOn, CompareOperation* cmp, size_t numCmp) = delete;
+    static void apply(DTRes *&res, const DTLhs *lhs, const DTRhs *rhs, const char **lhsOn, size_t numLhsOn,
+                      const char **rhsOn, size_t numRhsOn, CompareOperation *cmp, size_t numCmp) = delete;
 };
 
 // ****************************************************************************
 // Convenience function
 // ****************************************************************************
 
-template<class DTRes, class DTLhs, class DTRhs>
-void thetaJoin(DTRes*& res, const DTLhs* lhs, const DTRhs* rhs, const char** lhsOn, size_t numLhsOn,
-               const char** rhsOn, size_t numRhsOn, CompareOperation* cmp, size_t numCmp, DCTX(ctx)){
+template <class DTRes, class DTLhs, class DTRhs>
+void thetaJoin(DTRes *&res, const DTLhs *lhs, const DTRhs *rhs, const char **lhsOn, size_t numLhsOn, const char **rhsOn,
+               size_t numRhsOn, CompareOperation *cmp, size_t numCmp, DCTX(ctx)) {
     ThetaJoin<DTRes, DTLhs, DTRhs>::apply(res, lhs, rhs, lhsOn, numLhsOn, rhsOn, numRhsOn, cmp, numCmp);
 }
 
-
 // ****************************************************************************
 // (Partial) template specializations for different data/value types
 // ****************************************************************************
@@ -56,8 +53,7 @@ void thetaJoin(DTRes*& res, const DTLhs* lhs, const DTRhs* rhs, const char** lhs
 // Frame, Frame -> Frame
 // ----------------------------------------------------------------------------
 
-template<>
-class ThetaJoin<Frame, Frame, Frame> {
+template <> class ThetaJoin<Frame, Frame, Frame> {
     /**
      * Stores column indices and compare operation of one equation.
      */
@@ -65,185 +61,171 @@ class ThetaJoin<Frame, Frame, Frame> {
         size_t lhsColumnIndex;
         size_t rhsColumnIndex;
         CompareOperation cmp;
-        
+
         Equation(size_t lhsColumnIndex, size_t rhsColumnIndex, CompareOperation cmp)
-        : lhsColumnIndex(lhsColumnIndex), rhsColumnIndex(rhsColumnIndex), cmp(cmp){
+            : lhsColumnIndex(lhsColumnIndex), rhsColumnIndex(rhsColumnIndex), cmp(cmp) {
             // do nothing
         }
     };
 
     /**
-     * @brief Convenience structure to store both relations and give easy access to and creation of meta data.
+     * @brief Convenience structure to store both relations and give easy access
+     * to and creation of meta data.
      */
     struct Container {
-        const Frame * const lhs = nullptr;
-        const Frame * const rhs = nullptr;
-        
-        const ValueTypeCode * lhsSchema = nullptr;
-        const ValueTypeCode * rhsSchema = nullptr;
+        const Frame *const lhs = nullptr;
+        const Frame *const rhs = nullptr;
+
+        const ValueTypeCode *lhsSchema = nullptr;
+        const ValueTypeCode *rhsSchema = nullptr;
         std::vector<Equation> equations;
-    
-        void addEquation(const char* lhsOn, const char* rhsOn, CompareOperation cmp){
+
+        void addEquation(const char *lhsOn, const char *rhsOn, CompareOperation cmp) {
             equations.emplace_back(lhs->getColumnIdx(lhsOn), rhs->getColumnIdx(rhsOn), cmp);
         }
-        
-        Container(const Frame * lhs, const Frame * rhs, const char ** lhsOn, const char ** rhsOn,
-                  CompareOperation * cmp, uint64_t numCmp)
-        : lhs(lhs), rhs(rhs), lhsSchema(lhs->getSchema()), rhsSchema(rhs->getSchema()){
-            for(size_t i = 0; i < numCmp; ++i){
+
+        Container(const Frame *lhs, const Frame *rhs, const char **lhsOn, const char **rhsOn, CompareOperation *cmp,
+                  uint64_t numCmp)
+            : lhs(lhs), rhs(rhs), lhsSchema(lhs->getSchema()), rhsSchema(rhs->getSchema()) {
+            for (size_t i = 0; i < numCmp; ++i) {
                 addEquation(lhsOn[i], rhsOn[i], cmp[i]);
             }
         }
-        
+
         /**
          * Create Frame schema for joining both relations.
          * @return schema
          */
-        [[nodiscard]] ValueTypeCode* createResultSchema() const{
+        [[nodiscard]] ValueTypeCode *createResultSchema() const {
             size_t lhsCols = lhs->getNumCols();
             size_t rhsCols = rhs->getNumCols();
-            
+
             auto resSchema = new ValueTypeCode[lhsCols + rhsCols];
-            for(uint64_t i = 0; i < lhsCols; ++i){
+            for (uint64_t i = 0; i < lhsCols; ++i) {
                 resSchema[i] = lhsSchema[i];
             }
-            for(uint64_t i = 0; i < rhsCols; ++i){
+            for (uint64_t i = 0; i < rhsCols; ++i) {
                 resSchema[i + lhsCols] = rhsSchema[i];
             }
             return resSchema;
         }
-        
+
         /**
          * Create labels for joining both relations
          * @return
          */
-        [[nodiscard]] std::string* createResultLabels() const{
+        [[nodiscard]] std::string *createResultLabels() const {
             size_t lhsCols = lhs->getNumCols();
             size_t rhsCols = rhs->getNumCols();
             auto lhsLabels = lhs->getLabels();
             auto rhsLabels = rhs->getLabels();
-            
+
             auto resLabels = new std::string[lhsCols + rhsCols];
-            for(uint64_t i = 0; i < lhsCols; ++i){
+            for (uint64_t i = 0; i < lhsCols; ++i) {
                 resLabels[i] = lhsLabels[i];
             }
-            for(uint64_t i = 0; i < rhsCols; ++i){
+            for (uint64_t i = 0; i < rhsCols; ++i) {
                 resLabels[i + lhsCols] = rhsLabels[i];
             }
             return resLabels;
         }
-        
+
         /**
          * Get number of columns after joining both relations
          * @return number of columns
          */
-        [[nodiscard]] size_t getResultNumCols() const{
-            return lhs->getNumCols() + rhs->getNumCols();
-        }
-        
+        [[nodiscard]] size_t getResultNumCols() const { return lhs->getNumCols() + rhs->getNumCols(); }
+
         /**
          * Get the ValueTypeCode for lhs column of eq_index-th equation.
          * @param eq_index index of equation
          * @return
          */
-        ValueTypeCode getVTLhs(uint64_t eq_index){
-            return lhsSchema[equations.at(eq_index).lhsColumnIndex];
-        }
+        ValueTypeCode getVTLhs(uint64_t eq_index) { return lhsSchema[equations.at(eq_index).lhsColumnIndex]; }
         /**
          * Get the ValueTypeCode for lhs column of eq_index-th equation.
          * @param eq_index index of equation
          * @return
          */
-        ValueTypeCode getVTRhs(uint64_t eq_index){
-            return rhsSchema[equations.at(eq_index).rhsColumnIndex];
-        }
+        ValueTypeCode getVTRhs(uint64_t eq_index) { return rhsSchema[equations.at(eq_index).rhsColumnIndex]; }
     };
-    
+
     /**
      * Structure to store and manage the position lists.
-     * Only usable for Conjunction of equations (Disjunction is not supported (yet))!
+     * Only usable for Conjunction of equations (Disjunction is not supported
+     * (yet))!
      */
     class ResultContainer {
         using posType = uint64_t;
-        
-        DenseMatrix<posType> * positions = nullptr;
-        
+
+        DenseMatrix<posType> *positions = nullptr;
+
         uint64_t readOffset = 0;
         uint64_t writeOffset = 0;
         uint64_t size_ = 0;
         uint64_t maxSize = 0;
-        
+
       public:
         explicit ResultContainer(uint64_t maxNumRows)
-        : positions(DataObjectFactory::create<DenseMatrix<posType>>(maxNumRows, 2, false)),
-          maxSize(maxNumRows)
-        {
+            : positions(DataObjectFactory::create<DenseMatrix<posType>>(maxNumRows, 2, false)), maxSize(maxNumRows) {
             resetCursor();
         }
-        
-        ~ResultContainer(){
-            DataObjectFactory::destroy(positions);
-        }
-        
-        void resetCursor(){
+
+        ~ResultContainer() { DataObjectFactory::destroy(positions); }
+
+        void resetCursor() {
             readOffset = 0;
             writeOffset = 0;
         }
-        
-        void addPosPair(uint64_t lhsPos_, uint64_t rhsPos_){
+
+        void addPosPair(uint64_t lhsPos_, uint64_t rhsPos_) {
             /// lhs
             positions->set(writeOffset, 0, lhsPos_);
             /// rhs
             positions->set(writeOffset, 1, rhsPos_);
             ++writeOffset;
         }
-        
-        [[nodiscard]] std::tuple<posType, posType> readNext(){
+
+        [[nodiscard]] std::tuple<posType, posType> readNext() {
             auto res = std::make_tuple(
-              /// lhs
-              positions->get(readOffset, 0),
-              /// rhs
-              positions->get(readOffset, 1));
+                /// lhs
+                positions->get(readOffset, 0),
+                /// rhs
+                positions->get(readOffset, 1));
             ++readOffset;
             return std::move(res);
         }
-        
-        void finalize(){
+
+        void finalize() {
             size_ = writeOffset;
             resetCursor();
         }
-        
-        [[nodiscard]] uint64_t size() const {
-            return size_;
-        }
-        
-        [[nodiscard]] const DenseMatrix<posType> * getPositions() const {
-            return positions;
-        }
+
+        [[nodiscard]] uint64_t size() const { return size_; }
+
+        [[nodiscard]] const DenseMatrix<posType> *getPositions() const { return positions; }
     };
 
-    
-    template< typename VTCol >
-    struct WriteColumn {
-        static void apply(Frame *& out, const Container& container, uint64_t inColIdx, uint64_t outColIdx,
-                          bool isLhs, ResultContainer const * positions){
-            if(!out){
+    template <typename VTCol> struct WriteColumn {
+        static void apply(Frame *&out, const Container &container, uint64_t inColIdx, uint64_t outColIdx, bool isLhs,
+                          ResultContainer const *positions) {
+            if (!out) {
                 throw std::runtime_error("Result Frame not allocated!");
             }
-            const Frame * in = isLhs ? container.lhs : container.rhs;
-            
-            auto * inData = reinterpret_cast<VTCol const *>(in->getColumnRaw(inColIdx));
-            auto * outData = reinterpret_cast<VTCol *>(out->getColumnRaw(outColIdx));
-            
-            for(uint64_t i = 0; i < positions->size(); ++i){
+            const Frame *in = isLhs ? container.lhs : container.rhs;
+
+            auto *inData = reinterpret_cast<VTCol const *>(in->getColumnRaw(inColIdx));
+            auto *outData = reinterpret_cast<VTCol *>(out->getColumnRaw(outColIdx));
+
+            for (uint64_t i = 0; i < positions->size(); ++i) {
                 outData[i] = inData[positions->getPositions()->get(i, isLhs ? 0 : 1)];
             }
         }
     };
-    
-    
+
     /**
-     * @brief Compares two values of arbitrary type following the encoded operation.
+     * @brief Compares two values of arbitrary type following the encoded
+     * operation.
      *
      * @tparam VTLhs type of left hand side value
      * @tparam VTRhs type of right hand side value
@@ -252,86 +234,84 @@ class ThetaJoin<Frame, Frame, Frame> {
      * @param cmp compare operation to use
      * @return boolean representation of the compare operation
      */
-    template<typename VTLhs, typename VTRhs>
-    static bool compareValues(VTLhs lhs, VTRhs rhs, CompareOperation cmp){
-        if constexpr (std::is_same_v<VTLhs, VTRhs>){
-            switch ( cmp ){
-                case CompareOperation::Equal:
-                    return lhs == rhs;
-                case CompareOperation::LessThan:
-                    return lhs < rhs;
-                case CompareOperation::LessEqual:
-                    return lhs <= rhs;
-                case CompareOperation::GreaterThan:
-                    return lhs > rhs;
-                case CompareOperation::GreaterEqual:
-                    return lhs >= rhs;
-                case CompareOperation::NotEqual:
-                    return lhs != rhs;
+    template <typename VTLhs, typename VTRhs> static bool compareValues(VTLhs lhs, VTRhs rhs, CompareOperation cmp) {
+        if constexpr (std::is_same_v<VTLhs, VTRhs>) {
+            switch (cmp) {
+            case CompareOperation::Equal:
+                return lhs == rhs;
+            case CompareOperation::LessThan:
+                return lhs < rhs;
+            case CompareOperation::LessEqual:
+                return lhs <= rhs;
+            case CompareOperation::GreaterThan:
+                return lhs > rhs;
+            case CompareOperation::GreaterEqual:
+                return lhs >= rhs;
+            case CompareOperation::NotEqual:
+                return lhs != rhs;
             }
         } else {
             switch (cmp) {
-                case CompareOperation::Equal:
-                    return lhs == static_cast<VTLhs>(rhs);
-                case CompareOperation::LessThan:
-                    return lhs < static_cast<VTLhs>(rhs);
-                case CompareOperation::LessEqual:
-                    return lhs <= static_cast<VTLhs>(rhs);
-                case CompareOperation::GreaterThan:
-                    return lhs > static_cast<VTLhs>(rhs);
-                case CompareOperation::GreaterEqual:
-                    return lhs >= static_cast<VTLhs>(rhs);
-                case CompareOperation::NotEqual:
-                    return lhs != static_cast<VTLhs>(rhs);
+            case CompareOperation::Equal:
+                return lhs == static_cast<VTLhs>(rhs);
+            case CompareOperation::LessThan:
+                return lhs < static_cast<VTLhs>(rhs);
+            case CompareOperation::LessEqual:
+                return lhs <= static_cast<VTLhs>(rhs);
+            case CompareOperation::GreaterThan:
+                return lhs > static_cast<VTLhs>(rhs);
+            case CompareOperation::GreaterEqual:
+                return lhs >= static_cast<VTLhs>(rhs);
+            case CompareOperation::NotEqual:
+                return lhs != static_cast<VTLhs>(rhs);
             }
         }
         throw std::runtime_error("Unknown compare operation");
     }
-    
+
     /**
-     * @brief Generates (or updates) a position list of two join columns, which fulfill the join condition.
+     * @brief Generates (or updates) a position list of two join columns, which
+     * fulfill the join condition.
      *
      * @tparam VTLhs value type of left hand side column
      * @tparam VTRhs value type of right hand side column
      */
-    template<typename VTLhs, typename VTRhs>
-    struct CompareColumnPair {
+    template <typename VTLhs, typename VTRhs> struct CompareColumnPair {
         /**
          * @brief Execute function of this kernel.
-         * @param container Convenience structure to store both relations and give easy access to meta data
+         * @param container Convenience structure to store both relations and
+         * give easy access to meta data
          * @param positions Pointer reference to resulting position list
          * @param depth Index of the depth-th equation in the theta join
          */
         static void apply(
-          /// Container, containing Frames
-          Container& container,
-          ResultContainer *& positions,
-          /// traverse depth
-          size_t depth)
-        {
-            Equation& eq = container.equations.at(depth);
-            auto const * lhsData = reinterpret_cast<VTLhs const*>(container.lhs->getColumnRaw(eq.lhsColumnIndex));
-            auto const * rhsData = reinterpret_cast<VTRhs const*>(container.rhs->getColumnRaw(eq.rhsColumnIndex));
-            
+            /// Container, containing Frames
+            Container &container, ResultContainer *&positions,
+            /// traverse depth
+            size_t depth) {
+            Equation &eq = container.equations.at(depth);
+            auto const *lhsData = reinterpret_cast<VTLhs const *>(container.lhs->getColumnRaw(eq.lhsColumnIndex));
+            auto const *rhsData = reinterpret_cast<VTRhs const *>(container.rhs->getColumnRaw(eq.rhsColumnIndex));
+
             size_t lhsRowCount = container.lhs->getNumRows();
             size_t rhsRowCount = container.rhs->getNumRows();
-            
-            if(!positions) {
+
+            if (!positions) {
                 positions = new ResultContainer(lhsRowCount * rhsRowCount);
             }
-            
-            if(depth == 0){
-                for(size_t outerLoop = 0; outerLoop < lhsRowCount; ++outerLoop){
-                    for(size_t innerLoop = 0; innerLoop < rhsRowCount; ++innerLoop){
-                        if(compareValues<VTLhs, VTRhs>(lhsData[outerLoop], rhsData[innerLoop], eq.cmp)){
+
+            if (depth == 0) {
+                for (size_t outerLoop = 0; outerLoop < lhsRowCount; ++outerLoop) {
+                    for (size_t innerLoop = 0; innerLoop < rhsRowCount; ++innerLoop) {
+                        if (compareValues<VTLhs, VTRhs>(lhsData[outerLoop], rhsData[innerLoop], eq.cmp)) {
                             positions->addPosPair(outerLoop, innerLoop);
                         }
                     }
                 }
             } else {
-                for(uint64_t i = 0; i < positions->size(); ++i){
+                for (uint64_t i = 0; i < positions->size(); ++i) {
                     auto [lhsPos, rhsPos] = positions->readNext();
-                    if(compareValues<VTLhs, VTRhs>(lhsData[lhsPos], rhsData[rhsPos], eq.cmp)){
+                    if (compareValues<VTLhs, VTRhs>(lhsData[lhsPos], rhsData[rhsPos], eq.cmp)) {
                         positions->addPosPair(lhsPos, rhsPos);
                     }
                 }
@@ -339,55 +319,54 @@ class ThetaJoin<Frame, Frame, Frame> {
             positions->finalize();
         }
     };
-    
+
   public:
-    static void apply(Frame*& res, const Frame* lhs, const Frame* rhs, const char** lhsOn, size_t numLhsOn,
-                      const char** rhsOn, size_t numRhsOn, CompareOperation* cmp, size_t numCmp) {
+    static void apply(Frame *&res, const Frame *lhs, const Frame *rhs, const char **lhsOn, size_t numLhsOn,
+                      const char **rhsOn, size_t numRhsOn, CompareOperation *cmp, size_t numCmp) {
         /// @todo get rid of redundant parameters ??
         if (numLhsOn != numRhsOn || numRhsOn != numCmp)
             throw std::runtime_error("incorrect amount of compare values");
 
         size_t lhsCols = lhs->getNumCols();
         size_t rhsCols = rhs->getNumCols();
-        
-        /// convenience container holding all relevant data for traversing over both relations
+
+        /// convenience container holding all relevant data for traversing over
+        /// both relations
         Container container(lhs, rhs, lhsOn, rhsOn, cmp, numCmp);
-        
+
         /// container to store result position pairs
-        ResultContainer * resultPositions = nullptr;
-    
+        ResultContainer *resultPositions = nullptr;
+
         /// iterate over equations
-        for(size_t i = 0; i < numCmp; ++i){
+        for (size_t i = 0; i < numCmp; ++i) {
             DeduceValueTypeAndExecute<CompareColumnPair>::apply(
-              /// lhs value type
+                /// lhs value type
                 container.getVTLhs(i),
-              /// rhs value type
+                /// rhs value type
                 container.getVTRhs(i),
-              /// parameter of TraverseColumnWise
-              container, resultPositions, i
-              );
+                /// parameter of TraverseColumnWise
+                container, resultPositions, i);
         }
-        
+
         /// write result
         res = DataObjectFactory::create<Frame>(resultPositions->size(), lhsCols + rhsCols,
                                                container.createResultSchema(), container.createResultLabels(), false);
-        for(uint64_t i = 0; i < lhsCols; ++i){
-            DeduceValueTypeAndExecute<WriteColumn>::apply(container.lhsSchema[i], res, container,
-                                                          i, i, true, resultPositions);
+        for (uint64_t i = 0; i < lhsCols; ++i) {
+            DeduceValueTypeAndExecute<WriteColumn>::apply(container.lhsSchema[i], res, container, i, i, true,
+                                                          resultPositions);
         }
-        for(uint64_t i = 0; i < rhsCols; ++i){
-            DeduceValueTypeAndExecute<WriteColumn>::apply(container.rhsSchema[i], res, container,
-                                                          i, i + lhsCols, false, resultPositions);
+        for (uint64_t i = 0; i < rhsCols; ++i) {
+            DeduceValueTypeAndExecute<WriteColumn>::apply(container.rhsSchema[i], res, container, i, i + lhsCols, false,
+                                                          resultPositions);
         }
-        
+
         /// cleanup
         delete resultPositions;
     }
 };
 
-
-void thetaJoin(Frame*& res, const Frame* lhs, const Frame* rhs, const char** lhsOn, size_t numLhsOn,
-               const char** rhsOn, size_t numRhsOn, CompareOperation* cmp, size_t numCmp, DCTX(ctx)){
+inline void thetaJoin(Frame *&res, const Frame *lhs, const Frame *rhs, const char **lhsOn, size_t numLhsOn,
+                      const char **rhsOn, size_t numRhsOn, CompareOperation *cmp, size_t numCmp, DCTX(ctx)) {
     ThetaJoin<Frame, Frame, Frame>::apply(res, lhs, rhs, lhsOn, numLhsOn, rhsOn, numRhsOn, cmp, numCmp);
 }
-#endif //SRC_RUNTIME_LOCAL_KERNELS_THETAJOIN_H
+#endif // SRC_RUNTIME_LOCAL_KERNELS_THETAJOIN_H
diff --git a/src/runtime/local/kernels/Transpose.h b/src/runtime/local/kernels/Transpose.h
index be6d6750f..dacf2d5c4 100644
--- a/src/runtime/local/kernels/Transpose.h
+++ b/src/runtime/local/kernels/Transpose.h
@@ -28,17 +28,15 @@
 // Struct for partial template specialization
 // ****************************************************************************
 
-template<class DTRes, class DTArg>
-struct Transpose {
-    static void apply(DTRes *& res, const DTArg * arg, DCTX(ctx)) = delete;
+template <class DTRes, class DTArg> struct Transpose {
+    static void apply(DTRes *&res, const DTArg *arg, DCTX(ctx)) = delete;
 };
 
 // ****************************************************************************
 // Convenience function
 // ****************************************************************************
 
-template<class DTRes, class DTArg>
-void transpose(DTRes *& res, const DTArg * arg, DCTX(ctx)) {
+template <class DTRes, class DTArg> void transpose(DTRes *&res, const DTArg *arg, DCTX(ctx)) {
     Transpose<DTRes, DTArg>::apply(res, arg, ctx);
 }
 
@@ -50,18 +48,15 @@ void transpose(DTRes *& res, const DTArg * arg, DCTX(ctx)) {
 // DenseMatrix <- DenseMatrix
 // ----------------------------------------------------------------------------
 
-template<typename VT>
-struct Transpose<DenseMatrix<VT>, DenseMatrix<VT>> {
-    static void apply(DenseMatrix<VT> *& res, const DenseMatrix<VT> * arg, DCTX(ctx)) {
+template <typename VT> struct Transpose<DenseMatrix<VT>, DenseMatrix<VT>> {
+    static void apply(DenseMatrix<VT> *&res, const DenseMatrix<VT> *arg, DCTX(ctx)) {
         const size_t numRows = arg->getNumRows();
         const size_t numCols = arg->getNumCols();
-        
+
         // skip data movement for vectors
         if ((numRows == 1 || numCols == 1) && !arg->isView()) {
             res = DataObjectFactory::create<DenseMatrix<VT>>(numCols, numRows, arg);
-        }
-        else
-        {
+        } else {
             if (res == nullptr)
                 res = DataObjectFactory::create<DenseMatrix<VT>>(numCols, numRows, false);
 
@@ -84,55 +79,54 @@ struct Transpose<DenseMatrix<VT>, DenseMatrix<VT>> {
 // CSRMatrix <- CSRMatrix
 // ----------------------------------------------------------------------------
 
-template<typename VT>
-struct Transpose<CSRMatrix<VT>, CSRMatrix<VT>> {
-    static void apply(CSRMatrix<VT> *& res, const CSRMatrix<VT> * arg, DCTX(ctx)) {
-      // Implementation inspired by SciPy
-      // https://github.com/scipy/scipy/blob/8a64c938ddf1ae4c02a08d2c5e38daeb8d061d38/scipy/sparse/sparsetools/csr.h#L608
-      const size_t numRows = arg->getNumRows();
-      const size_t numCols = arg->getNumCols();
-
-      if(res == nullptr)
-        res = DataObjectFactory::create<CSRMatrix<VT>>(numCols, numRows, arg->getNumNonZeros(), false);
-
-      const VT * valuesArg = arg->getValues();
-      const size_t * colIdxsArg = arg->getColIdxs();
-      const size_t * rowOffsetsArg = arg->getRowOffsets();
-
-      const size_t numNonZeros = arg->getNumNonZeros();
-
-      VT * valuesRes = res->getValues();
-      size_t * colIdxsRes = res->getColIdxs();
-      size_t * rowOffsetsRes = res->getRowOffsets();
-
-      std::fill(rowOffsetsRes, rowOffsetsRes + numCols, 0);
-
-      for (size_t row = 0; row < numRows; row++)
-        for (size_t j = rowOffsetsArg[row]; j < rowOffsetsArg[row + 1]; j++)
-          rowOffsetsRes[colIdxsArg[j]]++;
-
-      for (size_t col = 0, cumsum = 0; col < numCols; col++) {
-        size_t tmp = rowOffsetsRes[col];
-        rowOffsetsRes[col] = cumsum;
-        cumsum += tmp;
-      }
-      rowOffsetsRes[numCols] = numNonZeros;
-
-      for (size_t row = 0; row < numRows; row++) {
-        for (size_t j = rowOffsetsArg[row]; j < rowOffsetsArg[row + 1]; j++) {
-          size_t col = colIdxsArg[j];
-          size_t dest = rowOffsetsRes[col];
-          colIdxsRes[dest] = row;
-          valuesRes[dest] = valuesArg[j];
-          rowOffsetsRes[col]++;
+template <typename VT> struct Transpose<CSRMatrix<VT>, CSRMatrix<VT>> {
+    static void apply(CSRMatrix<VT> *&res, const CSRMatrix<VT> *arg, DCTX(ctx)) {
+        // Implementation inspired by SciPy
+        // https://github.com/scipy/scipy/blob/8a64c938ddf1ae4c02a08d2c5e38daeb8d061d38/scipy/sparse/sparsetools/csr.h#L608
+        const size_t numRows = arg->getNumRows();
+        const size_t numCols = arg->getNumCols();
+
+        if (res == nullptr)
+            res = DataObjectFactory::create<CSRMatrix<VT>>(numCols, numRows, arg->getNumNonZeros(), false);
+
+        const VT *valuesArg = arg->getValues();
+        const size_t *colIdxsArg = arg->getColIdxs();
+        const size_t *rowOffsetsArg = arg->getRowOffsets();
+
+        const size_t numNonZeros = arg->getNumNonZeros();
+
+        VT *valuesRes = res->getValues();
+        size_t *colIdxsRes = res->getColIdxs();
+        size_t *rowOffsetsRes = res->getRowOffsets();
+
+        std::fill(rowOffsetsRes, rowOffsetsRes + numCols, 0);
+
+        for (size_t row = 0; row < numRows; row++)
+            for (size_t j = rowOffsetsArg[row]; j < rowOffsetsArg[row + 1]; j++)
+                rowOffsetsRes[colIdxsArg[j]]++;
+
+        for (size_t col = 0, cumsum = 0; col < numCols; col++) {
+            size_t tmp = rowOffsetsRes[col];
+            rowOffsetsRes[col] = cumsum;
+            cumsum += tmp;
+        }
+        rowOffsetsRes[numCols] = numNonZeros;
+
+        for (size_t row = 0; row < numRows; row++) {
+            for (size_t j = rowOffsetsArg[row]; j < rowOffsetsArg[row + 1]; j++) {
+                size_t col = colIdxsArg[j];
+                size_t dest = rowOffsetsRes[col];
+                colIdxsRes[dest] = row;
+                valuesRes[dest] = valuesArg[j];
+                rowOffsetsRes[col]++;
+            }
         }
-      }
 
-      for (size_t col = 0, last = 0; col < numCols + 1; col++) {
-        size_t tmp = rowOffsetsRes[col];
-        rowOffsetsRes[col] = last;
-        last = tmp;
-      }
+        for (size_t col = 0, last = 0; col < numCols + 1; col++) {
+            size_t tmp = rowOffsetsRes[col];
+            rowOffsetsRes[col] = last;
+            last = tmp;
+        }
     }
 };
 
@@ -140,9 +134,8 @@ struct Transpose<CSRMatrix<VT>, CSRMatrix<VT>> {
 // Matrix <- Matrix
 // ----------------------------------------------------------------------------
 
-template<typename VT>
-struct Transpose<Matrix<VT>, Matrix<VT>> {
-    static void apply(Matrix<VT> *& res, const Matrix<VT> * arg, DCTX(ctx)) {
+template <typename VT> struct Transpose<Matrix<VT>, Matrix<VT>> {
+    static void apply(Matrix<VT> *&res, const Matrix<VT> *arg, DCTX(ctx)) {
         const size_t numRowsRes = arg->getNumCols();
         const size_t numColsRes = arg->getNumRows();
 
diff --git a/src/runtime/local/kernels/Tri.h b/src/runtime/local/kernels/Tri.h
index 9ff9a08f9..b57b29bc5 100644
--- a/src/runtime/local/kernels/Tri.h
+++ b/src/runtime/local/kernels/Tri.h
@@ -33,8 +33,7 @@
 // Struct for partial template specialization
 // ****************************************************************************
 
-template<class DT>
-struct Tri {
+template <class DT> struct Tri {
     /**
      * @brief lower/upperTri
      * @param res Result pointer
@@ -43,15 +42,14 @@ struct Tri {
      * @param diag Preserves (true) or zeroes (false) the diagonal
      * @param values Preserves (true) or replaces with 1s the remaining elements
      */
-    static void apply(DT *& res, const DT * arg, bool upper, bool diag, bool values, DCTX(ctx)) = delete;
+    static void apply(DT *&res, const DT *arg, bool upper, bool diag, bool values, DCTX(ctx)) = delete;
 };
 
 // ****************************************************************************
 // Convenience function
 // ****************************************************************************
 
-template<class DT>
-void tri(DT *& res, const DT * arg, bool upper, bool diag, bool values, DCTX(ctx)) {
+template <class DT> void tri(DT *&res, const DT *arg, bool upper, bool diag, bool values, DCTX(ctx)) {
     Tri<DT>::apply(res, arg, upper, diag, values, ctx);
 }
 
@@ -63,34 +61,33 @@ void tri(DT *& res, const DT * arg, bool upper, bool diag, bool values, DCTX(ctx
 // DenseMatrix <- DenseMatrix
 // ----------------------------------------------------------------------------
 
-template<typename VT>
-struct Tri<DenseMatrix<VT>> {
-    static void apply(DenseMatrix<VT> *& res, const DenseMatrix<VT> * arg, bool upper, bool diag, bool values, DCTX(ctx)) {
+template <typename VT> struct Tri<DenseMatrix<VT>> {
+    static void apply(DenseMatrix<VT> *&res, const DenseMatrix<VT> *arg, bool upper, bool diag, bool values,
+                      DCTX(ctx)) {
         const size_t numRows = arg->getNumRows();
         const size_t numCols = arg->getNumCols();
 
         if (numRows != numCols) {
-            throw std::runtime_error("matrix must be square, but is of shape" +
-                                     std::to_string(numRows) + "x" +
+            throw std::runtime_error("matrix must be square, but is of shape" + std::to_string(numRows) + "x" +
                                      std::to_string(numCols));
         }
 
-        if(res == nullptr)
+        if (res == nullptr)
             res = DataObjectFactory::create<DenseMatrix<VT>>(numRows, numCols, true);
 
-        const VT * valuesArg = arg->getValues();
-        VT * valuesRes = res->getValues();
+        const VT *valuesArg = arg->getValues();
+        VT *valuesRes = res->getValues();
         const size_t rowSkipArg = arg->getRowSkip();
         const size_t rowSkipRes = res->getRowSkip();
 
         size_t start = upper ? !diag : 0;
         size_t end = upper ? numCols : diag;
-        size_t * inc = upper ? &start : &end;
+        size_t *inc = upper ? &start : &end;
 
-        for(size_t r = 0; r < numRows; r++, (*inc)++) {
-            for(size_t c = start; c < end; c++) {
+        for (size_t r = 0; r < numRows; r++, (*inc)++) {
+            for (size_t c = start; c < end; c++) {
                 VT val = valuesArg[c];
-                if(val != VT(0)) {
+                if (val != VT(0)) {
                     valuesRes[c] = !values ? 1 : val;
                 }
             }
@@ -104,41 +101,39 @@ struct Tri<DenseMatrix<VT>> {
 // CSRMatrix <- CSRMatrix
 // ----------------------------------------------------------------------------
 
-template<typename VT>
-struct Tri<CSRMatrix<VT>> {
-    static void apply(CSRMatrix<VT> *& res, const CSRMatrix<VT> * arg, bool upper, bool diag, bool values, DCTX(ctx)) {
+template <typename VT> struct Tri<CSRMatrix<VT>> {
+    static void apply(CSRMatrix<VT> *&res, const CSRMatrix<VT> *arg, bool upper, bool diag, bool values, DCTX(ctx)) {
         const size_t numRows = arg->getNumRows();
         const size_t numCols = arg->getNumCols();
 
         if (numRows != numCols) {
-            throw std::runtime_error("matrix must be square, but is of shape" +
-                                     std::to_string(numRows) + "x" +
+            throw std::runtime_error("matrix must be square, but is of shape" + std::to_string(numRows) + "x" +
                                      std::to_string(numCols));
         }
-        if(res == nullptr) {
+        if (res == nullptr) {
             const size_t nonZeros = std::min(arg->getNumNonZeros(), numRows * (numRows + 1) / 2);
             res = DataObjectFactory::create<CSRMatrix<VT>>(numRows, numCols, nonZeros, false);
         }
 
         size_t start = upper ? !diag : 0;
         size_t end = upper ? numCols : diag;
-        size_t * inc = upper ? &start : &end;
+        size_t *inc = upper ? &start : &end;
 
-        VT * valuesRes = res->getValues();
-        size_t * colIdxsRes = res->getColIdxs();
-        size_t * rowOffsetsRes = res->getRowOffsets();
+        VT *valuesRes = res->getValues();
+        size_t *colIdxsRes = res->getColIdxs();
+        size_t *rowOffsetsRes = res->getRowOffsets();
 
         rowOffsetsRes[0] = 0;
-        for(size_t r = 0, pos = 0; r < numRows; r++, (*inc)++) {
+        for (size_t r = 0, pos = 0; r < numRows; r++, (*inc)++) {
             const size_t rowNumNonZeros = arg->getNumNonZeros(r);
-            const size_t * rowColIdxs = arg->getColIdxs(r);
-            const VT * rowValues = arg->getValues(r);
+            const size_t *rowColIdxs = arg->getColIdxs(r);
+            const VT *rowValues = arg->getValues(r);
 
-            for(size_t i = 0; i < rowNumNonZeros; i++) {
+            for (size_t i = 0; i < rowNumNonZeros; i++) {
                 const size_t c = rowColIdxs[i];
-                if(c >= start && c < end) {
+                if (c >= start && c < end) {
                     VT val = rowValues[i];
-                    if(val != VT(0)) {
+                    if (val != VT(0)) {
                         valuesRes[pos] = !values ? 1 : val;
                         colIdxsRes[pos++] = c;
                     }
@@ -153,9 +148,8 @@ struct Tri<CSRMatrix<VT>> {
 // Matrix <- Matrix
 // ----------------------------------------------------------------------------
 
-template<typename VT>
-struct Tri<Matrix<VT>> {
-    static void apply(Matrix<VT> *& res, const Matrix<VT> * arg, bool upper, bool diag, bool values, DCTX(ctx)) {
+template <typename VT> struct Tri<Matrix<VT>> {
+    static void apply(Matrix<VT> *&res, const Matrix<VT> *arg, bool upper, bool diag, bool values, DCTX(ctx)) {
         const size_t numRows = arg->getNumRows();
         const size_t numCols = arg->getNumCols();
 
@@ -163,12 +157,13 @@ struct Tri<Matrix<VT>> {
             throw std::runtime_error("Tri: matrix must be square");
 
         if (res == nullptr)
-            // append sets non-appended values to zero so initialization of zeros would be redundant
+            // append sets non-appended values to zero so initialization of
+            // zeros would be redundant
             res = DataObjectFactory::create<DenseMatrix<VT>>(numRows, numCols, false);
 
         size_t start = upper ? !diag : 0;
         size_t end = upper ? numCols : diag;
-        size_t * inc = upper ? &start : &end;
+        size_t *inc = upper ? &start : &end;
 
         res->prepareAppend();
         for (size_t r = 0; r < numRows; ++r, ++(*inc)) {
@@ -182,4 +177,4 @@ struct Tri<Matrix<VT>> {
     }
 };
 
-#endif //SRC_RUNTIME_LOCAL_KERNELS_TRI_H
+#endif // SRC_RUNTIME_LOCAL_KERNELS_TRI_H
diff --git a/src/runtime/local/kernels/TypeOfObj.h b/src/runtime/local/kernels/TypeOfObj.h
index 5fc14e947..a29f47863 100644
--- a/src/runtime/local/kernels/TypeOfObj.h
+++ b/src/runtime/local/kernels/TypeOfObj.h
@@ -30,17 +30,15 @@
 // Struct for partial template specialization
 // ****************************************************************************
 
-template<class DTArg>
-struct TypeOfObj {
-    static void apply(char *& res, const DTArg * arg, DCTX(ctx)) = delete;
+template <class DTArg> struct TypeOfObj {
+    static void apply(char *&res, const DTArg *arg, DCTX(ctx)) = delete;
 };
 
 // ****************************************************************************
 // Convenience function
 // ****************************************************************************
 
-template<class DTArg>
-void typeOfObj(char *& res, const DTArg * arg, DCTX(ctx)) {
+template <class DTArg> void typeOfObj(char *&res, const DTArg *arg, DCTX(ctx)) {
     TypeOfObj<DTArg>::apply(res, arg, ctx);
 }
 
@@ -52,12 +50,10 @@ void typeOfObj(char *& res, const DTArg * arg, DCTX(ctx)) {
 // DenseMatrix
 // ----------------------------------------------------------------------------
 
-template<typename VT>
-struct TypeOfObj<DenseMatrix<VT>> {
-    static void apply(char *& res, const DenseMatrix<VT> * arg, DCTX(ctx)) {
-        const std::string typeName = std::string("DenseMatrix(") + std::to_string(arg->getNumRows())
-            + "x" + std::to_string(arg->getNumCols()) + ", "
-            + ValueTypeUtils::cppNameFor<VT> + ")";
+template <typename VT> struct TypeOfObj<DenseMatrix<VT>> {
+    static void apply(char *&res, const DenseMatrix<VT> *arg, DCTX(ctx)) {
+        const std::string typeName = std::string("DenseMatrix(") + std::to_string(arg->getNumRows()) + "x" +
+                                     std::to_string(arg->getNumCols()) + ", " + ValueTypeUtils::cppNameFor<VT> + ")";
         if (res == nullptr)
             res = new char[typeName.size() + 1];
         std::memcpy(res, typeName.c_str(), typeName.size() + 1);
@@ -68,12 +64,10 @@ struct TypeOfObj<DenseMatrix<VT>> {
 // CSRMatrix
 // ----------------------------------------------------------------------------
 
-template<typename VT>
-struct TypeOfObj<CSRMatrix<VT>> {
-    static void apply(char *& res, const CSRMatrix<VT> * arg, DCTX(ctx)) {
-        const std::string typeName = std::string("CSRMatrix(") + std::to_string(arg->getNumRows())
-            + "x" + std::to_string(arg->getNumCols()) + ", "
-            + ValueTypeUtils::cppNameFor<VT> + ")";
+template <typename VT> struct TypeOfObj<CSRMatrix<VT>> {
+    static void apply(char *&res, const CSRMatrix<VT> *arg, DCTX(ctx)) {
+        const std::string typeName = std::string("CSRMatrix(") + std::to_string(arg->getNumRows()) + "x" +
+                                     std::to_string(arg->getNumCols()) + ", " + ValueTypeUtils::cppNameFor<VT> + ")";
         if (res == nullptr)
             res = new char[typeName.size() + 1];
         std::memcpy(res, typeName.c_str(), typeName.size() + 1);
@@ -84,14 +78,14 @@ struct TypeOfObj<CSRMatrix<VT>> {
 // Frame
 // ----------------------------------------------------------------------------
 
-template<>
-struct TypeOfObj<Frame> {
-    static void apply(char *& res, const Frame * arg, DCTX(ctx)) {
-        std::string typeName = std::string("Frame(") + std::to_string(arg->getNumRows()) + "x" + std::to_string(arg->getNumCols()) + ", [";
-        const std::string * labels = arg->getLabels();
+template <> struct TypeOfObj<Frame> {
+    static void apply(char *&res, const Frame *arg, DCTX(ctx)) {
+        std::string typeName =
+            std::string("Frame(") + std::to_string(arg->getNumRows()) + "x" + std::to_string(arg->getNumCols()) + ", [";
+        const std::string *labels = arg->getLabels();
         for (size_t i = 0; i < arg->getNumCols(); i++) {
             typeName += labels[i] + ":" + ValueTypeUtils::cppNameForCode(arg->getColumnType(i));
-            if (i < arg->getNumCols() - 1 ) {
+            if (i < arg->getNumCols() - 1) {
                 typeName += ", ";
             }
         }
diff --git a/src/runtime/local/kernels/TypeOfSca.h b/src/runtime/local/kernels/TypeOfSca.h
index aa6259e0c..c2694e46a 100644
--- a/src/runtime/local/kernels/TypeOfSca.h
+++ b/src/runtime/local/kernels/TypeOfSca.h
@@ -23,8 +23,7 @@
 
 #include <cstring>
 
-template<typename VT>
-void typeOfSca(char *& res, const VT arg, DCTX(ctx)) {
+template <typename VT> void typeOfSca(char *&res, const VT arg, DCTX(ctx)) {
     const std::string typeName = ValueTypeUtils::cppNameFor<VT>;
     if (res == nullptr)
         res = new char[typeName.size() + 1];
diff --git a/src/runtime/local/kernels/UnaryOpCode.h b/src/runtime/local/kernels/UnaryOpCode.h
index dbb3b0fe4..3bc79bb92 100644
--- a/src/runtime/local/kernels/UnaryOpCode.h
+++ b/src/runtime/local/kernels/UnaryOpCode.h
@@ -19,6 +19,8 @@
 
 #pragma once
 
+#include <runtime/local/datastructures/FixedSizeStringValueType.h>
+
 // ****************************************************************************
 // Enum for unary op codes and their names
 // ****************************************************************************
@@ -46,7 +48,10 @@ enum class UnaryOpCode {
     CEIL,
     ROUND,
     // Comparison.
-    ISNAN
+    ISNAN,
+    // String.
+    UPPER,
+    LOWER
 };
 
 /**
@@ -64,8 +69,9 @@ static std::string_view unary_op_codes[] = {
     // Rounding.
     "FLOOR", "CEIL", "ROUND",
     // Comparison.
-    "ISNAN"
-};
+    "ISNAN",
+    // String.
+    "UPPER", "LOWER"};
 
 // ****************************************************************************
 // Specification which unary ops should be supported on which value types
@@ -79,45 +85,48 @@ static std::string_view unary_op_codes[] = {
  * @tparam VTRes The result value type.
  * @tparam VTArg The argument value type.
  */
-template<UnaryOpCode op, typename VTRes, typename VTArg>
-static constexpr bool supportsUnaryOp = false;
+template <UnaryOpCode op, typename VTRes, typename VTArg> static constexpr bool supportsUnaryOp = false;
 
 // Macros for concisely specifying which unary operations should be
 // supported on which value types.
 
-// Generates code specifying that the unary operation `Op` should be supported on
-// the value type `VT` (for both the result and the argument, for simplicity).
-#define SUPPORT(Op, VT) \
-    template<> constexpr bool supportsUnaryOp<UnaryOpCode::Op, VT, VT> = true;
+// Generates code specifying that the unary operation `Op` should be supported
+// on the value type `VT` (for both the result and the argument, for
+// simplicity).
+#define SUPPORT(Op, VT) template <> constexpr bool supportsUnaryOp<UnaryOpCode::Op, VT, VT> = true;
 
 // Generates code specifying that all unary operations typically supported on
 // numeric value types should be supported on the given value type `VT`
 // (for both the result and the argument, for simplicity).
-#define SUPPORT_NUMERIC(VT) \
-    /* Arithmetic/general math. */ \
-    SUPPORT(MINUS, VT) \
-    SUPPORT(ABS  , VT) \
-    SUPPORT(SIGN , VT) \
-    SUPPORT(SQRT , VT) \
-    SUPPORT(EXP  , VT) \
-    SUPPORT(LN   , VT) \
-    /* Trigonometric/hyperbolic. */ \
-    SUPPORT(SIN , VT) \
-    SUPPORT(COS , VT) \
-    SUPPORT(TAN , VT) \
-    SUPPORT(ASIN, VT) \
-    SUPPORT(ACOS, VT) \
-    SUPPORT(ATAN, VT) \
-    SUPPORT(SINH, VT) \
-    SUPPORT(COSH, VT) \
-    SUPPORT(TANH, VT) \
-    /* Rounding. */ \
-    SUPPORT(FLOOR, VT) \
-    SUPPORT(CEIL , VT) \
-    SUPPORT(ROUND, VT) \
-    /* Comparison */ \
+#define SUPPORT_NUMERIC(VT)                                                                                            \
+    /* Arithmetic/general math. */                                                                                     \
+    SUPPORT(MINUS, VT)                                                                                                 \
+    SUPPORT(ABS, VT)                                                                                                   \
+    SUPPORT(SIGN, VT)                                                                                                  \
+    SUPPORT(SQRT, VT)                                                                                                  \
+    SUPPORT(EXP, VT)                                                                                                   \
+    SUPPORT(LN, VT)                                                                                                    \
+    /* Trigonometric/hyperbolic. */                                                                                    \
+    SUPPORT(SIN, VT)                                                                                                   \
+    SUPPORT(COS, VT)                                                                                                   \
+    SUPPORT(TAN, VT)                                                                                                   \
+    SUPPORT(ASIN, VT)                                                                                                  \
+    SUPPORT(ACOS, VT)                                                                                                  \
+    SUPPORT(ATAN, VT)                                                                                                  \
+    SUPPORT(SINH, VT)                                                                                                  \
+    SUPPORT(COSH, VT)                                                                                                  \
+    SUPPORT(TANH, VT)                                                                                                  \
+    /* Rounding. */                                                                                                    \
+    SUPPORT(FLOOR, VT)                                                                                                 \
+    SUPPORT(CEIL, VT)                                                                                                  \
+    SUPPORT(ROUND, VT)                                                                                                 \
+    /* Comparison */                                                                                                   \
     SUPPORT(ISNAN, VT)
 
+#define SUPPORT_STRING(VT)                                                                                             \
+    /* String */                                                                                                       \
+    SUPPORT(UPPER, VT)                                                                                                 \
+    SUPPORT(LOWER, VT)
 // Concise specification of which unary operations should be supported on
 // which value types.
 SUPPORT_NUMERIC(double)
@@ -128,9 +137,13 @@ SUPPORT_NUMERIC(int8_t)
 SUPPORT_NUMERIC(uint64_t)
 SUPPORT_NUMERIC(uint32_t)
 SUPPORT_NUMERIC(uint8_t)
+// String operations
+SUPPORT_STRING(std::string)
+SUPPORT_STRING(FixedStr16)
 
 // Undefine helper macros.
 #undef SUPPORT
 #undef SUPPORT_NUMERIC
+#undef SUPPORT_STRING
 
-#endif //SRC_RUNTIME_LOCAL_KERNELS_UNARYOPCODE_H
+#endif // SRC_RUNTIME_LOCAL_KERNELS_UNARYOPCODE_H
diff --git a/src/runtime/local/kernels/VectorizedPipeline.h b/src/runtime/local/kernels/VectorizedPipeline.h
index f19ff5995..7daaedc6f 100644
--- a/src/runtime/local/kernels/VectorizedPipeline.h
+++ b/src/runtime/local/kernels/VectorizedPipeline.h
@@ -16,51 +16,52 @@
 
 #pragma once
 
+#include <ir/daphneir/Daphne.h>
 #include <runtime/local/context/DaphneContext.h>
 #include <runtime/local/datastructures/DataObjectFactory.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
 #include <runtime/local/vectorized/MTWrapper.h>
-#include <ir/daphneir/Daphne.h>
 
 #include <cstddef>
 
-using mlir::daphne::VectorSplit;
 using mlir::daphne::VectorCombine;
+using mlir::daphne::VectorSplit;
 
 // ****************************************************************************
 // Struct for partial template specialization
 // ****************************************************************************
 
-template<class DTRes>
-struct VectorizedPipeline {
-    static void apply(DTRes ** outputs, size_t numOutputs, bool* isScalar, Structure **inputs, size_t numInputs, int64_t *outRows,
-            int64_t *outCols, int64_t *splits, int64_t *combines, size_t numFuncs, void** fun, DCTX(ctx)) {
+template <class DTRes> struct VectorizedPipeline {
+    static void apply(DTRes **outputs, size_t numOutputs, bool *isScalar, Structure **inputs, size_t numInputs,
+                      int64_t *outRows, int64_t *outCols, int64_t *splits, int64_t *combines, size_t numFuncs,
+                      void **fun, DCTX(ctx)) {
         auto wrapper = std::make_unique<MTWrapper<DTRes>>(numFuncs, ctx);
 
         std::vector<std::function<void(DTRes ***, Structure **, DCTX(ctx))>> funcs;
         for (auto i = 0ul; i < numFuncs; ++i) {
             funcs.emplace_back(std::function<void(DTRes ***, Structure **, DCTX(ctx))>(
-                    reinterpret_cast<void (*)(DTRes ***, Structure **, DCTX(ctx))>(reinterpret_cast<void*>(fun[i]))));
+                reinterpret_cast<void (*)(DTRes ***, Structure **, DCTX(ctx))>(reinterpret_cast<void *>(fun[i]))));
         }
 
         // TODO Do we really need *** here, isn't ** enough?
-        auto *** outputs2 = new DTRes**[numOutputs];
-        for(size_t i = 0; i < numOutputs; i++)
+        auto ***outputs2 = new DTRes **[numOutputs];
+        for (size_t i = 0; i < numOutputs; i++)
             outputs2[i] = outputs + i;
-        
-        if(ctx->getUserConfig().vectorized_single_queue) {
+
+        if (ctx->getUserConfig().vectorized_single_queue) {
             wrapper->executeSingleQueue(funcs, outputs2, isScalar, inputs, numInputs, numOutputs, outRows, outCols,
-                    reinterpret_cast<VectorSplit *>(splits), reinterpret_cast<VectorCombine *>(combines), ctx, false);
-        }
-        else if(!ctx->getUserConfig().vectorized_single_queue && numFuncs == 1) {
+                                        reinterpret_cast<VectorSplit *>(splits),
+                                        reinterpret_cast<VectorCombine *>(combines), ctx, false);
+        } else if (!ctx->getUserConfig().vectorized_single_queue && numFuncs == 1) {
             wrapper->executeCpuQueues(funcs, outputs2, isScalar, inputs, numInputs, numOutputs, outRows, outCols,
-                    reinterpret_cast<VectorSplit *>(splits), reinterpret_cast<VectorCombine *>(combines), ctx, false);
+                                      reinterpret_cast<VectorSplit *>(splits),
+                                      reinterpret_cast<VectorCombine *>(combines), ctx, false);
+        } else {
+            wrapper->executeQueuePerDeviceType(funcs, outputs2, isScalar, inputs, numInputs, numOutputs, outRows,
+                                               outCols, reinterpret_cast<VectorSplit *>(splits),
+                                               reinterpret_cast<VectorCombine *>(combines), ctx, false);
         }
-        else {
-            wrapper->executeQueuePerDeviceType(funcs, outputs2, isScalar, inputs, numInputs, numOutputs, outRows, outCols,
-                    reinterpret_cast<VectorSplit *>(splits), reinterpret_cast<VectorCombine *>(combines), ctx, false);
-        }
-        
+
         delete[] outputs2;
     }
 };
@@ -69,10 +70,10 @@ struct VectorizedPipeline {
 // Convenience function
 // ****************************************************************************
 
-template<class DTRes>
-[[maybe_unused]] void vectorizedPipeline(DTRes ** outputs, size_t numOutputs, bool* isScalar, Structure **inputs,
-        size_t numInputs, int64_t *outRows, int64_t *outCols, int64_t *splits, int64_t *combines, size_t numFuncs,
-        void** fun, DCTX(ctx)) {
+template <class DTRes>
+[[maybe_unused]] void vectorizedPipeline(DTRes **outputs, size_t numOutputs, bool *isScalar, Structure **inputs,
+                                         size_t numInputs, int64_t *outRows, int64_t *outCols, int64_t *splits,
+                                         int64_t *combines, size_t numFuncs, void **fun, DCTX(ctx)) {
     VectorizedPipeline<DTRes>::apply(outputs, numOutputs, isScalar, inputs, numInputs, outRows, outCols, splits,
-            combines, numFuncs, fun, ctx);
+                                     combines, numFuncs, fun, ctx);
 }
diff --git a/src/runtime/local/kernels/Write.h b/src/runtime/local/kernels/Write.h
index 44f6f03cd..40f6014a3 100644
--- a/src/runtime/local/kernels/Write.h
+++ b/src/runtime/local/kernels/Write.h
@@ -17,6 +17,7 @@
 #ifndef SRC_RUNTIME_LOCAL_KERNELS_WRITE_H
 #define SRC_RUNTIME_LOCAL_KERNELS_WRITE_H
 
+#include <parser/metadata/MetaDataParser.h>
 #include <runtime/local/context/DaphneContext.h>
 #include <runtime/local/datastructures/DataObjectFactory.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
@@ -25,26 +26,23 @@
 #include <runtime/local/io/FileMetaData.h>
 #include <runtime/local/io/WriteCsv.h>
 #include <runtime/local/io/WriteDaphne.h>
-#include <parser/metadata/MetaDataParser.h>
 #if USE_HDFS
-    #include <runtime/local/io/HDFS/WriteHDFS.h>
+#include <runtime/local/io/HDFS/WriteHDFS.h>
 #endif
 
 // ****************************************************************************
 // Struct for partial template specialization
 // ****************************************************************************
 
-template<class DTArg>
-struct Write {
-    static void apply(const DTArg * arg, const char * filename, DCTX(ctx)) = delete;
+template <class DTArg> struct Write {
+    static void apply(const DTArg *arg, const char *filename, DCTX(ctx)) = delete;
 };
 
 // ****************************************************************************
 // Convenience function
 // ****************************************************************************
 
-template<class DTArg>
-void write(const DTArg * arg, const char * filename, DCTX(ctx)) {
+template <class DTArg> void write(const DTArg *arg, const char *filename, DCTX(ctx)) {
     Write<DTArg>::apply(arg, filename, ctx);
 }
 
@@ -56,36 +54,35 @@ void write(const DTArg * arg, const char * filename, DCTX(ctx)) {
 // DenseMatrix
 // ----------------------------------------------------------------------------
 
-template<typename VT>
-struct Write<DenseMatrix<VT>> {
-    static void apply(const DenseMatrix<VT> * arg, const char * filename, DCTX(ctx)) {
-	std::string fn(filename);
-	auto pos = fn.find_last_of('.');
-	std::string ext(fn.substr(pos+1)) ;
-	if (ext == "csv") {
-		File * file = openFileForWrite(filename);
-		FileMetaData metaData(arg->getNumRows(), arg->getNumCols(), true, ValueTypeUtils::codeFor<VT>);
-		MetaDataParser::writeMetaData(filename, metaData);
-		writeCsv(arg, file);
-		closeFile(file);
-	} else if (ext == "dbdf") {
-        FileMetaData metaData(arg->getNumRows(), arg->getNumCols(), true, ValueTypeUtils::codeFor<VT>);
-        MetaDataParser::writeMetaData(filename, metaData);
-		writeDaphne(arg, filename);
+template <typename VT> struct Write<DenseMatrix<VT>> {
+    static void apply(const DenseMatrix<VT> *arg, const char *filename, DCTX(ctx)) {
+        std::string fn(filename);
+        auto pos = fn.find_last_of('.');
+        std::string ext(fn.substr(pos + 1));
+        if (ext == "csv") {
+            File *file = openFileForWrite(filename);
+            FileMetaData metaData(arg->getNumRows(), arg->getNumCols(), true, ValueTypeUtils::codeFor<VT>);
+            MetaDataParser::writeMetaData(filename, metaData);
+            writeCsv(arg, file);
+            closeFile(file);
+        } else if (ext == "dbdf") {
+            FileMetaData metaData(arg->getNumRows(), arg->getNumCols(), true, ValueTypeUtils::codeFor<VT>);
+            MetaDataParser::writeMetaData(filename, metaData);
+            writeDaphne(arg, filename);
 #if USE_HDFS
-	} else if (ext == "hdfs"){
-        HDFSMetaData hdfs = {true, filename};
-        FileMetaData metaData(arg->getNumRows(), arg->getNumCols(), true, ValueTypeUtils::codeFor<VT>, -1, hdfs);        
-        // Get file extension before .hdfs (e.g. file.csv.hdfs)
-        auto posHdfs = pos;
-        auto posExt = fn.find_last_of('.', pos-1);
-	    std::string nestedExt(fn.substr(posExt + 1, posHdfs - posExt - 1));
-        MetaDataParser::writeMetaData(filename, metaData);
+        } else if (ext == "hdfs") {
+            HDFSMetaData hdfs = {true, filename};
+            FileMetaData metaData(arg->getNumRows(), arg->getNumCols(), true, ValueTypeUtils::codeFor<VT>, -1, hdfs);
+            // Get file extension before .hdfs (e.g. file.csv.hdfs)
+            auto posHdfs = pos;
+            auto posExt = fn.find_last_of('.', pos - 1);
+            std::string nestedExt(fn.substr(posExt + 1, posHdfs - posExt - 1));
+            MetaDataParser::writeMetaData(filename, metaData);
 
-        // call WriteHDFS
-        writeHDFS(arg, filename, ctx);
-#endif        
-    }
+            // call WriteHDFS
+            writeHDFS(arg, filename, ctx);
+#endif
+        }
     }
 };
 
@@ -93,13 +90,12 @@ struct Write<DenseMatrix<VT>> {
 // Frame
 // ----------------------------------------------------------------------------
 
-template<>
-struct Write<Frame> {
-    static void apply(const Frame * arg, const char * filename, DCTX(ctx)) {
-        File * file = openFileForWrite(filename);
+template <> struct Write<Frame> {
+    static void apply(const Frame *arg, const char *filename, DCTX(ctx)) {
+        File *file = openFileForWrite(filename);
         std::vector<ValueTypeCode> vtcs;
         std::vector<std::string> labels;
-        for(size_t i = 0; i < arg->getNumCols(); i++) {
+        for (size_t i = 0; i < arg->getNumCols(); i++) {
             vtcs.push_back(arg->getSchema()[i]);
             labels.push_back(arg->getLabels()[i]);
         }
@@ -114,23 +110,22 @@ struct Write<Frame> {
 // Matrix
 // ----------------------------------------------------------------------------
 
-template<typename VT>
-struct Write<Matrix<VT>> {
-    static void apply(const Matrix<VT> * arg, const char * filename, DCTX(ctx)) {
+template <typename VT> struct Write<Matrix<VT>> {
+    static void apply(const Matrix<VT> *arg, const char *filename, DCTX(ctx)) {
         std::string fn(filename);
         auto pos = fn.find_last_of('.');
-        std::string ext(fn.substr(pos+1));
+        std::string ext(fn.substr(pos + 1));
         if (ext == "csv") {
-            File * file = openFileForWrite(filename);
+            File *file = openFileForWrite(filename);
             FileMetaData metaData(arg->getNumRows(), arg->getNumCols(), true, ValueTypeUtils::codeFor<VT>);
             MetaDataParser::writeMetaData(filename, metaData);
             writeCsv(arg, file);
             closeFile(file);
         } else {
-            throw std::runtime_error( "[Write.h] - generic Matrix type currently only supports csv file extension.");
+            throw std::runtime_error("[Write.h] - generic Matrix type currently only supports csv "
+                                     "file extension.");
         }
     }
 };
 
-
-#endif //SRC_RUNTIME_LOCAL_KERNELS_WRITE_H
+#endif // SRC_RUNTIME_LOCAL_KERNELS_WRITE_H
diff --git a/src/runtime/local/kernels/experimental/gemv/Makefile b/src/runtime/local/kernels/experimental/gemv/Makefile
new file mode 100644
index 000000000..90a1217a2
--- /dev/null
+++ b/src/runtime/local/kernels/experimental/gemv/Makefile
@@ -0,0 +1,44 @@
+# Copyright 2024 The DAPHNE Consortium
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+CXX = g++
+CXXFLAGS = -DLIKWID_PERFMON -ggdb3 -fPIC -fno-omit-frame-pointer -O3 -march=native -fopenmp -std=c++17
+ASMFLAGS = -fverbose-asm -S
+
+INCLUDES = -I/usr/local/include/ -I../../../../../
+
+SRCS = gemv.cpp
+OBJS = $(SRCS:.cpp=.o)
+TARGET = gemv.so
+
+.PHONY: clean
+
+all: $(TARGET)
+	@echo " ==> Built target $(TARGET)"
+
+$(TARGET): $(OBJS)
+	@echo " ==> COMPILING  $@"
+	$(CXX) $(CXXFLAGS) $(INCLUDES) -shared -o $(TARGET) $(OBJS) -llikwid
+	
+asm: $(SRCS)
+	@echo " ==> COMPILING  $@"
+	$(CXX) $(ASMFLAGS) $(CXXFLAGS) $(INCLUDES) -o $(SRCS:.cpp=.s) -cpp $< -llikwid
+
+.cpp.o:
+	@echo " ==> COMPILING  $@"
+	$(CXX) -c $(CXXFLAGS) $(INCLUDES) -o $@ -cpp $< -llikwid
+
+clean:
+	@echo "==> CLEANING"
+	$(RM) *.o *.s *.so
diff --git a/src/runtime/local/kernels/experimental/gemv/gemv.cpp b/src/runtime/local/kernels/experimental/gemv/gemv.cpp
new file mode 100644
index 000000000..008aff5d4
--- /dev/null
+++ b/src/runtime/local/kernels/experimental/gemv/gemv.cpp
@@ -0,0 +1,97 @@
+#include <immintrin.h>
+#include <runtime/local/datastructures/CSRMatrix.h>
+#include <runtime/local/datastructures/DenseMatrix.h>
+#include <unistd.h>
+
+#include "runtime/local/datastructures/DataObjectFactory.h"
+
+#include <iostream>
+#include <stdexcept>
+
+#ifdef LIKWID_PERFMON
+#include <likwid-marker.h>
+#else
+#define LIKWID_MARKER_INIT
+#define LIKWID_MARKER_THREADINIT
+#define LIKWID_MARKER_SWITCH
+#define LIKWID_MARKER_REGISTER(regionTag)
+#define LIKWID_MARKER_START(regionTag)
+#define LIKWID_MARKER_STOP(regionTag)
+#define LIKWID_MARKER_CLOSE
+#define LIKWID_MARKER_GET(regionTag, nevents, events, time, count)
+#endif
+
+class DaphneContext;
+
+// Horizontal sum of [4 x double] __m256d
+inline double hsum_double_avx2(__m256d v) {
+    __m128d vlow = _mm256_castpd256_pd128(v);
+    __m128d vhigh = _mm256_extractf128_pd(v, 1);
+    vlow = _mm_add_pd(vlow, vhigh);
+    __m128d high64 = _mm_unpackhi_pd(vlow, vlow);
+    return _mm_cvtsd_f64(_mm_add_sd(vlow, high64));
+}
+
+extern "C" {
+
+void spmv_simd_parallel_omp(DenseMatrix<double> *&res, const CSRMatrix<double> *lhs, const DenseMatrix<double> *rhs,
+                            bool transa, bool transb, DaphneContext *ctx) {
+    LIKWID_MARKER_INIT;
+    const size_t nr_lhs = lhs->getNumRows();
+    [[maybe_unused]] const size_t nc_lhs = lhs->getNumCols();
+
+    [[maybe_unused]] const size_t nr_rhs = rhs->getNumRows();
+    const size_t nc_rhs = rhs->getNumCols();
+
+    if (nc_lhs != nr_rhs) {
+        throw std::runtime_error("Gemv - #cols of mat and #rows of vec must be the same");
+    }
+
+    if (res == nullptr)
+        res = DataObjectFactory::create<DenseMatrix<double>>(nr_lhs, nc_rhs, false);
+
+    const auto *valuesRhs = rhs->getValues();
+    auto *valuesRes = res->getValues();
+    memset(valuesRes, double(0), sizeof(double) * nr_lhs * nc_rhs);
+
+    auto *row_offsets = lhs->getRowOffsets();
+    auto *values = lhs->getValues();
+    auto *col_idx = lhs->getColIdxs();
+
+#pragma omp parallel
+    {
+        LIKWID_MARKER_START("spmv_simd_parallel_omp");
+#pragma omp for
+        for (size_t row = 0; row < nr_lhs; ++row) {
+            double row_sum = 0;
+            // Initialize [4 x double] row-accumulator
+            __m256d row_acc = _mm256_setzero_pd();
+            // Iterate over non-zero elements in row
+            auto values_in_row = row_offsets[row + 1] - row_offsets[row];
+            int rounds = values_in_row / 4;
+            for (int i = 0; i < rounds; ++i) {
+                int idx = row_offsets[row] + i * 4;
+                // Load doubles from LHS matrix
+                __m256d mat_v = _mm256_loadu_pd(&values[idx]);
+                // Load RHS column indices
+                __m256i col_idxs = _mm256_loadu_si256((const __m256i *)&col_idx[idx]);
+                // Gather values from RHS vector
+                __m256d vec_v = _mm256_i64gather_pd(valuesRhs, col_idxs, 8);
+                // Multiply and add to accumulator
+                row_acc = _mm256_fmadd_pd(mat_v, vec_v, row_acc);
+            }
+            // Horizontal sum of accumulator
+            row_sum = hsum_double_avx2(row_acc);
+            // Handle remaining elements
+            for (auto i = row_offsets[row] + rounds * 4; i < row_offsets[row + 1]; ++i) {
+                row_sum += values[i] * valuesRhs[col_idx[i]];
+            }
+            // Store result
+            valuesRes[row] = row_sum;
+        }
+
+        LIKWID_MARKER_STOP("spmv_simd_parallel_omp");
+    }
+    LIKWID_MARKER_CLOSE;
+}
+}
diff --git a/src/runtime/local/kernels/experimental/gemv/gemv.json b/src/runtime/local/kernels/experimental/gemv/gemv.json
new file mode 100644
index 000000000..1f835e044
--- /dev/null
+++ b/src/runtime/local/kernels/experimental/gemv/gemv.json
@@ -0,0 +1,10 @@
+[
+    {
+        "opMnemonic": "gemv",
+        "kernelFuncName": "spmv_simd_parallel_omp",
+        "resTypes": ["DenseMatrix<double>"],
+        "argTypes": ["CSRMatrix<double>", "DenseMatrix<double>"],
+        "backend": "CPP",
+        "libPath": "gemv.so"
+    }
+]
diff --git a/src/runtime/local/kernels/experimental/gemv/gemv.o b/src/runtime/local/kernels/experimental/gemv/gemv.o
new file mode 100644
index 000000000..7988f5fca
Binary files /dev/null and b/src/runtime/local/kernels/experimental/gemv/gemv.o differ
diff --git a/src/runtime/local/kernels/experimental/gemv/gemv.so b/src/runtime/local/kernels/experimental/gemv/gemv.so
new file mode 100755
index 000000000..1196b3515
Binary files /dev/null and b/src/runtime/local/kernels/experimental/gemv/gemv.so differ
diff --git a/src/runtime/local/kernels/genKernelInst.py b/src/runtime/local/kernels/genKernelInst.py
index 41b16e553..4eb43e217 100755
--- a/src/runtime/local/kernels/genKernelInst.py
+++ b/src/runtime/local/kernels/genKernelInst.py
@@ -24,6 +24,12 @@
 input JSON-file specifies which kernel shall be instantiated with which
 template arguments.
 
+Kernels are sorted in descending order by the number of template
+instantiations. The top N kernels are generated in separate files, while the
+rest are generated in a single file. This is done so that kernels with large
+amounts of code are compiled in parallel while the remaining kernels are short
+enough to be compiled in a single file.
+
 Furthermore, a JSON file is generated that contains information about the
 pre-compiled kernels. This file is used to populate the kernel catalog at
 system start-up.
@@ -38,6 +44,8 @@
 import io
 import json
 import sys
+from typing import List, Tuple
+from pathlib import Path
 
 INDENT = 4 * " "
 DEFAULT_NEWRESPARAM = "res"
@@ -267,6 +275,65 @@ def printHelp():
     print("Usage: python3 {} INPUT_SPEC_FILE OUTPUT_CPP_FILE OUTPUT_CATALOG_FILE API".format(sys.argv[0]))
     print(__doc__)
 
+def getDefaultHeaders() -> str:
+    return "#include <stdexcept>\n" \
+        "#include <util/ErrorHandler.h>\n" \
+        "#include <runtime/local/context/DaphneContext.h>\n" \
+        "#include <runtime/local/instrumentation/KernelInstrumentation.h>\n"
+
+def codegenKernelInfos(kernelInfos, catalog_entries) -> Tuple[bool, str]:
+    ops_inst_str: str = ""
+    header_str: str = getDefaultHeaders()
+    didGenerateCode = False
+
+    for kernelInfo in kernelInfos:
+        kernelTemplateInfo = kernelInfo["kernelTemplate"]
+        if "api" in kernelInfo:
+            for api in kernelInfo["api"]:
+                for name in api["name"]:
+                    # print("Processing API: " + name)
+                    # print("  OpName: " + kernelTemplateInfo["opName"])
+                    # print("  Instantiations: " + str(api["instantiations"]))
+                    # if "opCodes" in api:
+                    #     print("  opCodes: " + str(api["opCodes"]))
+                    if name == API:
+                        # Comment reporting the kernel name.
+                        ops_inst_str += INDENT + "// {}\n".format("-" * 76)
+                        ops_inst_str += INDENT + "// {}\n".format(kernelTemplateInfo["opName"])
+                        ops_inst_str += INDENT + "// {}\n".format("-" * 76)
+
+                        # Include for the required header.
+                        if API != "CPP":
+                            header_str = header_str + "#include <runtime/local/kernels/{}/{}>\n".format(API, kernelTemplateInfo["header"])
+                        else:
+                            header_str = header_str + "#include <runtime/local/kernels/{}>\n".format(kernelTemplateInfo["header"])
+
+                        outBuf = io.StringIO()
+                        for instantiation in api["instantiations"]:
+                            generateKernelInstantiation(kernelTemplateInfo, instantiation,
+                                                        api.get("opCodes", None), outBuf, catalog_entries, API)
+                        ops_inst_str += outBuf.getvalue()
+                        didGenerateCode = True
+        else:
+            if API == "CPP":
+                # Comment reporting the kernel name.
+                ops_inst_str += INDENT + "// {}\n".format("-" * 76)
+                ops_inst_str += INDENT + "// {}\n".format(kernelTemplateInfo["opName"])
+                ops_inst_str += INDENT + "// {}\n".format("-" * 76)
+
+                # Include for the required header.
+                header_str = header_str + "#include <runtime/local/kernels/{}>\n".format(kernelTemplateInfo["header"])
+                # One function per instantiation of the kernel.
+                opCodes = kernelInfo.get("opCodes", None)
+                outBuf = io.StringIO()
+                for instantiation in kernelInfo["instantiations"]:
+                    generateKernelInstantiation(kernelTemplateInfo, instantiation, opCodes, outBuf, catalog_entries, API)
+                ops_inst_str += outBuf.getvalue()
+                didGenerateCode = True
+
+    file = header_str + '\nextern \"C\" {\n' + ops_inst_str + "}\n"
+    return didGenerateCode, file
+
 
 if __name__ == "__main__":
     if len(sys.argv) == 2 and (sys.argv[1] == "-h" or sys.argv[1] == "--help"):
@@ -277,76 +344,40 @@ def printHelp():
         print()
         printHelp()
         sys.exit(1)
+
     # Parse arguments.
     inSpecPath = sys.argv[1]
     outCppPath = sys.argv[2]
     outCatalogPath = sys.argv[3]
     API = sys.argv[4]
-    ops_inst_str = ""
-    header_str = ""
+    cppCodegen: List[str] = []
     catalog_entries = []
 
     # Load the specification (which kernel template shall be instantiated
     # with which template arguments) from a JSON-file.
     with open(inSpecPath, "r") as inFile:
-        kernelsInfo = json.load(inFile)
-
-        for kernelInfo in kernelsInfo:
-            kernelTemplateInfo = kernelInfo["kernelTemplate"]
-            if "api" in kernelInfo:
-                for api in kernelInfo["api"]:
-                    for name in api["name"]:
-                        # print("Processing API: " + name)
-                        # print("  OpName: " + kernelTemplateInfo["opName"])
-                        # print("  Instantiations: " + str(api["instantiations"]))
-                        # if "opCodes" in api:
-                        #     print("  opCodes: " + str(api["opCodes"]))
-                        if name == API:
-                            # Comment reporting the kernel name.
-                            ops_inst_str += INDENT + "// {}\n".format("-" * 76)
-                            ops_inst_str += INDENT + "// {}\n".format(kernelTemplateInfo["opName"])
-                            ops_inst_str += INDENT + "// {}\n".format("-" * 76)
-
-                            # Include for the required header.
-                            if API != "CPP":
-                                header_str = header_str + "#include <runtime/local/kernels/{}/{}>\n".format(API, kernelTemplateInfo["header"])
-                            else:
-                                header_str = header_str + "#include <runtime/local/kernels/{}>\n".format(kernelTemplateInfo["header"])
-
-                            outBuf = io.StringIO()
-                            for instantiation in api["instantiations"]:
-                                generateKernelInstantiation(kernelTemplateInfo, instantiation,
-                                                            api.get("opCodes", None), outBuf, catalog_entries, API)
-                            ops_inst_str += outBuf.getvalue()
-            else:
-                if API == "CPP":
-                    # Comment reporting the kernel name.
-                    ops_inst_str += INDENT + "// {}\n".format("-" * 76)
-                    ops_inst_str += INDENT + "// {}\n".format(kernelTemplateInfo["opName"])
-                    ops_inst_str += INDENT + "// {}\n".format("-" * 76)
-
-                    # Include for the required header.
-                    header_str = header_str + "#include <runtime/local/kernels/{}>\n".format(kernelTemplateInfo["header"])
-                    # One function per instantiation of the kernel.
-                    opCodes = kernelInfo.get("opCodes", None)
-                    outBuf = io.StringIO()
-                    for instantiation in kernelInfo["instantiations"]:
-                        generateKernelInstantiation(kernelTemplateInfo, instantiation, opCodes, outBuf, catalog_entries, API)
-                    ops_inst_str += outBuf.getvalue()
-
-
-    # Store the C++ code of the kernel instantiations in a CPP-file.
-    with open(outCppPath, "w") as outFile:
-        outFile.write("// This file was generated by {}. Don't edit manually!\n\n".format(sys.argv[0]))
-        outFile.write("#include <runtime/local/context/DaphneContext.h>\n")
-        outFile.write("#include <stdexcept>\n")
-        outFile.write("#include <util/ErrorHandler.h>\n")
-        outFile.write("#include <runtime/local/instrumentation/KernelInstrumentation.h>\n")
-        outFile.write(header_str)
-        outFile.write("\nextern \"C\" {\n")
-        outFile.write(ops_inst_str)
-        outFile.write("}\n")
-
-    # Store the information on the kernels in a JSON-file.
-    with open(outCatalogPath, "w") as outCatalog:
+        kernelInfos = json.load(inFile)
+
+        for kernelInfo in kernelInfos:
+            didGenerateCode, file = codegenKernelInfos([kernelInfo], catalog_entries)
+            if didGenerateCode:
+                cppCodegen.append(file)
+
+    cppFiles = []
+    # Write the generated C++ code to separate files.
+    for idx, file in enumerate(cppCodegen):
+        fileName = outCppPath + "_" + str(idx) + ".cpp"
+        cppFiles.append(fileName)
+        with open(fileName, "w") as outFile:
+            outFile.write(file)
+
+    # Store kernel catalog info as JSON.
+
+    catalogFile = Path(outCatalogPath)
+    catalogFile.parent.mkdir(exist_ok=True, parents=True)
+    with catalogFile.open('w') as outCatalog:
+        print("writing catalog to " + outCatalogPath)
         outCatalog.write(json.dumps(catalog_entries, indent=2))
+
+    #  Lists all generated *.cpp files.
+    #  print(cppFiles)
diff --git a/src/runtime/local/kernels/kernels.json b/src/runtime/local/kernels/kernels.json
index a74a59dc6..22e285d77 100644
--- a/src/runtime/local/kernels/kernels.json
+++ b/src/runtime/local/kernels/kernels.json
@@ -4638,6 +4638,11 @@
                         ["DenseMatrix", "double"],
                         ["CSRMatrix", "double"],
                         ["DenseMatrix", "double"]
+                    ],
+                    [
+                        ["DenseMatrix", "float"],
+                        ["CSRMatrix", "float"],
+                        ["DenseMatrix", "float"]
                     ]
                 ]
             }
diff --git a/src/runtime/local/vectorized/LoadPartitioning.h b/src/runtime/local/vectorized/LoadPartitioning.h
index efbc49180..1b18fa95a 100644
--- a/src/runtime/local/vectorized/LoadPartitioning.h
+++ b/src/runtime/local/vectorized/LoadPartitioning.h
@@ -20,12 +20,12 @@
 
 #include <cmath>
 #include <cstdlib>
-#include <string>
 #include <iostream>
+#include <string>
 
 class LoadPartitioning {
 
-private:
+  private:
     int schedulingMethod;
     uint64_t totalTasks;
     uint64_t chunkParam;
@@ -33,41 +33,39 @@ class LoadPartitioning {
     uint64_t remainingTasks;
     uint32_t totalWorkers;
     uint64_t schedulingStep;
-    uint64_t tssChunk; 
+    uint64_t tssChunk;
     uint64_t tssDelta;
     uint64_t mfscChunk;
     uint32_t fissStages;
-    int getMethod (const char * method){
-        return std::stoi(method);
-    }
-    int getStages(int tasks, int workers){
-        int actual_step=0;
-        int scheduled=0;
-        int step=0;
-        while (scheduled < tasks){
-            actual_step=step/workers;
-            double chunk = pow(0.5,actual_step+1)*tasks/float(workers);
-            scheduled+=ceil(chunk);
-            step+=1;
+    int getMethod(const char *method) { return std::stoi(method); }
+    int getStages(int tasks, int workers) {
+        int actual_step = 0;
+        int scheduled = 0;
+        int step = 0;
+        while (scheduled < tasks) {
+            actual_step = step / workers;
+            double chunk = pow(0.5, actual_step + 1) * tasks / float(workers);
+            scheduled += ceil(chunk);
+            step += 1;
         }
-        return actual_step+1;
+        return actual_step + 1;
     }
-public:
-LoadPartitioning(int method, uint64_t tasks, uint64_t chunk, uint32_t workers, bool autoChunk){
+
+  public:
+    LoadPartitioning(int method, uint64_t tasks, uint64_t chunk, uint32_t workers, bool autoChunk) {
         schedulingMethod = method;
         totalTasks = tasks;
-        double tSize = (totalTasks+workers-1.0)/workers;
-        mfscChunk = ceil(tSize*log(2.0)/log((1.0*tSize)));
+        double tSize = (totalTasks + workers - 1.0) / workers;
+        mfscChunk = ceil(tSize * log(2.0) / log((1.0 * tSize)));
         fissStages = getStages(totalTasks, workers);
-        if(!autoChunk){    
+        if (!autoChunk) {
             chunkParam = chunk;
-        }
-        else{
+        } else {
             // calculate expertChunk
-            int mul = log2(totalTasks/workers)*0.618; 
-            chunkParam = (totalTasks)/((2<<mul)*workers);
-            method=SS;
-            if (chunkParam < 1){
+            int mul = log2(totalTasks / workers) * 0.618;
+            chunkParam = (totalTasks) / ((2 << mul) * workers);
+            method = SS;
+            if (chunkParam < 1) {
                 chunkParam = 1;
             }
         }
@@ -75,86 +73,88 @@ LoadPartitioning(int method, uint64_t tasks, uint64_t chunk, uint32_t workers, b
         remainingTasks = tasks;
         schedulingStep = 0;
         scheduledTasks = 0;
-        tssChunk = (uint64_t) ceil((double) totalTasks / ((double) 2.0*totalWorkers));
-        uint64_t nTemp = (uint64_t) ceil(2.0*totalTasks/(tssChunk+1.0));
-        tssDelta  = (uint64_t) (tssChunk - 1.0)/(double)(nTemp-1.0);
+        tssChunk = (uint64_t)ceil((double)totalTasks / ((double)2.0 * totalWorkers));
+        uint64_t nTemp = (uint64_t)ceil(2.0 * totalTasks / (tssChunk + 1.0));
+        tssDelta = (uint64_t)(tssChunk - 1.0) / (double)(nTemp - 1.0);
     }
-    bool hasNextChunk(){
-        return scheduledTasks < totalTasks; 
-    }  
-    uint64_t getNextChunk(){
+    bool hasNextChunk() { return scheduledTasks < totalTasks; }
+    uint64_t getNextChunk() {
         uint64_t chunkSize = 0;
-        switch (schedulingMethod){
-            case STATIC:{//STATIC
-                chunkSize = std::ceil(totalTasks/totalWorkers);
-                break;
-            }
-            case SS:{// self-scheduling (SS)
-                chunkSize = 1;
-                break;
-            }
-            case GSS:{//guided self-scheduling (GSS)
-                chunkSize = (uint64_t) ceil((double)remainingTasks/totalWorkers);
-                break;
-            }
-            case TSS:{//trapezoid self-scheduling (TSS)
-                chunkSize = tssChunk - tssDelta * schedulingStep;
-                break;
-            }
-            case FAC2:{//factoring (FAC2)
-                uint64_t actualStep = schedulingStep/totalWorkers; // has to be an integer division 
-                chunkSize = (uint64_t) ceil(pow(0.5,actualStep+1)*(totalTasks/totalWorkers));
-                break;
-            }
-            case TFSS:{//trapezoid factoring self-scheduling (TFSS)
-                chunkSize = (uint64_t) ceil((double) remainingTasks/ ((double) 2.0*totalWorkers));
-                break;
-            }
-            case FISS:{//fixed increase self-scheduling (FISS)
-                //TODO
-                uint64_t X = fissStages + 2;
-                uint64_t initChunk = (uint64_t) ceil(totalTasks/((2.0+fissStages)*totalWorkers));
-                chunkSize = initChunk + schedulingStep * (uint64_t) ceil((2.0*totalTasks*(1.0-(fissStages/X)))/(totalWorkers*fissStages*(fissStages-1))); //chunksize with increment after init 
-                break;
-            }
-            case VISS:{//variable increase self-scheduling (VISS)
-                //TODO
-                uint64_t schedulingStepnew =  schedulingStep/totalWorkers;
-                uint64_t initChunk = (uint64_t) ceil(totalTasks/((2.0+fissStages)*totalWorkers));
-                chunkSize =  initChunk * (uint64_t) ceil((double)(1-pow(0.5,schedulingStepnew))/0.5);
-                break;
-            }
-            case PLS:{//performance-based loop self-scheduling (PLS)
-                //TODO
-                double SWR = 0.5; //static workload ratio
-                if(remainingTasks > totalTasks - (totalTasks*SWR)){
-                    chunkSize = (uint64_t) ceil((double)totalTasks*SWR/totalWorkers);
-                }else{
-                    chunkSize = (uint64_t) ceil((double)remainingTasks/totalWorkers);
-                }
-                break;
-            }
-            case PSS:{//probabilistic self-scheduling (PSS)
-                //E[P] is the average number of idle processor, for now we use still totalWorkers
-                double averageIdleProc = (double)totalWorkers;
-                chunkSize = (uint64_t) ceil((double)remainingTasks/(1.5*averageIdleProc));
-                //TODO
-                break;
-            }
-            case MFSC:{//modifed fixed-size chunk self-scheduling (MFSC)
-                chunkSize=mfscChunk;
-                break;
-            }
-            default:{
-                chunkSize = (uint64_t)ceil(totalTasks/totalWorkers/4.0);
-                break;
+        switch (schedulingMethod) {
+        case STATIC: { // STATIC
+            chunkSize = std::ceil(totalTasks / totalWorkers);
+            break;
+        }
+        case SS: { // self-scheduling (SS)
+            chunkSize = 1;
+            break;
+        }
+        case GSS: { // guided self-scheduling (GSS)
+            chunkSize = (uint64_t)ceil((double)remainingTasks / totalWorkers);
+            break;
+        }
+        case TSS: { // trapezoid self-scheduling (TSS)
+            chunkSize = tssChunk - tssDelta * schedulingStep;
+            break;
+        }
+        case FAC2: {                                             // factoring (FAC2)
+            uint64_t actualStep = schedulingStep / totalWorkers; // has to be an integer division
+            chunkSize = (uint64_t)ceil(pow(0.5, actualStep + 1) * (totalTasks / totalWorkers));
+            break;
+        }
+        case TFSS: { // trapezoid factoring self-scheduling (TFSS)
+            chunkSize = (uint64_t)ceil((double)remainingTasks / ((double)2.0 * totalWorkers));
+            break;
+        }
+        case FISS: { // fixed increase self-scheduling (FISS)
+            // TODO
+            uint64_t X = fissStages + 2;
+            uint64_t initChunk = (uint64_t)ceil(totalTasks / ((2.0 + fissStages) * totalWorkers));
+            chunkSize =
+                initChunk + schedulingStep * (uint64_t)ceil((2.0 * totalTasks * (1.0 - (fissStages / X))) /
+                                                            (totalWorkers * fissStages *
+                                                             (fissStages - 1))); // chunksize with increment after init
+            break;
+        }
+        case VISS: { // variable increase self-scheduling (VISS)
+            // TODO
+            uint64_t schedulingStepnew = schedulingStep / totalWorkers;
+            uint64_t initChunk = (uint64_t)ceil(totalTasks / ((2.0 + fissStages) * totalWorkers));
+            chunkSize = initChunk * (uint64_t)ceil((double)(1 - pow(0.5, schedulingStepnew)) / 0.5);
+            break;
+        }
+        case PLS: { // performance-based loop self-scheduling (PLS)
+            // TODO
+            double SWR = 0.5; // static workload ratio
+            if (remainingTasks > totalTasks - (totalTasks * SWR)) {
+                chunkSize = (uint64_t)ceil((double)totalTasks * SWR / totalWorkers);
+            } else {
+                chunkSize = (uint64_t)ceil((double)remainingTasks / totalWorkers);
             }
+            break;
+        }
+        case PSS: { // probabilistic self-scheduling (PSS)
+            // E[P] is the average number of idle processor, for now we use
+            // still totalWorkers
+            double averageIdleProc = (double)totalWorkers;
+            chunkSize = (uint64_t)ceil((double)remainingTasks / (1.5 * averageIdleProc));
+            // TODO
+            break;
+        }
+        case MFSC: { // modifed fixed-size chunk self-scheduling (MFSC)
+            chunkSize = mfscChunk;
+            break;
         }
-        chunkSize = std::max(chunkSize,chunkParam);
+        default: {
+            chunkSize = (uint64_t)ceil(totalTasks / totalWorkers / 4.0);
+            break;
+        }
+        }
+        chunkSize = std::max(chunkSize, chunkParam);
         chunkSize = std::min(chunkSize, remainingTasks);
         schedulingStep++;
-        scheduledTasks+=chunkSize;
-        remainingTasks-=chunkSize;
+        scheduledTasks += chunkSize;
+        remainingTasks -= chunkSize;
         return chunkSize;
-    } 
+    }
 };
\ No newline at end of file
diff --git a/src/runtime/local/vectorized/LoadPartitioningDefs.h b/src/runtime/local/vectorized/LoadPartitioningDefs.h
index 118df3d7f..7b71cd55b 100644
--- a/src/runtime/local/vectorized/LoadPartitioningDefs.h
+++ b/src/runtime/local/vectorized/LoadPartitioningDefs.h
@@ -16,21 +16,12 @@
 
 #pragma once
 
-enum QueueTypeOption {
-    CENTRALIZED=0,
-    PERGROUP,
-    PERCPU
-};
+enum QueueTypeOption { CENTRALIZED = 0, PERGROUP, PERCPU };
 
-enum VictimSelectionLogic {
-    SEQ=0,
-    SEQPRI,
-    RANDOM,
-    RANDOMPRI
-};
+enum VictimSelectionLogic { SEQ = 0, SEQPRI, RANDOM, RANDOMPRI };
 
 enum SelfSchedulingScheme {
-    STATIC=0,
+    STATIC = 0,
     SS,
     GSS,
     TSS,
@@ -43,5 +34,5 @@ enum SelfSchedulingScheme {
     MFSC,
     PSS,
     AUTO,
-    INVALID=-1 /* only for JSON enum conversion */
+    INVALID = -1 /* only for JSON enum conversion */
 };
diff --git a/src/runtime/local/vectorized/MTWrapper.h b/src/runtime/local/vectorized/MTWrapper.h
index 73be95772..a802f2d87 100644
--- a/src/runtime/local/vectorized/MTWrapper.h
+++ b/src/runtime/local/vectorized/MTWrapper.h
@@ -35,15 +35,14 @@
 
 #include <hwloc.h>
 
-//TODO use the wrapper to cache threads
-//TODO generalize for arbitrary inputs (not just binary)
+// TODO use the wrapper to cache threads
+// TODO generalize for arbitrary inputs (not just binary)
 
-using mlir::daphne::VectorSplit;
 using mlir::daphne::VectorCombine;
+using mlir::daphne::VectorSplit;
 
-template <typename DT>
-class MTWrapperBase {
-protected:
+template <typename DT> class MTWrapperBase {
+  protected:
     std::vector<std::unique_ptr<Worker>> cuda_workers;
     std::vector<std::unique_ptr<Worker>> cpp_workers;
     std::vector<int> topologyPhysicalIds;
@@ -53,13 +52,14 @@ class MTWrapperBase {
     uint32_t _numCPPThreads{};
     uint32_t _numCUDAThreads{};
     int _queueMode;
-    // _queueMode 0: Centralized queue for all workers, 1: One queue for every physical ID (socket), 2: One queue per CPU
+    // _queueMode 0: Centralized queue for all workers, 1: One queue for every
+    // physical ID (socket), 2: One queue per CPU
     int _numQueues;
     int _stealLogic;
     int _totalNumaDomains;
     DCTX(_ctx);
 
-    std::pair<size_t, size_t> getInputProperties(Structure** inputs, size_t numInputs, VectorSplit* splits) {
+    std::pair<size_t, size_t> getInputProperties(Structure **inputs, size_t numInputs, VectorSplit *splits) {
         auto len = 0ul;
         auto mem_required = 0ul;
 
@@ -73,36 +73,36 @@ class MTWrapperBase {
         return std::make_pair(len, mem_required);
     }
 
-    void hwloc_recurse_topology(hwloc_topology_t topo, hwloc_obj_t obj,
-                                unsigned int parent_package_id,
-                                std::vector<int>& physicalIds,
-                                std::vector<int>& uniqueThreads,
-                                std::vector<int>& responsibleThreads) {
+    void hwloc_recurse_topology(hwloc_topology_t topo, hwloc_obj_t obj, unsigned int parent_package_id,
+                                std::vector<int> &physicalIds, std::vector<int> &uniqueThreads,
+                                std::vector<int> &responsibleThreads) {
         if (obj->type != HWLOC_OBJ_CORE) {
-          for (unsigned int i = 0; i < obj->arity; i++) {
-              hwloc_recurse_topology(topo, obj->children[i], parent_package_id, physicalIds, uniqueThreads, responsibleThreads);
-          }
+            for (unsigned int i = 0; i < obj->arity; i++) {
+                hwloc_recurse_topology(topo, obj->children[i], parent_package_id, physicalIds, uniqueThreads,
+                                       responsibleThreads);
+            }
         } else {
-          physicalIds.push_back(parent_package_id);
-          for (unsigned int i = 0; i < obj->arity; i++)
-            uniqueThreads.push_back(obj->children[i]->os_index);
+            physicalIds.push_back(parent_package_id);
+            for (unsigned int i = 0; i < obj->arity; i++)
+                uniqueThreads.push_back(obj->children[i]->os_index);
 
-          switch (_ctx->getUserConfig().queueSetupScheme) {
+            switch (_ctx->getUserConfig().queueSetupScheme) {
             case CENTRALIZED: {
-              responsibleThreads.push_back(0);
+                responsibleThreads.push_back(0);
             } break;
             case PERGROUP: {
-              if (responsibleThreads.size() == parent_package_id)
-                responsibleThreads.push_back(obj->children[0]->os_index);
+                if (responsibleThreads.size() == parent_package_id)
+                    responsibleThreads.push_back(obj->children[0]->os_index);
             } break;
             case PERCPU: {
-              responsibleThreads.push_back(obj->os_index);
+                responsibleThreads.push_back(obj->os_index);
             } break;
-          }
+            }
         }
     }
 
-    void get_topology(std::vector<int> &physicalIds, std::vector<int> &uniqueThreads, std::vector<int> &responsibleThreads) {
+    void get_topology(std::vector<int> &physicalIds, std::vector<int> &uniqueThreads,
+                      std::vector<int> &responsibleThreads) {
         hwloc_topology_t topology;
 
         hwloc_topology_init(&topology);
@@ -111,125 +111,128 @@ class MTWrapperBase {
         hwloc_obj_t package = hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_PACKAGE, NULL);
 
         while (package != NULL) {
-          auto package_id = package->os_index;
-          hwloc_recurse_topology(topology, package, package_id, physicalIds, uniqueThreads, responsibleThreads);
-          package = hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_PACKAGE, package);
+            auto package_id = package->os_index;
+            hwloc_recurse_topology(topology, package, package_id, physicalIds, uniqueThreads, responsibleThreads);
+            package = hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_PACKAGE, package);
         }
 
         hwloc_topology_destroy(topology);
     }
 
     void initCPPWorkers(std::vector<TaskQueue *> &qvector, uint32_t batchSize, const bool verbose = false,
-            int numQueues = 0, int queueMode = 0, bool pinWorkers = false) {
+                        int numQueues = 0, int queueMode = 0, bool pinWorkers = false) {
         cpp_workers.resize(_numCPPThreads);
-        if( numQueues == 0 ) {
-            throw std::runtime_error("MTWrapper::initCPPWorkers: numQueues is 0, this should not happen.");
+        if (numQueues == 0) {
+            throw std::runtime_error("MTWrapper::initCPPWorkers: numQueues is "
+                                     "0, this should not happen.");
         }
 
         int i = 0;
-        for( auto& w : cpp_workers ) {
-            w = std::make_unique<WorkerCPU>(qvector, topologyPhysicalIds, topologyUniqueThreads, _ctx, verbose, 0, batchSize,
-                    i, numQueues, queueMode, this->_stealLogic, pinWorkers);
+        for (auto &w : cpp_workers) {
+            w = std::make_unique<WorkerCPU>(qvector, topologyPhysicalIds, topologyUniqueThreads, _ctx, verbose, 0,
+                                            batchSize, i, numQueues, queueMode, this->_stealLogic, pinWorkers);
             i++;
         }
     }
 #ifdef USE_CUDA
-    void initCUDAWorkers(TaskQueue* q, uint32_t batchSize, bool verbose = false) {
+    void initCUDAWorkers(TaskQueue *q, uint32_t batchSize, bool verbose = false) {
         cuda_workers.resize(_numCUDAThreads);
-        for (auto& w : cuda_workers)
+        for (auto &w : cuda_workers)
             w = std::make_unique<WorkerGPU>(q, _ctx, verbose, 1, batchSize);
     }
 
-    void cudaPrefetchInputs(Structure** inputs, uint32_t numInputs, size_t mem_required,
-            mlir::daphne::VectorSplit* splits) {
-        const size_t deviceID = 0; //ToDo: multi device support
+    void cudaPrefetchInputs(Structure **inputs, uint32_t numInputs, size_t mem_required,
+                            mlir::daphne::VectorSplit *splits) {
+        const size_t deviceID = 0; // ToDo: multi device support
         auto ctx = CUDAContext::get(_ctx, deviceID);
         AllocationDescriptorCUDA alloc_desc(_ctx, deviceID);
         auto buffer_usage = static_cast<float>(mem_required) / static_cast<float>(ctx->getMemBudget());
         ctx->logger->debug("Vect pipe total in/out buffer usage: {}", buffer_usage);
-        if(buffer_usage < 1.0) {
+        if (buffer_usage < 1.0) {
             for (auto i = 0u; i < numInputs; ++i) {
-                if(splits[i] == mlir::daphne::VectorSplit::ROWS) {
-                    [[maybe_unused]] auto unused = static_cast<const DT*>(inputs[i])->getValues(&alloc_desc);
+                if (splits[i] == mlir::daphne::VectorSplit::ROWS) {
+                    [[maybe_unused]] auto unused = static_cast<const DT *>(inputs[i])->getValues(&alloc_desc);
                 }
             }
         }
     }
 #endif
-    size_t allocateOutput(DT***& res, size_t numOutputs, const int64_t* outRows, const int64_t* outCols,
-            mlir::daphne::VectorCombine* combines) {
+    size_t allocateOutput(DT ***&res, size_t numOutputs, const int64_t *outRows, const int64_t *outCols,
+                          mlir::daphne::VectorCombine *combines) {
         auto mem_required = 0ul;
         // output allocation for row-wise combine
-        for(size_t i = 0; i < numOutputs; ++i) {
-            if((*res[i]) == nullptr && outRows[i] != -1 && outCols[i] != -1) {
+        for (size_t i = 0; i < numOutputs; ++i) {
+            if ((*res[i]) == nullptr && outRows[i] != -1 && outCols[i] != -1) {
                 auto zeroOut = combines[i] == mlir::daphne::VectorCombine::ADD;
                 (*res[i]) = DataObjectFactory::create<DT>(outRows[i], outCols[i], zeroOut);
-                mem_required += static_cast<DT*>((*res[i]))->getBufferSize();
+                mem_required += static_cast<DT *>((*res[i]))->getBufferSize();
             }
         }
         return mem_required;
     }
 
-    virtual void combineOutputs(DT***& res, DT***& res_cuda, size_t numOutputs, mlir::daphne::VectorCombine* combines,
-            DCTX(ctx)) = 0;
+    virtual void combineOutputs(DT ***&res, DT ***&res_cuda, size_t numOutputs, mlir::daphne::VectorCombine *combines,
+                                DCTX(ctx)) = 0;
 
     void joinAll() {
-        for(auto& w : cpp_workers)
+        for (auto &w : cpp_workers)
             w->join();
-        for(auto& w : cuda_workers)
+        for (auto &w : cuda_workers)
             w->join();
     }
 
-public:
+  public:
     explicit MTWrapperBase(uint32_t numFunctions, DCTX(ctx)) : _ctx(ctx) {
         _ctx->logger->debug("Querying cpu topology");
         get_topology(topologyPhysicalIds, topologyUniqueThreads, topologyResponsibleThreads);
 
-        if(ctx->config.numberOfThreads > 0)
+        if (ctx->config.numberOfThreads > 0)
             _numCPPThreads = ctx->config.numberOfThreads;
         else
             _numCPPThreads = std::thread::hardware_concurrency();
 
-        if(_ctx->getUserConfig().queueSetupScheme != CENTRALIZED)
+        if (_ctx->getUserConfig().queueSetupScheme != CENTRALIZED)
             _numCPPThreads = topologyUniqueThreads.size();
 
-        // If the available CPUs from Slurm is less than the configured num threads, use the value from Slurm
-        if(const char* env_m = std::getenv("SLURM_CPUS_ON_NODE"))
-            if(std::stoul(env_m) < _numCPPThreads)
+        // If the available CPUs from Slurm is less than the configured num
+        // threads, use the value from Slurm
+        if (const char *env_m = std::getenv("SLURM_CPUS_ON_NODE"))
+            if (std::stoul(env_m) < _numCPPThreads)
                 _numCPPThreads = std::stoi(env_m);
 
-        // this is a bit hacky: if the second function (if available) is assumed to be the one containing CUDA ops
-        if(ctx && ctx->useCUDA() && numFunctions > 1)
+        // this is a bit hacky: if the second function (if available) is assumed
+        // to be the one containing CUDA ops
+        if (ctx && ctx->useCUDA() && numFunctions > 1)
             _numCUDAThreads = ctx->cuda_contexts.size();
 
         _queueMode = 0;
         _numQueues = 1;
         _stealLogic = _ctx->getUserConfig().victimSelection;
-        if( std::thread::hardware_concurrency() < topologyUniqueThreads.size() && _ctx->config.hyperthreadingEnabled )
+        if (std::thread::hardware_concurrency() < topologyUniqueThreads.size() && _ctx->config.hyperthreadingEnabled)
             topologyUniqueThreads.resize(_numCPPThreads);
         _numThreads = _numCPPThreads + _numCUDAThreads;
-        _totalNumaDomains = std::set<double>( topologyPhysicalIds.begin(), topologyPhysicalIds.end() ).size();
+        _totalNumaDomains = std::set<double>(topologyPhysicalIds.begin(), topologyPhysicalIds.end()).size();
 
-        if ( _ctx->getUserConfig().queueSetupScheme == PERGROUP ) {
+        if (_ctx->getUserConfig().queueSetupScheme == PERGROUP) {
             _queueMode = 1;
             _numQueues = _totalNumaDomains;
-        } else if ( _ctx->getUserConfig().queueSetupScheme == PERCPU ) {
+        } else if (_ctx->getUserConfig().queueSetupScheme == PERCPU) {
             _queueMode = 2;
             _numQueues = _numCPPThreads;
         }
 
         // ToDo: use logger
-        if( _ctx->config.debugMultiThreading ) {
+        if (_ctx->config.debugMultiThreading) {
             std::cout << "topologyPhysicalIds:" << std::endl;
-            for(const auto & topologyEntry: topologyPhysicalIds) {
+            for (const auto &topologyEntry : topologyPhysicalIds) {
                 std::cout << topologyEntry << ',';
             }
             std::cout << std::endl << "topologyUniqueThreads:" << std::endl;
-            for(const auto & topologyEntry: topologyUniqueThreads) {
+            for (const auto &topologyEntry : topologyUniqueThreads) {
                 std::cout << topologyEntry << ',';
             }
             std::cout << std::endl << "topologyResponsibleThreads:" << std::endl;
-            for(const auto & topologyEntry: topologyResponsibleThreads) {
+            for (const auto &topologyEntry : topologyResponsibleThreads) {
                 std::cout << topologyEntry << ',';
             }
             std::cout << std::endl << "_totalNumaDomains=" << _totalNumaDomains << std::endl;
@@ -242,57 +245,60 @@ class MTWrapperBase {
     virtual ~MTWrapperBase() = default;
 };
 
-template<typename DT>
-class MTWrapper : public MTWrapperBase<DT> {};
+template <typename DT> class MTWrapper : public MTWrapperBase<DT> {};
 
-template<typename VT>
-class MTWrapper<DenseMatrix<VT>> : public  MTWrapperBase<DenseMatrix<VT>> {
-public:
+template <typename VT> class MTWrapper<DenseMatrix<VT>> : public MTWrapperBase<DenseMatrix<VT>> {
+  public:
     using PipelineFunc = void(DenseMatrix<VT> ***, Structure **, DCTX(ctx));
 
-    explicit MTWrapper(uint32_t numFunctions, DCTX(ctx)) : MTWrapperBase<DenseMatrix<VT>>(numFunctions, ctx){}
-
+    explicit MTWrapper(uint32_t numFunctions, DCTX(ctx)) : MTWrapperBase<DenseMatrix<VT>>(numFunctions, ctx) {}
 
-    [[maybe_unused]] void executeSingleQueue(std::vector<std::function<PipelineFunc>> funcs, DenseMatrix<VT>*** res,
-            const bool* isScalar, Structure** inputs, size_t numInputs, size_t numOutputs, int64_t *outRows,
-            int64_t* outCols, VectorSplit* splits, VectorCombine* combines, DCTX(ctx), bool verbose);
+    [[maybe_unused]] void executeSingleQueue(std::vector<std::function<PipelineFunc>> funcs, DenseMatrix<VT> ***res,
+                                             const bool *isScalar, Structure **inputs, size_t numInputs,
+                                             size_t numOutputs, int64_t *outRows, int64_t *outCols, VectorSplit *splits,
+                                             VectorCombine *combines, DCTX(ctx), bool verbose);
 
-    [[maybe_unused]] void executeCpuQueues(std::vector<std::function<PipelineFunc>> funcs, DenseMatrix<VT>*** res,
-            const bool* isScalar, Structure** inputs, size_t numInputs, size_t numOutputs, int64_t *outRows,
-            int64_t* outCols, VectorSplit* splits, VectorCombine* combines, DCTX(ctx), bool verbose);
+    [[maybe_unused]] void executeCpuQueues(std::vector<std::function<PipelineFunc>> funcs, DenseMatrix<VT> ***res,
+                                           const bool *isScalar, Structure **inputs, size_t numInputs,
+                                           size_t numOutputs, int64_t *outRows, int64_t *outCols, VectorSplit *splits,
+                                           VectorCombine *combines, DCTX(ctx), bool verbose);
 
-    [[maybe_unused]] void executeQueuePerDeviceType(std::vector<std::function<PipelineFunc>> funcs, DenseMatrix<VT>*** res,
-            const bool* isScalar,Structure** inputs, size_t numInputs, size_t numOutputs, int64_t* outRows,
-            int64_t* outCols, VectorSplit* splits, VectorCombine* combines, DCTX(ctx), bool verbose);
+    [[maybe_unused]] void executeQueuePerDeviceType(std::vector<std::function<PipelineFunc>> funcs,
+                                                    DenseMatrix<VT> ***res, const bool *isScalar, Structure **inputs,
+                                                    size_t numInputs, size_t numOutputs, int64_t *outRows,
+                                                    int64_t *outCols, VectorSplit *splits, VectorCombine *combines,
+                                                    DCTX(ctx), bool verbose);
 
-    void combineOutputs(DenseMatrix<VT>***& res, DenseMatrix<VT>***& res_cuda, size_t numOutputs,
-            mlir::daphne::VectorCombine* combines, DCTX(ctx)) override;
+    void combineOutputs(DenseMatrix<VT> ***&res, DenseMatrix<VT> ***&res_cuda, size_t numOutputs,
+                        mlir::daphne::VectorCombine *combines, DCTX(ctx)) override;
 };
 
-template<typename VT>
-class MTWrapper<CSRMatrix<VT>> : public MTWrapperBase<CSRMatrix<VT>> {
-public:
+template <typename VT> class MTWrapper<CSRMatrix<VT>> : public MTWrapperBase<CSRMatrix<VT>> {
+  public:
     using PipelineFunc = void(CSRMatrix<VT> ***, Structure **, DCTX(ctx));
 
-    explicit MTWrapper(uint32_t numFunctions, DCTX(ctx)) :
-            MTWrapperBase<CSRMatrix<VT>>(numFunctions, ctx){ }
+    explicit MTWrapper(uint32_t numFunctions, DCTX(ctx)) : MTWrapperBase<CSRMatrix<VT>>(numFunctions, ctx) {}
 
-    [[maybe_unused]] void executeSingleQueue(std::vector<std::function<PipelineFunc>> funcs, CSRMatrix<VT>*** res,
-            const bool* isScalar, Structure** inputs, size_t numInputs, size_t numOutputs, const int64_t* outRows,
-            const int64_t* outCols, VectorSplit* splits, VectorCombine* combines, DCTX(ctx), bool verbose) {
+    [[maybe_unused]] void executeSingleQueue(std::vector<std::function<PipelineFunc>> funcs, CSRMatrix<VT> ***res,
+                                             const bool *isScalar, Structure **inputs, size_t numInputs,
+                                             size_t numOutputs, const int64_t *outRows, const int64_t *outCols,
+                                             VectorSplit *splits, VectorCombine *combines, DCTX(ctx), bool verbose) {
         throw std::runtime_error("sparse single queue vect exec not implemented");
     }
 
-    [[maybe_unused]] void executeCpuQueues(std::vector<std::function<PipelineFunc>> funcs, CSRMatrix<VT>*** res,
-            const bool* isScalar, Structure** inputs, size_t numInputs, size_t numOutputs, const int64_t* outRows,
-            const int64_t* outCols, VectorSplit* splits, VectorCombine* combines, DCTX(ctx), bool verbose);
+    [[maybe_unused]] void executeCpuQueues(std::vector<std::function<PipelineFunc>> funcs, CSRMatrix<VT> ***res,
+                                           const bool *isScalar, Structure **inputs, size_t numInputs,
+                                           size_t numOutputs, const int64_t *outRows, const int64_t *outCols,
+                                           VectorSplit *splits, VectorCombine *combines, DCTX(ctx), bool verbose);
 
-    [[maybe_unused]] void executeQueuePerDeviceType(std::vector<std::function<PipelineFunc>> funcs, CSRMatrix<VT>*** res,
-            const bool* isScalar, Structure** inputs, size_t numInputs, size_t numOutputs, int64_t* outRows, int64_t* outCols,
-                            VectorSplit* splits, VectorCombine* combines, DCTX(ctx), bool verbose) {
+    [[maybe_unused]] void executeQueuePerDeviceType(std::vector<std::function<PipelineFunc>> funcs,
+                                                    CSRMatrix<VT> ***res, const bool *isScalar, Structure **inputs,
+                                                    size_t numInputs, size_t numOutputs, int64_t *outRows,
+                                                    int64_t *outCols, VectorSplit *splits, VectorCombine *combines,
+                                                    DCTX(ctx), bool verbose) {
         throw std::runtime_error("sparse queuePerDeviceType vect exec not implemented");
     }
-    
-    void combineOutputs(CSRMatrix<VT>***& res, CSRMatrix<VT>***& res_cuda, [[maybe_unused]] size_t numOutputs,
-                        [[maybe_unused]] mlir::daphne::VectorCombine* combines, DCTX(ctx)) override {}
+
+    void combineOutputs(CSRMatrix<VT> ***&res, CSRMatrix<VT> ***&res_cuda, [[maybe_unused]] size_t numOutputs,
+                        [[maybe_unused]] mlir::daphne::VectorCombine *combines, DCTX(ctx)) override {}
 };
diff --git a/src/runtime/local/vectorized/MTWrapper_dense.cpp b/src/runtime/local/vectorized/MTWrapper_dense.cpp
index bb9dbf67f..ffed73054 100644
--- a/src/runtime/local/vectorized/MTWrapper_dense.cpp
+++ b/src/runtime/local/vectorized/MTWrapper_dense.cpp
@@ -21,11 +21,11 @@
 #include <runtime/local/vectorized/TasksCUDA.h>
 #endif
 
-template<typename VT>
+template <typename VT>
 [[maybe_unused]] void MTWrapper<DenseMatrix<VT>>::executeSingleQueue(
-        std::vector<std::function<typename MTWrapper<DenseMatrix<VT>>::PipelineFunc>> funcs, DenseMatrix<VT> ***res,
-        const bool* isScalar, Structure **inputs, size_t numInputs, size_t numOutputs, int64_t *outRows, int64_t *outCols,
-        VectorSplit *splits, VectorCombine *combines, DCTX(ctx), const bool verbose) {
+    std::vector<std::function<typename MTWrapper<DenseMatrix<VT>>::PipelineFunc>> funcs, DenseMatrix<VT> ***res,
+    const bool *isScalar, Structure **inputs, size_t numInputs, size_t numOutputs, int64_t *outRows, int64_t *outCols,
+    VectorSplit *splits, VectorCombine *combines, DCTX(ctx), const bool verbose) {
     auto inputProps = this->getInputProperties(inputs, numInputs, splits);
     auto len = inputProps.first;
     auto mem_required = inputProps.second;
@@ -35,16 +35,18 @@ template<typename VT>
     // create task queue (w/o size-based blocking)
     std::unique_ptr<TaskQueue> q = std::make_unique<BlockingTaskQueue>(len);
 
-    std::vector<TaskQueue*> tmp_q{q.get()};
+    std::vector<TaskQueue *> tmp_q{q.get()};
     auto batchSize8M = std::max(100ul, static_cast<size_t>(std::ceil(8388608 / row_mem)));
     this->initCPPWorkers(tmp_q, batchSize8M, verbose, 1, 0, false);
 
 #ifdef USE_CUDA
-    if(this->_numCUDAThreads) {
+    if (this->_numCUDAThreads) {
         this->initCUDAWorkers(q.get(), batchSize8M * 4, verbose);
         this->cudaPrefetchInputs(inputs, numInputs, mem_required, splits);
-        ctx->logger->info("MTWrapper_dense: \nRequired memory (ins/outs): {} Required mem/row: {}\n batchsizeCPU={} "
-                "batchsizeGPU={}", mem_required, row_mem, batchSize8M, batchSize8M*4);
+        ctx->logger->info("MTWrapper_dense: \nRequired memory (ins/outs): {} "
+                          "Required mem/row: {}\n batchsizeCPU={} "
+                          "batchsizeGPU={}",
+                          mem_required, row_mem, batchSize8M, batchSize8M * 4);
     }
 #endif
 
@@ -55,19 +57,20 @@ template<typename VT>
     // create tasks and close input
     uint64_t startChunk = 0;
     uint64_t endChunk = 0;
-    int method=ctx->config.taskPartitioningScheme;
+    int method = ctx->config.taskPartitioningScheme;
     int chunkParam = ctx->config.minimumTaskSize;
-    if(chunkParam<=0)
-        chunkParam=1;
-    bool autoChunk=false;
-    if(method==AUTO)
+    if (chunkParam <= 0)
+        chunkParam = 1;
+    bool autoChunk = false;
+    if (method == AUTO)
         autoChunk = true;
     LoadPartitioning lp(method, len, chunkParam, this->_numThreads, autoChunk);
     while (lp.hasNextChunk()) {
         endChunk += lp.getNextChunk();
-        q->enqueueTask(new CompiledPipelineTask<DenseMatrix<VT>>(CompiledPipelineTaskData<DenseMatrix<VT>>{funcs,
-                isScalar, inputs, numInputs, numOutputs, outRows, outCols, splits, combines, startChunk, endChunk,
-                outRows, outCols, 0, ctx}, resLock, res));
+        q->enqueueTask(new CompiledPipelineTask<DenseMatrix<VT>>(
+            CompiledPipelineTaskData<DenseMatrix<VT>>{funcs, isScalar, inputs, numInputs, numOutputs, outRows, outCols,
+                                                      splits, combines, startChunk, endChunk, outRows, outCols, 0, ctx},
+            resLock, res));
         startChunk = endChunk;
     }
     q->closeInput();
@@ -75,11 +78,11 @@ template<typename VT>
     this->joinAll();
 }
 
-template<typename VT>
+template <typename VT>
 [[maybe_unused]] void MTWrapper<DenseMatrix<VT>>::executeCpuQueues(
-        std::vector<std::function<typename MTWrapper<DenseMatrix<VT>>::PipelineFunc>> funcs, DenseMatrix<VT> ***res,
-        const bool* isScalar, Structure **inputs, size_t numInputs, size_t numOutputs, int64_t *outRows, int64_t *outCols,
-        VectorSplit *splits, VectorCombine *combines, DCTX(ctx), bool verbose) {
+    std::vector<std::function<typename MTWrapper<DenseMatrix<VT>>::PipelineFunc>> funcs, DenseMatrix<VT> ***res,
+    const bool *isScalar, Structure **inputs, size_t numInputs, size_t numOutputs, int64_t *outRows, int64_t *outCols,
+    VectorSplit *splits, VectorCombine *combines, DCTX(ctx), bool verbose) {
     auto inputProps = this->getInputProperties(inputs, numInputs, splits);
     auto len = inputProps.first;
     auto mem_required = inputProps.second;
@@ -87,9 +90,9 @@ template<typename VT>
     auto row_mem = mem_required / len;
 
     std::vector<std::unique_ptr<TaskQueue>> q;
-    std::vector<TaskQueue*> qvector;
+    std::vector<TaskQueue *> qvector;
     if (ctx->getUserConfig().pinWorkers) {
-        for(int i=0; i<this->_numQueues; i++) {
+        for (int i = 0; i < this->_numQueues; i++) {
             cpu_set_t cpuset;
             CPU_ZERO(&cpuset);
             CPU_SET(i, &cpuset);
@@ -99,7 +102,7 @@ template<typename VT>
             qvector.push_back(q[i].get());
         }
     } else {
-        for(int i=0; i<this->_numQueues; i++) {
+        for (int i = 0; i < this->_numQueues; i++) {
             std::unique_ptr<TaskQueue> tmp = std::make_unique<BlockingTaskQueue>(len);
             q.push_back(std::move(tmp));
             qvector.push_back(q[i].get());
@@ -107,7 +110,8 @@ template<typename VT>
     }
 
     auto batchSize8M = std::max(100ul, static_cast<size_t>(std::ceil(8388608 / row_mem)));
-    this->initCPPWorkers(qvector, batchSize8M, verbose, this->_numQueues, this->_queueMode, ctx->getUserConfig().pinWorkers);
+    this->initCPPWorkers(qvector, batchSize8M, verbose, this->_numQueues, this->_queueMode,
+                         ctx->getUserConfig().pinWorkers);
 
     // lock for aggregation combine
     // TODO: multiple locks per output
@@ -118,78 +122,90 @@ template<typename VT>
     uint64_t endChunk = 0;
     uint64_t currentItr = 0;
     uint64_t target;
-    int method=ctx->config.taskPartitioningScheme;
+    int method = ctx->config.taskPartitioningScheme;
     int chunkParam = ctx->config.minimumTaskSize;
-    if(chunkParam<=0)
-        chunkParam=1;
+    if (chunkParam <= 0)
+        chunkParam = 1;
     if (ctx->getUserConfig().prePartitionRows) {
-        uint64_t oneChunk = len/this->_numQueues;
+        uint64_t oneChunk = len / this->_numQueues;
         int remainder = len - (oneChunk * this->_numQueues);
         std::vector<LoadPartitioning> lps;
-        lps.emplace_back(method, oneChunk+remainder, chunkParam, this->_numThreads, false);
-        for(int i=1; i<this->_numQueues; i++) {
+        lps.emplace_back(method, oneChunk + remainder, chunkParam, this->_numThreads, false);
+        for (int i = 1; i < this->_numQueues; i++) {
             lps.emplace_back(method, oneChunk, chunkParam, this->_numThreads, false);
         }
         if (ctx->getUserConfig().pinWorkers) {
-            for(int i=0; i<this->_numQueues; i++) {
+            for (int i = 0; i < this->_numQueues; i++) {
                 while (lps[i].hasNextChunk()) {
                     endChunk += lps[i].getNextChunk();
-                    qvector[i]->enqueueTaskPinned(new CompiledPipelineTask<DenseMatrix<VT>>(CompiledPipelineTaskData<DenseMatrix<VT>>{funcs, isScalar,
-                            inputs, numInputs, numOutputs, outRows, outCols, splits, combines, startChunk, endChunk, outRows,
-                            outCols, 0, ctx}, resLock, res), this->topologyResponsibleThreads[i]);
+                    qvector[i]->enqueueTaskPinned(
+                        new CompiledPipelineTask<DenseMatrix<VT>>(
+                            CompiledPipelineTaskData<DenseMatrix<VT>>{funcs, isScalar, inputs, numInputs, numOutputs,
+                                                                      outRows, outCols, splits, combines, startChunk,
+                                                                      endChunk, outRows, outCols, 0, ctx},
+                            resLock, res),
+                        this->topologyResponsibleThreads[i]);
                     startChunk = endChunk;
                 }
             }
         } else {
-            for(int i=0; i<this->_numQueues; i++) {
+            for (int i = 0; i < this->_numQueues; i++) {
                 while (lps[i].hasNextChunk()) {
                     endChunk += lps[i].getNextChunk();
-                    qvector[i]->enqueueTask(new CompiledPipelineTask<DenseMatrix<VT>>(CompiledPipelineTaskData<DenseMatrix<VT>>{funcs, isScalar,
-                            inputs, numInputs, numOutputs, outRows, outCols, splits, combines, startChunk, endChunk, outRows,
-                            outCols, 0, ctx}, resLock, res));
+                    qvector[i]->enqueueTask(new CompiledPipelineTask<DenseMatrix<VT>>(
+                        CompiledPipelineTaskData<DenseMatrix<VT>>{funcs, isScalar, inputs, numInputs, numOutputs,
+                                                                  outRows, outCols, splits, combines, startChunk,
+                                                                  endChunk, outRows, outCols, 0, ctx},
+                        resLock, res));
                     startChunk = endChunk;
                 }
             }
         }
     } else {
-        bool autoChunk=false;
-        if(method==AUTO)
+        bool autoChunk = false;
+        if (method == AUTO)
             autoChunk = true;
         LoadPartitioning lp(method, len, chunkParam, this->_numThreads, autoChunk);
         if (ctx->getUserConfig().pinWorkers) {
             while (lp.hasNextChunk()) {
                 endChunk += lp.getNextChunk();
                 target = currentItr % this->_numQueues;
-                qvector[target]->enqueueTaskPinned(new CompiledPipelineTask<DenseMatrix<VT>>(CompiledPipelineTaskData<DenseMatrix<VT>>{funcs, isScalar,
-                        inputs, numInputs, numOutputs, outRows, outCols, splits, combines, startChunk, endChunk, outRows,
-                        outCols, 0, ctx}, resLock, res), this->topologyUniqueThreads[target]);
+                qvector[target]->enqueueTaskPinned(
+                    new CompiledPipelineTask<DenseMatrix<VT>>(
+                        CompiledPipelineTaskData<DenseMatrix<VT>>{funcs, isScalar, inputs, numInputs, numOutputs,
+                                                                  outRows, outCols, splits, combines, startChunk,
+                                                                  endChunk, outRows, outCols, 0, ctx},
+                        resLock, res),
+                    this->topologyUniqueThreads[target]);
                 startChunk = endChunk;
-		currentItr++;
+                currentItr++;
             }
         } else {
             while (lp.hasNextChunk()) {
                 endChunk += lp.getNextChunk();
                 target = currentItr % this->_numQueues;
-                qvector[target]->enqueueTask(new CompiledPipelineTask<DenseMatrix<VT>>(CompiledPipelineTaskData<DenseMatrix<VT>>{funcs, isScalar,
-                        inputs, numInputs, numOutputs, outRows, outCols, splits, combines, startChunk, endChunk, outRows,
-                        outCols, 0, ctx}, resLock, res));
+                qvector[target]->enqueueTask(new CompiledPipelineTask<DenseMatrix<VT>>(
+                    CompiledPipelineTaskData<DenseMatrix<VT>>{funcs, isScalar, inputs, numInputs, numOutputs, outRows,
+                                                              outCols, splits, combines, startChunk, endChunk, outRows,
+                                                              outCols, 0, ctx},
+                    resLock, res));
                 startChunk = endChunk;
-		currentItr++;
+                currentItr++;
             }
         }
     }
-    for(int i=0; i<this->_numQueues; i++) {
+    for (int i = 0; i < this->_numQueues; i++) {
         qvector[i]->closeInput();
     }
 
     this->joinAll();
 }
 
-template<typename VT>
+template <typename VT>
 [[maybe_unused]] void MTWrapper<DenseMatrix<VT>>::executeQueuePerDeviceType(
-        std::vector<std::function<typename MTWrapper<DenseMatrix<VT>>::PipelineFunc>> funcs, DenseMatrix<VT> ***res,
-        const bool* isScalar, Structure **inputs, size_t numInputs, size_t numOutputs, int64_t *outRows, int64_t *outCols,
-        VectorSplit *splits, VectorCombine *combines, DCTX(ctx), bool verbose) {
+    std::vector<std::function<typename MTWrapper<DenseMatrix<VT>>::PipelineFunc>> funcs, DenseMatrix<VT> ***res,
+    const bool *isScalar, Structure **inputs, size_t numInputs, size_t numOutputs, int64_t *outRows, int64_t *outCols,
+    VectorSplit *splits, VectorCombine *combines, DCTX(ctx), bool verbose) {
     size_t device_task_len = 0ul;
     auto inputProps = this->getInputProperties(inputs, numInputs, splits);
     auto len = inputProps.first;
@@ -209,27 +225,27 @@ template<typename VT>
     std::unique_ptr<TaskQueue> q_cuda = std::make_unique<BlockingTaskQueue>(gpu_task_len);
     this->initCUDAWorkers(q_cuda.get(), batchSize8M * 4, verbose);
 
-    auto*** res_cuda = new DenseMatrix<VT>**[numOutputs];
+    auto ***res_cuda = new DenseMatrix<VT> **[numOutputs];
     auto blksize = gpu_task_len / ctx->cuda_contexts.size();
     ctx->logger->debug("gpu_task_len: {}\ntaskRatioCUDA: {}\nBlock size: {}", gpu_task_len, taskRatioCUDA, blksize);
     for (size_t i = 0; i < numOutputs; ++i) {
-        res_cuda[i] = new DenseMatrix<VT>*;
-        if(combines[i] == mlir::daphne::VectorCombine::ROWS) {
+        res_cuda[i] = new DenseMatrix<VT> *;
+        if (combines[i] == mlir::daphne::VectorCombine::ROWS) {
             auto rc2 = static_cast<DenseMatrix<VT> *>((*res[i]))->sliceRow(0, gpu_task_len);
             (*res_cuda[i]) = rc2;
-        }
-        else if(combines[i] == mlir::daphne::VectorCombine::COLS) {
+        } else if (combines[i] == mlir::daphne::VectorCombine::COLS) {
             (*res_cuda[i]) = static_cast<DenseMatrix<VT> *>((*res[i]))->slice(0, outRows[i], 0, gpu_task_len);
-        }
-        else {
+        } else {
             (*res_cuda[i]) = (*res[i]);
         }
     }
 
     for (uint32_t k = 0; k < gpu_task_len; k += blksize) {
-        q_cuda->enqueueTask(new CompiledPipelineTaskCUDA<DenseMatrix<VT>>(CompiledPipelineTaskData<DenseMatrix<VT>>{
-                funcs, isScalar, inputs, numInputs, numOutputs, outRows, outCols, splits, combines, k,
-                std::min(k + blksize, len), outRows, outCols, 0, ctx}, resLock, res_cuda));
+        q_cuda->enqueueTask(new CompiledPipelineTaskCUDA<DenseMatrix<VT>>(
+            CompiledPipelineTaskData<DenseMatrix<VT>>{funcs, isScalar, inputs, numInputs, numOutputs, outRows, outCols,
+                                                      splits, combines, k, std::min(k + blksize, len), outRows, outCols,
+                                                      0, ctx},
+            resLock, res_cuda));
     }
     q_cuda->closeInput();
 #endif
@@ -239,11 +255,11 @@ template<typename VT>
     std::unique_ptr<TaskQueue> q_cpp;
 
     std::vector<std::unique_ptr<TaskQueue>> q;
-    std::vector<TaskQueue*> qvector;
-    if(cpu_task_len > 0) {
+    std::vector<TaskQueue *> qvector;
+    if (cpu_task_len > 0) {
         // Multiple Queues addition
         if (ctx->getUserConfig().pinWorkers) {
-            for(int i=0; i<this->_numQueues; i++) {
+            for (int i = 0; i < this->_numQueues; i++) {
                 cpu_set_t cpuset;
                 CPU_ZERO(&cpuset);
                 CPU_SET(i, &cpuset);
@@ -253,27 +269,26 @@ template<typename VT>
                 qvector.push_back(q[i].get());
             }
         } else {
-            for(int i=0; i<this->_numQueues; i++) {
+            for (int i = 0; i < this->_numQueues; i++) {
                 std::unique_ptr<TaskQueue> tmp = std::make_unique<BlockingTaskQueue>(cpu_task_len);
                 q.push_back(std::move(tmp));
                 qvector.push_back(q[i].get());
             }
         }
-        this->initCPPWorkers(qvector, batchSize8M, verbose, this->_numQueues, this->_queueMode, ctx->getUserConfig().pinWorkers);
-// End Multiple Queues
+        this->initCPPWorkers(qvector, batchSize8M, verbose, this->_numQueues, this->_queueMode,
+                             ctx->getUserConfig().pinWorkers);
+        // End Multiple Queues
 
         res_cpp = new DenseMatrix<VT> **[numOutputs];
         auto offset = device_task_len;
 
         for (size_t i = 0; i < numOutputs; ++i) {
             res_cpp[i] = new DenseMatrix<VT> *;
-            if(combines[i] == mlir::daphne::VectorCombine::ROWS) {
+            if (combines[i] == mlir::daphne::VectorCombine::ROWS) {
                 (*res_cpp[i]) = static_cast<DenseMatrix<VT> *>((*res[i]))->sliceRow(device_task_len, len);
-            }
-            else if(combines[i] == mlir::daphne::VectorCombine::COLS) {
+            } else if (combines[i] == mlir::daphne::VectorCombine::COLS) {
                 (*res_cpp[i]) = static_cast<DenseMatrix<VT> *>((*res[i]))->sliceCol(device_task_len, len);
-            }
-            else {
+            } else {
                 (*res_cpp[i]) = (*res[i]);
             }
         }
@@ -282,25 +297,27 @@ template<typename VT>
         uint64_t endChunk = device_task_len;
         uint64_t currentItr = 0;
         uint64_t target;
-        int method=ctx->config.taskPartitioningScheme;
+        int method = ctx->config.taskPartitioningScheme;
         int chunkParam = ctx->config.minimumTaskSize;
-        if(chunkParam<=0)
-            chunkParam=1;
-        bool autoChunk=false;
-        if(method==AUTO)
+        if (chunkParam <= 0)
+            chunkParam = 1;
+        bool autoChunk = false;
+        if (method == AUTO)
             autoChunk = true;
 
         LoadPartitioning lp(method, cpu_task_len, chunkParam, this->_numCPPThreads, autoChunk);
         while (lp.hasNextChunk()) {
             endChunk += lp.getNextChunk();
             target = currentItr % this->_numQueues;
-            qvector[target]->enqueueTask(new CompiledPipelineTask<DenseMatrix<VT>>(CompiledPipelineTaskData<DenseMatrix<VT>>{
-                    funcs, isScalar, inputs, numInputs, numOutputs, outRows, outCols, splits, combines, startChunk, endChunk,
-                    outRows, outCols, offset, ctx}, resLock, res_cpp));
+            qvector[target]->enqueueTask(new CompiledPipelineTask<DenseMatrix<VT>>(
+                CompiledPipelineTaskData<DenseMatrix<VT>>{funcs, isScalar, inputs, numInputs, numOutputs, outRows,
+                                                          outCols, splits, combines, startChunk, endChunk, outRows,
+                                                          outCols, offset, ctx},
+                resLock, res_cpp));
             startChunk = endChunk;
             currentItr++;
         }
-        for(int i=0; i<this->_numQueues; i++) {
+        for (int i = 0; i < this->_numQueues; i++) {
             qvector[i]->closeInput();
         }
     }
@@ -310,50 +327,54 @@ template<typename VT>
     this->combineOutputs(res, res_cuda, numOutputs, combines, ctx);
 #endif
 
-    if(cpu_task_len > 0) {
+    if (cpu_task_len > 0) {
         for (size_t i = 0; i < numOutputs; ++i) {
-            if(combines[i] == mlir::daphne::VectorCombine::ROWS || combines[i] == mlir::daphne::VectorCombine::COLS)
+            if (combines[i] == mlir::daphne::VectorCombine::ROWS || combines[i] == mlir::daphne::VectorCombine::COLS)
                 DataObjectFactory::destroy((*res_cpp[i]));
         }
     }
 }
 
 #ifdef USE_CUDA
-template<typename VT>
-void MTWrapper<DenseMatrix<VT>>::combineOutputs(DenseMatrix<VT>***& res_, DenseMatrix<VT>***& res_cuda_, size_t numOutputs,
-                                                mlir::daphne::VectorCombine* combines, DCTX(ctx)) {
-    const size_t deviceID = 0; //ToDo: multi device support
+template <typename VT>
+void MTWrapper<DenseMatrix<VT>>::combineOutputs(DenseMatrix<VT> ***&res_, DenseMatrix<VT> ***&res_cuda_,
+                                                size_t numOutputs, mlir::daphne::VectorCombine *combines, DCTX(ctx)) {
+    const size_t deviceID = 0; // ToDo: multi device support
     AllocationDescriptorCUDA alloc_desc(ctx, deviceID);
     for (size_t i = 0; i < numOutputs; ++i) {
-        auto* res = static_cast<DenseMatrix<VT> *>((*res_[i]));
-        auto* res_cuda = static_cast<DenseMatrix<VT> *>((*res_cuda_[i]));
+        auto *res = static_cast<DenseMatrix<VT> *>((*res_[i]));
+        auto *res_cuda = static_cast<DenseMatrix<VT> *>((*res_cuda_[i]));
         if (combines[i] == mlir::daphne::VectorCombine::ROWS) {
             const auto &const_res_cuda = *res_cuda;
             auto data_dest = res->getValues();
             CHECK_CUDART(cudaMemcpy(data_dest, const_res_cuda.getValues(&alloc_desc), const_res_cuda.getBufferSize(),
                                     cudaMemcpyDeviceToHost));
-//            debugPrintCUDABuffer("MTWrapperDense: combine outputs", const_res_cuda.getValues(&alloc_desc), const_res_cuda.getNumItems());
+            //            debugPrintCUDABuffer("MTWrapperDense: combine
+            //            outputs", const_res_cuda.getValues(&alloc_desc),
+            //            const_res_cuda.getNumItems());
             DataObjectFactory::destroy(res_cuda);
-        }
-        else if (combines[i] == mlir::daphne::VectorCombine::COLS) {
+        } else if (combines[i] == mlir::daphne::VectorCombine::COLS) {
             const auto &const_res_cuda = *res_cuda;
             auto dst_base_ptr = res->getValues();
             auto src_base_ptr = const_res_cuda.getValues(&alloc_desc);
-            for(auto j = 0u; j < res_cuda->getNumRows(); ++j) {
-                //ToDo: rowSkip would be correct here if res_cuda wasn't a shallow copy
-//                auto data_src = src_base_ptr + res_cuda->getRowSkip() * j;
+            for (auto j = 0u; j < res_cuda->getNumRows(); ++j) {
+                // ToDo: rowSkip would be correct here if res_cuda wasn't a
+                // shallow copy
+                //                auto data_src = src_base_ptr +
+                //                res_cuda->getRowSkip() * j;
                 auto data_src = src_base_ptr + res_cuda->getNumCols() * j;
                 auto data_dst = dst_base_ptr + res->getRowSkip() * j;
-                CHECK_CUDART(cudaMemcpy(data_dst, data_src,res_cuda->getNumCols() * sizeof(VT), cudaMemcpyDeviceToHost));
+                CHECK_CUDART(
+                    cudaMemcpy(data_dst, data_src, res_cuda->getNumCols() * sizeof(VT), cudaMemcpyDeviceToHost));
             }
             DataObjectFactory::destroy(res_cuda);
         }
     }
 }
 #else
-template<typename VT>
-void MTWrapper<DenseMatrix<VT>>::combineOutputs(DenseMatrix<VT>***& res_, DenseMatrix<VT>***& res_cuda_, size_t numOutputs,
-        mlir::daphne::VectorCombine* combines, DCTX(ctx)) { }
+template <typename VT>
+void MTWrapper<DenseMatrix<VT>>::combineOutputs(DenseMatrix<VT> ***&res_, DenseMatrix<VT> ***&res_cuda_,
+                                                size_t numOutputs, mlir::daphne::VectorCombine *combines, DCTX(ctx)) {}
 #endif
 
 template class MTWrapper<DenseMatrix<double>>;
diff --git a/src/runtime/local/vectorized/MTWrapper_sparse.cpp b/src/runtime/local/vectorized/MTWrapper_sparse.cpp
index ce302ad15..91fddcf83 100644
--- a/src/runtime/local/vectorized/MTWrapper_sparse.cpp
+++ b/src/runtime/local/vectorized/MTWrapper_sparse.cpp
@@ -17,12 +17,12 @@
 #include "MTWrapper.h"
 #include <runtime/local/vectorized/Tasks.h>
 
-template<typename VT>
-void MTWrapper<CSRMatrix<VT>>::executeCpuQueues(std::vector<std::function<void(CSRMatrix<VT> ***, Structure **,
-        DCTX(ctx))>> funcs, CSRMatrix<VT> ***res, const bool* isScalar, Structure **inputs, size_t numInputs,
-        size_t numOutputs, const int64_t *outRows, const int64_t *outCols, VectorSplit *splits, VectorCombine *combines,
-        DCTX(ctx), const bool verbose) {
-//     TODO: reduce code duplication
+template <typename VT>
+void MTWrapper<CSRMatrix<VT>>::executeCpuQueues(
+    std::vector<std::function<void(CSRMatrix<VT> ***, Structure **, DCTX(ctx))>> funcs, CSRMatrix<VT> ***res,
+    const bool *isScalar, Structure **inputs, size_t numInputs, size_t numOutputs, const int64_t *outRows,
+    const int64_t *outCols, VectorSplit *splits, VectorCombine *combines, DCTX(ctx), const bool verbose) {
+    //     TODO: reduce code duplication
     auto inputProps = this->getInputProperties(inputs, numInputs, splits);
     auto len = inputProps.first;
     auto mem_required = inputProps.second;
@@ -30,9 +30,9 @@ void MTWrapper<CSRMatrix<VT>>::executeCpuQueues(std::vector<std::function<void(C
     auto row_mem = mem_required / len;
 
     std::vector<std::unique_ptr<TaskQueue>> q;
-    std::vector<TaskQueue*> qvector;
+    std::vector<TaskQueue *> qvector;
     if (ctx->getUserConfig().pinWorkers) {
-        for(int i=0; i<this->_numQueues; i++) {
+        for (int i = 0; i < this->_numQueues; i++) {
             cpu_set_t cpuset;
             CPU_ZERO(&cpuset);
             CPU_SET(i, &cpuset);
@@ -42,7 +42,7 @@ void MTWrapper<CSRMatrix<VT>>::executeCpuQueues(std::vector<std::function<void(C
             qvector.push_back(q[i].get());
         }
     } else {
-        for(int i=0; i<this->_numQueues; i++) {
+        for (int i = 0; i < this->_numQueues; i++) {
             std::unique_ptr<TaskQueue> tmp = std::make_unique<BlockingTaskQueue>(len);
             q.push_back(std::move(tmp));
             qvector.push_back(q[i].get());
@@ -51,14 +51,14 @@ void MTWrapper<CSRMatrix<VT>>::executeCpuQueues(std::vector<std::function<void(C
 
     auto batchSize8M = std::max(100ul, static_cast<size_t>(std::ceil(8388608 / row_mem)));
     this->initCPPWorkers(qvector, batchSize8M, verbose, this->_numQueues, this->_queueMode,
-            ctx->getUserConfig().pinWorkers);
+                         ctx->getUserConfig().pinWorkers);
 
-    for(size_t i = 0; i < numOutputs; i++)
-        if(*(res[i]) != nullptr)
+    for (size_t i = 0; i < numOutputs; i++)
+        if (*(res[i]) != nullptr)
             throw std::runtime_error("TODO");
 
     std::vector<VectorizedDataSink<CSRMatrix<VT>> *> dataSinks(numOutputs);
-    for(size_t i = 0; i < numOutputs; i++)
+    for (size_t i = 0; i < numOutputs; i++)
         dataSinks[i] = new VectorizedDataSink<CSRMatrix<VT>>(combines[i], outRows[i], outCols[i]);
 
     // lock for aggregation combine
@@ -68,42 +68,48 @@ void MTWrapper<CSRMatrix<VT>>::executeCpuQueues(std::vector<std::function<void(C
     uint64_t endChunk = 0;
     uint64_t currentItr = 0;
     uint64_t target;
-    int method=ctx->config.taskPartitioningScheme;
+    int method = ctx->config.taskPartitioningScheme;
     int chunkParam = ctx->config.minimumTaskSize;
-    if(chunkParam<=0)
-        chunkParam=1;
+    if (chunkParam <= 0)
+        chunkParam = 1;
     if (ctx->getUserConfig().prePartitionRows) {
-        uint64_t oneChunk = len/this->_numQueues;
+        uint64_t oneChunk = len / this->_numQueues;
         int remainder = len - (oneChunk * this->_numQueues);
         std::vector<LoadPartitioning> lps;
-        lps.emplace_back(method, oneChunk+remainder, chunkParam, this->_numThreads, false);
-        for(int i=1; i<this->_numQueues; i++) {
+        lps.emplace_back(method, oneChunk + remainder, chunkParam, this->_numThreads, false);
+        for (int i = 1; i < this->_numQueues; i++) {
             lps.emplace_back(method, oneChunk, chunkParam, this->_numThreads, false);
         }
         if (ctx->getUserConfig().pinWorkers) {
-            for(int i=0; i<this->_numQueues; i++) {
+            for (int i = 0; i < this->_numQueues; i++) {
                 while (lps[i].hasNextChunk()) {
                     endChunk += lps[i].getNextChunk();
-                    qvector[i]->enqueueTaskPinned(new CompiledPipelineTask<CSRMatrix<VT>>(CompiledPipelineTaskData<CSRMatrix<VT>>{funcs, isScalar,
-                            inputs, numInputs, numOutputs, outRows, outCols, splits, combines, startChunk, endChunk, outRows,
-                            outCols, 0, ctx}, dataSinks), this->topologyResponsibleThreads[i]);
+                    qvector[i]->enqueueTaskPinned(
+                        new CompiledPipelineTask<CSRMatrix<VT>>(
+                            CompiledPipelineTaskData<CSRMatrix<VT>>{funcs, isScalar, inputs, numInputs, numOutputs,
+                                                                    outRows, outCols, splits, combines, startChunk,
+                                                                    endChunk, outRows, outCols, 0, ctx},
+                            dataSinks),
+                        this->topologyResponsibleThreads[i]);
                     startChunk = endChunk;
                 }
             }
         } else {
-            for(int i=0; i<this->_numQueues; i++) {
+            for (int i = 0; i < this->_numQueues; i++) {
                 while (lps[i].hasNextChunk()) {
                     endChunk += lps[i].getNextChunk();
-                    qvector[i]->enqueueTask(new CompiledPipelineTask<CSRMatrix<VT>>(CompiledPipelineTaskData<CSRMatrix<VT>>{funcs, isScalar,
-                            inputs, numInputs, numOutputs, outRows, outCols, splits, combines, startChunk, endChunk, outRows,
-                            outCols, 0, ctx}, dataSinks));
+                    qvector[i]->enqueueTask(new CompiledPipelineTask<CSRMatrix<VT>>(
+                        CompiledPipelineTaskData<CSRMatrix<VT>>{funcs, isScalar, inputs, numInputs, numOutputs, outRows,
+                                                                outCols, splits, combines, startChunk, endChunk,
+                                                                outRows, outCols, 0, ctx},
+                        dataSinks));
                     startChunk = endChunk;
                 }
             }
         }
     } else {
-        bool autoChunk=false;
-        if(method==AUTO)
+        bool autoChunk = false;
+        if (method == AUTO)
             autoChunk = true;
 
         LoadPartitioning lp(method, len, chunkParam, this->_numThreads, autoChunk);
@@ -111,30 +117,36 @@ void MTWrapper<CSRMatrix<VT>>::executeCpuQueues(std::vector<std::function<void(C
             while (lp.hasNextChunk()) {
                 endChunk += lp.getNextChunk();
                 target = currentItr % this->_numQueues;
-                qvector[target]->enqueueTaskPinned(new CompiledPipelineTask<CSRMatrix<VT>>(CompiledPipelineTaskData<CSRMatrix<VT>>{funcs, isScalar,
-                        inputs, numInputs, numOutputs, outRows, outCols, splits, combines, startChunk, endChunk, outRows,
-                        outCols, 0, ctx}, dataSinks), target);
+                qvector[target]->enqueueTaskPinned(
+                    new CompiledPipelineTask<CSRMatrix<VT>>(
+                        CompiledPipelineTaskData<CSRMatrix<VT>>{funcs, isScalar, inputs, numInputs, numOutputs, outRows,
+                                                                outCols, splits, combines, startChunk, endChunk,
+                                                                outRows, outCols, 0, ctx},
+                        dataSinks),
+                    target);
                 startChunk = endChunk;
-		currentItr++;
+                currentItr++;
             }
         } else {
             while (lp.hasNextChunk()) {
                 endChunk += lp.getNextChunk();
                 target = currentItr % this->_numQueues;
-                qvector[target]->enqueueTask(new CompiledPipelineTask<CSRMatrix<VT>>(CompiledPipelineTaskData<CSRMatrix<VT>>{funcs, isScalar,
-                        inputs, numInputs, numOutputs, outRows, outCols, splits, combines, startChunk, endChunk, outRows,
-                        outCols, 0, ctx}, dataSinks));
+                qvector[target]->enqueueTask(new CompiledPipelineTask<CSRMatrix<VT>>(
+                    CompiledPipelineTaskData<CSRMatrix<VT>>{funcs, isScalar, inputs, numInputs, numOutputs, outRows,
+                                                            outCols, splits, combines, startChunk, endChunk, outRows,
+                                                            outCols, 0, ctx},
+                    dataSinks));
                 startChunk = endChunk;
-		currentItr++;
+                currentItr++;
             }
         }
     }
-    for(int i=0; i<this->_numQueues; i++) {
+    for (int i = 0; i < this->_numQueues; i++) {
         qvector[i]->closeInput();
     }
 
     this->joinAll();
-    for(size_t i = 0; i < numOutputs; i++) {
+    for (size_t i = 0; i < numOutputs; i++) {
         *(res[i]) = dataSinks[i]->consume();
         delete dataSinks[i];
     }
diff --git a/src/runtime/local/vectorized/TaskQueues.h b/src/runtime/local/vectorized/TaskQueues.h
index 4697147a3..66b7bc681 100644
--- a/src/runtime/local/vectorized/TaskQueues.h
+++ b/src/runtime/local/vectorized/TaskQueues.h
@@ -17,34 +17,34 @@
 #ifndef SRC_RUNTIME_LOCAL_VECTORIZED_TASKQUEUES_H
 #define SRC_RUNTIME_LOCAL_VECTORIZED_TASKQUEUES_H
 
+#include <condition_variable>
 #include <list>
 #include <mutex>
-#include <condition_variable>
 #include <runtime/local/vectorized/Tasks.h>
 
 const uint64_t DEFAULT_MAX_SIZE = 100000;
 
 class TaskQueue {
-public:
+  public:
     virtual ~TaskQueue() = default;
 
-    virtual void enqueueTask(Task* t) = 0;
-    virtual void enqueueTaskPinned(Task* t, int targetCPU) = 0;
-    virtual Task* dequeueTask() = 0;
+    virtual void enqueueTask(Task *t) = 0;
+    virtual void enqueueTaskPinned(Task *t, int targetCPU) = 0;
+    virtual Task *dequeueTask() = 0;
     virtual uint64_t size() = 0;
     virtual void closeInput() = 0;
 };
 
 class BlockingTaskQueue : public TaskQueue {
-private:
-    std::list<Task*> _data;
+  private:
+    std::list<Task *> _data;
     std::mutex _qmutex;
     std::condition_variable _cv;
-    EOFTask _eof; //end marker
+    EOFTask _eof; // end marker
     uint64_t _capacity;
     bool _closedInput;
 
-public:
+  public:
     BlockingTaskQueue() : BlockingTaskQueue(DEFAULT_MAX_SIZE) {}
     explicit BlockingTaskQueue(uint64_t capacity) {
         _closedInput = false;
@@ -52,11 +52,11 @@ class BlockingTaskQueue : public TaskQueue {
     }
     ~BlockingTaskQueue() override = default;
 
-    void enqueueTask(Task* t) override {
+    void enqueueTask(Task *t) override {
         // lock mutex, released at end of scope
         std::unique_lock<std::mutex> ul(_qmutex);
         // blocking wait until tasks dequeued
-        while( _data.size() + 1 > _capacity )
+        while (_data.size() + 1 > _capacity)
             _cv.wait(ul);
         // add task to end of list
         _data.push_back(t);
@@ -64,31 +64,31 @@ class BlockingTaskQueue : public TaskQueue {
         _cv.notify_one();
     }
 
-    void enqueueTaskPinned(Task* t, int targetCPU) override {
+    void enqueueTaskPinned(Task *t, int targetCPU) override {
         // Change CPU pinning before enqueue to utilize NUMA first-touch policy
         cpu_set_t cpuset;
         CPU_ZERO(&cpuset);
         CPU_SET(targetCPU, &cpuset);
         sched_setaffinity(0, sizeof(cpu_set_t), &cpuset);
         std::unique_lock<std::mutex> ul(_qmutex);
-        while( _data.size() + 1 > _capacity )
+        while (_data.size() + 1 > _capacity)
             _cv.wait(ul);
         _data.push_back(t);
         _cv.notify_one();
     }
 
-    Task* dequeueTask() override {
+    Task *dequeueTask() override {
         // lock mutex, released at end of scope
         std::unique_lock<std::mutex> ul(_qmutex);
         // blocking wait for new tasks
-        while( _data.empty() ) {
-            if( _closedInput )
+        while (_data.empty()) {
+            if (_closedInput)
                 return &_eof;
             else
                 _cv.wait(ul);
         }
         // obtain next task
-        Task* t = _data.front();
+        Task *t = _data.front();
         _data.pop_front();
         _cv.notify_one();
         return t;
@@ -106,4 +106,4 @@ class BlockingTaskQueue : public TaskQueue {
     }
 };
 
-#endif //SRC_RUNTIME_LOCAL_VECTORIZED_TASKQUEUES_H
+#endif // SRC_RUNTIME_LOCAL_VECTORIZED_TASKQUEUES_H
diff --git a/src/runtime/local/vectorized/Tasks.cpp b/src/runtime/local/vectorized/Tasks.cpp
index cd3da04ce..8291f2623 100644
--- a/src/runtime/local/vectorized/Tasks.cpp
+++ b/src/runtime/local/vectorized/Tasks.cpp
@@ -17,174 +17,166 @@
 #include "runtime/local/vectorized/Tasks.h"
 #include "runtime/local/kernels/EwBinaryMat.h"
 
-template<typename VT>
-void CompiledPipelineTask<DenseMatrix<VT>>::execute(uint32_t fid, uint32_t batchSize) {
+template <typename VT> void CompiledPipelineTask<DenseMatrix<VT>>::execute(uint32_t fid, uint32_t batchSize) {
     // local add aggregation to minimize locking
-    std::vector<DenseMatrix<VT>*> localAddRes(_data._numOutputs);
-    std::vector<DenseMatrix<VT>*> localResults(_data._numOutputs);
-    std::vector<DenseMatrix<VT>**> outputs;
+    std::vector<DenseMatrix<VT> *> localAddRes(_data._numOutputs);
+    std::vector<DenseMatrix<VT> *> localResults(_data._numOutputs);
+    std::vector<DenseMatrix<VT> **> outputs;
     for (auto &lres : localResults)
         outputs.push_back(&lres);
-    for(uint64_t r = _data._rl ; r < _data._ru ; r += batchSize) {
-        //create zero-copy views of inputs/outputs
+    for (uint64_t r = _data._rl; r < _data._ru; r += batchSize) {
+        // create zero-copy views of inputs/outputs
         uint64_t r2 = std::min(r + batchSize, _data._ru);
-        
+
         auto linputs = this->createFuncInputs(r, r2);
-        
-        //execute function on given data binding (batch size)
+
+        // execute function on given data binding (batch size)
         _data._funcs[fid](outputs.data(), linputs.data(), _data._ctx);
         accumulateOutputs(localResults, localAddRes, r, r2);
-        
+
         // cleanup
         for (auto &localResult : localResults)
-            if(localResult) {
+            if (localResult) {
                 DataObjectFactory::destroy(localResult);
                 localResult = nullptr;
             }
-        
+
         // Note that a pipeline manages the reference counters of its inputs
         // internally. Thus, we do not need to care about freeing the inputs
         // here.
     }
-    
-    for(size_t o = 0; o < _data._numOutputs; ++o) {
-        if(_data._combines[o] == VectorCombine::ADD) {
+
+    for (size_t o = 0; o < _data._numOutputs; ++o) {
+        if (_data._combines[o] == VectorCombine::ADD) {
             auto &result = (*_res[o]);
             _resLock.lock();
-            if(result == nullptr) {
+            if (result == nullptr) {
                 result = localAddRes[o];
                 _resLock.unlock();
-            }
-            else {
+            } else {
                 ewBinaryMat(BinaryOpCode::ADD, result, result, localAddRes[o], _data._ctx);
                 _resLock.unlock();
-                //cleanup
+                // cleanup
                 DataObjectFactory::destroy(localAddRes[o]);
             }
         }
     }
 }
 
-template<typename VT>
-uint64_t CompiledPipelineTask<DenseMatrix<VT>>::getTaskSize() {
-    return _data._ru-_data._rl;
-}
+template <typename VT> uint64_t CompiledPipelineTask<DenseMatrix<VT>>::getTaskSize() { return _data._ru - _data._rl; }
 
-template<typename VT>
+template <typename VT>
 void CompiledPipelineTask<DenseMatrix<VT>>::accumulateOutputs(std::vector<DenseMatrix<VT> *> &localResults,
-        std::vector<DenseMatrix<VT> *> &localAddRes, uint64_t rowStart, uint64_t rowEnd) {
-    //TODO: in-place computation via better compiled pipelines
-    //TODO: multi-return
-    for(auto o = 0u ; o < _data._numOutputs ; ++o) {
+                                                              std::vector<DenseMatrix<VT> *> &localAddRes,
+                                                              uint64_t rowStart, uint64_t rowEnd) {
+    // TODO: in-place computation via better compiled pipelines
+    // TODO: multi-return
+    for (auto o = 0u; o < _data._numOutputs; ++o) {
         auto &result = (*_res[o]);
         switch (_data._combines[o]) {
-            case VectorCombine::ROWS: {
-                auto slice = result->sliceRow(rowStart-_data._offset, rowEnd-_data._offset);
-                // TODO It's probably more efficient to memcpy than to get/set.
-                // But eventually, we don't want to copy at all.
-                for(auto i = 0u ; i < slice->getNumRows() ; ++i) {
-                    for(auto j = 0u ; j < slice->getNumCols() ; ++j) {
-                        slice->set(i, j, localResults[o]->get(i, j));
-                    }
+        case VectorCombine::ROWS: {
+            auto slice = result->sliceRow(rowStart - _data._offset, rowEnd - _data._offset);
+            // TODO It's probably more efficient to memcpy than to get/set.
+            // But eventually, we don't want to copy at all.
+            for (auto i = 0u; i < slice->getNumRows(); ++i) {
+                for (auto j = 0u; j < slice->getNumCols(); ++j) {
+                    slice->set(i, j, localResults[o]->get(i, j));
                 }
-                DataObjectFactory::destroy(slice);
-                break;
             }
-            case VectorCombine::COLS: {
-                auto slice = result->sliceCol(rowStart-_data._offset, rowEnd-_data._offset);
-                // TODO It's probably more efficient to memcpy than to get/set.
-                // But eventually, we don't want to copy at all.
-                for(auto i = 0u ; i < slice->getNumRows() ; ++i) {
-                    for(auto j = 0u ; j < slice->getNumCols() ; ++j) {
-                        slice->set(i, j, localResults[o]->get(i, j));
-                    }
-                }
-                DataObjectFactory::destroy(slice);
-                break;
-            }
-            case VectorCombine::ADD: {
-                if(localAddRes[o] == nullptr) {
-                    // take lres and reset it to nullptr
-                    localAddRes[o] = localResults[o];
-                    localResults[o] = nullptr;
-                }
-                else {
-                    ewBinaryMat(BinaryOpCode::ADD, localAddRes[o], localAddRes[o], localResults[o], nullptr);
+            DataObjectFactory::destroy(slice);
+            break;
+        }
+        case VectorCombine::COLS: {
+            auto slice = result->sliceCol(rowStart - _data._offset, rowEnd - _data._offset);
+            // TODO It's probably more efficient to memcpy than to get/set.
+            // But eventually, we don't want to copy at all.
+            for (auto i = 0u; i < slice->getNumRows(); ++i) {
+                for (auto j = 0u; j < slice->getNumCols(); ++j) {
+                    slice->set(i, j, localResults[o]->get(i, j));
                 }
-                break;
             }
-            default: {
-                throw std::runtime_error(("VectorCombine case `"
-                                          + std::to_string(static_cast<int64_t>(_data._combines[o])) + "` not supported"));
+            DataObjectFactory::destroy(slice);
+            break;
+        }
+        case VectorCombine::ADD: {
+            if (localAddRes[o] == nullptr) {
+                // take lres and reset it to nullptr
+                localAddRes[o] = localResults[o];
+                localResults[o] = nullptr;
+            } else {
+                ewBinaryMat(BinaryOpCode::ADD, localAddRes[o], localAddRes[o], localResults[o], nullptr);
             }
+            break;
+        }
+        default: {
+            throw std::runtime_error(("VectorCombine case `" +
+                                      std::to_string(static_cast<int64_t>(_data._combines[o])) + "` not supported"));
+        }
         }
     }
 }
 
-template<typename VT>
-void CompiledPipelineTask<CSRMatrix<VT>>::execute(uint32_t fid, uint32_t batchSize) {
+template <typename VT> void CompiledPipelineTask<CSRMatrix<VT>>::execute(uint32_t fid, uint32_t batchSize) {
     std::vector<size_t> localResNumRows(_data._numOutputs);
     std::vector<size_t> localResNumCols(_data._numOutputs);
-    for(size_t i = 0; i < _data._numOutputs; i++) {
-        switch(_data._combines[i]) {
-            case VectorCombine::ROWS: {
-                if (_data._wholeResultCols[i] == -1)
-                    throw std::runtime_error("TODO: CompiledPipeLineTask (CSRMatrix) Rows _data._wholeResultCols[i] == -1");
-                localResNumRows[i] = _data._ru - _data._rl;
-                localResNumCols[i] = _data._wholeResultCols[i];
-                break;
-            }
-            case VectorCombine::COLS: {
-                if (_data._wholeResultRows[i] == -1)
-                    throw std::runtime_error("TODO: CompiledPipeLineTask (CSRMatrix) Cols _data._wholeResultRows[i] == -1");
-                localResNumRows[i] = _data._wholeResultRows[i];
-                localResNumCols[i] = _data._ru - _data._rl;
-                break;
-            }
-            default:
-                throw std::runtime_error("Not implemented");
+    for (size_t i = 0; i < _data._numOutputs; i++) {
+        switch (_data._combines[i]) {
+        case VectorCombine::ROWS: {
+            if (_data._wholeResultCols[i] == -1)
+                throw std::runtime_error("TODO: CompiledPipeLineTask (CSRMatrix) Rows "
+                                         "_data._wholeResultCols[i] == -1");
+            localResNumRows[i] = _data._ru - _data._rl;
+            localResNumCols[i] = _data._wholeResultCols[i];
+            break;
+        }
+        case VectorCombine::COLS: {
+            if (_data._wholeResultRows[i] == -1)
+                throw std::runtime_error("TODO: CompiledPipeLineTask (CSRMatrix) Cols "
+                                         "_data._wholeResultRows[i] == -1");
+            localResNumRows[i] = _data._wholeResultRows[i];
+            localResNumCols[i] = _data._ru - _data._rl;
+            break;
+        }
+        default:
+            throw std::runtime_error("Not implemented");
         }
     }
-    
-    std::vector<VectorizedDataSink<CSRMatrix<VT>>*> localSinks(_data._numOutputs);
-    for(size_t i = 0; i < _data._numOutputs; i++)
-        localSinks[i] = new VectorizedDataSink<CSRMatrix<VT>>(_data._combines[i], localResNumRows[i], localResNumCols[i]);
-    
-    std::vector<CSRMatrix<VT>*> lres(_data._numOutputs, nullptr);
-    for(uint64_t r = _data._rl ; r < _data._ru ; r += batchSize) {
-        //create zero-copy views of inputs/outputs
+
+    std::vector<VectorizedDataSink<CSRMatrix<VT>> *> localSinks(_data._numOutputs);
+    for (size_t i = 0; i < _data._numOutputs; i++)
+        localSinks[i] =
+            new VectorizedDataSink<CSRMatrix<VT>>(_data._combines[i], localResNumRows[i], localResNumCols[i]);
+
+    std::vector<CSRMatrix<VT> *> lres(_data._numOutputs, nullptr);
+    for (uint64_t r = _data._rl; r < _data._ru; r += batchSize) {
+        // create zero-copy views of inputs/outputs
         uint64_t r2 = std::min(r + batchSize, _data._ru);
-        
+
         auto linputs = this->createFuncInputs(r, r2);
-        CSRMatrix<VT> *** outputs = new CSRMatrix<VT>**[_data._numOutputs];
-        for(size_t i = 0; i < _data._numOutputs; i++)
+        CSRMatrix<VT> ***outputs = new CSRMatrix<VT> **[_data._numOutputs];
+        for (size_t i = 0; i < _data._numOutputs; i++)
             outputs[i] = &(lres[i]);
-        //execute function on given data binding (batch size)
+        // execute function on given data binding (batch size)
         _data._funcs[fid](outputs, linputs.data(), _data._ctx);
         delete[] outputs;
-        for(size_t i = 0; i < _data._numOutputs; i++)
+        for (size_t i = 0; i < _data._numOutputs; i++)
             localSinks[i]->add(lres[i], r - _data._rl, false);
 
         // cleanup
-        for(size_t i = 0; i < _data._numOutputs; i++)
+        for (size_t i = 0; i < _data._numOutputs; i++)
             lres[i] = nullptr;
-        
+
         // Note that a pipeline manages the reference counters of its inputs
         // internally. Thus, we do not need to care about freeing the inputs
         // here.
     }
-    for(size_t i = 0; i < _data._numOutputs; i++) {
+    for (size_t i = 0; i < _data._numOutputs; i++) {
         _resultSinks[i]->add(localSinks[i]->consume(), _data._rl);
         delete localSinks[i];
     }
 }
 
-
-template<typename VT>
-uint64_t CompiledPipelineTask<CSRMatrix<VT>>::getTaskSize() {
-return _data._ru-_data._rl;
-}
-
+template <typename VT> uint64_t CompiledPipelineTask<CSRMatrix<VT>>::getTaskSize() { return _data._ru - _data._rl; }
 
 template class CompiledPipelineTask<DenseMatrix<double>>;
 template class CompiledPipelineTask<DenseMatrix<float>>;
diff --git a/src/runtime/local/vectorized/Tasks.h b/src/runtime/local/vectorized/Tasks.h
index fef77214c..e7b4f783f 100644
--- a/src/runtime/local/vectorized/Tasks.h
+++ b/src/runtime/local/vectorized/Tasks.h
@@ -16,22 +16,22 @@
 
 #pragma once
 
+#include <ir/daphneir/Daphne.h>
+#include <runtime/local/context/DaphneContext.h>
 #include <runtime/local/datastructures/DataObjectFactory.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
 #include <runtime/local/kernels/EwBinaryMat.h>
 #include <runtime/local/vectorized/VectorizedDataSink.h>
-#include <runtime/local/context/DaphneContext.h>
-#include <ir/daphneir/Daphne.h>
 
 #include <functional>
-#include <vector>
 #include <mutex>
+#include <vector>
 
-using mlir::daphne::VectorSplit;
 using mlir::daphne::VectorCombine;
+using mlir::daphne::VectorSplit;
 
 class Task {
-public:
+  public:
     virtual ~Task() = default;
 
     virtual void execute(uint32_t fid, uint32_t batchSize) = 0;
@@ -40,17 +40,16 @@ class Task {
 
 // task for signaling closed input queue (no more tasks)
 class EOFTask : public Task {
-public:
+  public:
     EOFTask() = default;
     ~EOFTask() override = default;
     void execute(uint32_t fid, uint32_t batchSize) override {}
-    uint64_t getTaskSize() override {return 0;}
+    uint64_t getTaskSize() override { return 0; }
 };
 
-template<class DT>
-struct CompiledPipelineTaskData {
+template <class DT> struct CompiledPipelineTaskData {
     std::vector<std::function<void(DT ***, Structure **, DCTX(ctx))>> _funcs;
-    const bool* _isScalar;
+    const bool *_isScalar;
     Structure **_inputs;
     const size_t _numInputs;
     const size_t _numOutputs;
@@ -58,8 +57,8 @@ struct CompiledPipelineTaskData {
     const int64_t *_outCols;
     const VectorSplit *_splits;
     const VectorCombine *_combines;
-    const uint64_t _rl;    // row lower index
-    const uint64_t _ru;    // row upper index
+    const uint64_t _rl;              // row lower index
+    const uint64_t _ru;              // row upper index
     const int64_t *_wholeResultRows; // number of rows of the complete result
     const int64_t *_wholeResultCols; // number of cols of the complete result
     const uint64_t _offset;
@@ -73,40 +72,37 @@ struct CompiledPipelineTaskData {
     }
 };
 
-template<class DT>
-class CompiledPipelineTaskBase : public Task {
-protected:
+template <class DT> class CompiledPipelineTaskBase : public Task {
+  protected:
     CompiledPipelineTaskData<DT> _data;
 
-public:
+  public:
     explicit CompiledPipelineTaskBase(CompiledPipelineTaskData<DT> data) : _data(data) {}
     void execute(uint32_t fid, uint32_t batchSize) override = 0;
     uint64_t getTaskSize() override = 0;
 
-protected:
+  protected:
     bool isBroadcast(mlir::daphne::VectorSplit splitMethod, Structure *input) {
         return splitMethod == VectorSplit::NONE || (splitMethod == VectorSplit::ROWS && input->getNumRows() == 1);
     }
 
     std::vector<Structure *> createFuncInputs(uint64_t rowStart, uint64_t rowEnd) {
         std::vector<Structure *> linputs;
-        for(auto i = 0u ; i < _data._numInputs ; i++) {
+        for (auto i = 0u; i < _data._numInputs; i++) {
             if (isBroadcast(_data._splits[i], _data._inputs[i])) {
                 linputs.push_back(_data._inputs[i]);
                 // We need to increase the reference counter, since the
                 // pipeline manages the reference counter itself.
                 // This might be a scalar disguised as a Structure*.
-                if(!_data._isScalar[i])
+                if (!_data._isScalar[i])
                     // Note that increaseRefCounter() synchronizes the access
                     // via a std::mutex. If that turns out to slow down things,
                     // creating a shallow copy of the input would be an
                     // alternative.
                     _data._inputs[i]->increaseRefCounter();
-            }
-            else if (VectorSplit::ROWS == _data._splits[i]) {
+            } else if (VectorSplit::ROWS == _data._splits[i]) {
                 linputs.push_back(_data._inputs[i]->sliceRow(rowStart, rowEnd));
-            }
-            else {
+            } else {
                 llvm_unreachable("Not all vector splits handled");
             }
         }
@@ -114,34 +110,34 @@ class CompiledPipelineTaskBase : public Task {
     }
 };
 
-template<class DT>
-class CompiledPipelineTask : public CompiledPipelineTaskBase<DT> {};
+template <class DT> class CompiledPipelineTask : public CompiledPipelineTaskBase<DT> {};
 
-template<typename VT>
-class CompiledPipelineTask<DenseMatrix<VT>> : public CompiledPipelineTaskBase<DenseMatrix<VT>> {
+template <typename VT> class CompiledPipelineTask<DenseMatrix<VT>> : public CompiledPipelineTaskBase<DenseMatrix<VT>> {
     std::mutex &_resLock;
     DenseMatrix<VT> ***_res;
     using CompiledPipelineTaskBase<DenseMatrix<VT>>::_data;
-public:
+
+  public:
     CompiledPipelineTask(CompiledPipelineTaskData<DenseMatrix<VT>> data, std::mutex &resLock, DenseMatrix<VT> ***res)
         : CompiledPipelineTaskBase<DenseMatrix<VT>>(data), _resLock(resLock), _res(res) {}
 
     void execute(uint32_t fid, uint32_t batchSize) override;
     uint64_t getTaskSize() override;
 
-private:
-    void accumulateOutputs(std::vector<DenseMatrix<VT>*>& localResults, std::vector<DenseMatrix<VT> *> &localAddRes,
-            uint64_t rowStart, uint64_t rowEnd);
+  private:
+    void accumulateOutputs(std::vector<DenseMatrix<VT> *> &localResults, std::vector<DenseMatrix<VT> *> &localAddRes,
+                           uint64_t rowStart, uint64_t rowEnd);
 };
 
-template<typename VT>
-class CompiledPipelineTask<CSRMatrix<VT>> : public CompiledPipelineTaskBase<CSRMatrix<VT>> {
-    std::vector<VectorizedDataSink<CSRMatrix<VT>> *>& _resultSinks;
+template <typename VT> class CompiledPipelineTask<CSRMatrix<VT>> : public CompiledPipelineTaskBase<CSRMatrix<VT>> {
+    std::vector<VectorizedDataSink<CSRMatrix<VT>> *> &_resultSinks;
     using CompiledPipelineTaskBase<CSRMatrix<VT>>::_data;
-public:
-    CompiledPipelineTask(CompiledPipelineTaskData<CSRMatrix<VT>> data, std::vector<VectorizedDataSink<CSRMatrix<VT>> *>& resultSinks)
+
+  public:
+    CompiledPipelineTask(CompiledPipelineTaskData<CSRMatrix<VT>> data,
+                         std::vector<VectorizedDataSink<CSRMatrix<VT>> *> &resultSinks)
         : CompiledPipelineTaskBase<CSRMatrix<VT>>(data), _resultSinks(resultSinks) {}
-    
+
     void execute(uint32_t fid, uint32_t batchSize) override;
     uint64_t getTaskSize() override;
 };
diff --git a/src/runtime/local/vectorized/TasksCUDA.cpp b/src/runtime/local/vectorized/TasksCUDA.cpp
index 4090507bc..00e50555b 100644
--- a/src/runtime/local/vectorized/TasksCUDA.cpp
+++ b/src/runtime/local/vectorized/TasksCUDA.cpp
@@ -14,112 +14,112 @@
  * limitations under the License.
  */
 
-#include "runtime/local/datastructures/AllocationDescriptorCUDA.h"
 #include "runtime/local/vectorized/TasksCUDA.h"
+#include "runtime/local/datastructures/AllocationDescriptorCUDA.h"
 #include "runtime/local/kernels/CUDA/EwBinaryMat.h"
 
-template<typename VT>
-void CompiledPipelineTaskCUDA<DenseMatrix<VT>>::execute(uint32_t fid, uint32_t batchSize) {
+template <typename VT> void CompiledPipelineTaskCUDA<DenseMatrix<VT>>::execute(uint32_t fid, uint32_t batchSize) {
     // local add aggregation to minimize locking
-    std::vector<DenseMatrix<VT>*> localAddRes(_data._numOutputs);
-    std::vector<DenseMatrix<VT>*> localResults(_data._numOutputs);
-    for(uint64_t r = _data._rl ; r < _data._ru ; r += batchSize) {
-        //create zero-copy views of inputs/outputs
+    std::vector<DenseMatrix<VT> *> localAddRes(_data._numOutputs);
+    std::vector<DenseMatrix<VT> *> localResults(_data._numOutputs);
+    for (uint64_t r = _data._rl; r < _data._ru; r += batchSize) {
+        // create zero-copy views of inputs/outputs
         uint64_t r2 = std::min(r + batchSize, _data._ru);
-        
+
         auto linputs = this->createFuncInputs(r, r2);
-        std::vector<DenseMatrix<VT>**> outputs;
-        
+        std::vector<DenseMatrix<VT> **> outputs;
+
         for (auto &lres : localResults) {
             outputs.push_back(&lres);
         }
-        //execute function on given data binding (batch size)
+        // execute function on given data binding (batch size)
         _data._funcs[fid](outputs.data(), linputs.data(), _data._ctx);
         accumulateOutputs(localResults, localAddRes, r, r2);
-        
+
         // cleanup
         for (auto &localResult : localResults) {
-            if(localResult) {
+            if (localResult) {
                 DataObjectFactory::destroy(localResult);
                 localResult = nullptr;
             }
         }
     }
-    
-    for(size_t o = 0; o < _data._numOutputs; ++o) {
-        if(_data._combines[o] == VectorCombine::ADD) {
+
+    for (size_t o = 0; o < _data._numOutputs; ++o) {
+        if (_data._combines[o] == VectorCombine::ADD) {
             auto &result = (*_res[o]);
             _resLock.lock();
-            if(result == nullptr) {
+            if (result == nullptr) {
                 result = localAddRes[o];
                 _resLock.unlock();
-            }
-            else {
+            } else {
                 CUDA::ewBinaryMat(BinaryOpCode::ADD, result, result, localAddRes[o], _data._ctx);
                 _resLock.unlock();
-                //cleanup
+                // cleanup
                 DataObjectFactory::destroy(localAddRes[o]);
             }
         }
     }
 }
 
-template<typename VT>
-void CompiledPipelineTaskCUDA<DenseMatrix<VT>>::accumulateOutputs(std::vector<DenseMatrix<VT>*>& localResults,
-        std::vector<DenseMatrix<VT> *> &localAddRes, uint64_t rowStart, uint64_t rowEnd) {
-    
-    //TODO: in-place computation via better compiled pipelines
-    //TODO: multi-return
-    const size_t deviceID = 0; //ToDo: multi device support
+template <typename VT>
+void CompiledPipelineTaskCUDA<DenseMatrix<VT>>::accumulateOutputs(std::vector<DenseMatrix<VT> *> &localResults,
+                                                                  std::vector<DenseMatrix<VT> *> &localAddRes,
+                                                                  uint64_t rowStart, uint64_t rowEnd) {
+
+    // TODO: in-place computation via better compiled pipelines
+    // TODO: multi-return
+    const size_t deviceID = 0; // ToDo: multi device support
     AllocationDescriptorCUDA alloc_desc(_data._ctx, deviceID);
-    for(auto o = 0u ; o < _data._numOutputs ; ++o) {
+    for (auto o = 0u; o < _data._numOutputs; ++o) {
         auto &result = (*_res[o]);
         switch (_data._combines[o]) {
-            case VectorCombine::ROWS: {
-                auto bufsize = localResults[o]->getBufferSize();
-                auto data = result->getValues(&alloc_desc);
-                data += result->getRowSkip() * rowStart;
-                CHECK_CUDART(cudaMemcpy(data, localResults[o]->getValues(&alloc_desc), bufsize, cudaMemcpyDeviceToDevice));
-//                debugPrintCUDABuffer("TaskCUDA: accumulate outputs", localResults[o]->getValues(&alloc_desc), localResults[o]->getNumItems());
-                break;
-            }
-            case VectorCombine::COLS: {
-                auto res_base_ptr = result->getValues(&alloc_desc);
-                auto lres_data_base_ptr = localResults[o]->getValues(&alloc_desc);
-                auto rlen = rowEnd - rowStart;
-                auto slice = result->slice(0, this->_data._outRows[o], rowStart, rowEnd);
-                for(auto i = 0u; i < slice->getNumRows(); ++i) {
-                    auto data_src = lres_data_base_ptr + localResults[o]->getRowSkip() * i;
-//                    auto data_dst = res_base_ptr + result->getRowSkip() * i + rowStart;
-                    auto data_dst = res_base_ptr + result->getNumCols() * i + rowStart;
-//                    auto data_dst = res_base_ptr;
-                    CHECK_CUDART(cudaMemcpy(data_dst, data_src, sizeof(VT) * rlen, cudaMemcpyDeviceToDevice));
-                }
-                DataObjectFactory::destroy(slice);
-                break;
-            }
-            case VectorCombine::ADD: {
-                if(localAddRes[o] == nullptr) {
-                    // take lres and reset it to nullptr
-                    localAddRes[o] = localResults[o];
-                    localResults[o] = nullptr;
-                }
-                else {
-                    CUDA::ewBinaryMat(BinaryOpCode::ADD, localAddRes[o], localAddRes[o], localResults[o], nullptr);
-                }
-                break;
+        case VectorCombine::ROWS: {
+            auto bufsize = localResults[o]->getBufferSize();
+            auto data = result->getValues(&alloc_desc);
+            data += result->getRowSkip() * rowStart;
+            CHECK_CUDART(cudaMemcpy(data, localResults[o]->getValues(&alloc_desc), bufsize, cudaMemcpyDeviceToDevice));
+            //                debugPrintCUDABuffer("TaskCUDA: accumulate
+            //                outputs", localResults[o]->getValues(&alloc_desc),
+            //                localResults[o]->getNumItems());
+            break;
+        }
+        case VectorCombine::COLS: {
+            auto res_base_ptr = result->getValues(&alloc_desc);
+            auto lres_data_base_ptr = localResults[o]->getValues(&alloc_desc);
+            auto rlen = rowEnd - rowStart;
+            auto slice = result->slice(0, this->_data._outRows[o], rowStart, rowEnd);
+            for (auto i = 0u; i < slice->getNumRows(); ++i) {
+                auto data_src = lres_data_base_ptr + localResults[o]->getRowSkip() * i;
+                //                    auto data_dst = res_base_ptr +
+                //                    result->getRowSkip() * i + rowStart;
+                auto data_dst = res_base_ptr + result->getNumCols() * i + rowStart;
+                //                    auto data_dst = res_base_ptr;
+                CHECK_CUDART(cudaMemcpy(data_dst, data_src, sizeof(VT) * rlen, cudaMemcpyDeviceToDevice));
             }
-            default: {
-                throw std::runtime_error(("VectorCombine case `"
-                                          + std::to_string(static_cast<int64_t>(_data._combines[o])) + "` not supported"));
+            DataObjectFactory::destroy(slice);
+            break;
+        }
+        case VectorCombine::ADD: {
+            if (localAddRes[o] == nullptr) {
+                // take lres and reset it to nullptr
+                localAddRes[o] = localResults[o];
+                localResults[o] = nullptr;
+            } else {
+                CUDA::ewBinaryMat(BinaryOpCode::ADD, localAddRes[o], localAddRes[o], localResults[o], nullptr);
             }
+            break;
+        }
+        default: {
+            throw std::runtime_error(("VectorCombine case `" +
+                                      std::to_string(static_cast<int64_t>(_data._combines[o])) + "` not supported"));
+        }
         }
     }
 }
 
-template<typename VT>
-uint64_t CompiledPipelineTaskCUDA<DenseMatrix<VT>>::getTaskSize() {
-    return _data._ru-_data._rl;
+template <typename VT> uint64_t CompiledPipelineTaskCUDA<DenseMatrix<VT>>::getTaskSize() {
+    return _data._ru - _data._rl;
 }
 
 template class CompiledPipelineTaskCUDA<DenseMatrix<double>>;
diff --git a/src/runtime/local/vectorized/TasksCUDA.h b/src/runtime/local/vectorized/TasksCUDA.h
index db861b0cf..1abc9b219 100644
--- a/src/runtime/local/vectorized/TasksCUDA.h
+++ b/src/runtime/local/vectorized/TasksCUDA.h
@@ -18,23 +18,24 @@
 
 #include "Tasks.h"
 
-template<class DT>
-class CompiledPipelineTaskCUDA : public CompiledPipelineTaskBase<DT> {};
+template <class DT> class CompiledPipelineTaskCUDA : public CompiledPipelineTaskBase<DT> {};
 
-template<typename VT>
+template <typename VT>
 class CompiledPipelineTaskCUDA<DenseMatrix<VT>> : public CompiledPipelineTaskBase<DenseMatrix<VT>> {
     std::mutex &_resLock;
     DenseMatrix<VT> ***_res;
     using CompiledPipelineTaskBase<DenseMatrix<VT>>::_data;
-public:
-    CompiledPipelineTaskCUDA(CompiledPipelineTaskData<DenseMatrix<VT>> data, std::mutex &resLock, DenseMatrix<VT> ***res)
-            : CompiledPipelineTaskBase<DenseMatrix<VT>>(data), _resLock(resLock), _res(res) {}
-    
+
+  public:
+    CompiledPipelineTaskCUDA(CompiledPipelineTaskData<DenseMatrix<VT>> data, std::mutex &resLock,
+                             DenseMatrix<VT> ***res)
+        : CompiledPipelineTaskBase<DenseMatrix<VT>>(data), _resLock(resLock), _res(res) {}
+
     void execute(uint32_t fid, uint32_t batchSize) override;
 
     uint64_t getTaskSize() override;
 
-private:
-    void accumulateOutputs(std::vector<DenseMatrix<VT>*>& localResults, std::vector<DenseMatrix<VT> *> &localAddRes,
-            uint64_t rowStart, uint64_t rowEnd);
+  private:
+    void accumulateOutputs(std::vector<DenseMatrix<VT> *> &localResults, std::vector<DenseMatrix<VT> *> &localAddRes,
+                           uint64_t rowStart, uint64_t rowEnd);
 };
diff --git a/src/runtime/local/vectorized/VectorizedDataSink.h b/src/runtime/local/vectorized/VectorizedDataSink.h
index c237260c6..730bb8bb7 100644
--- a/src/runtime/local/vectorized/VectorizedDataSink.h
+++ b/src/runtime/local/vectorized/VectorizedDataSink.h
@@ -25,16 +25,14 @@
 
 using mlir::daphne::VectorCombine;
 
-template<typename DT>
-class VectorizedDataSink {
-public:
+template <typename DT> class VectorizedDataSink {
+  public:
     void add(DT *matrix, size_t startRow) = delete;
 };
 
 // TODO: VectorizedDataSink for DenseMatrix
 
-template<typename VT>
-class VectorizedDataSink<CSRMatrix<VT>> {
+template <typename VT> class VectorizedDataSink<CSRMatrix<VT>> {
     using QueueElements = std::pair<size_t, CSRMatrix<VT> *>;
     VectorCombine _combine;
     std::priority_queue<QueueElements, std::vector<QueueElements>, std::greater<>> _results;
@@ -44,7 +42,8 @@ class VectorizedDataSink<CSRMatrix<VT>> {
     uint64_t _numNnz = 0;
     // for column-wise combine
     std::vector<size_t> _rowNnz;
-public:
+
+  public:
     VectorizedDataSink(VectorCombine combine, uint64_t numRows, uint64_t numCols)
         : _combine(combine), _numRows(numRows), _numCols(numCols), _rowNnz(numRows) {}
 
@@ -63,20 +62,21 @@ class VectorizedDataSink<CSRMatrix<VT>> {
                 rowNnz[row] += off[row + 1] - off[row];
             }
         }
-        LLVM_FALLTHROUGH;
+            LLVM_FALLTHROUGH;
         case VectorCombine::ROWS: {
             _numNnz += matrix->getNumNonZeros();
             _results.emplace(startRow, matrix);
             break;
         }
         default: {
-            throw std::runtime_error("Vectorization of sparse matrices only implemented for row-wise combines");
+            throw std::runtime_error("Vectorization of sparse matrices only "
+                                     "implemented for row-wise combines");
         }
         }
     }
 
     CSRMatrix<VT> *consume() {
-        if(_results.empty()) {
+        if (_results.empty()) {
             throw std::runtime_error("Vectorized CSRMatrix without any iterations");
         }
         auto *res = DataObjectFactory::create<CSRMatrix<VT>>(_numRows, _numCols, _numNnz, false);
@@ -85,26 +85,23 @@ class VectorizedDataSink<CSRMatrix<VT>> {
         auto *resValues = res->getValues();
         auto *resColIdxs = res->getColIdxs();
         if (_combine == VectorCombine::ROWS) {
-            while(!_results.empty()) {
+            while (!_results.empty()) {
                 auto pair = _results.top();
                 _results.pop();
 
                 auto rowStart = pair.first;
                 auto currMat = pair.second;
                 auto *currRowOff = currMat->getRowOffsets();
-                for(size_t row = 0; row < currMat->getNumRows(); ++row) {
+                for (size_t row = 0; row < currMat->getNumRows(); ++row) {
                     resRowOff[rowStart + row + 1] = resRowOff[rowStart] + currRowOff[row + 1];
                 }
-                std::memcpy(resColIdxs + resRowOff[rowStart],
-                    currMat->getColIdxs(),
-                    currMat->getNumNonZeros() * sizeof(*resColIdxs));
-                std::memcpy(resValues + resRowOff[rowStart],
-                    currMat->getValues(),
-                    currMat->getNumNonZeros() * sizeof(*resValues));
+                std::memcpy(resColIdxs + resRowOff[rowStart], currMat->getColIdxs(),
+                            currMat->getNumNonZeros() * sizeof(*resColIdxs));
+                std::memcpy(resValues + resRowOff[rowStart], currMat->getValues(),
+                            currMat->getNumNonZeros() * sizeof(*resValues));
                 DataObjectFactory::destroy(currMat);
             }
-        }
-        else if (_combine == VectorCombine::COLS) {
+        } else if (_combine == VectorCombine::COLS) {
             auto *rowNnz = &_rowNnz[0];
             PRAGMA_LOOP_VECTORIZE
             for (size_t row = 0; row < _numRows; ++row) {
@@ -113,7 +110,7 @@ class VectorizedDataSink<CSRMatrix<VT>> {
                 rowNnz[row] = resRowOff[row];
             }
             // we start with first columns
-            while(!_results.empty()) {
+            while (!_results.empty()) {
                 auto pair = _results.top();
                 _results.pop();
 
@@ -122,12 +119,10 @@ class VectorizedDataSink<CSRMatrix<VT>> {
                 auto *currRowOff = currMat->getRowOffsets();
                 auto *currValues = currMat->getValues();
                 auto *currColIdxs = currMat->getColIdxs();
-                for(size_t row = 0; row < currMat->getNumRows(); ++row) {
+                for (size_t row = 0; row < currMat->getNumRows(); ++row) {
                     auto offset = currRowOff[row];
                     auto len = currRowOff[row + 1] - offset;
-                    std::memcpy(resValues + rowNnz[row],
-                        currValues + offset,
-                        len * sizeof(*resValues));
+                    std::memcpy(resValues + rowNnz[row], currValues + offset, len * sizeof(*resValues));
                     for (size_t i = 0; i < len; ++i) {
                         resColIdxs[rowNnz[row] + i] = colStart + currColIdxs[offset + i];
                     }
@@ -135,8 +130,7 @@ class VectorizedDataSink<CSRMatrix<VT>> {
                 }
                 DataObjectFactory::destroy(currMat);
             }
-        }
-        else {
+        } else {
             llvm_unreachable("NOT IMPLEMENTED");
         }
 
diff --git a/src/runtime/local/vectorized/Worker.h b/src/runtime/local/vectorized/Worker.h
index 64fbf99f2..62264432c 100644
--- a/src/runtime/local/vectorized/Worker.h
+++ b/src/runtime/local/vectorized/Worker.h
@@ -18,30 +18,32 @@
 
 #include <runtime/local/vectorized/Tasks.h>
 
-#include <thread>
-#include <sched.h>
 #include <numeric>
+#include <sched.h>
+#include <thread>
 
 class Worker {
-protected:
+  protected:
     std::unique_ptr<std::thread> t;
 
     DCTX(ctx);
 
-    // Worker only used as derived class, which starts the thread after the class has been constructed (order matters).
+    // Worker only used as derived class, which starts the thread after the
+    // class has been constructed (order matters).
     explicit Worker(DCTX(dctx)) : t(), ctx(dctx) {}
 
-public:
-    // Worker is move only due to std::thread. Therefore, we delete the copy constructor and the assignment operator.
-    Worker(const Worker&) = delete;
-    Worker& operator=(const Worker&) = delete;
+  public:
+    // Worker is move only due to std::thread. Therefore, we delete the copy
+    // constructor and the assignment operator.
+    Worker(const Worker &) = delete;
+    Worker &operator=(const Worker &) = delete;
 
     // move constructor
-    Worker(Worker&& obj)  noexcept : t(std::move(obj.t)), ctx(obj.ctx) {}
+    Worker(Worker &&obj) noexcept : t(std::move(obj.t)), ctx(obj.ctx) {}
 
     // move assignment operator
-    Worker& operator=(Worker&& obj)  noexcept {
-        if(t->joinable())
+    Worker &operator=(Worker &&obj) noexcept {
+        if (t->joinable())
             t->join();
         t = std::move(obj.t);
         ctx = obj.ctx;
@@ -49,15 +51,11 @@ class Worker {
     }
 
     virtual ~Worker() {
-        if(t->joinable())
+        if (t->joinable())
             t->join();
     };
 
-    void join() {
-        t->join();
-    }
+    void join() { t->join(); }
     virtual void run() = 0;
-    static bool isEOF(Task* t) {
-        return dynamic_cast<EOFTask*>(t);
-    }
+    static bool isEOF(Task *t) { return dynamic_cast<EOFTask *>(t); }
 };
diff --git a/src/runtime/local/vectorized/WorkerCPU.h b/src/runtime/local/vectorized/WorkerCPU.h
index cd0bf0fd3..fda1cc73c 100644
--- a/src/runtime/local/vectorized/WorkerCPU.h
+++ b/src/runtime/local/vectorized/WorkerCPU.h
@@ -22,7 +22,7 @@
 #include <spdlog/spdlog.h>
 
 class WorkerCPU : public Worker {
-    std::vector<TaskQueue*> _q;
+    std::vector<TaskQueue *> _q;
     std::vector<int> _physical_ids;
     std::vector<int> _unique_threads;
     std::array<bool, 256> eofWorkers;
@@ -34,14 +34,15 @@ class WorkerCPU : public Worker {
     int _queueMode;
     int _stealLogic;
     bool _pinWorkers;
-public:
+
+  public:
     // ToDo: remove compile-time verbose parameter and use logger
-    WorkerCPU(std::vector<TaskQueue*> deques, std::vector<int> physical_ids, std::vector<int> unique_threads,
-            DCTX(dctx), bool verbose, uint32_t fid = 0, uint32_t batchSize = 100, int threadID = 0, int numQueues = 0,
-            int queueMode = 0, int stealLogic = 0, bool pinWorkers = 0) : Worker(dctx), _q(deques),
-            _physical_ids(physical_ids), _unique_threads(unique_threads),
-            _verbose(verbose), _fid(fid), _batchSize(batchSize), _threadID(threadID), _numQueues(numQueues),
-            _queueMode(queueMode), _stealLogic(stealLogic), _pinWorkers(pinWorkers) {
+    WorkerCPU(std::vector<TaskQueue *> deques, std::vector<int> physical_ids, std::vector<int> unique_threads,
+              DCTX(dctx), bool verbose, uint32_t fid = 0, uint32_t batchSize = 100, int threadID = 0, int numQueues = 0,
+              int queueMode = 0, int stealLogic = 0, bool pinWorkers = 0)
+        : Worker(dctx), _q(deques), _physical_ids(physical_ids), _unique_threads(unique_threads), _verbose(verbose),
+          _fid(fid), _batchSize(batchSize), _threadID(threadID), _numQueues(numQueues), _queueMode(queueMode),
+          _stealLogic(stealLogic), _pinWorkers(pinWorkers) {
         // at last, start the thread
         t = std::make_unique<std::thread>(&WorkerCPU::run, this);
     }
@@ -59,89 +60,91 @@ class WorkerCPU : public Worker {
 
         int currentDomain = _physical_ids[_threadID];
         int targetQueue = _threadID;
-        if( _queueMode == 0 ) {
+        if (_queueMode == 0) {
             targetQueue = 0;
-        } else if ( _queueMode == 1) {
+        } else if (_queueMode == 1) {
             targetQueue = currentDomain;
-        } else if ( _queueMode == 2) {
+        } else if (_queueMode == 2) {
             targetQueue = _threadID;
         } else {
             ctx->logger->error("WorkerCPU: queue not found");
         }
         int startingQueue = targetQueue;
 
-        Task* t = _q[targetQueue]->dequeueTask();
+        Task *t = _q[targetQueue]->dequeueTask();
 
-        while( !isEOF(t) ) {
-            //execute self-contained task
-            if( _verbose )
+        while (!isEOF(t)) {
+            // execute self-contained task
+            if (_verbose)
                 ctx->logger->trace("WorkerCPU: executing task.");
             t->execute(_fid, _batchSize);
             delete t;
-            //get next tasks (blocking)
+            // get next tasks (blocking)
             t = _q[targetQueue]->dequeueTask();
         }
 
-        // All tasks from own queue have completed. Now stealing from other queues.
+        // All tasks from own queue have completed. Now stealing from other
+        // queues.
 
-        if( _numQueues > 1 ) {
-            if( _stealLogic == 0) {
+        if (_numQueues > 1) {
+            if (_stealLogic == 0) {
                 // Stealing in sequential order
 
-                targetQueue = (targetQueue+1)%_numQueues;
+                targetQueue = (targetQueue + 1) % _numQueues;
 
-                while ( targetQueue != startingQueue ) {
+                while (targetQueue != startingQueue) {
                     t = _q[targetQueue]->dequeueTask();
-                    if( isEOF(t) ) {
-                        targetQueue = (targetQueue+1)%_numQueues;
+                    if (isEOF(t)) {
+                        targetQueue = (targetQueue + 1) % _numQueues;
                     } else {
                         t->execute(_fid, _batchSize);
                         delete t;
                     }
                 }
-            } else if ( _stealLogic == 1) {
+            } else if (_stealLogic == 1) {
                 // Stealing in sequential order from same domain first
-                if ( _queueMode == 2 ) {
-                    targetQueue = (targetQueue+1)%_numQueues;
+                if (_queueMode == 2) {
+                    targetQueue = (targetQueue + 1) % _numQueues;
 
-                    while ( targetQueue != startingQueue ) {
-                        if ( _physical_ids[targetQueue] == currentDomain ){
+                    while (targetQueue != startingQueue) {
+                        if (_physical_ids[targetQueue] == currentDomain) {
                             t = _q[targetQueue]->dequeueTask();
-                            if( isEOF(t) ) {
-                                targetQueue = (targetQueue+1)%_numQueues;
+                            if (isEOF(t)) {
+                                targetQueue = (targetQueue + 1) % _numQueues;
                             } else {
                                 t->execute(_fid, _batchSize);
                                 delete t;
                             }
                         } else {
-                            targetQueue = (targetQueue+1)%_numQueues;
+                            targetQueue = (targetQueue + 1) % _numQueues;
                         }
                     }
                 }
 
                 // No more tasks on this domain, now switching to other domain
 
-                targetQueue = (targetQueue+1)%_numQueues;
+                targetQueue = (targetQueue + 1) % _numQueues;
 
-                while ( targetQueue != startingQueue ) {
+                while (targetQueue != startingQueue) {
                     t = _q[targetQueue]->dequeueTask();
-                    if( isEOF(t) ) {
-                        targetQueue = (targetQueue+1)%_numQueues;
+                    if (isEOF(t)) {
+                        targetQueue = (targetQueue + 1) % _numQueues;
                     } else {
                         t->execute(_fid, _batchSize);
                         delete t;
                     }
                 }
-            } else if( _stealLogic == 2) {
+            } else if (_stealLogic == 2) {
                 // stealing from random workers until all workers EOF
 
                 eofWorkers.fill(false);
-                while( std::accumulate(eofWorkers.begin(), eofWorkers.end(), 0) < _numQueues ) {
+                while (std::accumulate(eofWorkers.begin(), eofWorkers.end(), 0) < _numQueues) {
                     targetQueue = rand() % _numQueues;
-                    if( eofWorkers[targetQueue] == false ) {
+                    if (eofWorkers[targetQueue] == false) {
                         t = _q[targetQueue]->dequeueTask();
-                        //std::cout << "Execute task stolen from: " << targetQueue << std::endl;
-                        if( isEOF(t) ) {
+                        // std::cout << "Execute task stolen from: " <<
+                        // targetQueue << std::endl;
+                        if (isEOF(t)) {
                             eofWorkers[targetQueue] = true;
                         } else {
                             t->execute(_fid, _batchSize);
@@ -150,23 +153,23 @@ class WorkerCPU : public Worker {
                     }
                 }
 
-            } else if ( _stealLogic == 3) {
+            } else if (_stealLogic == 3) {
                 // stealing from random workers from same socket first
                 int queuesThisDomain = 0;
                 eofWorkers.fill(false);
 
-                for( int i=0; i<_numQueues; i++ ) {
-                    if( _physical_ids[i] == currentDomain ) {
+                for (int i = 0; i < _numQueues; i++) {
+                    if (_physical_ids[i] == currentDomain) {
                         queuesThisDomain++;
                     }
                 }
-                if ( _queueMode == 2 ) {
-                    while( std::accumulate(eofWorkers.begin(), eofWorkers.end(), 0) < queuesThisDomain ) {
+                if (_queueMode == 2) {
+                    while (std::accumulate(eofWorkers.begin(), eofWorkers.end(), 0) < queuesThisDomain) {
                         targetQueue = rand() % _numQueues;
-                        if( _physical_ids[targetQueue] == currentDomain) {
-                            if( eofWorkers[targetQueue] == false ) {
+                        if (_physical_ids[targetQueue] == currentDomain) {
+                            if (eofWorkers[targetQueue] == false) {
                                 t = _q[targetQueue]->dequeueTask();
-                                if( isEOF(t) ) {
+                                if (isEOF(t)) {
                                     eofWorkers[targetQueue] = true;
                                 } else {
                                     t->execute(_fid, _batchSize);
@@ -177,15 +180,17 @@ class WorkerCPU : public Worker {
                     }
                 }
 
-                // all workers on same domain are EOF, now also allowing stealing from other domain
-                // This could also be done by keeping a list of EOF workers on the other domain
+                // all workers on same domain are EOF, now also allowing
+                // stealing from other domain This could also be done by keeping
+                // a list of EOF workers on the other domain
 
-                while ( std::accumulate(eofWorkers.begin(), eofWorkers.end(), 0) < _numQueues ) {
+                while (std::accumulate(eofWorkers.begin(), eofWorkers.end(), 0) < _numQueues) {
                     targetQueue = rand() % _numQueues;
-                    // no need to check if they are on the other domain, because otherwise they would be EOF anyway
-                    if( eofWorkers[targetQueue] == false ) {
+                    // no need to check if they are on the other domain, because
+                    // otherwise they would be EOF anyway
+                    if (eofWorkers[targetQueue] == false) {
                         t = _q[targetQueue]->dequeueTask();
-                        if( isEOF(t) ) {
+                        if (isEOF(t)) {
                             eofWorkers[targetQueue] = true;
                         } else {
                             t->execute(_fid, _batchSize);
@@ -197,7 +202,7 @@ class WorkerCPU : public Worker {
         }
 
         // No more tasks available anywhere
-        if( _verbose )
+        if (_verbose)
             ctx->logger->debug("WorkerCPU: received EOF, finalized.");
     }
 };
diff --git a/src/runtime/local/vectorized/WorkerGPU.h b/src/runtime/local/vectorized/WorkerGPU.h
index 3d0b3fd6d..90aa1448b 100644
--- a/src/runtime/local/vectorized/WorkerGPU.h
+++ b/src/runtime/local/vectorized/WorkerGPU.h
@@ -20,14 +20,15 @@
 #include <runtime/local/vectorized/TaskQueues.h>
 
 class WorkerGPU : public Worker {
-    TaskQueue* _q;
+    TaskQueue *_q;
     bool _verbose;
     uint32_t _fid;
     uint32_t _batchSize;
-public:
+
+  public:
     // ToDo: remove compile-time verbose parameter and use logger
-    WorkerGPU(TaskQueue* tq, DCTX(dctx), bool verbose, uint32_t fid = 0, uint32_t batchSize = 100) : Worker(dctx), _q(tq),
-            _verbose(verbose), _fid(fid), _batchSize(batchSize) {
+    WorkerGPU(TaskQueue *tq, DCTX(dctx), bool verbose, uint32_t fid = 0, uint32_t batchSize = 100)
+        : Worker(dctx), _q(tq), _verbose(verbose), _fid(fid), _batchSize(batchSize) {
         // at last, start the thread
         t = std::make_unique<std::thread>(&WorkerGPU::run, this);
     }
@@ -35,18 +36,18 @@ class WorkerGPU : public Worker {
     ~WorkerGPU() override = default;
 
     void run() override {
-        Task* t = _q->dequeueTask();
+        Task *t = _q->dequeueTask();
 
-        while( !isEOF(t) ) {
-            //execute self-contained task
-            if( _verbose )
+        while (!isEOF(t)) {
+            // execute self-contained task
+            if (_verbose)
                 ctx->logger->trace("WorkerGPU: executing task.");
             t->execute(_fid, _batchSize);
             delete t;
-            //get next tasks (blocking)
+            // get next tasks (blocking)
             t = _q->dequeueTask();
         }
-        if( _verbose )
+        if (_verbose)
             ctx->logger->trace("WorkerGPU: received EOF, finalized.");
     }
 };
\ No newline at end of file
diff --git a/src/util/CMakeLists.txt b/src/util/CMakeLists.txt
index d6bb21f58..de58a9e3a 100644
--- a/src/util/CMakeLists.txt
+++ b/src/util/CMakeLists.txt
@@ -32,7 +32,7 @@ add_library(Util
 		StringRefCount.cpp
         )
 
-target_link_libraries(Util PRIVATE spdlog::spdlog)
+target_link_libraries(Util PRIVATE spdlog::spdlog fmt::fmt)
 
 # Make sure that certain .inc files have been generated by TableGen.
 add_dependencies(Util
diff --git a/src/util/DaphneLogger.cpp b/src/util/DaphneLogger.cpp
index c5f6bc7c3..78caa990a 100644
--- a/src/util/DaphneLogger.cpp
+++ b/src/util/DaphneLogger.cpp
@@ -16,10 +16,12 @@
 
 #include "DaphneLogger.h"
 
-#include <spdlog/spdlog.h>
+#include <fmt/core.h>
+
 #include "spdlog/async.h"
-#include "spdlog/sinks/stdout_color_sinks.h"
 #include "spdlog/sinks/basic_file_sink.h"
+#include "spdlog/sinks/stdout_color_sinks.h"
+#include <spdlog/spdlog.h>
 
 #include <iostream>
 
@@ -34,23 +36,20 @@
  * critical = 5,
  * off = 6,
  *
- * fallback_loggers takes {str:name, str:filename, int:level, str:pattern} initializers
+ * fallback_loggers takes {str:name, str:filename, int:level, str:pattern}
+ * initializers
  *
  * special loggers:
  *   - default: used if no named logger is fetched with spdlog::get("name")
  */
 const std::vector<LogConfig> DaphneLogger::fallback_loggers = {
-        {"default","", 4,"%^[%l]:%$ %v"},
-        {"compiler::cuda", "", 4, "%^[%n %L]:%$ %v" },
-        {"runtime::cuda", "", 4, "%^[%n %L]:%$ %v" },
-        {"runtime", "", 4, "%^[%n %L]:%$ %v" },
-        {"compiler", "", 4, "%^[%n %L]:%$ %v" },
-        {"parser", "", 4, "%^[%n %L]:%$ %v" }
-};
+    {"default", "", 4, "%^[%l]:%$ %v"},          {"compiler::cuda", "", 4, "%^[%n %L]:%$ %v"},
+    {"runtime::cuda", "", 4, "%^[%n %L]:%$ %v"}, {"runtime", "", 4, "%^[%n %L]:%$ %v"},
+    {"compiler", "", 4, "%^[%n %L]:%$ %v"},      {"parser", "", 4, "%^[%n %L]:%$ %v"}};
 
-void DaphneLogger::createLoggers(const LogConfig& config) {
+void DaphneLogger::createLoggers(const LogConfig &config) {
     auto logger = spdlog::get(config.name);
-    if(not logger) {
+    if (not logger) {
         std::vector<spdlog::sink_ptr> sinks;
         if (!config.filename.empty())
             sinks.push_back(std::make_shared<spdlog::sinks::basic_file_sink_mt>(config.filename, true));
@@ -76,22 +75,21 @@ void DaphneLogger::createLoggers(const LogConfig& config) {
     }
 }
 
-DaphneLogger::DaphneLogger(DaphneUserConfig& _config) : n_threads(1), queue_size(8192) {
+DaphneLogger::DaphneLogger(DaphneUserConfig &_config) : n_threads(1), queue_size(8192) {
     spdlog::init_thread_pool(queue_size, n_threads);
     try {
         level_limit = static_cast<spdlog::level::level_enum>(_config.log_level_limit);
 
         // user configured loggers
-        for (const auto &config: _config.loggers) {
+        for (const auto &config : _config.loggers) {
             createLoggers(config);
         }
         // compiled fallback loggers
-        for (const auto &config: fallback_loggers) {
+        for (const auto &config : fallback_loggers) {
             createLoggers(config);
         }
-    }
-    catch (const spdlog::spdlog_ex &ex) {
-        throw std::runtime_error(fmt::format("Log initialization failed: {}", + ex.what()));
+    } catch (const spdlog::spdlog_ex &ex) {
+        throw std::runtime_error(fmt::format("Log initialization failed: {}", +ex.what()));
     }
     _config.log_ptr = this;
 }
diff --git a/src/util/DaphneLogger.h b/src/util/DaphneLogger.h
index fb1077427..4d9690ace 100644
--- a/src/util/DaphneLogger.h
+++ b/src/util/DaphneLogger.h
@@ -16,13 +16,16 @@
 
 #pragma once
 
-#include <spdlog/async.h>
+#include <fmt/core.h>
 #include <spdlog/common.h>
-#include <spdlog/spdlog.h>
+
 #include <api/cli/DaphneUserConfig.h>
+#include <spdlog/async.h>
+#include <spdlog/spdlog.h>
 #include <vector>
 
 struct DaphneUserConfig;
+struct LogConfig;
 
 class DaphneLogger {
     const static std::vector<LogConfig> fallback_loggers;
@@ -32,25 +35,22 @@ class DaphneLogger {
     int n_threads;
     int queue_size;
 
-    void createLoggers(const LogConfig& config);
+    void createLoggers(const LogConfig &config);
 
-public:
-    explicit DaphneLogger(DaphneUserConfig& config);
+  public:
+    explicit DaphneLogger(DaphneUserConfig &config);
 
-    [[maybe_unused]] std::vector<std::shared_ptr<spdlog::logger>>* getLoggers() {
-        return &loggers;
-    }
+    [[maybe_unused]] std::vector<std::shared_ptr<spdlog::logger>> *getLoggers() { return &loggers; }
 
     [[maybe_unused]] std::shared_ptr<spdlog::logger> getDefaultLogger() { return default_logger; }
 
     // register loggers in shared libraries
     void registerLoggers() {
-        for (const auto& logger : loggers) {
-            if(not spdlog::get(logger->name())) {
+        for (const auto &logger : loggers) {
+            if (not spdlog::get(logger->name())) {
                 spdlog::register_logger(logger);
             }
         }
         spdlog::set_default_logger(default_logger);
     }
 };
-
diff --git a/src/util/DeduceType.h b/src/util/DeduceType.h
index 9872ee0d3..eadd7fd21 100644
--- a/src/util/DeduceType.h
+++ b/src/util/DeduceType.h
@@ -17,132 +17,136 @@
 #ifndef DAPHNE_PROTOTYPE_SRC_UTIL_DEDUCETYPE_H
 #define DAPHNE_PROTOTYPE_SRC_UTIL_DEDUCETYPE_H
 
+#include <runtime/local/datastructures/ValueTypeCode.h>
 #include <type_traits>
 #include <utility>
-#include <runtime/local/datastructures/ValueTypeCode.h>
 
 /// @todo move somewhere better suited
-template<typename T>
-struct is_ValueTypeCode : std::false_type {};
+template <typename T> struct is_ValueTypeCode : std::false_type {};
 
-template<>
-struct is_ValueTypeCode<ValueTypeCode> : std::true_type {};
-template<>
-struct is_ValueTypeCode<const ValueTypeCode> : std::true_type {};
-template<>
-struct is_ValueTypeCode<ValueTypeCode&> : std::true_type {};
-template<>
-struct is_ValueTypeCode<const ValueTypeCode&> : std::true_type {};
+template <> struct is_ValueTypeCode<ValueTypeCode> : std::true_type {};
+template <> struct is_ValueTypeCode<const ValueTypeCode> : std::true_type {};
+template <> struct is_ValueTypeCode<ValueTypeCode &> : std::true_type {};
+template <> struct is_ValueTypeCode<const ValueTypeCode &> : std::true_type {};
 
-template<typename T>
-inline constexpr bool is_ValueTypeCode_v = is_ValueTypeCode<T>::value;
+template <typename T> inline constexpr bool is_ValueTypeCode_v = is_ValueTypeCode<T>::value;
 
 /**
  * @brief Helper class to handle ValueTypeCode to C++ data type mapping.
  *
  * This class is designed to ease the handling of types of frames.
- * As first template parameter, it takes a templated class with an static <u>apply</u> function, which is later called
- * with the deduced data types.
+ * As first template parameter, it takes a templated class with an static
+ * <u>apply</u> function, which is later called with the deduced data types.
  *
  * \n\n<b>Usage:</b>\n
  * <u>1. one type</u>
  * <p>DeduceValueTypeAndExecute\<Kernel\>::apply(ValueTypeCode::SI32, 42);\n
  * ==> calls Kernel\<int32_t\>::apply(42)</p>
  *
- * <p>DeduceValueTypeAndExecute\<Kernel\>::apply(ValueTypeCode::SI32, args...);\n
+ * <p>DeduceValueTypeAndExecute\<Kernel\>::apply(ValueTypeCode::SI32,
+ * args...);\n
  * ==> calls Kernel\<int32_t\>::apply(args...)</p>
  *
- *  <p>DeduceValueTypeAndExecute\<Kernel, Foo, Bar\>::apply(ValueTypeCode::SI32, 42);\n
+ *  <p>DeduceValueTypeAndExecute\<Kernel, Foo, Bar\>::apply(ValueTypeCode::SI32,
+ * 42);\n
  *  ==> calls Kernel\<Foo, Bar, int32_t\>::apply(42)</p>
  *
  * <u>2. multiple types</u>
  *
- * <p>DeduceValueTypeAndExecute\<Kernel\>::apply(ValueTypeCode::SI32, ValueTypeCode::UI64,
- * ValueTypeCode::F64, args...);\n
+ * <p>DeduceValueTypeAndExecute\<Kernel\>::apply(ValueTypeCode::SI32,
+ * ValueTypeCode::UI64, ValueTypeCode::F64, args...);\n
  * ==> calls Kernel\<int32_t, uint64_t, double\>::apply(args...)</p>
  *
- * <p>DeduceValueTypeAndExecute\<Kernel, Foo, Bar\>::apply(ValueTypeCode::SI32, ValueTypeCode::UI64,
- * ValueTypeCode::F64, args...);\n
+ * <p>DeduceValueTypeAndExecute\<Kernel, Foo, Bar\>::apply(ValueTypeCode::SI32,
+ * ValueTypeCode::UI64, ValueTypeCode::F64, args...);\n
  * ==> calls Kernel\<Foo, Bar, int32_t, uint64_t, double\>::apply(args...)</p>
  *
- * <b>Warning:</b> (|ValueTypeCode| ^ Levels) switch-case-branches are created, where Levels is the amount of
- * ValueTypeCode arguments. Use with care.
+ * <b>Warning:</b> (|ValueTypeCode| ^ Levels) switch-case-branches are created,
+ * where Levels is the amount of ValueTypeCode arguments. Use with care.
  *
  *
  * @tparam TExec Executable - Templated class with static apply function.
  * @tparam TList Additional template parameters passed to the Executable.
  */
-template < template < typename ... > typename TExec, typename ... TList >
-class DeduceValueTypeAndExecute;
+template <template <typename...> typename TExec, typename... TList> class DeduceValueTypeAndExecute;
 
-template < uint64_t depth, template <typename ... > typename TExec, typename...TList>
-class DeduceValueType_Helper {
-    template < template < typename ... > typename, typename ... >
-    friend class DeduceValueTypeAndExecute;
-    template< uint64_t, template <typename ... > typename, typename ... >
-    friend class DeduceValueType_Helper;
-    
-    template < typename ... TArgs >
-    static void apply(ValueTypeCode vtc, TArgs&&...args){
-        if constexpr (depth > 1){
+template <uint64_t depth, template <typename...> typename TExec, typename... TList> class DeduceValueType_Helper {
+    template <template <typename...> typename, typename...> friend class DeduceValueTypeAndExecute;
+    template <uint64_t, template <typename...> typename, typename...> friend class DeduceValueType_Helper;
+
+    template <typename... TArgs> static void apply(ValueTypeCode vtc, TArgs &&...args) {
+        if constexpr (depth > 1) {
             switch (vtc) {
-                case ValueTypeCode::SI8:
-                    DeduceValueType_Helper<depth - 1, TExec, TList..., int8_t  >::apply(std::forward<TArgs>(args)...); return;
-                case ValueTypeCode::SI32:
-                    DeduceValueType_Helper<depth - 1, TExec, TList..., int32_t >::apply(std::forward<TArgs>(args)...); return;
-                case ValueTypeCode::SI64:
-                    DeduceValueType_Helper<depth - 1, TExec, TList..., int64_t >::apply(std::forward<TArgs>(args)...); return;
-                case ValueTypeCode::UI8:
-                    DeduceValueType_Helper<depth - 1, TExec, TList..., uint8_t >::apply(std::forward<TArgs>(args)...); return;
-                case ValueTypeCode::UI32:
-                    DeduceValueType_Helper<depth - 1, TExec, TList..., uint32_t>::apply(std::forward<TArgs>(args)...); return;
-                case ValueTypeCode::UI64:
-                    DeduceValueType_Helper<depth - 1, TExec, TList..., uint64_t>::apply(std::forward<TArgs>(args)...); return;
-                case ValueTypeCode::F32:
-                    DeduceValueType_Helper<depth - 1, TExec, TList..., float   >::apply(std::forward<TArgs>(args)...); return;
-                case ValueTypeCode::F64:
-                    DeduceValueType_Helper<depth - 1, TExec, TList..., double  >::apply(std::forward<TArgs>(args)...); return;
-                default:
-                    throw std::runtime_error("DeduceValueType_Helper::apply: unknown value type code");
+            case ValueTypeCode::SI8:
+                DeduceValueType_Helper<depth - 1, TExec, TList..., int8_t>::apply(std::forward<TArgs>(args)...);
+                return;
+            case ValueTypeCode::SI32:
+                DeduceValueType_Helper<depth - 1, TExec, TList..., int32_t>::apply(std::forward<TArgs>(args)...);
+                return;
+            case ValueTypeCode::SI64:
+                DeduceValueType_Helper<depth - 1, TExec, TList..., int64_t>::apply(std::forward<TArgs>(args)...);
+                return;
+            case ValueTypeCode::UI8:
+                DeduceValueType_Helper<depth - 1, TExec, TList..., uint8_t>::apply(std::forward<TArgs>(args)...);
+                return;
+            case ValueTypeCode::UI32:
+                DeduceValueType_Helper<depth - 1, TExec, TList..., uint32_t>::apply(std::forward<TArgs>(args)...);
+                return;
+            case ValueTypeCode::UI64:
+                DeduceValueType_Helper<depth - 1, TExec, TList..., uint64_t>::apply(std::forward<TArgs>(args)...);
+                return;
+            case ValueTypeCode::F32:
+                DeduceValueType_Helper<depth - 1, TExec, TList..., float>::apply(std::forward<TArgs>(args)...);
+                return;
+            case ValueTypeCode::F64:
+                DeduceValueType_Helper<depth - 1, TExec, TList..., double>::apply(std::forward<TArgs>(args)...);
+                return;
+            default:
+                throw std::runtime_error("DeduceValueType_Helper::apply: unknown value type code");
             }
         } else {
             switch (vtc) {
-                case ValueTypeCode::SI8:
-                    TExec<TList..., int8_t  >::apply(std::forward<TArgs>(args)...); return;
-                case ValueTypeCode::SI32:
-                    TExec<TList..., int32_t >::apply(std::forward<TArgs>(args)...); return;
-                case ValueTypeCode::SI64:
-                    TExec<TList..., int64_t >::apply(std::forward<TArgs>(args)...); return;
-                case ValueTypeCode::UI8:
-                    TExec<TList..., uint8_t >::apply(std::forward<TArgs>(args)...); return;
-                case ValueTypeCode::UI32:
-                    TExec<TList..., uint32_t>::apply(std::forward<TArgs>(args)...); return;
-                case ValueTypeCode::UI64:
-                    TExec<TList..., uint64_t>::apply(std::forward<TArgs>(args)...); return;
-                case ValueTypeCode::F32:
-                    TExec<TList..., float   >::apply(std::forward<TArgs>(args)...); return;
-                case ValueTypeCode::F64:
-                    TExec<TList..., double  >::apply(std::forward<TArgs>(args)...); return;
-                default:
-                    throw std::runtime_error("DeduceValueType_Helper::apply: unknown value type code");
+            case ValueTypeCode::SI8:
+                TExec<TList..., int8_t>::apply(std::forward<TArgs>(args)...);
+                return;
+            case ValueTypeCode::SI32:
+                TExec<TList..., int32_t>::apply(std::forward<TArgs>(args)...);
+                return;
+            case ValueTypeCode::SI64:
+                TExec<TList..., int64_t>::apply(std::forward<TArgs>(args)...);
+                return;
+            case ValueTypeCode::UI8:
+                TExec<TList..., uint8_t>::apply(std::forward<TArgs>(args)...);
+                return;
+            case ValueTypeCode::UI32:
+                TExec<TList..., uint32_t>::apply(std::forward<TArgs>(args)...);
+                return;
+            case ValueTypeCode::UI64:
+                TExec<TList..., uint64_t>::apply(std::forward<TArgs>(args)...);
+                return;
+            case ValueTypeCode::F32:
+                TExec<TList..., float>::apply(std::forward<TArgs>(args)...);
+                return;
+            case ValueTypeCode::F64:
+                TExec<TList..., double>::apply(std::forward<TArgs>(args)...);
+                return;
+            default:
+                throw std::runtime_error("DeduceValueType_Helper::apply: unknown value type code");
             }
         }
-    
     }
 };
 
-
-template < template < typename ... > typename TExec, typename ... TList >
-class DeduceValueTypeAndExecute {
+template <template <typename...> typename TExec, typename... TList> class DeduceValueTypeAndExecute {
     /**
      * Count how many types in TArgs are of type ValueTypeCode
      * @tparam TArgs Types to test and count
      */
-    template<typename...TArgs>
-    struct count_vtc {
+    template <typename... TArgs> struct count_vtc {
         /// Count of ValueTypeCodes in TArgs
-        static const uint64_t count = 0 + ( ... + (is_ValueTypeCode_v<TArgs> ? 1 : 0));
+        static const uint64_t count = 0 + (... + (is_ValueTypeCode_v<TArgs> ? 1 : 0));
     };
+
   public:
     /**
      * @brief Calls the executable after deducing the value types.
@@ -151,12 +155,9 @@ class DeduceValueTypeAndExecute {
      * @tparam Tvtc
      * @param vtc
      */
-    template <typename...Tvtc>
-    static void apply(Tvtc&& ... vtc){
+    template <typename... Tvtc> static void apply(Tvtc &&...vtc) {
         DeduceValueType_Helper<count_vtc<Tvtc...>::count, TExec, TList...>::apply(std::forward<Tvtc>(vtc)...);
     }
 };
 
-
-
-#endif //DAPHNE_PROTOTYPE_SRC_UTIL_DEDUCETYPE_H
+#endif // DAPHNE_PROTOTYPE_SRC_UTIL_DEDUCETYPE_H
diff --git a/src/util/ErrorHandler.cpp b/src/util/ErrorHandler.cpp
index 62ed0f980..571b77adf 100644
--- a/src/util/ErrorHandler.cpp
+++ b/src/util/ErrorHandler.cpp
@@ -54,14 +54,13 @@ static constexpr auto RESET_COLOR = "\x1b[0m";
  *       |
  *       |
  */
-std::runtime_error ErrorHandler::makeError(std::string header, std::string msg,
-                             std::string file, unsigned int line,
-                             unsigned int col) {
+std::runtime_error ErrorHandler::makeError(std::string header, std::string msg, std::string file, unsigned int line,
+                                           unsigned int col) {
     std::stringstream s;
     s << header;
     std::filesystem::path p = file;
-    s << INDENT << DAPHNE_BLUE << " | " << RESET_COLOR << "Source file -> "
-      << std::filesystem::absolute(p) << ':' << line << ':' << col << "\n";
+    s << INDENT << DAPHNE_BLUE << " | " << RESET_COLOR << "Source file -> " << std::filesystem::absolute(p) << ':'
+      << line << ':' << col << "\n";
 
     auto fStream = std::ifstream(file);
     s << INDENT << DAPHNE_BLUE << " | " << RESET_COLOR << "\n";
@@ -72,50 +71,39 @@ std::runtime_error ErrorHandler::makeError(std::string header, std::string msg,
 
     s << DAPHNE_BLUE << std::setw(3) << line << " | " << RESET_COLOR << l;
     std::string hint = std::string(col, ' ') + "^" + std::string(2, '~');
-    s << "\n"
-      << INDENT << DAPHNE_BLUE << " | " << DAPHNE_RED << hint << RESET_COLOR
-      << "\n\n";
+    s << "\n" << INDENT << DAPHNE_BLUE << " | " << DAPHNE_RED << hint << RESET_COLOR << "\n\n";
 
     return std::runtime_error(s.str());
 }
 
 static constexpr int UNREGISTERED = -1;
-std::runtime_error ErrorHandler::runtimeError(int kId, std::string msg,
-                                              KernelDispatchMapping *kdm) {
+std::runtime_error ErrorHandler::runtimeError(int kId, std::string msg, KernelDispatchMapping *kdm) {
     if (kId == UNREGISTERED)
         return std::runtime_error(msg);
 
     auto kdi = kdm->getKernelDispatchInfo(kId);
-    std::string header =
-        std::string("The kernel-function ") + DAPHNE_BLUE + kdi.kernelName +
-        RESET_COLOR + " failed during runtime with the following message [ " +
-        DAPHNE_RED + msg + RESET_COLOR + " ]\n";
+    std::string header = std::string("The kernel-function ") + DAPHNE_BLUE + kdi.kernelName + RESET_COLOR +
+                         " failed during runtime with the following message [ " + DAPHNE_RED + msg + RESET_COLOR +
+                         " ]\n";
 
     return makeError(header, msg, kdi.fileName, kdi.line, kdi.column);
 }
 
-std::runtime_error ErrorHandler::compilerError(mlir::Operation *op,
-                                               const std::string &pass,
-                                               const std::string &msg) {
+std::runtime_error ErrorHandler::compilerError(mlir::Operation *op, const std::string &pass, const std::string &msg) {
     return compilerError(op->getLoc(), pass, msg);
 }
 
-std::runtime_error ErrorHandler::compilerError(mlir::Location loc,
-                                               const std::string &pass,
-                                               const std::string &msg) {
+std::runtime_error ErrorHandler::compilerError(mlir::Location loc, const std::string &pass, const std::string &msg) {
 
     auto flcLoc = llvm::dyn_cast<mlir::FileLineColLoc>(loc);
     std::stringstream header;
     auto fName = flcLoc.getFilename().str();
-    header << DAPHNE_BLUE << pass << RESET_COLOR
-           << " failed with the following message [ " << DAPHNE_RED << msg
+    header << DAPHNE_BLUE << pass << RESET_COLOR << " failed with the following message [ " << DAPHNE_RED << msg
            << RESET_COLOR << " ]\n";
-    return makeError(header.str(), msg, fName, flcLoc.getLine(),
-                     flcLoc.getColumn());
+    return makeError(header.str(), msg, fName, flcLoc.getLine(), flcLoc.getColumn());
 }
 
-std::runtime_error ErrorHandler::rethrowError(const std::string &action,
-                                              const std::string &msg) {
+std::runtime_error ErrorHandler::rethrowError(const std::string &action, const std::string &msg) {
     std::stringstream s;
     s << action << BREADCRUMB << msg;
     return std::runtime_error(s.str());
diff --git a/src/util/ErrorHandler.h b/src/util/ErrorHandler.h
index 0575ed6f2..20495119a 100644
--- a/src/util/ErrorHandler.h
+++ b/src/util/ErrorHandler.h
@@ -47,8 +47,7 @@ struct ErrorHandler {
      * KernelDispatchMapping::registerKernel. Is passed to all kernel calls.
      * \param kdm KernelDispatchMapping instance held by the DaphneContext.
      */
-    static std::runtime_error runtimeError(int kId, std::string msg,
-                                           KernelDispatchMapping *kdm);
+    static std::runtime_error runtimeError(int kId, std::string msg, KernelDispatchMapping *kdm);
 
     /*
      * To be used during compilation, emits the passed msg and provides
@@ -59,9 +58,7 @@ struct ErrorHandler {
      * failed.
      * \param msg User-facing error message.
      */
-    static std::runtime_error compilerError(mlir::Location loc,
-                                            const std::string &action,
-                                            const std::string &msg);
+    static std::runtime_error compilerError(mlir::Location loc, const std::string &action, const std::string &msg);
 
     /*
      * To be used during compilation, emits the passed msg and provides
@@ -72,9 +69,7 @@ struct ErrorHandler {
      * failed.
      * \param msg User-facing error message.
      */
-    static std::runtime_error compilerError(mlir::Operation *op,
-                                            const std::string &action,
-                                            const std::string &msg);
+    static std::runtime_error compilerError(mlir::Operation *op, const std::string &action, const std::string &msg);
 
     /*
      * To be used when catching an exception and rethrowing the exception.
@@ -85,8 +80,7 @@ struct ErrorHandler {
      * \param msg Recommended to reuse the message from the caught exception,
      * but additional information can be added to the string.
      */
-    static std::runtime_error rethrowError(const std::string &action,
-                                           const std::string &msg);
+    static std::runtime_error rethrowError(const std::string &action, const std::string &msg);
 
     /*
      * Creates an std::runtime_error instance with a header, an error message,
@@ -114,9 +108,8 @@ struct ErrorHandler {
      *       |
      *       |
      */
-    static std::runtime_error makeError(std::string header, std::string msg,
-                                 std::string file, unsigned int line,
-                                 unsigned int col);
+    static std::runtime_error makeError(std::string header, std::string msg, std::string file, unsigned int line,
+                                        unsigned int col);
 
     /*
      * Writes the current module IR to the file "module_fail.log" on disk.
diff --git a/src/util/KernelDispatchMapping.cpp b/src/util/KernelDispatchMapping.cpp
index b3b038566..e7af80777 100644
--- a/src/util/KernelDispatchMapping.cpp
+++ b/src/util/KernelDispatchMapping.cpp
@@ -16,26 +16,23 @@
 
 #include "KernelDispatchMapping.h"
 
-#include <mlir/IR/Location.h>
 #include <mlir/IR/BuiltinOps.h>
+#include <mlir/IR/Location.h>
 
 KernelDispatchMapping &KernelDispatchMapping::instance() {
     static KernelDispatchMapping INSTANCE;
     return INSTANCE;
 }
 
-int KernelDispatchMapping::registerKernel(std::string name,
-                                          mlir::Operation *op) {
+int KernelDispatchMapping::registerKernel(std::string name, mlir::Operation *op) {
     std::lock_guard<std::mutex> lg(m_dispatchMapping);
     int kId = kIdCounter++;
     if (auto flcLoc = llvm::dyn_cast<mlir::FileLineColLoc>(op->getLoc())) {
         auto fName = flcLoc.getFilename().str();
-        dispatchMapping[kId] = {name, fName, flcLoc.getLine(),
-                                  flcLoc.getColumn()};
+        dispatchMapping[kId] = {name, fName, flcLoc.getLine(), flcLoc.getColumn()};
         currentLoc = flcLoc;
     } else {
-        dispatchMapping[kId] = {name, currentLoc.getFilename().str(),
-                                  currentLoc.getLine(), currentLoc.getColumn()};
+        dispatchMapping[kId] = {name, currentLoc.getFilename().str(), currentLoc.getLine(), currentLoc.getColumn()};
     }
     return kId;
 }
diff --git a/src/util/KernelDispatchMapping.h b/src/util/KernelDispatchMapping.h
index f6a9c0ffe..13f79d481 100644
--- a/src/util/KernelDispatchMapping.h
+++ b/src/util/KernelDispatchMapping.h
@@ -41,7 +41,7 @@ struct KDMInfo {
  * and maps a kernel identifier (int) to an instance of KDMInfo.
  */
 struct KernelDispatchMapping {
-   private:
+  private:
     int kIdCounter{0};
     std::mutex m_dispatchMapping{};
     std::unordered_map<int, KDMInfo> dispatchMapping{};
@@ -51,28 +51,20 @@ struct KernelDispatchMapping {
      */
     mlir::FileLineColLoc currentLoc{};
 
-   public:
-    std::unordered_map<int, KDMInfo>::iterator begin() {
-        return dispatchMapping.begin();
-    }
-    std::unordered_map<int, KDMInfo>::iterator end() {
-        return dispatchMapping.end();
-    }
-    std::unordered_map<int, KDMInfo>::const_iterator begin() const {
-        return dispatchMapping.begin();
-    }
-    std::unordered_map<int, KDMInfo>::const_iterator end() const {
-        return dispatchMapping.end();
-    }
+  public:
+    std::unordered_map<int, KDMInfo>::iterator begin() { return dispatchMapping.begin(); }
+    std::unordered_map<int, KDMInfo>::iterator end() { return dispatchMapping.end(); }
+    std::unordered_map<int, KDMInfo>::const_iterator begin() const { return dispatchMapping.begin(); }
+    std::unordered_map<int, KDMInfo>::const_iterator end() const { return dispatchMapping.end(); }
 
-    static KernelDispatchMapping& instance();
+    static KernelDispatchMapping &instance();
 
     /**
      * Used to register kernel call during source code lowering.
      * \param name The symbol name of the kernel.
      * \param op The mlir::Operation being lowered to dispatch a kernel call.
      */
-    int registerKernel(std::string name, mlir::Operation* op);
+    int registerKernel(std::string name, mlir::Operation *op);
     //
     KDMInfo getKernelDispatchInfo(int kId);
 };
diff --git a/src/util/MurmurHash3.cpp b/src/util/MurmurHash3.cpp
index bc0245a33..193e125cc 100644
--- a/src/util/MurmurHash3.cpp
+++ b/src/util/MurmurHash3.cpp
@@ -16,33 +16,27 @@
 
 #if defined(_MSC_VER)
 
-#define FORCE_INLINE	__forceinline
+#define FORCE_INLINE __forceinline
 
 #include <stdlib.h>
 
-#define ROTL32(x,y)	_rotl(x,y)
-#define ROTL64(x,y)	_rotl64(x,y)
+#define ROTL32(x, y) _rotl(x, y)
+#define ROTL64(x, y) _rotl64(x, y)
 
 #define BIG_CONSTANT(x) (x)
 
 // Other compilers
 
-#else	// defined(_MSC_VER)
+#else // defined(_MSC_VER)
 
-#define	FORCE_INLINE inline __attribute__((always_inline))
+#define FORCE_INLINE inline __attribute__((always_inline))
 
-inline uint32_t rotl32 ( uint32_t x, int8_t r )
-{
-  return (x << r) | (x >> (32 - r));
-}
+inline uint32_t rotl32(uint32_t x, int8_t r) { return (x << r) | (x >> (32 - r)); }
 
-inline uint64_t rotl64 ( uint64_t x, int8_t r )
-{
-  return (x << r) | (x >> (64 - r));
-}
+inline uint64_t rotl64(uint64_t x, int8_t r) { return (x << r) | (x >> (64 - r)); }
 
-#define	ROTL32(x,y)	rotl32(x,y)
-#define ROTL64(x,y)	rotl64(x,y)
+#define ROTL32(x, y) rotl32(x, y)
+#define ROTL64(x, y) rotl64(x, y)
 
 #define BIG_CONSTANT(x) (x##LLU)
 
@@ -52,283 +46,388 @@ inline uint64_t rotl64 ( uint64_t x, int8_t r )
 // Block read - if your platform needs to do endian-swapping or can only
 // handle aligned reads, do the conversion here
 
-FORCE_INLINE uint32_t getblock32 ( const uint32_t * p, int i )
-{
-  return p[i];
-}
+FORCE_INLINE uint32_t getblock32(const uint32_t *p, int i) { return p[i]; }
 
-FORCE_INLINE uint64_t getblock64 ( const uint64_t * p, int i )
-{
-  return p[i];
-}
+FORCE_INLINE uint64_t getblock64(const uint64_t *p, int i) { return p[i]; }
 
 //-----------------------------------------------------------------------------
 // Finalization mix - force all bits of a hash block to avalanche
 
-FORCE_INLINE uint32_t fmix32 ( uint32_t h )
-{
-  h ^= h >> 16;
-  h *= 0x85ebca6b;
-  h ^= h >> 13;
-  h *= 0xc2b2ae35;
-  h ^= h >> 16;
+FORCE_INLINE uint32_t fmix32(uint32_t h) {
+    h ^= h >> 16;
+    h *= 0x85ebca6b;
+    h ^= h >> 13;
+    h *= 0xc2b2ae35;
+    h ^= h >> 16;
 
-  return h;
+    return h;
 }
 
 //----------
 
-FORCE_INLINE uint64_t fmix64 ( uint64_t k )
-{
-  k ^= k >> 33;
-  k *= BIG_CONSTANT(0xff51afd7ed558ccd);
-  k ^= k >> 33;
-  k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53);
-  k ^= k >> 33;
+FORCE_INLINE uint64_t fmix64(uint64_t k) {
+    k ^= k >> 33;
+    k *= BIG_CONSTANT(0xff51afd7ed558ccd);
+    k ^= k >> 33;
+    k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53);
+    k ^= k >> 33;
 
-  return k;
+    return k;
 }
 
 //-----------------------------------------------------------------------------
 
-void MurmurHash3_x86_32 ( const void * key, int len,
-                          uint32_t seed, void * out )
-{
-  const auto* data = static_cast<const uint8_t*>(key);
-  const int nblocks = len / 4;
-
-  uint32_t h1 = seed;
-
-  const uint32_t c1 = 0xcc9e2d51;
-  const uint32_t c2 = 0x1b873593;
-
-  //----------
-  // body
-
-  const auto* blocks = reinterpret_cast<const uint32_t *>(data + nblocks*4);
-
-  for(int i = -nblocks; i; i++)
-  {
-    uint32_t k1 = getblock32(blocks,i);
-
-    k1 *= c1;
-    k1 = ROTL32(k1,15);
-    k1 *= c2;
-    
-    h1 ^= k1;
-    h1 = ROTL32(h1,13); 
-    h1 = h1*5+0xe6546b64;
-  }
-
-  //----------
-  // tail
-
-  const auto* tail = static_cast<const uint8_t*>(data + nblocks*4);
-
-  uint32_t k1 = 0;
-
-  switch(len & 3)
-  {
-  case 3: k1 ^= tail[2] << 16; [[fallthrough]];
-  case 2: k1 ^= tail[1] << 8;  [[fallthrough]];
-  case 1: k1 ^= tail[0];
-          k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
-  }
-
-  //----------
-  // finalization
-
-  h1 ^= len;
-
-  h1 = fmix32(h1);
-
-  *(uint32_t*)out = h1;
-} 
-
-//-----------------------------------------------------------------------------
-
-[[maybe_unused]] void MurmurHash3_x86_128 ( const void * key, const int len,
-                           uint32_t seed, void * out )
-{
-  const auto* data = static_cast<const uint8_t*>(key);
-  const int nblocks = len / 16;
-
-  uint32_t h1 = seed;
-  uint32_t h2 = seed;
-  uint32_t h3 = seed;
-  uint32_t h4 = seed;
-
-  const uint32_t c1 = 0x239b961b; 
-  const uint32_t c2 = 0xab0e9789;
-  const uint32_t c3 = 0x38b34ae5; 
-  const uint32_t c4 = 0xa1e38b93;
-
-  //----------
-  // body
-
-  const auto* blocks = reinterpret_cast<const uint32_t *>(data + nblocks*16);
-
-  for(int i = -nblocks; i; i++)
-  {
-    uint32_t k1 = getblock32(blocks,i*4+0);
-    uint32_t k2 = getblock32(blocks,i*4+1);
-    uint32_t k3 = getblock32(blocks,i*4+2);
-    uint32_t k4 = getblock32(blocks,i*4+3);
+void MurmurHash3_x86_32(const void *key, int len, uint32_t seed, void *out) {
+    const auto *data = static_cast<const uint8_t *>(key);
+    const int nblocks = len / 4;
 
-    k1 *= c1; k1  = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
+    uint32_t h1 = seed;
 
-    h1 = ROTL32(h1,19); h1 += h2; h1 = h1*5+0x561ccd1b;
+    const uint32_t c1 = 0xcc9e2d51;
+    const uint32_t c2 = 0x1b873593;
 
-    k2 *= c2; k2  = ROTL32(k2,16); k2 *= c3; h2 ^= k2;
+    //----------
+    // body
 
-    h2 = ROTL32(h2,17); h2 += h3; h2 = h2*5+0x0bcaa747;
+    const auto *blocks = reinterpret_cast<const uint32_t *>(data + nblocks * 4);
 
-    k3 *= c3; k3  = ROTL32(k3,17); k3 *= c4; h3 ^= k3;
+    for (int i = -nblocks; i; i++) {
+        uint32_t k1 = getblock32(blocks, i);
 
-    h3 = ROTL32(h3,15); h3 += h4; h3 = h3*5+0x96cd1c35;
+        k1 *= c1;
+        k1 = ROTL32(k1, 15);
+        k1 *= c2;
 
-    k4 *= c4; k4  = ROTL32(k4,18); k4 *= c1; h4 ^= k4;
+        h1 ^= k1;
+        h1 = ROTL32(h1, 13);
+        h1 = h1 * 5 + 0xe6546b64;
+    }
 
-    h4 = ROTL32(h4,13); h4 += h1; h4 = h4*5+0x32ac3b17;
-  }
+    //----------
+    // tail
 
-  //----------
-  // tail
+    const auto *tail = static_cast<const uint8_t *>(data + nblocks * 4);
 
-  const auto* tail = static_cast<const uint8_t*>(data + nblocks*16);
+    uint32_t k1 = 0;
 
-  uint32_t k1 = 0;
-  uint32_t k2 = 0;
-  uint32_t k3 = 0;
-  uint32_t k4 = 0;
+    switch (len & 3) {
+    case 3:
+        k1 ^= tail[2] << 16;
+        [[fallthrough]];
+    case 2:
+        k1 ^= tail[1] << 8;
+        [[fallthrough]];
+    case 1:
+        k1 ^= tail[0];
+        k1 *= c1;
+        k1 = ROTL32(k1, 15);
+        k1 *= c2;
+        h1 ^= k1;
+    }
 
-  switch(len & 15)
-  {
-  case 15: k4 ^= tail[14] << 16; [[fallthrough]];
-  case 14: k4 ^= tail[13] << 8;  [[fallthrough]];
-  case 13: k4 ^= tail[12] << 0;
-           k4 *= c4; k4  = ROTL32(k4,18); k4 *= c1; h4 ^= k4; [[fallthrough]];
-  case 12: k3 ^= tail[11] << 24; [[fallthrough]];
-  case 11: k3 ^= tail[10] << 16; [[fallthrough]];
-  case 10: k3 ^= tail[ 9] << 8;  [[fallthrough]];
-  case  9: k3 ^= tail[ 8] << 0;
-           k3 *= c3; k3  = ROTL32(k3,17); k3 *= c4; h3 ^= k3; [[fallthrough]];
+    //----------
+    // finalization
 
-  case  8: k2 ^= tail[ 7] << 24; [[fallthrough]];
-  case  7: k2 ^= tail[ 6] << 16; [[fallthrough]];
-  case  6: k2 ^= tail[ 5] << 8;  [[fallthrough]];
-  case  5: k2 ^= tail[ 4] << 0;
-           k2 *= c2; k2  = ROTL32(k2,16); k2 *= c3; h2 ^= k2; [[fallthrough]];
+    h1 ^= len;
 
-  case  4: k1 ^= tail[ 3] << 24; [[fallthrough]];
-  case  3: k1 ^= tail[ 2] << 16; [[fallthrough]];
-  case  2: k1 ^= tail[ 1] << 8;  [[fallthrough]];
-  case  1: k1 ^= tail[ 0] << 0;
-           k1 *= c1; k1  = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
-  }
+    h1 = fmix32(h1);
 
-  //----------
-  // finalization
-
-  h1 ^= len; h2 ^= len; h3 ^= len; h4 ^= len;
-
-  h1 += h2; h1 += h3; h1 += h4;
-  h2 += h1; h3 += h1; h4 += h1;
-
-  h1 = fmix32(h1);
-  h2 = fmix32(h2);
-  h3 = fmix32(h3);
-  h4 = fmix32(h4);
-
-  h1 += h2; h1 += h3; h1 += h4;
-  h2 += h1; h3 += h1; h4 += h1;
-
-  ((uint32_t*)out)[0] = h1;
-  ((uint32_t*)out)[1] = h2;
-  ((uint32_t*)out)[2] = h3;
-  ((uint32_t*)out)[3] = h4;
+    *(uint32_t *)out = h1;
 }
 
 //-----------------------------------------------------------------------------
 
-[[maybe_unused]] void MurmurHash3_x64_128 ( const void * key, const int len,
-                           const uint32_t seed, void * out )
-{
-  const auto* data = static_cast<const uint8_t*>(key);
-  const int nblocks = len / 16;
-
-  uint64_t h1 = seed;
-  uint64_t h2 = seed;
-
-  const uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5);
-  const uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f);
-
-  //----------
-  // body
-
-  const auto* blocks = reinterpret_cast<const uint64_t *>(data);
-
-  for(int i = 0; i < nblocks; i++)
-  {
-    uint64_t k1 = getblock64(blocks,i*2+0);
-    uint64_t k2 = getblock64(blocks,i*2+1);
-
-    k1 *= c1; k1  = ROTL64(k1,31); k1 *= c2; h1 ^= k1;
-
-    h1 = ROTL64(h1,27); h1 += h2; h1 = h1*5+0x52dce729;
-
-    k2 *= c2; k2  = ROTL64(k2,33); k2 *= c1; h2 ^= k2;
-
-    h2 = ROTL64(h2,31); h2 += h1; h2 = h2*5+0x38495ab5;
-  }
-
-  //----------
-  // tail
-
-  const auto* tail = static_cast<const uint8_t*>(data + nblocks*16);
-
-  uint64_t k1 = 0;
-  uint64_t k2 = 0;
-
-  switch(len & 15)
-  {
-  case 15: k2 ^= ((uint64_t)tail[14]) << 48; [[fallthrough]];
-  case 14: k2 ^= ((uint64_t)tail[13]) << 40; [[fallthrough]];
-  case 13: k2 ^= ((uint64_t)tail[12]) << 32; [[fallthrough]];
-  case 12: k2 ^= ((uint64_t)tail[11]) << 24; [[fallthrough]];
-  case 11: k2 ^= ((uint64_t)tail[10]) << 16; [[fallthrough]];
-  case 10: k2 ^= ((uint64_t)tail[ 9]) << 8;  [[fallthrough]];
-  case  9: k2 ^= ((uint64_t)tail[ 8]) << 0;
-           k2 *= c2; k2  = ROTL64(k2,33); k2 *= c1; h2 ^= k2;  [[fallthrough]];
-
-  case  8: k1 ^= ((uint64_t)tail[ 7]) << 56; [[fallthrough]];
-  case  7: k1 ^= ((uint64_t)tail[ 6]) << 48; [[fallthrough]];
-  case  6: k1 ^= ((uint64_t)tail[ 5]) << 40; [[fallthrough]];
-  case  5: k1 ^= ((uint64_t)tail[ 4]) << 32; [[fallthrough]];
-  case  4: k1 ^= ((uint64_t)tail[ 3]) << 24; [[fallthrough]];
-  case  3: k1 ^= ((uint64_t)tail[ 2]) << 16; [[fallthrough]];
-  case  2: k1 ^= ((uint64_t)tail[ 1]) << 8;  [[fallthrough]];
-  case  1: k1 ^= ((uint64_t)tail[ 0]) << 0;
-           k1 *= c1; k1  = ROTL64(k1,31); k1 *= c2; h1 ^= k1;
-  }
-
-  //----------
-  // finalization
-
-  h1 ^= len; h2 ^= len;
-
-  h1 += h2;
-  h2 += h1;
-
-  h1 = fmix64(h1);
-  h2 = fmix64(h2);
+[[maybe_unused]] void MurmurHash3_x86_128(const void *key, const int len, uint32_t seed, void *out) {
+    const auto *data = static_cast<const uint8_t *>(key);
+    const int nblocks = len / 16;
+
+    uint32_t h1 = seed;
+    uint32_t h2 = seed;
+    uint32_t h3 = seed;
+    uint32_t h4 = seed;
+
+    const uint32_t c1 = 0x239b961b;
+    const uint32_t c2 = 0xab0e9789;
+    const uint32_t c3 = 0x38b34ae5;
+    const uint32_t c4 = 0xa1e38b93;
+
+    //----------
+    // body
+
+    const auto *blocks = reinterpret_cast<const uint32_t *>(data + nblocks * 16);
+
+    for (int i = -nblocks; i; i++) {
+        uint32_t k1 = getblock32(blocks, i * 4 + 0);
+        uint32_t k2 = getblock32(blocks, i * 4 + 1);
+        uint32_t k3 = getblock32(blocks, i * 4 + 2);
+        uint32_t k4 = getblock32(blocks, i * 4 + 3);
+
+        k1 *= c1;
+        k1 = ROTL32(k1, 15);
+        k1 *= c2;
+        h1 ^= k1;
+
+        h1 = ROTL32(h1, 19);
+        h1 += h2;
+        h1 = h1 * 5 + 0x561ccd1b;
+
+        k2 *= c2;
+        k2 = ROTL32(k2, 16);
+        k2 *= c3;
+        h2 ^= k2;
+
+        h2 = ROTL32(h2, 17);
+        h2 += h3;
+        h2 = h2 * 5 + 0x0bcaa747;
+
+        k3 *= c3;
+        k3 = ROTL32(k3, 17);
+        k3 *= c4;
+        h3 ^= k3;
+
+        h3 = ROTL32(h3, 15);
+        h3 += h4;
+        h3 = h3 * 5 + 0x96cd1c35;
+
+        k4 *= c4;
+        k4 = ROTL32(k4, 18);
+        k4 *= c1;
+        h4 ^= k4;
+
+        h4 = ROTL32(h4, 13);
+        h4 += h1;
+        h4 = h4 * 5 + 0x32ac3b17;
+    }
+
+    //----------
+    // tail
+
+    const auto *tail = static_cast<const uint8_t *>(data + nblocks * 16);
+
+    uint32_t k1 = 0;
+    uint32_t k2 = 0;
+    uint32_t k3 = 0;
+    uint32_t k4 = 0;
+
+    switch (len & 15) {
+    case 15:
+        k4 ^= tail[14] << 16;
+        [[fallthrough]];
+    case 14:
+        k4 ^= tail[13] << 8;
+        [[fallthrough]];
+    case 13:
+        k4 ^= tail[12] << 0;
+        k4 *= c4;
+        k4 = ROTL32(k4, 18);
+        k4 *= c1;
+        h4 ^= k4;
+        [[fallthrough]];
+    case 12:
+        k3 ^= tail[11] << 24;
+        [[fallthrough]];
+    case 11:
+        k3 ^= tail[10] << 16;
+        [[fallthrough]];
+    case 10:
+        k3 ^= tail[9] << 8;
+        [[fallthrough]];
+    case 9:
+        k3 ^= tail[8] << 0;
+        k3 *= c3;
+        k3 = ROTL32(k3, 17);
+        k3 *= c4;
+        h3 ^= k3;
+        [[fallthrough]];
+
+    case 8:
+        k2 ^= tail[7] << 24;
+        [[fallthrough]];
+    case 7:
+        k2 ^= tail[6] << 16;
+        [[fallthrough]];
+    case 6:
+        k2 ^= tail[5] << 8;
+        [[fallthrough]];
+    case 5:
+        k2 ^= tail[4] << 0;
+        k2 *= c2;
+        k2 = ROTL32(k2, 16);
+        k2 *= c3;
+        h2 ^= k2;
+        [[fallthrough]];
+
+    case 4:
+        k1 ^= tail[3] << 24;
+        [[fallthrough]];
+    case 3:
+        k1 ^= tail[2] << 16;
+        [[fallthrough]];
+    case 2:
+        k1 ^= tail[1] << 8;
+        [[fallthrough]];
+    case 1:
+        k1 ^= tail[0] << 0;
+        k1 *= c1;
+        k1 = ROTL32(k1, 15);
+        k1 *= c2;
+        h1 ^= k1;
+    }
+
+    //----------
+    // finalization
+
+    h1 ^= len;
+    h2 ^= len;
+    h3 ^= len;
+    h4 ^= len;
+
+    h1 += h2;
+    h1 += h3;
+    h1 += h4;
+    h2 += h1;
+    h3 += h1;
+    h4 += h1;
+
+    h1 = fmix32(h1);
+    h2 = fmix32(h2);
+    h3 = fmix32(h3);
+    h4 = fmix32(h4);
+
+    h1 += h2;
+    h1 += h3;
+    h1 += h4;
+    h2 += h1;
+    h3 += h1;
+    h4 += h1;
+
+    ((uint32_t *)out)[0] = h1;
+    ((uint32_t *)out)[1] = h2;
+    ((uint32_t *)out)[2] = h3;
+    ((uint32_t *)out)[3] = h4;
+}
 
-  h1 += h2;
-  h2 += h1;
+//-----------------------------------------------------------------------------
 
-  ((uint64_t*)out)[0] = h1;
-  ((uint64_t*)out)[1] = h2;
+[[maybe_unused]] void MurmurHash3_x64_128(const void *key, const int len, const uint32_t seed, void *out) {
+    const auto *data = static_cast<const uint8_t *>(key);
+    const int nblocks = len / 16;
+
+    uint64_t h1 = seed;
+    uint64_t h2 = seed;
+
+    const uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5);
+    const uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f);
+
+    //----------
+    // body
+
+    const auto *blocks = reinterpret_cast<const uint64_t *>(data);
+
+    for (int i = 0; i < nblocks; i++) {
+        uint64_t k1 = getblock64(blocks, i * 2 + 0);
+        uint64_t k2 = getblock64(blocks, i * 2 + 1);
+
+        k1 *= c1;
+        k1 = ROTL64(k1, 31);
+        k1 *= c2;
+        h1 ^= k1;
+
+        h1 = ROTL64(h1, 27);
+        h1 += h2;
+        h1 = h1 * 5 + 0x52dce729;
+
+        k2 *= c2;
+        k2 = ROTL64(k2, 33);
+        k2 *= c1;
+        h2 ^= k2;
+
+        h2 = ROTL64(h2, 31);
+        h2 += h1;
+        h2 = h2 * 5 + 0x38495ab5;
+    }
+
+    //----------
+    // tail
+
+    const auto *tail = static_cast<const uint8_t *>(data + nblocks * 16);
+
+    uint64_t k1 = 0;
+    uint64_t k2 = 0;
+
+    switch (len & 15) {
+    case 15:
+        k2 ^= ((uint64_t)tail[14]) << 48;
+        [[fallthrough]];
+    case 14:
+        k2 ^= ((uint64_t)tail[13]) << 40;
+        [[fallthrough]];
+    case 13:
+        k2 ^= ((uint64_t)tail[12]) << 32;
+        [[fallthrough]];
+    case 12:
+        k2 ^= ((uint64_t)tail[11]) << 24;
+        [[fallthrough]];
+    case 11:
+        k2 ^= ((uint64_t)tail[10]) << 16;
+        [[fallthrough]];
+    case 10:
+        k2 ^= ((uint64_t)tail[9]) << 8;
+        [[fallthrough]];
+    case 9:
+        k2 ^= ((uint64_t)tail[8]) << 0;
+        k2 *= c2;
+        k2 = ROTL64(k2, 33);
+        k2 *= c1;
+        h2 ^= k2;
+        [[fallthrough]];
+
+    case 8:
+        k1 ^= ((uint64_t)tail[7]) << 56;
+        [[fallthrough]];
+    case 7:
+        k1 ^= ((uint64_t)tail[6]) << 48;
+        [[fallthrough]];
+    case 6:
+        k1 ^= ((uint64_t)tail[5]) << 40;
+        [[fallthrough]];
+    case 5:
+        k1 ^= ((uint64_t)tail[4]) << 32;
+        [[fallthrough]];
+    case 4:
+        k1 ^= ((uint64_t)tail[3]) << 24;
+        [[fallthrough]];
+    case 3:
+        k1 ^= ((uint64_t)tail[2]) << 16;
+        [[fallthrough]];
+    case 2:
+        k1 ^= ((uint64_t)tail[1]) << 8;
+        [[fallthrough]];
+    case 1:
+        k1 ^= ((uint64_t)tail[0]) << 0;
+        k1 *= c1;
+        k1 = ROTL64(k1, 31);
+        k1 *= c2;
+        h1 ^= k1;
+    }
+
+    //----------
+    // finalization
+
+    h1 ^= len;
+    h2 ^= len;
+
+    h1 += h2;
+    h2 += h1;
+
+    h1 = fmix64(h1);
+    h2 = fmix64(h2);
+
+    h1 += h2;
+    h2 += h1;
+
+    ((uint64_t *)out)[0] = h1;
+    ((uint64_t *)out)[1] = h2;
 }
 
 //-----------------------------------------------------------------------------
-
diff --git a/src/util/MurmurHash3.h b/src/util/MurmurHash3.h
index c16976e0b..3f22304ee 100644
--- a/src/util/MurmurHash3.h
+++ b/src/util/MurmurHash3.h
@@ -17,7 +17,7 @@ typedef unsigned __int64 uint64_t;
 
 // Other compilers
 
-#else	// defined(_MSC_VER)
+#else // defined(_MSC_VER)
 
 #include <cstdint>
 
@@ -25,10 +25,10 @@ typedef unsigned __int64 uint64_t;
 
 //-----------------------------------------------------------------------------
 
-[[maybe_unused]] void MurmurHash3_x86_32  ( const void * key, int len, uint32_t seed, void * out );
+[[maybe_unused]] void MurmurHash3_x86_32(const void *key, int len, uint32_t seed, void *out);
 
-[[maybe_unused]] void MurmurHash3_x86_128 ( const void * key, int len, uint32_t seed, void * out );
+[[maybe_unused]] void MurmurHash3_x86_128(const void *key, int len, uint32_t seed, void *out);
 
-[[maybe_unused]] void MurmurHash3_x64_128 ( const void * key, int len, uint32_t seed, void * out );
+[[maybe_unused]] void MurmurHash3_x64_128(const void *key, int len, uint32_t seed, void *out);
 
 //-----------------------------------------------------------------------------
diff --git a/src/util/Statistics.cpp b/src/util/Statistics.cpp
index 724a54d24..2c6bf7863 100644
--- a/src/util/Statistics.cpp
+++ b/src/util/Statistics.cpp
@@ -1,8 +1,24 @@
+/*
+ * Copyright 2024 The DAPHNE Consortium
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 #include "Statistics.h"
 
+#include <fmt/core.h>
 #include <spdlog/spdlog.h>
 
-#include <iostream>
 #include <map>
 
 Statistics &Statistics::instance() {
@@ -26,32 +42,24 @@ void Statistics::stopKernelTimer(int kId) {
 size_t getMaxKernelNameLength(KernelDispatchMapping &kdm) {
     auto maxLen = 0ul;
     for (auto const &[kId, kdmInfo] : kdm) {
-        maxLen = kdmInfo.kernelName.length() > maxLen
-                     ? kdmInfo.kernelName.length()
-                     : maxLen;
+        maxLen = kdmInfo.kernelName.length() > maxLen ? kdmInfo.kernelName.length() : maxLen;
     }
     return maxLen;
 }
 
-std::vector<OperatorStatistics>
-Statistics::processStatisticsPerOperator(KernelDispatchMapping &kdm) {
+std::vector<OperatorStatistics> Statistics::processStatisticsPerOperator(KernelDispatchMapping &kdm) {
     std::vector<OperatorStatistics> stats;
     std::map<int, OperatorStatistics> statsByKid;
 
     std::sort(begin(kernelExecutionTimes), end(kernelExecutionTimes),
-              [](auto const &t1, auto const &t2) {
-                  return std::get<0>(t1) > std::get<0>(t2);
-              });
+              [](auto const &t1, auto const &t2) { return std::get<0>(t1) > std::get<0>(t2); });
 
     for (auto const &[kId, time] : kernelExecutionTimes) {
         statsByKid[kId] += {kdm.getKernelDispatchInfo(kId), 1, time.count()};
     }
 
-    std::transform(
-        statsByKid.begin(), statsByKid.end(), std::back_inserter(stats),
-        [](const std::map<int, OperatorStatistics>::value_type &pair) {
-            return pair.second;
-        });
+    std::transform(statsByKid.begin(), statsByKid.end(), std::back_inserter(stats),
+                   [](const std::map<int, OperatorStatistics>::value_type &pair) { return pair.second; });
 
     std::sort(rbegin(stats), rend(stats));
 
@@ -63,16 +71,14 @@ void Statistics::dumpStatistics(KernelDispatchMapping &kdm) {
     spdlog::info("DAPHNE operator execution runtime statistics.");
     auto maxLen = getMaxKernelNameLength(kdm);
 
-    spdlog::info("{:<2}  {:<{}}  {:<9}{:<11}  {}  {}", "#", "Operator Name",
-                 maxLen, "Time(s)", "Count", "Avg(s)", "File:Line:Column");
+    spdlog::info("{:<2}  {:<{}}  {:<9}{:<11}  {}  {}", "#", "Operator Name", maxLen, "Time(s)", "Count", "Avg(s)",
+                 "File:Line:Column");
     std::vector<OperatorStatistics> opStats = processStatisticsPerOperator(kdm);
 
     auto i = 0;
     for (auto const &[kdmInfo, count, opTime] : opStats) {
-        spdlog::info("{:<2}  {:<{}}  {:<9.2f}{:<13}{:<8.2f}{}", i++,
-                     kdmInfo.kernelName, maxLen, opTime, count, opTime / count,
-                     fmt::format("{}:{}:{}", kdmInfo.fileName, kdmInfo.line,
-                                 kdmInfo.column));
+        spdlog::info("{:<2}  {:<{}}  {:<9.2f}{:<13}{:<8.2f}{}", i++, kdmInfo.kernelName, maxLen, opTime, count,
+                     opTime / count, fmt::format("{}:{}:{}", kdmInfo.fileName, kdmInfo.line, kdmInfo.column));
         if (i > Statistics::MAX_STATS_COUNT)
             break;
     }
diff --git a/src/util/Statistics.h b/src/util/Statistics.h
index 09add6079..bd440dac7 100644
--- a/src/util/Statistics.h
+++ b/src/util/Statistics.h
@@ -1,3 +1,19 @@
+/*
+ * Copyright 2024 The DAPHNE Consortium
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 #pragma once
 
 #include <util/KernelDispatchMapping.h>
@@ -30,24 +46,16 @@ struct OperatorStatistics {
         return *this;
     }
 
-    OperatorStatistics operator+(OperatorStatistics &rhs) noexcept {
-        return *this += rhs;
-    }
+    OperatorStatistics operator+(OperatorStatistics &rhs) noexcept { return *this += rhs; }
 
-    constexpr bool operator<(const OperatorStatistics &os) noexcept {
-        return std::tie(total) < std::tie(os.total);
-    }
+    constexpr bool operator<(const OperatorStatistics &os) noexcept { return std::tie(total) < std::tie(os.total); }
 
-    constexpr bool operator<=(const OperatorStatistics &os) noexcept {
-        return std::tie(total) <= std::tie(os.total);
-    }
+    constexpr bool operator<=(const OperatorStatistics &os) noexcept { return std::tie(total) <= std::tie(os.total); }
 
-    friend std::ostream &operator<<(std::ostream &os,
-                                    const OperatorStatistics &opStats);
+    friend std::ostream &operator<<(std::ostream &os, const OperatorStatistics &opStats);
 };
 
-inline std::ostream &operator<<(std::ostream &os,
-                                const OperatorStatistics &opStats) {
+inline std::ostream &operator<<(std::ostream &os, const OperatorStatistics &opStats) {
     return os << "Name: " << opStats.kdmInfo.kernelName << "\n"
               << "Total: " << opStats.total << "\n"
               << "Count: " << opStats.count << "\n";
@@ -73,16 +81,14 @@ inline std::ostream &operator<<(std::ostream &os,
 class Statistics {
   private:
     using Time = std::chrono::time_point<std::chrono::high_resolution_clock>;
-    using KernelStats =
-        std::vector<std::tuple<int, std::chrono::duration<double>>>;
+    using KernelStats = std::vector<std::tuple<int, std::chrono::duration<double>>>;
 
     static constexpr int MAX_STATS_COUNT = 10;
     std::mutex m_times;
     KernelStats kernelExecutionTimes;
     std::unordered_map<int, Time> startTimes;
 
-    std::vector<OperatorStatistics>
-    processStatisticsPerOperator(KernelDispatchMapping &kdm);
+    std::vector<OperatorStatistics> processStatisticsPerOperator(KernelDispatchMapping &kdm);
 
   public:
     static Statistics &instance();
diff --git a/src/util/StringRefCount.cpp b/src/util/StringRefCount.cpp
index d90bf7ca5..e54a8f2b0 100644
--- a/src/util/StringRefCount.cpp
+++ b/src/util/StringRefCount.cpp
@@ -16,15 +16,14 @@
 
 #include "StringRefCount.h"
 
-void StringRefCounter::inc(const char* arg) {
+void StringRefCounter::inc(const char *arg) {
     auto ptr = reinterpret_cast<uintptr_t>(arg);
     const std::lock_guard<std::mutex> lock(mtxStrRefCnt);
-    if(auto found = stringRefCount.find(ptr); found != stringRefCount.end()) {
+    if (auto found = stringRefCount.find(ptr); found != stringRefCount.end()) {
         // If the string was found, increase its reference counter.
         found->second++;
         logger->debug("StringRefCounter::inc: ptr={}; arg={}; found and incremented", ptr, arg);
-    }
-    else {
+    } else {
         // If the string was not found, implicitly assume a prior counter of 1,
         // and increase the counter to 2.
         stringRefCount.insert({ptr, 2});
@@ -32,23 +31,23 @@ void StringRefCounter::inc(const char* arg) {
     }
 }
 
-bool StringRefCounter::dec(const char* arg) {
+bool StringRefCounter::dec(const char *arg) {
     auto ptr = reinterpret_cast<uintptr_t>(arg);
     const std::lock_guard<std::mutex> lock(mtxStrRefCnt);
-    if(auto found = stringRefCount.find(ptr); found != stringRefCount.end()) {
+    if (auto found = stringRefCount.find(ptr); found != stringRefCount.end()) {
         // If the string was found, decrease its reference counter.
         found->second--;
         logger->debug("StringRefCounter::dec: ptr={}; arg={}; found and decremented", ptr, arg);
-        if(found->second == 0) {
+        if (found->second == 0) {
             // If the reference counter became zero, erase it and return false.
             logger->debug("StringRefCounter::dec: ptr={}; arg={}; became zero and erased", ptr, arg);
             stringRefCount.erase(found);
             return false;
         }
-        // If the reference counter did not become zero, keep it and return true.
+        // If the reference counter did not become zero, keep it and return
+        // true.
         return true;
-    }
-    else {
+    } else {
         // If the string was not found, implicitly assume a prior counter of 1,
         // don't change the stored counters, just return false.
         logger->debug("StringRefCounter::dec: ptr={}; arg={}; not found", ptr, arg);
@@ -56,7 +55,7 @@ bool StringRefCounter::dec(const char* arg) {
     }
 }
 
-StringRefCounter& StringRefCounter::instance() {
+StringRefCounter &StringRefCounter::instance() {
     static StringRefCounter INSTANCE;
     return INSTANCE;
 }
\ No newline at end of file
diff --git a/src/util/StringRefCount.h b/src/util/StringRefCount.h
index eaa651aa0..e91279b63 100644
--- a/src/util/StringRefCount.h
+++ b/src/util/StringRefCount.h
@@ -25,20 +25,20 @@ class StringRefCounter {
 
     std::shared_ptr<spdlog::logger> logger;
 
-    // This map keeps track of string allocations to be able to remove them when they are not needed anymore
+    // This map keeps track of string allocations to be able to remove them when
+    // they are not needed anymore
     std::map<uintptr_t, size_t> stringRefCount;
     std::mutex mtxStrRefCnt;
 
-public:
-    StringRefCounter() {
-        logger = spdlog::get("runtime");
-    }
+  public:
+    StringRefCounter() { logger = spdlog::get("runtime"); }
 
     ~StringRefCounter() {
-        if(!stringRefCount.empty()) {
+        if (!stringRefCount.empty()) {
             // This should not happen.
-            logger->warn("{} string refs still present while destroying StringRefCounter - this should not happen.",
-                    stringRefCount.size());
+            logger->warn("{} string refs still present while destroying "
+                         "StringRefCounter - this should not happen.",
+                         stringRefCount.size());
         }
     }
 
@@ -52,22 +52,21 @@ class StringRefCounter {
      * @param arg The string that is to be tracked.
      *
      */
-    void inc(const char* arg);
+    void inc(const char *arg);
 
     /**
      * @brief Decreases the reference counter of the given string.
      *
      * If no reference counter is stored for this string, a prior value of 1 is
-     * implicitly assumed, i.e., then no changes are made to the stored reference
-     * counters and `false` is returned.
+     * implicitly assumed, i.e., then no changes are made to the stored
+     * reference counters and `false` is returned.
      *
      * @param arg The string that is to be tracked.
      *
-     * @return `false` if the reference counter became zero through the decrement,
-     * `true` otherwise.
+     * @return `false` if the reference counter became zero through the
+     * decrement, `true` otherwise.
      */
-    bool dec(const char* arg);
+    bool dec(const char *arg);
 
-    static StringRefCounter& instance();
+    static StringRefCounter &instance();
 };
-
diff --git a/src/util/UniqueBoundedPriorityQueue.h b/src/util/UniqueBoundedPriorityQueue.h
index fd2618c91..730eae448 100644
--- a/src/util/UniqueBoundedPriorityQueue.h
+++ b/src/util/UniqueBoundedPriorityQueue.h
@@ -21,22 +21,21 @@
 #include <cstdint>
 #include <queue>
 
-template <typename QT>
-class UniqueBoundedPriorityQueue: public std::priority_queue<QT> {
-    public:
-        UniqueBoundedPriorityQueue(size_t K): K(K){};
-        void push(const QT& val) {
+template <typename QT> class UniqueBoundedPriorityQueue : public std::priority_queue<QT> {
+  public:
+    UniqueBoundedPriorityQueue(size_t K) : K(K){};
+    void push(const QT &val) {
 
-            if (std::priority_queue<QT>::size() < K) {
-                std::priority_queue<QT>::push(val);
-            } else if (std::priority_queue<QT>::top() > val) {
-                std::priority_queue<QT>::pop();
-                std::priority_queue<QT>::push(val);
-            }
+        if (std::priority_queue<QT>::size() < K) {
+            std::priority_queue<QT>::push(val);
+        } else if (std::priority_queue<QT>::top() > val) {
+            std::priority_queue<QT>::pop();
+            std::priority_queue<QT>::push(val);
         }
-    private:
-        size_t K;
-};
+    }
 
+  private:
+    size_t K;
+};
 
 #endif
diff --git a/src/util/UniqueBoundedSet.h b/src/util/UniqueBoundedSet.h
index 46954e329..c0ce95df0 100644
--- a/src/util/UniqueBoundedSet.h
+++ b/src/util/UniqueBoundedSet.h
@@ -24,8 +24,8 @@
 #include <set>
 
 template <typename QT> class UniqueBoundedSet : std::set<QT> {
-public:
-  UniqueBoundedSet(size_t K) : K(K){};
+  public:
+    UniqueBoundedSet(size_t K) : K(K){};
 
     /**
      * @brief Inserts into the set until K unique values are contained.
@@ -45,7 +45,8 @@ template <typename QT> class UniqueBoundedSet : std::set<QT> {
      * @brief Will remove the biggest value from the set.
      */
     void pop() {
-        if (std::set<QT>::empty()) return;
+        if (std::set<QT>::empty())
+            return;
 
         // erase doesnt support reverse it, so move it back once.
         auto it = std::set<QT>::end();
@@ -61,13 +62,9 @@ template <typename QT> class UniqueBoundedSet : std::set<QT> {
         return *it;
     }
 
-    size_t size(){
-        return std::set<QT>::size();
-    };
+    size_t size() { return std::set<QT>::size(); };
 
-
-
-private:
+  private:
     size_t K;
 };
 
diff --git a/src/util/preprocessor_defs.h b/src/util/preprocessor_defs.h
index 9cc10013f..a334351c0 100644
--- a/src/util/preprocessor_defs.h
+++ b/src/util/preprocessor_defs.h
@@ -34,4 +34,3 @@ PRAGMA_CLION_IGNORED("UnusedImportStatement")
 #else
 #define PRAGMA_CLION_IGNORED
 #endif
-
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 931dd8763..f16c4e246 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -62,8 +62,7 @@ set(TEST_SOURCES
         api/cli/codegen/EwOpLoopFusionTest.cpp
         api/cli/codegen/AggAllTest.cpp
         api/cli/codegen/MapOpTest.cpp
-        codegen/CodegenTest.cpp
-        codegen/MatMulAccuracyTest.cpp
+
 
         ir/daphneir/InferTypesTest.cpp
         api/cli/operations/CanonicalizationConstantFoldingOpTest.cpp
diff --git a/test/api/cli/Utils.cpp b/test/api/cli/Utils.cpp
index b5072cedf..0ea3284fc 100644
--- a/test/api/cli/Utils.cpp
+++ b/test/api/cli/Utils.cpp
@@ -26,19 +26,18 @@
 #include <stdexcept>
 #include <string>
 
-std::string readTextFile(const std::string & filePath) {
+std::string readTextFile(const std::string &filePath) {
     std::ifstream ifs(filePath, std::ios::in);
     if (!ifs.good())
         throw std::runtime_error("could not open file '" + filePath + "'");
-    
+
     std::stringstream stream;
     stream << ifs.rdbuf();
-    
+
     return stream.str();
 }
 
-
-std::string generalizeDataTypes(const std::string& str) {
+std::string generalizeDataTypes(const std::string &str) {
     std::regex re("(DenseMatrix|CSRMatrix)");
     return std::regex_replace(str, re, "<SomeMatrix>");
 }
\ No newline at end of file
diff --git a/test/api/cli/Utils.h b/test/api/cli/Utils.h
index 7a52e0846..d2dc7a113 100644
--- a/test/api/cli/Utils.h
+++ b/test/api/cli/Utils.h
@@ -30,27 +30,25 @@
 
 #include <grpcpp/server.h>
 
+#include <filesystem>
 #include <sstream>
 #include <stdexcept>
 #include <string>
-#include <filesystem>
 
 /**
  * @brief Reads the entire contents of a plain text file into a string.
- * 
+ *
  * Not intended to be used with large files.
- * 
+ *
  * @param filePath The path to the file to be read.
  * @return A string containing the entire contents of the file.
  */
-std::string readTextFile(const std::string & filePath);
+std::string readTextFile(const std::string &filePath);
 
 #ifndef NDEBUG
-template<class T>
-[[maybe_unused]] void LOG(T t) { std::cout << t << std::endl; }
+template <class T> [[maybe_unused]] void LOG(T t) { std::cout << t << std::endl; }
 
-template<class T, class... OtherArgs>
-void LOG(T&& var, OtherArgs&&... args) {
+template <class T, class... OtherArgs> void LOG(T &&var, OtherArgs &&...args) {
     std::cout << std::forward<T>(var) << " ";
     LOG(std::forward<OtherArgs>(args)...);
 }
@@ -59,7 +57,7 @@ void LOG(T&& var, OtherArgs&&... args) {
 /**
  * @brief Executes the specified program with the given arguments and captures
  * `stdout`, `stderr`, and the status code.
- * 
+ *
  * @param out The stream where to direct the program's standard output.
  * @param err The stream where to direct the program's standard error.
  * @param execPath The path to the executable.
@@ -69,8 +67,8 @@ void LOG(T&& var, OtherArgs&&... args) {
  * @return The status code returned by the process, or `-1` if it did not exit
  * normally.
  */
-template<typename... Args>
-int runProgram(std::stringstream & out, std::stringstream & err, const char * execPath, Args ... args) {
+template <typename... Args>
+int runProgram(std::stringstream &out, std::stringstream &err, const char *execPath, Args... args) {
     constexpr int READ = 0;
     constexpr int WRITE = 1;
     int linkOut[2]; // pipe ends for stdout
@@ -79,27 +77,25 @@ int runProgram(std::stringstream & out, std::stringstream & err, const char * ex
 
     // Try to create the pipes.
     if (pipe(linkOut) == -1)
-        throw std::runtime_error("could not create pipe" +
-                                 std::to_string(errno));
+        throw std::runtime_error("could not create pipe" + std::to_string(errno));
     if (pipe(linkErr) == -1)
-        throw std::runtime_error("could not create pipe" +
-                                 std::to_string(errno));
+        throw std::runtime_error("could not create pipe" + std::to_string(errno));
 
     // Try to create the child process.
     pid_t p = fork();
 
-    if(p == -1)
+    if (p == -1)
         throw std::runtime_error("could not create child process");
-    else if(p) { // parent
+    else if (p) { // parent
         // Close write end of pipes.
         close(linkOut[WRITE]);
         close(linkErr[WRITE]);
 
         // Read data from stdout and stderr of the child from the pipes.
         ssize_t numBytes;
-        while((numBytes = read(linkOut[READ], buf, sizeof(buf))))
+        while ((numBytes = read(linkOut[READ], buf, sizeof(buf))))
             out.write(buf, numBytes);
-        while((numBytes = read(linkErr[READ], buf, sizeof(buf))))
+        while ((numBytes = read(linkErr[READ], buf, sizeof(buf))))
             err.write(buf, numBytes);
 
         close(linkOut[READ]);
@@ -107,7 +103,7 @@ int runProgram(std::stringstream & out, std::stringstream & err, const char * ex
         // Wait for child's termination.
         int status;
         waitpid(p, &status, 0);
-        if(status != 0) {
+        if (status != 0) {
 #ifndef NDEBUG
             std::cout << "stdout: " << out.str() << std::endl;
             std::cout << "stderr: " << err.str() << std::endl;
@@ -116,8 +112,7 @@ int runProgram(std::stringstream & out, std::stringstream & err, const char * ex
 #endif
         }
         return WIFEXITED(status) ? WEXITSTATUS(status) : -1;
-    }
-    else { // child
+    } else { // child
         // Redirect stdout and stderr to the pipe.
         dup2(linkOut[WRITE], STDOUT_FILENO);
         dup2(linkErr[WRITE], STDERR_FILENO);
@@ -140,8 +135,9 @@ int runProgram(std::stringstream & out, std::stringstream & err, const char * ex
 }
 
 /**
- * @brief Executes the specified program with the given arguments in the background.
- *  
+ * @brief Executes the specified program with the given arguments in the
+ * background.
+ *
  * @param out The file descriptor where to redirect standard output.
  * @param err The file descriptor where to redirect standard error.
  * @param execPath The path to the executable.
@@ -149,27 +145,25 @@ int runProgram(std::stringstream & out, std::stringstream & err, const char * ex
  * element should be of type `char *`. The first one should be the name of the
  * program itself. The last one does *not* need to be a null pointer.
  * @return The process id of the child process.
- * 
+ *
  */
-template<typename... Args>
-pid_t runProgramInBackground(int &out, int &err, const char * execPath, Args ... args) {        
-    
+template <typename... Args> pid_t runProgramInBackground(int &out, int &err, const char *execPath, Args... args) {
+
     // Try to create the child process.
     pid_t p = fork();
-    
-    if(p == -1)
+
+    if (p == -1)
         throw std::runtime_error("could not create child process");
-    else if(p) { // parent        
+    else if (p) { // parent
         // Return pid
         return p;
-    }
-    else { // child
+    } else { // child
         // Redirect stdout and stderr to the pipe.
         dup2(out, STDOUT_FILENO);
         dup2(err, STDERR_FILENO);
         // Execute other program.
         execl(execPath, args..., static_cast<char *>(nullptr));
-        
+
         // execl does not return, unless it failed.
         throw std::runtime_error("could not execute the program");
     }
@@ -194,17 +188,15 @@ pid_t runProgramInBackground(int &out, int &err, const char * execPath, Args ...
  * normally.
  */
 template <typename... Args>
-int runLIT(std::stringstream &out, std::stringstream &err, std::string dirPath,
-           Args... args) {
-    return runProgram(out, err, "/bin/python3", "python3",
-                      (dirPath + "run-lit.py").c_str(), "-v", dirPath.c_str(),
+int runLIT(std::stringstream &out, std::stringstream &err, std::string dirPath, Args... args) {
+    return runProgram(out, err, "/bin/python3", "python3", (dirPath + "run-lit.py").c_str(), "-v", dirPath.c_str(),
                       args...);
 }
 
 /**
  * @brief Executes DAPHNE's command line interface with the given arguments and
  * captures `stdout`, `stderr`, and the status code.
- * 
+ *
  * @param out The stream where to direct the program's standard output.
  * @param err The stream where to direct the program's standard error.
  * @param args The arguments including the script file. Despite the variadic
@@ -213,17 +205,16 @@ int runLIT(std::stringstream &out, std::stringstream &err, std::string dirPath,
  * @return The status code returned by the process, or `-1` if it did not exit
  * normally.
  */
-template<typename... Args>
-int runDaphne(std::stringstream & out, std::stringstream & err, Args ... args) {
+template <typename... Args> int runDaphne(std::stringstream &out, std::stringstream &err, Args... args) {
     return runProgram(out, err, "bin/daphne", "daphne", args...);
 }
 
 /**
  * @brief Executes the given Python script with the `python3` interpreter and
  * captures `stdout`, `stderr`, and the status code.
- * 
+ *
  * Typically the Python script will use DaphneLib, the Python API of DAPHNE.
- * 
+ *
  * @param out The stream where to direct the program's standard output.
  * @param err The stream where to direct the program's standard error.
  * @param scriptPath The path to the Python script file to execute.
@@ -233,15 +224,15 @@ int runDaphne(std::stringstream & out, std::stringstream & err, Args ... args) {
  * @return The status code returned by the process, or `-1` if it did not exit
  * normally.
  */
-template<typename... Args>
-int runDaphneLib(std::stringstream & out, std::stringstream & err, const char * scriptPath, Args ... args) {
+template <typename... Args>
+int runDaphneLib(std::stringstream &out, std::stringstream &err, const char *scriptPath, Args... args) {
     return runProgram(out, err, "/bin/python3", "python3", scriptPath, args...);
 }
 
 /**
  * @brief Checks whether executing the given DaphneDSL script with the command
  * line interface of DAPHNE returns the given status code.
- * 
+ *
  * @param exp The expected status code.
  * @param scriptFilePath The path to the DaphneDSL script file to execute.
  * @param args The arguments to pass in addition to the script's path. Note
@@ -249,8 +240,8 @@ int runDaphneLib(std::stringstream & out, std::stringstream & err, const char *
  * utility function. Despite the variadic template, each element should be of
  * type `char *`. The last one does *not* need to be a null pointer.
  */
-template<typename... Args>
-void checkDaphneStatusCode(StatusCode exp, const std::string & scriptFilePath, Args ... args) {
+template <typename... Args>
+void checkDaphneStatusCode(StatusCode exp, const std::string &scriptFilePath, Args... args) {
     std::stringstream out;
     std::stringstream err;
     int status = runDaphne(out, err, args..., scriptFilePath.c_str());
@@ -258,25 +249,25 @@ void checkDaphneStatusCode(StatusCode exp, const std::string & scriptFilePath, A
     CHECK(status == exp);
 }
 
-template<typename... Args>
-void checkDaphneStatusCodeSimple(StatusCode exp, const std::string & dirPath, const std::string & name, unsigned idx, Args ... args) {
+template <typename... Args>
+void checkDaphneStatusCodeSimple(StatusCode exp, const std::string &dirPath, const std::string &name, unsigned idx,
+                                 Args... args) {
     checkDaphneStatusCode(exp, dirPath + name + '_' + std::to_string(idx) + ".daphne", args...);
 }
 
 /**
  * @brief Checks whether executing the given DaphneDSL script with the command
  * line interface of DAPHNE fails.
- * 
+ *
  * This is the case when the return code is not `StatusCode::SUCCESS`.
- * 
+ *
  * @param scriptFilePath The path to the DaphneDSL script file to execute.
  * @param args The arguments to pass in addition to the script's path. Note
  * that script arguments must be passed via the `--args` option for this
  * utility function. Despite the variadic template, each element should be of
  * type `char *`. The last one does *not* need to be a null pointer.
  */
-template<typename... Args>
-void checkDaphneFails(const std::string & scriptFilePath, Args ... args) {
+template <typename... Args> void checkDaphneFails(const std::string &scriptFilePath, Args... args) {
     std::stringstream out;
     std::stringstream err;
     int status = runDaphne(out, err, args..., scriptFilePath.c_str());
@@ -284,18 +275,18 @@ void checkDaphneFails(const std::string & scriptFilePath, Args ... args) {
     CHECK(status != StatusCode::SUCCESS);
 }
 
-template<typename... Args>
-void checkDaphneFailsSimple(const std::string & dirPath, const std::string & name, unsigned idx, Args ... args) {
+template <typename... Args>
+void checkDaphneFailsSimple(const std::string &dirPath, const std::string &name, unsigned idx, Args... args) {
     checkDaphneFails(dirPath + name + '_' + std::to_string(idx) + ".daphne", args...);
 }
 
 /**
  * @brief Compares the standard output of executing the given DaphneDSL script
  * with the command line interface of DAPHNE to a reference text.
- * 
+ *
  * Also checks that the status code indicates a successful execution and that
  * nothing was printed to standard error.
- * 
+ *
  * @param exp The expected output on stdout.
  * @param scriptFilePath The path to the DaphneDSL script file to execute.
  * @param args The arguments to pass in addition to the script's path. Note
@@ -303,8 +294,8 @@ void checkDaphneFailsSimple(const std::string & dirPath, const std::string & nam
  * utility function. Despite the variadic template, each element should be of
  * type `char *`. The last one does *not* need to be a null pointer.
  */
-template<typename... Args>
-void compareDaphneToStr(const std::string & exp, const std::string & scriptFilePath, Args ... args) {
+template <typename... Args>
+void compareDaphneToStr(const std::string &exp, const std::string &scriptFilePath, Args... args) {
     std::stringstream out;
     std::stringstream err;
     int status = runDaphne(out, err, args..., scriptFilePath.c_str());
@@ -321,18 +312,19 @@ void compareDaphneToStr(const std::string & exp, const std::string & scriptFileP
 /**
  * @brief Compares the standard output of executing the given Python/DaphneLib
  * script to a reference text.
- * 
+ *
  * Also checks that the status code indicates a successful execution and that
  * nothing was printed to standard error.
- * 
+ *
  * @param exp The expected output on stdout.
- * @param scriptFilePath The path to the Python/DaphneLib script file to execute.
+ * @param scriptFilePath The path to the Python/DaphneLib script file to
+ * execute.
  * @param args The arguments to pass in addition to the script's path. Despite
  * the variadic template, each element should be of type `char *`. The last one
  * does *not* need to be a null pointer.
  */
-template<typename... Args>
-void compareDaphneLibToStr(const std::string & exp, const std::string & scriptFilePath, Args ... args) {
+template <typename... Args>
+void compareDaphneLibToStr(const std::string &exp, const std::string &scriptFilePath, Args... args) {
     std::stringstream out;
     std::stringstream err;
     int status = runDaphneLib(out, err, scriptFilePath.c_str(), args...);
@@ -357,9 +349,7 @@ void compareDaphneLibToStr(const std::string & exp, const std::string & scriptFi
  * @param epsilon The relative error that is acceptable.
  */
 template <typename... Args>
-void compareDaphneRunsNumerically(std::stringstream &left,
-                                  std::stringstream &right,
-                                  const int ignore_lines,
+void compareDaphneRunsNumerically(std::stringstream &left, std::stringstream &right, const int ignore_lines,
                                   const long double epsilon) {
     std::string s_left;
     std::string s_right;
@@ -371,18 +361,16 @@ void compareDaphneRunsNumerically(std::stringstream &left,
     }
     bool correct_so_far = true;
 
-    while (std::getline(left, s_left, ' ') &&
-           std::getline(right, s_right, ' ') && correct_so_far) {
+    while (std::getline(left, s_left, ' ') && std::getline(right, s_right, ' ') && correct_so_far) {
         try {
             // Long double just to be sure
             f_left = std::stold(s_left);
             f_right = std::stold(s_right);
         } catch (std::invalid_argument const &) {
-            FAIL("The result does not have the right number of outputs.\nLeft="
-                 << left.str() << "\t Right=" << right.str());
+            FAIL("The result does not have the right number of outputs.\nLeft=" << left.str()
+                                                                                << "\t Right=" << right.str());
         }
-        correct_so_far =
-            std::norm(f_left - f_right) < epsilon * std::norm(f_left);
+        correct_so_far = std::norm(f_left - f_right) < epsilon * std::norm(f_left);
     }
     CHECK(correct_so_far == true);
 }
@@ -407,10 +395,8 @@ void compareDaphneRunsNumerically(std::stringstream &left,
  * type `char *`. The last one does *not* need to be a null pointer.
  */
 template <typename... Args>
-void compareDaphneToStringNumerically(const std::string &exp,
-                                      const std::string &scriptFilePath,
-                                      int ignore_lines, long double epsilon,
-                                      Args... args) {
+void compareDaphneToStringNumerically(const std::string &exp, const std::string &scriptFilePath, int ignore_lines,
+                                      long double epsilon, Args... args) {
     std::stringstream out;
     std::stringstream err;
     int status = runDaphne(out, err, args..., scriptFilePath.c_str());
@@ -424,10 +410,10 @@ void compareDaphneToStringNumerically(const std::string &exp,
  * @brief Compares the standard output of executing the given DaphneDSL script
  * with the command line interface of DAPHNE to a reference text
  * file.
- * 
+ *
  * Also checks that the status code indicates a successful execution and that
  * nothing was printed to standard error.
- * 
+ *
  * @param refFilePath The path to the plain text file containing the reference
  * @param scriptFilePath The path to the DaphneDSL script file to execute.
  * output.
@@ -436,33 +422,34 @@ void compareDaphneToStringNumerically(const std::string &exp,
  * utility function. Despite the variadic template, each element should be of
  * type `char *`. The last one does *not* need to be a null pointer.
  */
-template<typename... Args>
-void compareDaphneToRef(const std::string & refFilePath, const std::string & scriptFilePath, Args ... args) {
+template <typename... Args>
+void compareDaphneToRef(const std::string &refFilePath, const std::string &scriptFilePath, Args... args) {
     return compareDaphneToStr(readTextFile(refFilePath), scriptFilePath, args...);
 }
 
 /**
  * @brief Compares the standard output of the given DaphneDSL script with that
  * of the given Python/DaphneLib script.
- * 
+ *
  * Also checks that the status codes indicate a successful execution for both
  * and that nothing was printed to standard error.
- * 
+ *
  * @param pythonScriptFilePath
  * @param daphneDSLScriptFilePath
  * @param args The arguments to pass in addition to the scripts' path. Despite
  * the variadic template, each element should be of type `char *`. The last one
  * does *not* need to be a null pointer.
  */
-template<typename... Args>
-void compareDaphneToDaphneLib(const std::string & pythonScriptFilePath, const std::string & daphneDSLScriptFilePath, Args ... args) {
+template <typename... Args>
+void compareDaphneToDaphneLib(const std::string &pythonScriptFilePath, const std::string &daphneDSLScriptFilePath,
+                              Args... args) {
     std::stringstream outDaphne;
     std::stringstream errDaphne;
     std::stringstream outDaphneLib;
     std::stringstream errDaphneLib;
     int statusDaphneLib = runDaphneLib(outDaphneLib, errDaphneLib, pythonScriptFilePath.c_str(), args...);
     int statusDaphne = runDaphne(outDaphne, errDaphne, daphneDSLScriptFilePath.c_str(), args...);
-    
+
     // Just CHECK (don't REQUIRE) success, such that in case of a failure, the
     // checks of out and err still run and provide useful messages. For err,
     // don't check empty(), because then catch2 doesn't display the error
@@ -478,18 +465,19 @@ void compareDaphneToDaphneLib(const std::string & pythonScriptFilePath, const st
  * @brief Approximate floating point comparison of each line in the standard
  * output of the given DaphneDSL script with that of the given Python/DaphneLib
  * script.
- * 
+ *
  * Also checks that the status codes indicate a successful execution for both
  * and that nothing was printed to standard error.
- * 
+ *
  * @param pythonScriptFilePath
  * @param daphneDSLScriptFilePath
  * @param args The arguments to pass in addition to the scripts' path. Despite
  * the variadic template, each element should be of type `char *`. The last one
  * does *not* need to be a null pointer.
  */
-template<typename... Args>
-void compareDaphneToDaphneLibScalar(const std::string & pythonScriptFilePath, const std::string & daphneDSLScriptFilePath, Args ... args) {
+template <typename... Args>
+void compareDaphneToDaphneLibScalar(const std::string &pythonScriptFilePath, const std::string &daphneDSLScriptFilePath,
+                                    Args... args) {
     std::stringstream outDaphne;
     std::stringstream errDaphne;
     std::stringstream outDaphneLib;
@@ -498,28 +486,28 @@ void compareDaphneToDaphneLibScalar(const std::string & pythonScriptFilePath, co
     float epsilon = 0.1;
     int statusDaphneLib = runDaphneLib(outDaphneLib, errDaphneLib, pythonScriptFilePath.c_str(), args...);
     int statusDaphne = runDaphne(outDaphne, errDaphne, args..., daphneDSLScriptFilePath.c_str());
-    
+
     // Just CHECK (don't REQUIRE) success, such that in case of a failure, the
     // checks of out and err still run and provide useful messages. For err,
     // don't check empty(), because then catch2 doesn't display the error
     // output.
     CHECK(statusDaphne == StatusCode::SUCCESS);
     CHECK(statusDaphneLib == 0);
-    while(std::getline(outDaphneLib, resultDaphneLib) && std::getline(outDaphne, resultDaphne)) {
+    while (std::getline(outDaphneLib, resultDaphneLib) && std::getline(outDaphne, resultDaphne)) {
         CHECK(std::stof(resultDaphneLib) - std::stof(resultDaphne) <= epsilon);
     }
     CHECK(errDaphne.str() == "");
     CHECK(errDaphneLib.str() == "");
 }
 
-template<typename... Args>
-void compareDaphneToRefSimple(const std::string & dirPath, const std::string & name, unsigned idx, Args ... args) {
+template <typename... Args>
+void compareDaphneToRefSimple(const std::string &dirPath, const std::string &name, unsigned idx, Args... args) {
     const std::string filePath = dirPath + name + '_' + std::to_string(idx);
-    compareDaphneToRef(filePath + ".txt",  filePath + ".daphne", args...);
+    compareDaphneToRef(filePath + ".txt", filePath + ".daphne", args...);
 }
 
-template<typename... Args>
-void compareDaphneToDaphneLibSimple(const std::string & dirPath, const std::string & name, unsigned idx, Args ... args) {
+template <typename... Args>
+void compareDaphneToDaphneLibSimple(const std::string &dirPath, const std::string &name, unsigned idx, Args... args) {
     const std::string filePath = dirPath + name + '_' + std::to_string(idx);
     compareDaphneToDaphneLib(filePath + ".py", args..., filePath + ".daphne");
 }
@@ -532,15 +520,17 @@ void compareDaphneToDaphneLibSimple(const std::string & dirPath, const std::stri
  * Also checks that the status code indicates a successful execution and that
  * nothing was printed to standard error.
  *
- * @param expScriptFilePath The path to the DaphneDSL script with expected behaviour.
- * @param actScriptFilePath The path to the DaphneDSL script to check with actual behaviour.
+ * @param expScriptFilePath The path to the DaphneDSL script with expected
+ * behaviour.
+ * @param actScriptFilePath The path to the DaphneDSL script to check with
+ * actual behaviour.
  * @param args The arguments to pass in addition to the script's path. Note
  * that script arguments must be passed via the `--args` option for this
  * utility function. Despite the variadic template, each element should be of
  * type `char *`. The last one does *not* need to be a null pointer.
  */
-template<typename... Args>
-void compareDaphneToSelfRef(const std::string &expScriptFilePath, const std::string &actScriptFilePath, Args ... args) {
+template <typename... Args>
+void compareDaphneToSelfRef(const std::string &expScriptFilePath, const std::string &actScriptFilePath, Args... args) {
     std::stringstream expOut;
     std::stringstream expErr;
     int expStatus = runDaphne(expOut, expErr, args..., expScriptFilePath.c_str());
@@ -555,8 +545,9 @@ void compareDaphneToSelfRef(const std::string &expScriptFilePath, const std::str
     CHECK(expErr.str() == actErr.str());
 }
 
-template<typename... Args>
-[[maybe_unused]] void compareDaphneToSelfRefSimple(const std::string & dirPath, const std::string & name, unsigned idx, Args ... args) {
+template <typename... Args>
+[[maybe_unused]] void compareDaphneToSelfRefSimple(const std::string &dirPath, const std::string &name, unsigned idx,
+                                                   Args... args) {
     const std::string filePath = dirPath + name + '_' + std::to_string(idx);
     compareDaphneToSelfRef(filePath + ".ref.daphne", filePath + ".daphne", args...);
 }
@@ -564,7 +555,7 @@ template<typename... Args>
 /**
  * @brief Compares the standard output of executing a given DaphneDSL script
  * with a reference script or text file, based on which file is found.
- * 
+ *
  * @param dirPath
  * @param name
  * @param idx
@@ -573,34 +564,31 @@ template<typename... Args>
  * utility function. Despite the variadic template, each element should be of
  * type `char *`. The last one does *not* need to be a null pointer.
  */
-template<typename... Args>
-void compareDaphneToSomeRefSimple(const std::string & dirPath, const std::string & name, unsigned idx, Args ... args) {
+template <typename... Args>
+void compareDaphneToSomeRefSimple(const std::string &dirPath, const std::string &name, unsigned idx, Args... args) {
     const std::string filePath = dirPath + name + '_' + std::to_string(idx);
     if (std::filesystem::exists(filePath + ".ref.daphne")) {
         compareDaphneToSelfRef(filePath + ".ref.daphne", filePath + ".daphne", args...);
-    }
-    else if (std::filesystem::exists(filePath + ".txt")) {
+    } else if (std::filesystem::exists(filePath + ".txt")) {
         compareDaphneToRef(filePath + ".txt", filePath + ".daphne", args...);
-    }
-    else {
+    } else {
         throw std::runtime_error("Could not find any ref for file `" + filePath + ".daphne`");
     }
 }
 
-
 // TODO Ideally, we shouldn't need that. There should be a way to print data
 // objects without technical information such as their physical data
 // representation.
 /**
  * @brief Replaces all occurrences of "DenseMatrix" and "CSRMatrix" in the
  * given string by "<SomeMatrix>".
- * 
+ *
  * Can be used to prepare the outputs of a DaphneDSL script with two different
  * sets of arguments for string comparison.
- * 
- * @param str 
+ *
+ * @param str
  * @return
  */
-std::string generalizeDataTypes(const std::string& str);
+std::string generalizeDataTypes(const std::string &str);
 
-#endif //TEST_API_CLI_UTILS_H
+#endif // TEST_API_CLI_UTILS_H
diff --git a/test/api/cli/algorithms/AlgorithmsTest.cpp b/test/api/cli/algorithms/AlgorithmsTest.cpp
index ba6f4f3e4..7debf5724 100644
--- a/test/api/cli/algorithms/AlgorithmsTest.cpp
+++ b/test/api/cli/algorithms/AlgorithmsTest.cpp
@@ -29,33 +29,23 @@ const std::string dirPath = "test/api/cli/algorithms/";
 // don't check the results.
 
 TEST_CASE("components", TAG_ALGORITHMS) {
-    checkDaphneStatusCode(
-        StatusCode::SUCCESS, dirPath + "components.daphne",
-        "--args", "n=100,e=1000"
-    );
+    checkDaphneStatusCode(StatusCode::SUCCESS, dirPath + "components.daphne", "--args", "n=100,e=1000");
 }
 
 TEST_CASE("componentsSparse", TAG_ALGORITHMS) {
-    // TODO: check against mode without `--select-matrix-representations` by reading file or when the sparse and dense
+    // TODO: check against mode without `--select-matrix-representations` by
+    // reading file or when the sparse and dense
     //  random kernels have the same values for same seed
-    checkDaphneStatusCode(
-        StatusCode::SUCCESS, dirPath + "components.daphne",
-        "--select-matrix-representations", "--args", "n=100,e=100"
-    );
+    checkDaphneStatusCode(StatusCode::SUCCESS, dirPath + "components.daphne", "--select-matrix-representations",
+                          "--args", "n=100,e=100");
 }
 
 TEST_CASE("kmeans", TAG_ALGORITHMS) {
-    checkDaphneStatusCode(
-            StatusCode::SUCCESS, dirPath + "kmeans.daphne",
-            "--args", "r=100,c=5,f=20,i=10"
-    );
+    checkDaphneStatusCode(StatusCode::SUCCESS, dirPath + "kmeans.daphne", "--args", "r=100,c=5,f=20,i=10");
 }
 
 TEST_CASE("lm", TAG_ALGORITHMS) {
-    checkDaphneStatusCode(
-            StatusCode::SUCCESS, dirPath + "lm.daphne",
-            "--args", "r=100,c=20"
-    );
+    checkDaphneStatusCode(StatusCode::SUCCESS, dirPath + "lm.daphne", "--args", "r=100,c=20");
 }
 
 // TODO Reactivate this test case. Currently, it passes on my machine (@pdamme),
diff --git a/test/api/cli/algorithms/DecisionTreeRandomForestTest.cpp b/test/api/cli/algorithms/DecisionTreeRandomForestTest.cpp
index 2852b1800..faff1c4e1 100644
--- a/test/api/cli/algorithms/DecisionTreeRandomForestTest.cpp
+++ b/test/api/cli/algorithms/DecisionTreeRandomForestTest.cpp
@@ -15,11 +15,12 @@
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
- * 
+ *
  * Modifications 2024 The DAPHNE Consortium.
  */
 
-// This code has been manually translated from Apache SystemDS (and significantly adapted).
+// This code has been manually translated from Apache SystemDS (and
+// significantly adapted).
 
 #include <api/cli/Utils.h>
 
@@ -36,8 +37,9 @@ const std::string DATASET_DIR = "test/data/";
 const std::string WINE_DATA = DATASET_DIR + "wine/winequality-red-white.csv";
 // const std::string EEG_DATA = DATASET_DIR + "EEG.csv";
 
-void runDecisionTree(int testNr, const std::string & dataFilePath, double minAcc, int dt, double maxV) {
-    const std::string scriptFileName = ("test/api/cli/algorithms/decisionTreeRealData" + std::to_string(testNr)) + ".daphne";
+void runDecisionTree(int testNr, const std::string &dataFilePath, double minAcc, int dt, double maxV) {
+    const std::string scriptFileName =
+        ("test/api/cli/algorithms/decisionTreeRealData" + std::to_string(testNr)) + ".daphne";
     const std::string argData = "data=\"" + dataFilePath + "\"";
     const std::string argDt = "dt=" + std::to_string(dt);
     const std::string argMaxV = "maxV=" + std::to_string(maxV);
@@ -60,8 +62,8 @@ void runDecisionTree(int testNr, const std::string & dataFilePath, double minAcc
 // }
 
 // TEST_CASE("randomForestTitanic8_MaxV1", TAG_ALGORITHMS) {
-//     //8 trees with sample fraction 0.125 each, accuracy 0.785 due to randomness
-//     runDecisionTree(1, TITANIC_DATA, 0.793, 9, 1.0);
+//     //8 trees with sample fraction 0.125 each, accuracy 0.785 due to
+//     randomness runDecisionTree(1, TITANIC_DATA, 0.793, 9, 1.0);
 // }
 
 // TEST_CASE("decisionTreeTitanic_MaxV06", TAG_ALGORITHMS) {
@@ -74,47 +76,39 @@ void runDecisionTree(int testNr, const std::string & dataFilePath, double minAcc
 // }
 
 // TEST_CASE("randomForestTitanic8_MaxV06", TAG_ALGORITHMS) {
-//     //8 trees with sample fraction 0.125 each, accuracy 0.785 due to randomness
-//     runDecisionTree(1, TITANIC_DATA, 0.793, 9, 0.6);
+//     //8 trees with sample fraction 0.125 each, accuracy 0.785 due to
+//     randomness runDecisionTree(1, TITANIC_DATA, 0.793, 9, 0.6);
 // }
 
-TEST_CASE("decisionTree_Wine_MaxV1", TAG_ALGORITHMS) {
-    runDecisionTree(2, WINE_DATA, 0.989, 1, 1.0);
-}
+TEST_CASE("decisionTree_Wine_MaxV1", TAG_ALGORITHMS) { runDecisionTree(2, WINE_DATA, 0.989, 1, 1.0); }
 
 // Added in DAPHNE (wasn't present in SystemDS).
-TEST_CASE("decisionTree_Wine_MaxV05", TAG_ALGORITHMS) {
-    runDecisionTree(2, WINE_DATA, 0.988, 1, 0.5);
-}
+TEST_CASE("decisionTree_Wine_MaxV05", TAG_ALGORITHMS) { runDecisionTree(2, WINE_DATA, 0.988, 1, 0.5); }
 
-TEST_CASE("randomForestWine_MaxV1", TAG_ALGORITHMS) {
-    runDecisionTree(2, WINE_DATA, 0.989, 2, 1.0);
-}
+TEST_CASE("randomForestWine_MaxV1", TAG_ALGORITHMS) { runDecisionTree(2, WINE_DATA, 0.989, 2, 1.0); }
 
 // Added in DAPHNE (wasn't present in SystemDS).
-TEST_CASE("randomForestWine_MaxV05", TAG_ALGORITHMS) {
-    runDecisionTree(2, WINE_DATA, 0.988, 2, 0.5);
-}
+TEST_CASE("randomForestWine_MaxV05", TAG_ALGORITHMS) { runDecisionTree(2, WINE_DATA, 0.988, 2, 0.5); }
 
 TEST_CASE("decisionTree_WineReg_MaxV1", TAG_ALGORITHMS) {
-    //for regression we compare R2 and use rss to optimize
+    // for regression we compare R2 and use rss to optimize
     runDecisionTree(3, WINE_DATA, 0.364, 1, 1.0);
 }
 
 // Added in DAPHNE (wasn't present in SystemDS).
 TEST_CASE("decisionTree_WineReg_MaxV05", TAG_ALGORITHMS) {
-    //for regression we compare R2 and use rss to optimize
+    // for regression we compare R2 and use rss to optimize
     runDecisionTree(3, WINE_DATA, 0.291, 1, 0.5);
 }
 
 TEST_CASE("randomForestWineReg_MaxV1", TAG_ALGORITHMS) {
-    //for regression we compare R2 and use rss to optimize
+    // for regression we compare R2 and use rss to optimize
     runDecisionTree(3, WINE_DATA, 0.364, 2, 1.0);
 }
 
 // Added in DAPHNE (wasn't present in SystemDS).
 TEST_CASE("randomForestWineReg_MaxV05", TAG_ALGORITHMS) {
-    //for regression we compare R2 and use rss to optimize
+    // for regression we compare R2 and use rss to optimize
     runDecisionTree(3, WINE_DATA, 0.284, 2, 0.5);
 }
 
diff --git a/test/api/cli/codegen/EwBinaryScalarTest.cpp b/test/api/cli/codegen/EwBinaryScalarTest.cpp
index 224d566c3..56ff8df23 100644
--- a/test/api/cli/codegen/EwBinaryScalarTest.cpp
+++ b/test/api/cli/codegen/EwBinaryScalarTest.cpp
@@ -25,9 +25,7 @@
 
 const std::string dirPath = "test/api/cli/codegen/";
 
-void test_binary_lowering(const std::string op,
-                          const std::string kernel_call,
-                          const std::string lowering,
+void test_binary_lowering(const std::string op, const std::string kernel_call, const std::string lowering,
                           const std::string result) {
     std::stringstream out;
     std::stringstream err;
@@ -50,17 +48,11 @@ void test_binary_lowering(const std::string op,
     CHECK(out.str() == result);
 }
 
-TEST_CASE("ewBinaryAddScalar", TAG_CODEGEN) {
-    test_binary_lowering("add", "llvm.call @_ewAdd__", "llvm.add", "3\n");
-}
+TEST_CASE("ewBinaryAddScalar", TAG_CODEGEN) { test_binary_lowering("add", "llvm.call @_ewAdd__", "llvm.add", "3\n"); }
 
-TEST_CASE("ewBinarySubScalar", TAG_CODEGEN) {
-    test_binary_lowering("sub", "llvm.call @_ewSub__", "llvm.sub", "-1\n");
-}
+TEST_CASE("ewBinarySubScalar", TAG_CODEGEN) { test_binary_lowering("sub", "llvm.call @_ewSub__", "llvm.sub", "-1\n"); }
 
-TEST_CASE("ewBinaryMulScalar", TAG_CODEGEN) {
-    test_binary_lowering("mul", "llvm.call @_ewMul__", "llvm.mul", "2\n");
-}
+TEST_CASE("ewBinaryMulScalar", TAG_CODEGEN) { test_binary_lowering("mul", "llvm.call @_ewMul__", "llvm.mul", "2\n"); }
 
 TEST_CASE("ewBinaryDivScalar", TAG_CODEGEN) {
     test_binary_lowering("div", "llvm.call @_ewDiv__", "llvm.fdiv", "1.5\n");
diff --git a/test/api/cli/codegen/EwOpLoopFusionTest.cpp b/test/api/cli/codegen/EwOpLoopFusionTest.cpp
index 46f91b7cb..0a5da6e7b 100644
--- a/test/api/cli/codegen/EwOpLoopFusionTest.cpp
+++ b/test/api/cli/codegen/EwOpLoopFusionTest.cpp
@@ -26,16 +26,15 @@
 const std::string dirPath = "test/api/cli/codegen/";
 
 TEST_CASE("ewloopfusion", TAG_CODEGEN) {
-    std::string result =
-        "DenseMatrix(2x2, double)\n"
-        "8 8\n"
-        "8 8\n"
-        "DenseMatrix(2x2, double)\n"
-        "10 10\n"
-        "10 10\n"
-        "DenseMatrix(2x2, double)\n"
-        "9 9\n"
-        "9 9\n";
+    std::string result = "DenseMatrix(2x2, double)\n"
+                         "8 8\n"
+                         "8 8\n"
+                         "DenseMatrix(2x2, double)\n"
+                         "10 10\n"
+                         "10 10\n"
+                         "DenseMatrix(2x2, double)\n"
+                         "9 9\n"
+                         "9 9\n";
 
     compareDaphneToStr(result, dirPath + "fusion.daphne");
     compareDaphneToStr(result, dirPath + "fusion.daphne", "--mlir-codegen");
diff --git a/test/api/cli/codegen/MapOpTest.cpp b/test/api/cli/codegen/MapOpTest.cpp
index a7ccf56d0..32b0075f4 100644
--- a/test/api/cli/codegen/MapOpTest.cpp
+++ b/test/api/cli/codegen/MapOpTest.cpp
@@ -26,12 +26,10 @@
 const std::string dirPath = "test/api/cli/codegen/";
 
 TEST_CASE("mapOp", TAG_CODEGEN) {
-    std::string result =
-        "DenseMatrix(2x2, double)\n"
-        "2.1 1\n"
-        "6.5 -1.2\n";
+    std::string result = "DenseMatrix(2x2, double)\n"
+                         "2.1 1\n"
+                         "6.5 -1.2\n";
 
     compareDaphneToStr(result, dirPath + "map.daphne");
     compareDaphneToStr(result, dirPath + "map.daphne", "--mlir-codegen", "--no-obj-ref-mgnt");
 }
-
diff --git a/test/api/cli/codegen/MatMulTest.cpp b/test/api/cli/codegen/MatMulTest.cpp
index 93265b191..8100c0637 100644
--- a/test/api/cli/codegen/MatMulTest.cpp
+++ b/test/api/cli/codegen/MatMulTest.cpp
@@ -20,96 +20,85 @@
 #include <catch.hpp>
 #include <string>
 
-
 const std::string dirPath = "test/api/cli/codegen/";
 
 TEST_CASE("matmul", TAG_CODEGEN TAG_MATMUL) {
-    std::string result =
-        "DenseMatrix(3x3, double)\n"
-        "45 45 45\n"
-        "45 45 45\n"
-        "45 45 45\n";
+    std::string result = "DenseMatrix(3x3, double)\n"
+                         "45 45 45\n"
+                         "45 45 45\n"
+                         "45 45 45\n";
 
     compareDaphneToStr(result, dirPath + "matmul.daphne");
     compareDaphneToStr(result, dirPath + "matmul.daphne", "--mlir-codegen");
 }
 TEST_CASE("matmul vectorized", TAG_CODEGEN TAG_MATMUL) {
-    std::string result =
-        "DenseMatrix(3x3, double)\n"
-        "45 45 45\n"
-        "45 45 45\n"
-        "45 45 45\n";
+    std::string result = "DenseMatrix(3x3, double)\n"
+                         "45 45 45\n"
+                         "45 45 45\n"
+                         "45 45 45\n";
 
     compareDaphneToStr(result, dirPath + "matmul.daphne", "--mlir-codegen", "--matmul-vec-size-bits=64");
-    compareDaphneToStr(result, dirPath + "matmul.daphne", "--mlir-codegen", "--no-obj-ref-mgnt", 
-    "--matmul-vec-size-bits=128");
+    compareDaphneToStr(result, dirPath + "matmul.daphne", "--mlir-codegen", "--no-obj-ref-mgnt",
+                       "--matmul-vec-size-bits=128");
 }
 TEST_CASE("matmul tiled", TAG_CODEGEN TAG_MATMUL) {
-    std::string result =
-        "DenseMatrix(3x3, double)\n"
-        "45 45 45\n"
-        "45 45 45\n"
-        "45 45 45\n";
+    std::string result = "DenseMatrix(3x3, double)\n"
+                         "45 45 45\n"
+                         "45 45 45\n"
+                         "45 45 45\n";
 
     compareDaphneToStr(result, dirPath + "matmul.daphne", "--mlir-codegen", "--matmul-fixed-tile-sizes=2,2,2");
 }
 TEST_CASE("matmul tiled and vectorized", TAG_CODEGEN TAG_MATMUL) {
-    std::string result =
-        "DenseMatrix(3x3, double)\n"
-        "45 45 45\n"
-        "45 45 45\n"
-        "45 45 45\n";
+    std::string result = "DenseMatrix(3x3, double)\n"
+                         "45 45 45\n"
+                         "45 45 45\n"
+                         "45 45 45\n";
 
-    compareDaphneToStr(result, dirPath + "matmul.daphne", "--mlir-codegen", "--matmul-vec-size-bits=64", 
-    "--matmul-fixed-tile-sizes=2,2,2");
+    compareDaphneToStr(result, dirPath + "matmul.daphne", "--mlir-codegen", "--matmul-vec-size-bits=64",
+                       "--matmul-fixed-tile-sizes=2,2,2");
 }
 TEST_CASE("matmul single", TAG_CODEGEN TAG_MATMUL) {
-    std::string result =
-        "DenseMatrix(3x3, float)\n"
-        "45 45 45\n"
-        "45 45 45\n"
-        "45 45 45\n";
+    std::string result = "DenseMatrix(3x3, float)\n"
+                         "45 45 45\n"
+                         "45 45 45\n"
+                         "45 45 45\n";
 
     compareDaphneToStr(result, dirPath + "matmul_single.daphne");
     compareDaphneToStr(result, dirPath + "matmul_single.daphne", "--mlir-codegen");
 }
 TEST_CASE("matmul si64", TAG_CODEGEN TAG_MATMUL) {
-    std::string result =
-        "DenseMatrix(3x3, int64_t)\n"
-        "45 45 45\n"
-        "45 45 45\n"
-        "45 45 45\n";
+    std::string result = "DenseMatrix(3x3, int64_t)\n"
+                         "45 45 45\n"
+                         "45 45 45\n"
+                         "45 45 45\n";
 
     compareDaphneToStr(result, dirPath + "matmul_si64.daphne");
     compareDaphneToStr(result, dirPath + "matmul_si64.daphne", "--mlir-codegen");
 }
 TEST_CASE("matmul ui64", TAG_CODEGEN TAG_MATMUL) {
-    std::string result =
-        "DenseMatrix(3x3, uint64_t)\n"
-        "45 45 45\n"
-        "45 45 45\n"
-        "45 45 45\n";
+    std::string result = "DenseMatrix(3x3, uint64_t)\n"
+                         "45 45 45\n"
+                         "45 45 45\n"
+                         "45 45 45\n";
 
     compareDaphneToStr(result, dirPath + "matmul_ui64.daphne", "--mlir-codegen");
 }
 TEST_CASE("matmul non square", TAG_CODEGEN TAG_MATMUL) {
-    std::string result =
-        "DenseMatrix(3x3, double)\n"
-        "60 60 60\n"
-        "60 60 60\n"
-        "60 60 60\n";
-    
+    std::string result = "DenseMatrix(3x3, double)\n"
+                         "60 60 60\n"
+                         "60 60 60\n"
+                         "60 60 60\n";
+
     compareDaphneToStr(result, dirPath + "matmul_non_square.daphne");
     compareDaphneToStr(result, dirPath + "matmul_non_square.daphne", "--mlir-codegen");
 }
 
-
 TEST_CASE("matvec", TAG_CODEGEN) {
-    std::string result =
-        "DenseMatrix(3x1, double)\n"
-        "45\n"
-        "45\n"
-        "45\n";
+    std::string result = "DenseMatrix(3x1, double)\n"
+                         "45\n"
+                         "45\n"
+                         "45\n";
 
     compareDaphneToStr(result, dirPath + "matvec.daphne");
     compareDaphneToStr(result, dirPath + "matvec.daphne", "--mlir-codegen");
diff --git a/test/api/cli/config/ConfigTest.cpp b/test/api/cli/config/ConfigTest.cpp
index 4fc6cbb8e..18687d6dd 100644
--- a/test/api/cli/config/ConfigTest.cpp
+++ b/test/api/cli/config/ConfigTest.cpp
@@ -25,37 +25,26 @@
 
 TEST_CASE("config success", TAG_CONFIG) {
     // Providing the following example config files as --config must succeed.
-    for(const char * configFilePath : {
-        "UserConfig.json",
-                
-        "test/parser/config/configFiles/UserConfig1.json",
-        "test/parser/config/configFiles/UserConfig4.json",
-        "test/parser/config/configFiles/UserConfig7.json"
-    }) {
+    for (const char *configFilePath :
+         {"UserConfig.json",
+
+          "test/parser/config/configFiles/UserConfig1.json", "test/parser/config/configFiles/UserConfig4.json",
+          "test/parser/config/configFiles/UserConfig7.json"}) {
         DYNAMIC_SECTION(configFilePath) {
-            checkDaphneStatusCode(
-                    StatusCode::SUCCESS,
-                    "test/api/cli/config/empty.daphne", "--config", configFilePath
-            );
+            checkDaphneStatusCode(StatusCode::SUCCESS, "test/api/cli/config/empty.daphne", "--config", configFilePath);
         }
     }
 }
 
 TEST_CASE("config failure", TAG_CONFIG) {
     // Providing the following example config files as --config must fail.
-    for(const char * configFilePath : {
-        "test/parser/config/configFiles/UserConfig2.json",
-        "test/parser/config/configFiles/UserConfig3.txt",
-        "test/parser/config/configFiles/UserConfig5.json",
-        "test/parser/config/configFiles/UserConfig6.json",
-        "test/parser/config/configFiles/UserConfig8.json",
-        "test/parser/config/configFiles/UserConfig9.json"
-    }) {
+    for (const char *configFilePath :
+         {"test/parser/config/configFiles/UserConfig2.json", "test/parser/config/configFiles/UserConfig3.txt",
+          "test/parser/config/configFiles/UserConfig5.json", "test/parser/config/configFiles/UserConfig6.json",
+          "test/parser/config/configFiles/UserConfig8.json", "test/parser/config/configFiles/UserConfig9.json"}) {
         DYNAMIC_SECTION(configFilePath) {
-            checkDaphneStatusCode(
-                    StatusCode::PARSER_ERROR,
-                    "test/api/cli/config/empty.daphne", "--config", configFilePath
-            );
+            checkDaphneStatusCode(StatusCode::PARSER_ERROR, "test/api/cli/config/empty.daphne", "--config",
+                                  configFilePath);
         }
     }
 }
\ No newline at end of file
diff --git a/test/api/cli/controlflow/ControlFlowTest.cpp b/test/api/cli/controlflow/ControlFlowTest.cpp
index 478da50ea..46794d2ee 100644
--- a/test/api/cli/controlflow/ControlFlowTest.cpp
+++ b/test/api/cli/controlflow/ControlFlowTest.cpp
@@ -25,30 +25,28 @@
 
 const std::string dirPath = "test/api/cli/controlflow/";
 
-#define MAKE_TEST_CASE(name, count) \
-    TEST_CASE(name, TAG_CONTROLFLOW) { \
-        for(unsigned i = 1; i <= count; i++) { \
-            DYNAMIC_SECTION(name "_" << i << ".daphne") { \
-                compareDaphneToRefSimple(dirPath, name, i); \
-            } \
-        } \
+#define MAKE_TEST_CASE(name, count)                                                                                    \
+    TEST_CASE(name, TAG_CONTROLFLOW) {                                                                                 \
+        for (unsigned i = 1; i <= count; i++) {                                                                        \
+            DYNAMIC_SECTION(name "_" << i << ".daphne") { compareDaphneToRefSimple(dirPath, name, i); }                \
+        }                                                                                                              \
     }
 
-#define MAKE_FAILURE_TEST_CASE(name, count) \
-    TEST_CASE(name ", controlflow failure", TAG_CONTROLFLOW) { \
-        for(unsigned i = 1; i <= count; i++) { \
-            DYNAMIC_SECTION(name "_" << i << ".daphne") { \
-                std::stringstream out; \
-                std::stringstream err; \
-                int status = runDaphne(out, err, (dirPath + name + "_" + std::to_string(i) + ".daphne").c_str()); \
-                CHECK(status == StatusCode::EXECUTION_ERROR); \
-                if (i == 1) { \
-                    CHECK_THAT(out.str(), Catch::Contains("system stopped: unspecified reason")); \
-                } else { \
-                    CHECK_THAT(out.str(), Catch::Contains("system stopped: CUSTOM ERROR")); \
-                } \
-            } \
-        } \
+#define MAKE_FAILURE_TEST_CASE(name, count)                                                                            \
+    TEST_CASE(name ", controlflow failure", TAG_CONTROLFLOW) {                                                         \
+        for (unsigned i = 1; i <= count; i++) {                                                                        \
+            DYNAMIC_SECTION(name "_" << i << ".daphne") {                                                              \
+                std::stringstream out;                                                                                 \
+                std::stringstream err;                                                                                 \
+                int status = runDaphne(out, err, (dirPath + name + "_" + std::to_string(i) + ".daphne").c_str());      \
+                CHECK(status == StatusCode::EXECUTION_ERROR);                                                          \
+                if (i == 1) {                                                                                          \
+                    CHECK_THAT(out.str(), Catch::Contains("system stopped: unspecified reason"));                      \
+                } else {                                                                                               \
+                    CHECK_THAT(out.str(), Catch::Contains("system stopped: CUSTOM ERROR"));                            \
+                }                                                                                                      \
+            }                                                                                                          \
+        }                                                                                                              \
     }
 
 MAKE_TEST_CASE("if", 8)
@@ -60,7 +58,7 @@ MAKE_FAILURE_TEST_CASE("stop", 2)
 
 TEST_CASE("loop-with-many-iterations", TAG_CONTROLFLOW) {
     std::stringstream exp;
-    for(size_t i = 1; i <= 500*1000; i++)
+    for (size_t i = 1; i <= 500 * 1000; i++)
         exp << i << std::endl;
     compareDaphneToStr(exp.str(), dirPath + "for_manyiterations_1.daphne");
     compareDaphneToStr(exp.str(), dirPath + "while_manyiterations_1.daphne");
@@ -68,7 +66,7 @@ TEST_CASE("loop-with-many-iterations", TAG_CONTROLFLOW) {
 
 TEST_CASE("loop-with-many-iterations_variadic-op", TAG_CONTROLFLOW) {
     std::stringstream exp;
-    for(size_t i = 1; i <= 500*1000; i++)
+    for (size_t i = 1; i <= 500 * 1000; i++)
         exp << "Frame(1x1, [col_0:int64_t])" << std::endl << i << std::endl;
     compareDaphneToStr(exp.str(), dirPath + "for_manyiterations_2.daphne");
     compareDaphneToStr(exp.str(), dirPath + "while_manyiterations_2.daphne");
diff --git a/test/api/cli/distributed/DistributedTest.cpp b/test/api/cli/distributed/DistributedTest.cpp
index 2286dfac3..819be188d 100644
--- a/test/api/cli/distributed/DistributedTest.cpp
+++ b/test/api/cli/distributed/DistributedTest.cpp
@@ -14,8 +14,8 @@
  *  limitations under the License.
  */
 
-#include <runtime/distributed/worker/WorkerImpl.h>
 #include <api/cli/Utils.h>
+#include <runtime/distributed/worker/WorkerImpl.h>
 
 #include <cstdlib>
 #include <fcntl.h>
@@ -26,14 +26,13 @@
 
 #include <grpcpp/grpcpp.h>
 
-#include<thread>
+#include <thread>
 
 const std::string dirPath = "test/api/cli/distributed/";
 
-TEST_CASE("Distributed runtime tests using gRPC", TAG_DISTRIBUTED)
-{
+TEST_CASE("Distributed runtime tests using gRPC", TAG_DISTRIBUTED) {
     auto addr1 = "0.0.0.0:50051";
-    auto addr2 = "0.0.0.0:50052";    
+    auto addr2 = "0.0.0.0:50052";
     // Redirect worker output to null
     int nullFd = open("/dev/null", O_WRONLY);
     auto pid1 = runProgramInBackground(nullFd, nullFd, "bin/DistributedWorker", "DistributedWorker", addr1);
@@ -41,7 +40,7 @@ TEST_CASE("Distributed runtime tests using gRPC", TAG_DISTRIBUTED)
     CHECK(std::getenv("DISTRIBUTED_WORKERS") == nullptr);
     auto distWorkerStr = std::string(addr1) + ',' + addr2;
 
-    SECTION("Execution of scripts using distributed runtime (gRPC)"){
+    SECTION("Execution of scripts using distributed runtime (gRPC)") {
         // TODO Make these script individual DYNAMIC_SECTIONs.
         for (auto i = 1u; i <= 4; ++i) {
             auto filename = dirPath + "distributed_" + std::to_string(i) + ".daphne";
@@ -57,7 +56,8 @@ TEST_CASE("Distributed runtime tests using gRPC", TAG_DISTRIBUTED)
             std::stringstream outDist;
             std::stringstream errDist;
             setenv(envVar, distWorkerStr.c_str(), 1);
-            status = runDaphne(outDist, errDist, std::string("--distributed").c_str(), std::string("--dist_backend=sync-gRPC").c_str(), filename.c_str());
+            status = runDaphne(outDist, errDist, std::string("--distributed").c_str(),
+                               std::string("--dist_backend=sync-gRPC").c_str(), filename.c_str());
             unsetenv(envVar);
             CHECK(errDist.str() == "");
             REQUIRE(status == StatusCode::SUCCESS);
@@ -65,8 +65,8 @@ TEST_CASE("Distributed runtime tests using gRPC", TAG_DISTRIBUTED)
             CHECK(outLocal.str() == outDist.str());
         }
     }
-    SECTION("Distributed chunked messages (gRPC)"){
-        
+    SECTION("Distributed chunked messages (gRPC)") {
+
         auto filename = dirPath + "distributed_2.daphne";
 
         std::stringstream outLocal;
@@ -80,13 +80,14 @@ TEST_CASE("Distributed runtime tests using gRPC", TAG_DISTRIBUTED)
         std::stringstream outDist;
         std::stringstream errDist;
         setenv(envVar, distWorkerStr.c_str(), 1);
-        status = runDaphne(outDist, errDist, std::string("--max-distr-chunk-size=100").c_str(), std::string("--distributed").c_str(), std::string("--dist_backend=sync-gRPC").c_str(), filename.c_str());
+        status = runDaphne(outDist, errDist, std::string("--max-distr-chunk-size=100").c_str(),
+                           std::string("--distributed").c_str(), std::string("--dist_backend=sync-gRPC").c_str(),
+                           filename.c_str());
         unsetenv(envVar);
         CHECK(errDist.str() == "");
         REQUIRE(status == StatusCode::SUCCESS);
 
         CHECK(outLocal.str() == outDist.str());
-    
     }
     // SECTION("Distributed read operation"){
     //     auto filenameLocal = dirPath + "distributedRead/readLocalMat.daphne";
@@ -103,9 +104,8 @@ TEST_CASE("Distributed runtime tests using gRPC", TAG_DISTRIBUTED)
     //     std::stringstream outDist;
     //     std::stringstream errDist;
     //     setenv(envVar, distWorkerStr.c_str(), 1);
-    //     status = runDaphne(outDist, errDist, std::string("--vec").c_str(), filenameDistr.c_str());
-    //     unsetenv(envVar);
-    //     CHECK(errDist.str() == "");
+    //     status = runDaphne(outDist, errDist, std::string("--vec").c_str(),
+    //     filenameDistr.c_str()); unsetenv(envVar); CHECK(errDist.str() == "");
     //     REQUIRE(status == StatusCode::SUCCESS);
 
     //     CHECK(outLocal.str() == outDist.str());
@@ -113,56 +113,56 @@ TEST_CASE("Distributed runtime tests using gRPC", TAG_DISTRIBUTED)
 
     kill(pid1, SIGKILL);
     kill(pid2, SIGKILL);
-    wait(NULL);   
+    wait(NULL);
 }
 
 #ifdef USE_MPI
-TEST_CASE("Distributed runtime tests using MPI", TAG_DISTRIBUTED)
-{
+TEST_CASE("Distributed runtime tests using MPI", TAG_DISTRIBUTED) {
 
-    SECTION("Execution of scripts using distributed runtime (MPI)"){
+    SECTION("Execution of scripts using distributed runtime (MPI)") {
         // TODO Make these script individual DYNAMIC_SECTIONs.
 
         for (auto i = 1u; i <= 4; ++i) {
             auto filename = dirPath + "distributed_" + std::to_string(i) + ".daphne";
-           
+
             std::stringstream outLocal;
             std::stringstream errLocal;
             int status = runDaphne(outLocal, errLocal, filename.c_str());
-           
+
             CHECK(errLocal.str() == "");
             REQUIRE(status == StatusCode::SUCCESS);
 
             std::stringstream outDist;
             std::stringstream errDist;
-            status = runProgram(outDist, errDist, "mpirun", "--allow-run-as-root", "-np", "4", "bin/daphne", "--distributed", "--dist_backend=MPI", filename.c_str());
-           
+            status = runProgram(outDist, errDist, "mpirun", "--allow-run-as-root", "-np", "4", "bin/daphne",
+                                "--distributed", "--dist_backend=MPI", filename.c_str());
+
             CHECK(errDist.str() == "");
             REQUIRE(status == StatusCode::SUCCESS);
 
             CHECK(outLocal.str() == outDist.str());
         }
     }
-    SECTION("Distributed chunked messages (MPI)"){
+    SECTION("Distributed chunked messages (MPI)") {
 
         auto filename = dirPath + "distributed_2.daphne";
 
         std::stringstream outLocal;
         std::stringstream errLocal;
-       
+
         int status = runDaphne(outLocal, errLocal, filename.c_str());
         CHECK(errLocal.str() == "");
         REQUIRE(status == StatusCode::SUCCESS);
-        
+
         std::stringstream outDist;
         std::stringstream errDist;
-        status = runProgram(outDist, errDist,  "mpirun", "--allow-run-as-root", "-np", "4", "bin/daphne", "--distributed", "--dist_backend=MPI", "--max-distr-chunk-size=100", filename.c_str());
+        status = runProgram(outDist, errDist, "mpirun", "--allow-run-as-root", "-np", "4", "bin/daphne",
+                            "--distributed", "--dist_backend=MPI", "--max-distr-chunk-size=100", filename.c_str());
         CHECK(errDist.str() == "");
         REQUIRE(status == StatusCode::SUCCESS);
 
         CHECK(outLocal.str() == outDist.str());
-    
     }
-    wait(NULL);   
+    wait(NULL);
 }
 #endif
\ No newline at end of file
diff --git a/test/api/cli/expressions/CastTest.cpp b/test/api/cli/expressions/CastTest.cpp
index a09761cdc..e0d3a933f 100644
--- a/test/api/cli/expressions/CastTest.cpp
+++ b/test/api/cli/expressions/CastTest.cpp
@@ -25,13 +25,11 @@
 
 const std::string dirPath = "test/api/cli/expressions/";
 
-#define MAKE_TEST_CASE(name, count) \
-    TEST_CASE(name, TAG_CAST) { \
-        for(unsigned i = 1; i <= count; i++) { \
-            DYNAMIC_SECTION(name "_" << i << ".daphne") { \
-                compareDaphneToRefSimple(dirPath, name, i); \
-            } \
-        } \
+#define MAKE_TEST_CASE(name, count)                                                                                    \
+    TEST_CASE(name, TAG_CAST) {                                                                                        \
+        for (unsigned i = 1; i <= count; i++) {                                                                        \
+            DYNAMIC_SECTION(name "_" << i << ".daphne") { compareDaphneToRefSimple(dirPath, name, i); }                \
+        }                                                                                                              \
     }
 MAKE_TEST_CASE("cast_obj_obj", 1)
 MAKE_TEST_CASE("cast_obj_sca", 1)
diff --git a/test/api/cli/expressions/CondTest.cpp b/test/api/cli/expressions/CondTest.cpp
index 9da93f091..f8fe62a1b 100644
--- a/test/api/cli/expressions/CondTest.cpp
+++ b/test/api/cli/expressions/CondTest.cpp
@@ -25,25 +25,23 @@
 
 const std::string dirPath = "test/api/cli/expressions/";
 
-
-#define MAKE_SUCCESS_TEST_CASE(name, count) \
-    TEST_CASE(name ", success", TAG_TERNARY) { \
-        for(unsigned i = 1; i <= count; i++) { \
-            DYNAMIC_SECTION(name "_success_" << i << ".daphne") { \
-                compareDaphneToRefSimple(dirPath, name "_success", i); \
-            } \
-        } \
+#define MAKE_SUCCESS_TEST_CASE(name, count)                                                                            \
+    TEST_CASE(name ", success", TAG_TERNARY) {                                                                         \
+        for (unsigned i = 1; i <= count; i++) {                                                                        \
+            DYNAMIC_SECTION(name "_success_" << i << ".daphne") {                                                      \
+                compareDaphneToRefSimple(dirPath, name "_success", i);                                                 \
+            }                                                                                                          \
+        }                                                                                                              \
     }
 
-#define MAKE_FAILURE_TEST_CASE(name, count) \
-    TEST_CASE(name ", failure", TAG_TERNARY) { \
-        for(unsigned i = 1; i <= count; i++) { \
-            DYNAMIC_SECTION(name "_failure_" << i << ".daphne") { \
-                checkDaphneFailsSimple(dirPath, name "_failure", i); \
-            } \
-        } \
+#define MAKE_FAILURE_TEST_CASE(name, count)                                                                            \
+    TEST_CASE(name ", failure", TAG_TERNARY) {                                                                         \
+        for (unsigned i = 1; i <= count; i++) {                                                                        \
+            DYNAMIC_SECTION(name "_failure_" << i << ".daphne") {                                                      \
+                checkDaphneFailsSimple(dirPath, name "_failure", i);                                                   \
+            }                                                                                                          \
+        }                                                                                                              \
     }
 
-
 MAKE_SUCCESS_TEST_CASE("cond", 2)
 MAKE_FAILURE_TEST_CASE("cond", 4)
diff --git a/test/api/cli/expressions/FrameLiteralTest.cpp b/test/api/cli/expressions/FrameLiteralTest.cpp
index d64dfd8bc..811a4298c 100644
--- a/test/api/cli/expressions/FrameLiteralTest.cpp
+++ b/test/api/cli/expressions/FrameLiteralTest.cpp
@@ -25,22 +25,22 @@
 
 const std::string dirPath = "test/api/cli/expressions/";
 
-#define MAKE_SUCCESS_TEST_CASE(name, count) \
-    TEST_CASE(name ", success", TAG_FRAME_LITERAL) { \
-        for(unsigned i = 1; i <= count; i++) { \
-            DYNAMIC_SECTION(name "_success_" << i << ".daphne") { \
-                compareDaphneToRefSimple(dirPath, name "_success", i); \
-            } \
-        } \
+#define MAKE_SUCCESS_TEST_CASE(name, count)                                                                            \
+    TEST_CASE(name ", success", TAG_FRAME_LITERAL) {                                                                   \
+        for (unsigned i = 1; i <= count; i++) {                                                                        \
+            DYNAMIC_SECTION(name "_success_" << i << ".daphne") {                                                      \
+                compareDaphneToRefSimple(dirPath, name "_success", i);                                                 \
+            }                                                                                                          \
+        }                                                                                                              \
     }
 
-#define MAKE_FAILURE_TEST_CASE(name, count) \
-    TEST_CASE(name ", failure", TAG_FRAME_LITERAL) { \
-        for(unsigned i = 1; i <= count; i++) { \
-            DYNAMIC_SECTION(name "_failure_" << i << ".daphne") { \
-                checkDaphneFailsSimple(dirPath, name "_failure", i); \
-            } \
-        } \
+#define MAKE_FAILURE_TEST_CASE(name, count)                                                                            \
+    TEST_CASE(name ", failure", TAG_FRAME_LITERAL) {                                                                   \
+        for (unsigned i = 1; i <= count; i++) {                                                                        \
+            DYNAMIC_SECTION(name "_failure_" << i << ".daphne") {                                                      \
+                checkDaphneFailsSimple(dirPath, name "_failure", i);                                                   \
+            }                                                                                                          \
+        }                                                                                                              \
     }
 
 MAKE_SUCCESS_TEST_CASE("frame_literal", 6)
diff --git a/test/api/cli/expressions/MatrixLiteralTest.cpp b/test/api/cli/expressions/MatrixLiteralTest.cpp
index 25ad8690e..35bbd5b3e 100644
--- a/test/api/cli/expressions/MatrixLiteralTest.cpp
+++ b/test/api/cli/expressions/MatrixLiteralTest.cpp
@@ -25,22 +25,22 @@
 
 const std::string dirPath = "test/api/cli/expressions/";
 
-#define MAKE_SUCCESS_TEST_CASE(name, count) \
-    TEST_CASE(name ", success", TAG_MATRIX_LITERAL) { \
-        for(unsigned i = 1; i <= count; i++) { \
-            DYNAMIC_SECTION(name "_success_" << i << ".daphne") { \
-                compareDaphneToRefSimple(dirPath, name "_success", i); \
-            } \
-        } \
+#define MAKE_SUCCESS_TEST_CASE(name, count)                                                                            \
+    TEST_CASE(name ", success", TAG_MATRIX_LITERAL) {                                                                  \
+        for (unsigned i = 1; i <= count; i++) {                                                                        \
+            DYNAMIC_SECTION(name "_success_" << i << ".daphne") {                                                      \
+                compareDaphneToRefSimple(dirPath, name "_success", i);                                                 \
+            }                                                                                                          \
+        }                                                                                                              \
     }
 
-#define MAKE_FAILURE_TEST_CASE(name, count) \
-    TEST_CASE(name ", failure", TAG_MATRIX_LITERAL) { \
-        for(unsigned i = 1; i <= count; i++) { \
-            DYNAMIC_SECTION(name "_failure_" << i << ".daphne") { \
-                checkDaphneFailsSimple(dirPath, name "_failure", i); \
-            } \
-        } \
+#define MAKE_FAILURE_TEST_CASE(name, count)                                                                            \
+    TEST_CASE(name ", failure", TAG_MATRIX_LITERAL) {                                                                  \
+        for (unsigned i = 1; i <= count; i++) {                                                                        \
+            DYNAMIC_SECTION(name "_failure_" << i << ".daphne") {                                                      \
+                checkDaphneFailsSimple(dirPath, name "_failure", i);                                                   \
+            }                                                                                                          \
+        }                                                                                                              \
     }
 
 MAKE_SUCCESS_TEST_CASE("matrix_literal", 7)
diff --git a/test/api/cli/extensibility/ExtensionTest.cpp b/test/api/cli/extensibility/ExtensionTest.cpp
index 6a04b5e1f..3fb23c30e 100644
--- a/test/api/cli/extensibility/ExtensionTest.cpp
+++ b/test/api/cli/extensibility/ExtensionTest.cpp
@@ -42,9 +42,9 @@ TEST_CASE("extension_kernel", TAG_EXTENSIBILITY) {
     // DAPHNE development environment. It would also work with make, but make
     // would create an additional dependency.
     status = runProgram(out, err, "ninja", "ninja", "-C", extDir.c_str());
-    if(status) {
-        // We don't expect any specifc output from ninja, but only if ninja failed,
-        // we want to see what it printed to stdout and stderr.
+    if (status) {
+        // We don't expect any specifc output from ninja, but only if ninja
+        // failed, we want to see what it printed to stdout and stderr.
         CHECK(out.str() == "");
         CHECK(err.str() == "");
     }
@@ -57,12 +57,8 @@ TEST_CASE("extension_kernel", TAG_EXTENSIBILITY) {
     // Run a DaphneDSL script which uses a kernel from the extension through a
     // kernel hint. The extension is registered with DAPHNE at run-time. DAPHNE
     // itself is not re-built or anything.
-    compareDaphneToStr(
-        "hello from mySumAll\n2\n",
-        std::string(dirPath + "extension_kernel_usage.daphne").c_str(),
-        "--kernel-ext",
-        std::string(dirPath + "kernel_extension_test/myKernels.json").c_str()
-    );
+    compareDaphneToStr("hello from mySumAll\n2\n", std::string(dirPath + "extension_kernel_usage.daphne").c_str(),
+                       "--kernel-ext", std::string(dirPath + "kernel_extension_test/myKernels.json").c_str());
 
     // Clear the streams.
     out.clear();
@@ -71,12 +67,13 @@ TEST_CASE("extension_kernel", TAG_EXTENSIBILITY) {
     // *************************************************************************
     // Clean the build of the custom kernel extension.
     // *************************************************************************
-    // Such that the next invocation of this test case needs to build the extension
-    // again, thereby testing again if the extension can be built successfully.
+    // Such that the next invocation of this test case needs to build the
+    // extension again, thereby testing again if the extension can be built
+    // successfully.
     status = runProgram(out, err, "ninja", "ninja", "-C", extDir.c_str(), "-t", "clean");
-    if(status) {
-        // We don't expect any specifc output from ninja, but only if ninja failed,
-        // we want to see what it printed to stdout and stderr.
+    if (status) {
+        // We don't expect any specifc output from ninja, but only if ninja
+        // failed, we want to see what it printed to stdout and stderr.
         CHECK(out.str() == "");
         CHECK(err.str() == "");
     }
diff --git a/test/api/cli/extensibility/HintTest.cpp b/test/api/cli/extensibility/HintTest.cpp
index 5629dac0a..94de4c6f1 100644
--- a/test/api/cli/extensibility/HintTest.cpp
+++ b/test/api/cli/extensibility/HintTest.cpp
@@ -25,40 +25,44 @@
 
 const std::string dirPath = "test/api/cli/extensibility/";
 
-#define MAKE_SUCCESS_TEST_CASE(name, count) \
-    TEST_CASE(name ", success", TAG_EXTENSIBILITY) { \
-        for(unsigned i = 1; i <= count; i++) { \
-            DYNAMIC_SECTION(name "_success_" << i << ".daphne") { \
-                compareDaphneToRefSimple(dirPath, name "_success", i); \
-            } \
-        } \
+#define MAKE_SUCCESS_TEST_CASE(name, count)                                                                            \
+    TEST_CASE(name ", success", TAG_EXTENSIBILITY) {                                                                   \
+        for (unsigned i = 1; i <= count; i++) {                                                                        \
+            DYNAMIC_SECTION(name "_success_" << i << ".daphne") {                                                      \
+                compareDaphneToRefSimple(dirPath, name "_success", i);                                                 \
+            }                                                                                                          \
+        }                                                                                                              \
     }
 
-#define MAKE_FAILURE_TEST_CASE(name, count) \
-    TEST_CASE(name ", failure", TAG_EXTENSIBILITY) { \
-        for(unsigned i = 1; i <= count; i++) { \
-            DYNAMIC_SECTION(name "_failure_" << i << ".daphne") { \
-                checkDaphneFailsSimple(dirPath, name "_failure", i); \
-            } \
-        } \
+#define MAKE_FAILURE_TEST_CASE(name, count)                                                                            \
+    TEST_CASE(name ", failure", TAG_EXTENSIBILITY) {                                                                   \
+        for (unsigned i = 1; i <= count; i++) {                                                                        \
+            DYNAMIC_SECTION(name "_failure_" << i << ".daphne") {                                                      \
+                checkDaphneFailsSimple(dirPath, name "_failure", i);                                                   \
+            }                                                                                                          \
+        }                                                                                                              \
     }
 
-#define MAKE_IR_TEST_CASE(idx, kernelName) \
-    TEST_CASE("hint_kernel_success_" #idx ".daphne, hint presence", TAG_EXTENSIBILITY) { \
-        std::stringstream out; \
-        std::stringstream err; \
-        int status = runDaphne(out, err, "--explain", "parsing_simplified", (dirPath + "hint_kernel_success_" #idx ".daphne").c_str()); \
-        CHECK(status == StatusCode::SUCCESS); \
-        CHECK_THAT(err.str(), Catch::Contains("kernel_hint = \"" kernelName "\"")); \
+#define MAKE_IR_TEST_CASE(idx, kernelName)                                                                             \
+    TEST_CASE("hint_kernel_success_" #idx ".daphne, hint presence", TAG_EXTENSIBILITY) {                               \
+        std::stringstream out;                                                                                         \
+        std::stringstream err;                                                                                         \
+        int status = runDaphne(out, err, "--explain", "parsing_simplified",                                            \
+                               (dirPath + "hint_kernel_success_" #idx ".daphne").c_str());                             \
+        CHECK(status == StatusCode::SUCCESS);                                                                          \
+        CHECK_THAT(err.str(), Catch::Contains("kernel_hint = \"" kernelName "\""));                                    \
     }
 
 // Check if DAPHNE fails when expected.
 MAKE_FAILURE_TEST_CASE("hint_kernel", 3)
 
-// Check if DAPHNE terminates normally when expected and produces the expected output.
+// Check if DAPHNE terminates normally when expected and produces the expected
+// output.
 MAKE_SUCCESS_TEST_CASE("hint_kernel", 3)
 
-// Check if DAPHNE terminates normally when expected and if the IR really contains the kernel hint.
+// Check if DAPHNE terminates normally when expected and if the IR really
+// contains the kernel hint.
 MAKE_IR_TEST_CASE(1, "_print__int64_t__bool__bool");
 MAKE_IR_TEST_CASE(2, "_sumAll__int64_t__DenseMatrix_int64_t");
-MAKE_IR_TEST_CASE(3, "_recode__DenseMatrix_int64_t__DenseMatrix_double__DenseMatrix_double__bool");
\ No newline at end of file
+MAKE_IR_TEST_CASE(3, "_recode__DenseMatrix_int64_t__DenseMatrix_double__"
+                     "DenseMatrix_double__bool");
\ No newline at end of file
diff --git a/test/api/cli/extensibility/kernel_extension_test/myKernels.cpp b/test/api/cli/extensibility/kernel_extension_test/myKernels.cpp
index 555dc75d2..0d1811c8e 100644
--- a/test/api/cli/extensibility/kernel_extension_test/myKernels.cpp
+++ b/test/api/cli/extensibility/kernel_extension_test/myKernels.cpp
@@ -14,7 +14,8 @@
  * limitations under the License.
  */
 
-// TODO Should be <daphne/runtime/local/datastructures/DenseMatrix.h> for more clarity.
+// TODO Should be <daphne/runtime/local/datastructures/DenseMatrix.h> for more
+// clarity.
 #include <runtime/local/datastructures/DenseMatrix.h>
 
 #include <iostream>
@@ -25,15 +26,14 @@ class DaphneContext;
 
 extern "C" {
 
-    void mySumAll(float* res, const DenseMatrix<float>* arg, DaphneContext* ctx) {
-        std::cout << "hello from mySumAll" << std::endl;
-        const float* valuesArg = arg->getValues();
-        *res = 0;
-        for(size_t r = 0; r < arg->getNumRows(); r++) {
-            for(size_t c = 0; c < arg->getNumCols(); c++)
-                *res += valuesArg[c];
-            valuesArg += arg->getRowSkip();
-        }
+void mySumAll(float *res, const DenseMatrix<float> *arg, DaphneContext *ctx) {
+    std::cout << "hello from mySumAll" << std::endl;
+    const float *valuesArg = arg->getValues();
+    *res = 0;
+    for (size_t r = 0; r < arg->getNumRows(); r++) {
+        for (size_t c = 0; c < arg->getNumCols(); c++)
+            *res += valuesArg[c];
+        valuesArg += arg->getRowSkip();
     }
-
+}
 }
diff --git a/test/api/cli/functions/FunctionsTest.cpp b/test/api/cli/functions/FunctionsTest.cpp
index 28fafe164..fd4c69910 100644
--- a/test/api/cli/functions/FunctionsTest.cpp
+++ b/test/api/cli/functions/FunctionsTest.cpp
@@ -24,26 +24,24 @@
 
 const std::string dirPath = "test/api/cli/functions/";
 
-#define MAKE_TEST_CASE(name, count) \
-    TEST_CASE(name, TAG_FUNCTIONS) { \
-        for(unsigned i = 1; i <= count; i++) { \
-            DYNAMIC_SECTION(name "_" << i << ".daphne") { \
-                compareDaphneToSomeRefSimple(dirPath, name, i); \
-            } \
-        } \
+#define MAKE_TEST_CASE(name, count)                                                                                    \
+    TEST_CASE(name, TAG_FUNCTIONS) {                                                                                   \
+        for (unsigned i = 1; i <= count; i++) {                                                                        \
+            DYNAMIC_SECTION(name "_" << i << ".daphne") { compareDaphneToSomeRefSimple(dirPath, name, i); }            \
+        }                                                                                                              \
     }
 
-#define MAKE_INVALID_TEST_CASE(name, count, error_status) \
-    TEST_CASE(name, TAG_FUNCTIONS) { \
-        for(unsigned i = 1; i <= (count); i++) { \
-            DYNAMIC_SECTION(name "_" << i << ".daphne") { \
-                std::stringstream out; \
-                std::stringstream err; \
-                std::string filePath = dirPath + (name) + "_" + std::to_string(i) + ".daphne"; \
-                int status = runDaphne(out, err, filePath.c_str()); \
-                REQUIRE(status == (error_status)); \
-            } \
-        } \
+#define MAKE_INVALID_TEST_CASE(name, count, error_status)                                                              \
+    TEST_CASE(name, TAG_FUNCTIONS) {                                                                                   \
+        for (unsigned i = 1; i <= (count); i++) {                                                                      \
+            DYNAMIC_SECTION(name "_" << i << ".daphne") {                                                              \
+                std::stringstream out;                                                                                 \
+                std::stringstream err;                                                                                 \
+                std::string filePath = dirPath + (name) + "_" + std::to_string(i) + ".daphne";                         \
+                int status = runDaphne(out, err, filePath.c_str());                                                    \
+                REQUIRE(status == (error_status));                                                                     \
+            }                                                                                                          \
+        }                                                                                                              \
     }
 
 MAKE_TEST_CASE("basic", 3)
diff --git a/test/api/cli/functions/RecursiveFunctionsTest.cpp b/test/api/cli/functions/RecursiveFunctionsTest.cpp
index 038540214..4c8f40ed4 100644
--- a/test/api/cli/functions/RecursiveFunctionsTest.cpp
+++ b/test/api/cli/functions/RecursiveFunctionsTest.cpp
@@ -24,13 +24,11 @@
 
 const std::string dirPath = "test/api/cli/functions/";
 
-#define MAKE_TEST_CASE(name, count) \
-    TEST_CASE(name, TAG_FUNCTIONS) { \
-        for(unsigned i = 1; i <= count; i++) { \
-            DYNAMIC_SECTION(name "_" << i << ".daphne") { \
-                compareDaphneToRefSimple(dirPath, name, i); \
-            } \
-        } \
+#define MAKE_TEST_CASE(name, count)                                                                                    \
+    TEST_CASE(name, TAG_FUNCTIONS) {                                                                                   \
+        for (unsigned i = 1; i <= count; i++) {                                                                        \
+            DYNAMIC_SECTION(name "_" << i << ".daphne") { compareDaphneToRefSimple(dirPath, name, i); }                \
+        }                                                                                                              \
     }
 
 MAKE_TEST_CASE("basic_recursion", 1)
\ No newline at end of file
diff --git a/test/api/cli/import/ImportTest.cpp b/test/api/cli/import/ImportTest.cpp
index ad196a249..42cf3875e 100644
--- a/test/api/cli/import/ImportTest.cpp
+++ b/test/api/cli/import/ImportTest.cpp
@@ -21,24 +21,26 @@
 #include <string>
 
 const std::string dirPath = "test/api/cli/import/";
-const char* configFilePath = "test/api/cli/import/UserConfig.json";
+const char *configFilePath = "test/api/cli/import/UserConfig.json";
 
-#define MAKE_SUCCESS_TEST_CASE(name, count) \
-    TEST_CASE(name ", success", TAG_CONFIG) {\
-        for(unsigned i = 1; i <= (count); i++) {\
-            DYNAMIC_SECTION(name "_success_" << i << ".daphne") {\
-                compareDaphneToRefSimple(dirPath, name "_success", i,  std::string("--config=").append(configFilePath).c_str());\
-            }\
-        }\
+#define MAKE_SUCCESS_TEST_CASE(name, count)                                                                            \
+    TEST_CASE(name ", success", TAG_CONFIG) {                                                                          \
+        for (unsigned i = 1; i <= (count); i++) {                                                                      \
+            DYNAMIC_SECTION(name "_success_" << i << ".daphne") {                                                      \
+                compareDaphneToRefSimple(dirPath, name "_success", i,                                                  \
+                                         std::string("--config=").append(configFilePath).c_str());                     \
+            }                                                                                                          \
+        }                                                                                                              \
     }
 
-#define MAKE_FAILURE_TEST_CASE(name, count) \
-    TEST_CASE(name ", failure", TAG_IMPORT) { \
-        for(unsigned i = 1; i <= (count); i++) { \
-            DYNAMIC_SECTION(name "_failure_" << i << ".daphne") { \
-                checkDaphneFailsSimple(dirPath, name "_failure", i, std::string("--config=").append(configFilePath).c_str());\
-            } \
-        } \
+#define MAKE_FAILURE_TEST_CASE(name, count)                                                                            \
+    TEST_CASE(name ", failure", TAG_IMPORT) {                                                                          \
+        for (unsigned i = 1; i <= (count); i++) {                                                                      \
+            DYNAMIC_SECTION(name "_failure_" << i << ".daphne") {                                                      \
+                checkDaphneFailsSimple(dirPath, name "_failure", i,                                                    \
+                                       std::string("--config=").append(configFilePath).c_str());                       \
+            }                                                                                                          \
+        }                                                                                                              \
     }
 
 MAKE_SUCCESS_TEST_CASE("import", 5)
diff --git a/test/api/cli/indexing/IndexingTest.cpp b/test/api/cli/indexing/IndexingTest.cpp
index dba80164e..94062f823 100644
--- a/test/api/cli/indexing/IndexingTest.cpp
+++ b/test/api/cli/indexing/IndexingTest.cpp
@@ -24,27 +24,27 @@
 
 const std::string dirPath = "test/api/cli/indexing/";
 
-#define MAKE_SUCCESS_TEST_CASE(name, count) \
-    TEST_CASE(name ", success", TAG_INDEXING) { \
-        for(unsigned i = 1; i <= count; i++) { \
-            DYNAMIC_SECTION(name "_success_" << i << ".daphne") { \
-                compareDaphneToRefSimple(dirPath, name "_success", i); \
-            } \
-        } \
+#define MAKE_SUCCESS_TEST_CASE(name, count)                                                                            \
+    TEST_CASE(name ", success", TAG_INDEXING) {                                                                        \
+        for (unsigned i = 1; i <= count; i++) {                                                                        \
+            DYNAMIC_SECTION(name "_success_" << i << ".daphne") {                                                      \
+                compareDaphneToRefSimple(dirPath, name "_success", i);                                                 \
+            }                                                                                                          \
+        }                                                                                                              \
     }
 
-#define MAKE_FAILURE_TEST_CASE(name, count) \
-    TEST_CASE(name ", failure", TAG_INDEXING) { \
-        for(unsigned i = 1; i <= count; i++) { \
-            DYNAMIC_SECTION(name "_failure_" << i << ".daphne") { \
-                checkDaphneFailsSimple(dirPath, name "_failure", i); \
-            } \
-        } \
+#define MAKE_FAILURE_TEST_CASE(name, count)                                                                            \
+    TEST_CASE(name ", failure", TAG_INDEXING) {                                                                        \
+        for (unsigned i = 1; i <= count; i++) {                                                                        \
+            DYNAMIC_SECTION(name "_failure_" << i << ".daphne") {                                                      \
+                checkDaphneFailsSimple(dirPath, name "_failure", i);                                                   \
+            }                                                                                                          \
+        }                                                                                                              \
     }
 
 MAKE_SUCCESS_TEST_CASE("right_indexing", 5)
 MAKE_FAILURE_TEST_CASE("right_indexing", 10)
-        
+
 // TODO Add a test case for multi-assignments (`X[...], Y[...] = ...`).
 MAKE_SUCCESS_TEST_CASE("left_indexing", 2)
 MAKE_SUCCESS_TEST_CASE("left_indexing_cow", 8)
diff --git a/test/api/cli/inference/InferenceTest.cpp b/test/api/cli/inference/InferenceTest.cpp
index bd5435043..9788bd799 100644
--- a/test/api/cli/inference/InferenceTest.cpp
+++ b/test/api/cli/inference/InferenceTest.cpp
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include <api/cli/Utils.h>
 #include <api/cli/StatusCode.h>
+#include <api/cli/Utils.h>
 
 #include <tags.h>
 
@@ -26,22 +26,22 @@
 
 const std::string dirPath = "test/api/cli/inference/";
 
-#define MAKE_SUCCESS_TEST_CASE(name, count) \
-    TEST_CASE(name ", inference success", TAG_INFERENCE) { \
-        for(unsigned i = 1; i <= count; i++) { \
-            DYNAMIC_SECTION(name "_success_" << i << ".daphne") { \
-                checkDaphneStatusCodeSimple(StatusCode::SUCCESS, dirPath, name "_success", i); \
-            } \
-        } \
+#define MAKE_SUCCESS_TEST_CASE(name, count)                                                                            \
+    TEST_CASE(name ", inference success", TAG_INFERENCE) {                                                             \
+        for (unsigned i = 1; i <= count; i++) {                                                                        \
+            DYNAMIC_SECTION(name "_success_" << i << ".daphne") {                                                      \
+                checkDaphneStatusCodeSimple(StatusCode::SUCCESS, dirPath, name "_success", i);                         \
+            }                                                                                                          \
+        }                                                                                                              \
     }
 
-#define MAKE_FAILURE_TEST_CASE(name, count) \
-    TEST_CASE(name ", inference failure", TAG_INFERENCE) { \
-        for(unsigned i = 1; i <= count; i++) { \
-            DYNAMIC_SECTION(name "_failure_" << i << ".daphne") { \
-                checkDaphneFailsSimple(dirPath, name "_failure", i); \
-            } \
-        } \
+#define MAKE_FAILURE_TEST_CASE(name, count)                                                                            \
+    TEST_CASE(name ", inference failure", TAG_INFERENCE) {                                                             \
+        for (unsigned i = 1; i <= count; i++) {                                                                        \
+            DYNAMIC_SECTION(name "_failure_" << i << ".daphne") {                                                      \
+                checkDaphneFailsSimple(dirPath, name "_failure", i);                                                   \
+            }                                                                                                          \
+        }                                                                                                              \
     }
 
 MAKE_SUCCESS_TEST_CASE("if", 8);
diff --git a/test/api/cli/io/ReadTest.cpp b/test/api/cli/io/ReadTest.cpp
index 819645f1f..0774fe422 100644
--- a/test/api/cli/io/ReadTest.cpp
+++ b/test/api/cli/io/ReadTest.cpp
@@ -40,18 +40,17 @@ TEST_CASE("readSparse", TAG_IO) {
 }
 #endif
 
-TEST_CASE("readFrameFromCSV", TAG_IO)
-{
+TEST_CASE("readFrameFromCSV", TAG_IO) {
     compareDaphneToRef(dirPath + "testReadFrame.txt", dirPath + "testReadFrame.daphne");
 }
 
-TEST_CASE("readMatrixFromCSV", TAG_IO)
-{
+TEST_CASE("readMatrixFromCSV", TAG_IO) {
     compareDaphneToRef(dirPath + "testReadMatrix.txt", dirPath + "testReadMatrix.daphne");
 }
 
 // does not yet work!
 // TEST_CASE("readReadMatrixFromCSV_DynamicPath", TAG_IO)
 // {
-//     compareDaphneToRef(dirPath + "testReadMatrix.txt", dirPath + "testReadMatrix_DynamicPath.daphne");
+//     compareDaphneToRef(dirPath + "testReadMatrix.txt", dirPath +
+//     "testReadMatrix_DynamicPath.daphne");
 // }
\ No newline at end of file
diff --git a/test/api/cli/io/WriteTest.cpp b/test/api/cli/io/WriteTest.cpp
index 6beb8f418..b58b8ec09 100644
--- a/test/api/cli/io/WriteTest.cpp
+++ b/test/api/cli/io/WriteTest.cpp
@@ -28,13 +28,17 @@ const std::string dirPath = "test/api/cli/io/";
 TEST_CASE("writeMatrixCSV_Full", TAG_IO) {
     std::string csvPath = dirPath + "matrix_full.csv";
     std::filesystem::remove(csvPath); // remove old file if it still exists
-    checkDaphneStatusCode(StatusCode::SUCCESS, dirPath + "writeMatrix_full.daphne", "--args", std::string("outPath=\"" + csvPath + "\"").c_str());
-    compareDaphneToRef(dirPath + "matrix_full_ref.csv", dirPath + "readMatrix.daphne", "--args", std::string("inPath=\"" + csvPath + "\"").c_str());
+    checkDaphneStatusCode(StatusCode::SUCCESS, dirPath + "writeMatrix_full.daphne", "--args",
+                          std::string("outPath=\"" + csvPath + "\"").c_str());
+    compareDaphneToRef(dirPath + "matrix_full_ref.csv", dirPath + "readMatrix.daphne", "--args",
+                       std::string("inPath=\"" + csvPath + "\"").c_str());
 }
 
 TEST_CASE("writeMatrixCSV_View", TAG_IO) {
     std::string csvPath = dirPath + "matrix_view.csv";
     std::filesystem::remove(csvPath); // remove old file if it still exists
-    checkDaphneStatusCode(StatusCode::SUCCESS, dirPath + "writeMatrix_view.daphne", "--args", std::string("outPath=\"" + csvPath + "\"").c_str());
-    compareDaphneToRef(dirPath + "matrix_view_ref.csv", dirPath + "readMatrix.daphne", "--args", std::string("inPath=\"" + csvPath + "\"").c_str());
+    checkDaphneStatusCode(StatusCode::SUCCESS, dirPath + "writeMatrix_view.daphne", "--args",
+                          std::string("outPath=\"" + csvPath + "\"").c_str());
+    compareDaphneToRef(dirPath + "matrix_view_ref.csv", dirPath + "readMatrix.daphne", "--args",
+                       std::string("inPath=\"" + csvPath + "\"").c_str());
 }
\ No newline at end of file
diff --git a/test/api/cli/lists/ListsTest.cpp b/test/api/cli/lists/ListsTest.cpp
index 445c98e61..6b3222361 100644
--- a/test/api/cli/lists/ListsTest.cpp
+++ b/test/api/cli/lists/ListsTest.cpp
@@ -25,22 +25,22 @@
 
 const std::string dirPath = "test/api/cli/lists/";
 
-#define MAKE_SUCCESS_TEST_CASE(name, count) \
-    TEST_CASE(name ", success", TAG_LISTS) { \
-        for(unsigned i = 1; i <= count; i++) { \
-            DYNAMIC_SECTION(name "_success_" << i << ".daphne") { \
-                compareDaphneToRefSimple(dirPath, name "_success", i); \
-            } \
-        } \
+#define MAKE_SUCCESS_TEST_CASE(name, count)                                                                            \
+    TEST_CASE(name ", success", TAG_LISTS) {                                                                           \
+        for (unsigned i = 1; i <= count; i++) {                                                                        \
+            DYNAMIC_SECTION(name "_success_" << i << ".daphne") {                                                      \
+                compareDaphneToRefSimple(dirPath, name "_success", i);                                                 \
+            }                                                                                                          \
+        }                                                                                                              \
     }
 
-#define MAKE_FAILURE_TEST_CASE(name, count) \
-    TEST_CASE(name ", failure", TAG_LISTS) { \
-        for(unsigned i = 1; i <= count; i++) { \
-            DYNAMIC_SECTION(name "_failure_" << i << ".daphne") { \
-                checkDaphneFailsSimple(dirPath, name "_failure", i); \
-            } \
-        } \
+#define MAKE_FAILURE_TEST_CASE(name, count)                                                                            \
+    TEST_CASE(name ", failure", TAG_LISTS) {                                                                           \
+        for (unsigned i = 1; i <= count; i++) {                                                                        \
+            DYNAMIC_SECTION(name "_failure_" << i << ".daphne") {                                                      \
+                checkDaphneFailsSimple(dirPath, name "_failure", i);                                                   \
+            }                                                                                                          \
+        }                                                                                                              \
     }
 
 MAKE_SUCCESS_TEST_CASE("lists", 4)
diff --git a/test/api/cli/literals/LiteralsTest.cpp b/test/api/cli/literals/LiteralsTest.cpp
index 69e98e747..7f020d922 100644
--- a/test/api/cli/literals/LiteralsTest.cpp
+++ b/test/api/cli/literals/LiteralsTest.cpp
@@ -25,22 +25,20 @@
 
 const std::string dirPath = "test/api/cli/literals/";
 
-#define MAKE_TEST_CASE(name, count) \
-    TEST_CASE(name, TAG_LITERALS) { \
-        for(unsigned i = 1; i <= count; i++) { \
-            DYNAMIC_SECTION(name "_success_" << i << ".daphne") { \
-                compareDaphneToRefSimple(dirPath, name, i); \
-            } \
-        } \
+#define MAKE_TEST_CASE(name, count)                                                                                    \
+    TEST_CASE(name, TAG_LITERALS) {                                                                                    \
+        for (unsigned i = 1; i <= count; i++) {                                                                        \
+            DYNAMIC_SECTION(name "_success_" << i << ".daphne") { compareDaphneToRefSimple(dirPath, name, i); }        \
+        }                                                                                                              \
     }
 
-#define MAKE_FAILURE_TEST_CASE(name, count) \
-    TEST_CASE(name ", failure", TAG_LITERALS) { \
-        for(unsigned i = 1; i <= count; i++) { \
-            DYNAMIC_SECTION(name "_failure_" << i << ".daphne") { \
-                checkDaphneFailsSimple(dirPath, name "_failure", i); \
-            } \
-        } \
+#define MAKE_FAILURE_TEST_CASE(name, count)                                                                            \
+    TEST_CASE(name ", failure", TAG_LITERALS) {                                                                        \
+        for (unsigned i = 1; i <= count; i++) {                                                                        \
+            DYNAMIC_SECTION(name "_failure_" << i << ".daphne") {                                                      \
+                checkDaphneFailsSimple(dirPath, name "_failure", i);                                                   \
+            }                                                                                                          \
+        }                                                                                                              \
     }
 
 MAKE_TEST_CASE("int", 8)
diff --git a/test/api/cli/operations/CanonicalizationConstantFoldingOpTest.cpp b/test/api/cli/operations/CanonicalizationConstantFoldingOpTest.cpp
index 19125d733..37c0f3595 100644
--- a/test/api/cli/operations/CanonicalizationConstantFoldingOpTest.cpp
+++ b/test/api/cli/operations/CanonicalizationConstantFoldingOpTest.cpp
@@ -15,16 +15,15 @@
  */
 
 #include <api/cli/Utils.h>
-#include <tags.h>
 #include <string>
-
+#include <tags.h>
 
 const std::string dirPath = "test/api/cli/operations/";
 
-void compareDaphneParsingSimplifiedToRef(const std::string & refFilePath, const std::string & scriptFilePath) {
+void compareDaphneParsingSimplifiedToRef(const std::string &refFilePath, const std::string &scriptFilePath) {
     std::stringstream out;
     std::stringstream err;
-	const std::string exp = readTextFile(refFilePath);
+    const std::string exp = readTextFile(refFilePath);
     int status = runDaphne(out, err, "--explain=parsing_simplified", scriptFilePath.c_str());
     CHECK(status == StatusCode::SUCCESS);
     CHECK(err.str() == exp);
diff --git a/test/api/cli/operations/ConstantFoldingTest.cpp b/test/api/cli/operations/ConstantFoldingTest.cpp
index dc531d4db..ced543ada 100644
--- a/test/api/cli/operations/ConstantFoldingTest.cpp
+++ b/test/api/cli/operations/ConstantFoldingTest.cpp
@@ -25,13 +25,11 @@
 
 const std::string dirPath = "test/api/cli/operations/";
 
-#define MAKE_TEST_CASE(name, count) \
-    TEST_CASE(name, TAG_OPERATIONS) { \
-        for(unsigned i = 1; i <= count; i++) { \
-            DYNAMIC_SECTION(name "_" << i << ".daphne") { \
-                compareDaphneToRefSimple(dirPath, name, i); \
-            } \
-        } \
+#define MAKE_TEST_CASE(name, count)                                                                                    \
+    TEST_CASE(name, TAG_OPERATIONS) {                                                                                  \
+        for (unsigned i = 1; i <= count; i++) {                                                                        \
+            DYNAMIC_SECTION(name "_" << i << ".daphne") { compareDaphneToRefSimple(dirPath, name, i); }                \
+        }                                                                                                              \
     }
 
 MAKE_TEST_CASE("constantFolding", 2)
\ No newline at end of file
diff --git a/test/api/cli/operations/OperationsTest.cpp b/test/api/cli/operations/OperationsTest.cpp
index c968fb4e0..0631fca25 100644
--- a/test/api/cli/operations/OperationsTest.cpp
+++ b/test/api/cli/operations/OperationsTest.cpp
@@ -25,13 +25,11 @@
 
 const std::string dirPath = "test/api/cli/operations/";
 
-#define MAKE_TEST_CASE(name, count) \
-    TEST_CASE(name, TAG_OPERATIONS) { \
-        for(unsigned i = 1; i <= count; i++) { \
-            DYNAMIC_SECTION(name "_" << i << ".daphne") { \
-                compareDaphneToRefSimple(dirPath, name, i); \
-            } \
-        } \
+#define MAKE_TEST_CASE(name, count)                                                                                    \
+    TEST_CASE(name, TAG_OPERATIONS) {                                                                                  \
+        for (unsigned i = 1; i <= count; i++) {                                                                        \
+            DYNAMIC_SECTION(name "_" << i << ".daphne") { compareDaphneToRefSimple(dirPath, name, i); }                \
+        }                                                                                                              \
     }
 
 MAKE_TEST_CASE("aggMax", 1)
diff --git a/test/api/cli/operations/TypeOfTest.cpp b/test/api/cli/operations/TypeOfTest.cpp
index 6f7c45864..807688f48 100644
--- a/test/api/cli/operations/TypeOfTest.cpp
+++ b/test/api/cli/operations/TypeOfTest.cpp
@@ -24,22 +24,20 @@
 
 const std::string dirPath = "test/api/cli/operations/";
 
-#define MAKE_TEST_CASE(name, count) \
-    TEST_CASE(name, TAG_OPERATIONS) { \
-        for(unsigned i = 1; i <= count; i++) { \
-            DYNAMIC_SECTION(name "_" << i << ".daphne") { \
-                compareDaphneToRefSimple(dirPath, name, i); \
-            } \
-        } \
+#define MAKE_TEST_CASE(name, count)                                                                                    \
+    TEST_CASE(name, TAG_OPERATIONS) {                                                                                  \
+        for (unsigned i = 1; i <= count; i++) {                                                                        \
+            DYNAMIC_SECTION(name "_" << i << ".daphne") { compareDaphneToRefSimple(dirPath, name, i); }                \
+        }                                                                                                              \
     }
 
-#define MAKE_TEST_CASE_SPARSE(name, count) \
-    TEST_CASE(name, TAG_OPERATIONS) { \
-        for(unsigned i = 1; i <= count; i++) { \
-            DYNAMIC_SECTION(name "_" << i << ".daphne") { \
-                compareDaphneToRefSimple(dirPath, name, i, "--select-matrix-repr=1"); \
-            } \
-        } \
+#define MAKE_TEST_CASE_SPARSE(name, count)                                                                             \
+    TEST_CASE(name, TAG_OPERATIONS) {                                                                                  \
+        for (unsigned i = 1; i <= count; i++) {                                                                        \
+            DYNAMIC_SECTION(name "_" << i << ".daphne") {                                                              \
+                compareDaphneToRefSimple(dirPath, name, i, "--select-matrix-repr=1");                                  \
+            }                                                                                                          \
+        }                                                                                                              \
     }
 
 MAKE_TEST_CASE("typeOf", 2)
diff --git a/test/api/cli/parser/MetaDataParserTest.cpp b/test/api/cli/parser/MetaDataParserTest.cpp
index 3a4db16d0..4b690a827 100644
--- a/test/api/cli/parser/MetaDataParserTest.cpp
+++ b/test/api/cli/parser/MetaDataParserTest.cpp
@@ -28,102 +28,101 @@
 
 const std::string dirPath = "test/api/cli/parser/metadataFiles/";
 
-TEST_CASE("Proper meta data file for Matrix", TAG_PARSER)
-{
+TEST_CASE("Proper meta data file for Matrix", TAG_PARSER) {
     const std::string metaDataFile = dirPath + "MetaData1";
     REQUIRE_NOTHROW(MetaDataParser::readMetaData(metaDataFile));
 }
 
-TEST_CASE("Proper meta data file for Frame", TAG_PARSER)
-{
+TEST_CASE("Proper meta data file for Frame", TAG_PARSER) {
     const std::string metaDataFile = dirPath + "MetaData2";
     REQUIRE_NOTHROW(MetaDataParser::readMetaData(metaDataFile));
 }
 
-TEST_CASE("Meta data file mising \"numRows\" key", TAG_PARSER)
-{
+TEST_CASE("Meta data file mising \"numRows\" key", TAG_PARSER) {
     const std::string metaDataFile = dirPath + "MetaData3";
     REQUIRE_THROWS(MetaDataParser::readMetaData(metaDataFile));
 }
 
-TEST_CASE("Meta data file mising \"numCols\" key", TAG_PARSER)
-{
+TEST_CASE("Meta data file mising \"numCols\" key", TAG_PARSER) {
     const std::string metaDataFile = dirPath + "MetaData4";
     REQUIRE_THROWS(MetaDataParser::readMetaData(metaDataFile));
 }
 
-TEST_CASE("Matrix meta data file missing \"valueType\" key", TAG_PARSER)
-{
+TEST_CASE("Matrix meta data file missing \"valueType\" key", TAG_PARSER) {
     const std::string metaDataFile = dirPath + "MetaData5";
     REQUIRE_THROWS(MetaDataParser::readMetaData(metaDataFile));
 }
 
-TEST_CASE("Meta data file without \"numNonZeros\" key", TAG_PARSER)
-{
+TEST_CASE("Meta data file without \"numNonZeros\" key", TAG_PARSER) {
     const std::string metaDataFile = dirPath + "MetaData6";
     REQUIRE_NOTHROW(MetaDataParser::readMetaData(metaDataFile));
 }
 
-TEST_CASE("A non existing meta data file passed to the method", TAG_PARSER)
-{
+TEST_CASE("A non existing meta data file passed to the method", TAG_PARSER) {
     const std::string metaDataFile = dirPath + "MetaMetaData";
     REQUIRE_THROWS(MetaDataParser::readMetaData(metaDataFile));
 }
 
-TEST_CASE("Frame meta data file without \"label\" keys", TAG_PARSER)
-{
+TEST_CASE("Frame meta data file without \"label\" keys", TAG_PARSER) {
     const std::string metaDataFile = dirPath + "MetaData7";
     REQUIRE_THROWS(MetaDataParser::readMetaData(metaDataFile));
 }
 
-TEST_CASE("Frame meta data file with default \"valueType\"", TAG_PARSER)
-{
+TEST_CASE("Frame meta data file with default \"valueType\"", TAG_PARSER) {
     auto dctx = setupContextAndLogger();
     const std::string metaDataFile = dirPath + "MetaData8";
     REQUIRE_NOTHROW(MetaDataParser::readMetaData(metaDataFile));
 }
 
-TEMPLATE_PRODUCT_TEST_CASE("Write proper meta data file for Matrix", TAG_PARSER,(DenseMatrix, CSRMatrix), (double))
-{
+TEMPLATE_PRODUCT_TEST_CASE("Write proper meta data file for Matrix", TAG_PARSER, (DenseMatrix, CSRMatrix), (double)) {
     using DT = TestType;
-    
+
     const std::filesystem::path metaDataFile(dirPath + "WriteMatrixMetaData.meta");
     const std::filesystem::path metaDataFileNoSuffix(dirPath + "WriteMatrixMetaData");
 
     auto m = genGivenVals<DT>(3, {
-            0, 0, 1, 0,
-            0, 0, 0, 0,
-            0, 2, 0, 0,
-    });
-    
+                                     0,
+                                     0,
+                                     1,
+                                     0,
+                                     0,
+                                     0,
+                                     0,
+                                     0,
+                                     0,
+                                     2,
+                                     0,
+                                     0,
+                                 });
+
     FileMetaData metaData(m->getNumRows(), m->getNumCols(), true, ValueTypeUtils::codeFor<typename DT::VT>);
     MetaDataParser::writeMetaData(metaDataFileNoSuffix, metaData);
-    
+
     REQUIRE_NOTHROW(MetaDataParser::readMetaData(metaDataFileNoSuffix));
 
     // cleanup
-    if(std::filesystem::exists(metaDataFile)) {
+    if (std::filesystem::exists(metaDataFile)) {
         std::filesystem::remove(metaDataFile);
     }
 }
 
-TEST_CASE("Write proper meta data file for Frame", TAG_PARSER)
-{
+TEST_CASE("Write proper meta data file for Frame", TAG_PARSER) {
     const std::filesystem::path metaDataFile(dirPath + "WriteFrameMetaData.meta");
-    // the writeMataData method adds the .meta suffix so we need one version here without it
+    // the writeMataData method adds the .meta suffix so we need one version
+    // here without it
     const std::filesystem::path metaDataFileNoSuffix(dirPath + "WriteFrameMetaData");
 
     std::vector<ValueTypeCode> schema = {ValueTypeCode::SI64, ValueTypeCode::F64};
     std::vector<std::string> labels = {"foo", "bar"};
     auto f = DataObjectFactory::create<Frame>(4, 2, schema.data(), labels.data(), false);
     FileMetaData metaData(f->getNumRows(), f->getNumCols(), false, schema, labels, -1);
-    
+
     MetaDataParser::writeMetaData(metaDataFileNoSuffix, metaData);
-    
+
     REQUIRE_NOTHROW(MetaDataParser::readMetaData(metaDataFileNoSuffix));
 
     // cleanup
-    if(std::filesystem::exists(metaDataFile)) {
+    if (std::filesystem::exists(metaDataFile)) {
         std::filesystem::remove(metaDataFile);
     }
 }
diff --git a/test/api/cli/parser/ParserTest.cpp b/test/api/cli/parser/ParserTest.cpp
index 2f1fb5a10..090e626d9 100644
--- a/test/api/cli/parser/ParserTest.cpp
+++ b/test/api/cli/parser/ParserTest.cpp
@@ -20,22 +20,22 @@
 
 #include <catch.hpp>
 
-#include <parser/daphnedsl/DaphneDSLParser.h>
 #include "ir/daphneir/Daphne.h"
+#include <parser/daphnedsl/DaphneDSLParser.h>
 
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/ExecutionEngine/OptUtils.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinOps.h"
-#include <mlir/IR/MLIRContext.h>
-#include <mlir/InitAllDialects.h>
+#include "mlir/Parser/Parser.h"
+#include <ir/daphneir/Passes.h>
 #include <mlir/ExecutionEngine/ExecutionEngine.h>
 #include <mlir/IR/AsmState.h>
-#include "mlir/Parser/Parser.h"
+#include <mlir/IR/MLIRContext.h>
+#include <mlir/InitAllDialects.h>
 #include <mlir/Pass/PassManager.h>
 #include <mlir/Transforms/Passes.h>
-#include <ir/daphneir/Passes.h>
 
 #include <iostream>
 #include <memory>
@@ -44,10 +44,9 @@
 
 const std::string dirPath = "test/api/cli/parser/";
 
-/// This testcase tests both parsing of a simple DML file, while also checking if the printing and parsing of
-/// dialect operations and types is compatible.
-TEST_CASE("Parse file in DML, write and re-read as DaphneIR", TAG_PARSER)
-{
+/// This testcase tests both parsing of a simple DML file, while also checking
+/// if the printing and parsing of dialect operations and types is compatible.
+TEST_CASE("Parse file in DML, write and re-read as DaphneIR", TAG_PARSER) {
     auto dctx = setupContextAndLogger();
 
     std::string daphneIrCode;
@@ -69,15 +68,16 @@ TEST_CASE("Parse file in DML, write and re-read as DaphneIR", TAG_PARSER)
 
         llvm::raw_string_ostream stream(daphneIrCode);
         moduleOp.print(stream);
-        
+
         // Print IR after SelectMatrixRepresentationsPass
         mlir::PassManager passManager(&context);
         passManager.addNestedPass<mlir::func::FuncOp>(mlir::daphne::createInferencePass());
         passManager.addPass(mlir::createCanonicalizerPass());
-        passManager.addNestedPass<mlir::func::FuncOp>(mlir::daphne::createSelectMatrixRepresentationsPass(dctx->getUserConfig()));
-        
+        passManager.addNestedPass<mlir::func::FuncOp>(
+            mlir::daphne::createSelectMatrixRepresentationsPass(dctx->getUserConfig()));
+
         REQUIRE(failed(passManager.run(moduleOp)) == false);
-        
+
         llvm::raw_string_ostream streamMatRepr(daphneIRCodeMatRepr);
         moduleOp.print(streamMatRepr);
     }
@@ -95,9 +95,10 @@ TEST_CASE("Parse file in DML, write and re-read as DaphneIR", TAG_PARSER)
     module->print(stream);
 
     REQUIRE(daphneIrCode == newCode);
-        
+
     // Parse after SelectMatrixRepresentationsPass
-    mlir::OwningOpRef<mlir::ModuleOp> moduleMatReprPass(mlir::parseSourceString<mlir::ModuleOp>(daphneIRCodeMatRepr, &context));
+    mlir::OwningOpRef<mlir::ModuleOp> moduleMatReprPass(
+        mlir::parseSourceString<mlir::ModuleOp>(daphneIRCodeMatRepr, &context));
     REQUIRE(moduleMatReprPass);
 
     std::string newCodeMatRepr;
diff --git a/test/api/cli/scoping/ScopingTest.cpp b/test/api/cli/scoping/ScopingTest.cpp
index a23801ed9..789791695 100644
--- a/test/api/cli/scoping/ScopingTest.cpp
+++ b/test/api/cli/scoping/ScopingTest.cpp
@@ -26,20 +26,18 @@
 const std::string dirPath = "test/api/cli/scoping/";
 
 TEST_CASE("block, success", TAG_SCOPING) {
-    for(unsigned i = 1; i <= 3; i++) {
-        DYNAMIC_SECTION("block_success_" << i << ".daphne") {
-            compareDaphneToRefSimple(dirPath, "block_success", i);
-        }
+    for (unsigned i = 1; i <= 3; i++) {
+        DYNAMIC_SECTION("block_success_" << i << ".daphne") { compareDaphneToRefSimple(dirPath, "block_success", i); }
     }
 }
 
-#define MAKE_FAILURE_TEST_CASE(name, count) \
-    TEST_CASE(name ", failure", TAG_SCOPING) { \
-        for(unsigned i = 1; i <= count; i++) { \
-            DYNAMIC_SECTION(name "_failure_" << i << ".daphne") { \
-                checkDaphneStatusCodeSimple(StatusCode::PARSER_ERROR, dirPath, name "_failure", i); \
-            } \
-        } \
+#define MAKE_FAILURE_TEST_CASE(name, count)                                                                            \
+    TEST_CASE(name ", failure", TAG_SCOPING) {                                                                         \
+        for (unsigned i = 1; i <= count; i++) {                                                                        \
+            DYNAMIC_SECTION(name "_failure_" << i << ".daphne") {                                                      \
+                checkDaphneStatusCodeSimple(StatusCode::PARSER_ERROR, dirPath, name "_failure", i);                    \
+            }                                                                                                          \
+        }                                                                                                              \
     }
 
 MAKE_FAILURE_TEST_CASE("block", 2)
diff --git a/test/api/cli/scriptargs/ScriptArgsTest.cpp b/test/api/cli/scriptargs/ScriptArgsTest.cpp
index 25ef3e2a7..6677b85ac 100644
--- a/test/api/cli/scriptargs/ScriptArgsTest.cpp
+++ b/test/api/cli/scriptargs/ScriptArgsTest.cpp
@@ -27,9 +27,9 @@ const std::string dirPath = "test/api/cli/scriptargs/";
 
 TEST_CASE("Print single script argument", TAG_SCRIPTARGS) {
     const std::string scriptPath = dirPath + "printSingleArg.daphne";
-    compareDaphneToStr("123\n"        , scriptPath.c_str(), "--args", "foo=123");
-    compareDaphneToStr("-123.45\n"    , scriptPath.c_str(), "--args", "foo=-123.45");
-    compareDaphneToStr("1\n"          , scriptPath.c_str(), "--args", "foo=true");
+    compareDaphneToStr("123\n", scriptPath.c_str(), "--args", "foo=123");
+    compareDaphneToStr("-123.45\n", scriptPath.c_str(), "--args", "foo=-123.45");
+    compareDaphneToStr("1\n", scriptPath.c_str(), "--args", "foo=true");
     compareDaphneToStr("hello world\n", scriptPath.c_str(), "--args", "foo=\"hello world\"");
     compareDaphneToStr("nan\n", scriptPath.c_str(), "--args", "foo=nan");
     compareDaphneToStr("inf\n", scriptPath.c_str(), "--args", "foo=inf");
@@ -67,29 +67,14 @@ TEST_CASE("Duplicate script argument") {
 TEST_CASE("Ways of specifying script arguments", TAG_SCRIPTARGS) {
     std::stringstream out;
     std::stringstream err;
-    
+
     const std::string scriptPath = dirPath + "printMultipleArgs.daphne";
 
     int status;
-    SECTION("only after script file") {
-        status = runDaphne(
-                out, err,
-                scriptPath.c_str(), "a=1", "b=2", "c=3", "d=4"
-        );
-    }
-    SECTION("only via --args") {
-        status = runDaphne(
-                out, err,
-                "--args", "a=1,b=2,c=3,d=4", scriptPath.c_str()
-        );
-    }
-    SECTION("mixed") {
-        status = runDaphne(
-                out, err,
-                "--args", "a=1,b=2", scriptPath.c_str(), "c=3", "d=4"
-        );
-    }
-    
+    SECTION("only after script file") { status = runDaphne(out, err, scriptPath.c_str(), "a=1", "b=2", "c=3", "d=4"); }
+    SECTION("only via --args") { status = runDaphne(out, err, "--args", "a=1,b=2,c=3,d=4", scriptPath.c_str()); }
+    SECTION("mixed") { status = runDaphne(out, err, "--args", "a=1,b=2", scriptPath.c_str(), "c=3", "d=4"); }
+
     // Don't REQUIRE, such that out and err are also printed in case of a test
     // failure. Don't use empty() on err, such that err is printed on failure.
     CHECK(status == StatusCode::SUCCESS);
@@ -97,7 +82,6 @@ TEST_CASE("Ways of specifying script arguments", TAG_SCRIPTARGS) {
     CHECK(err.str() == "");
 }
 
-
 TEST_CASE("Don't support general expressions as script arguments", TAG_SCRIPTARGS) {
     const std::string scriptPath = dirPath + "printSingleArg.daphne";
     checkDaphneFails(scriptPath.c_str(), "--args", "foo=10+10");
diff --git a/test/api/cli/secondorder/SecondOrderTest.cpp b/test/api/cli/secondorder/SecondOrderTest.cpp
index 20b5af3b9..4bd482f9a 100644
--- a/test/api/cli/secondorder/SecondOrderTest.cpp
+++ b/test/api/cli/secondorder/SecondOrderTest.cpp
@@ -25,22 +25,20 @@
 
 const std::string dirPath = "test/api/cli/secondorder/";
 
-#define MAKE_TEST_CASE(name, count) \
-    TEST_CASE(name, TAG_SECONDORDER) { \
-        for(unsigned i = 1; i <= count; i++) { \
-            DYNAMIC_SECTION(name "_" << i << ".daphne") { \
-                compareDaphneToRefSimple(dirPath, name, i); \
-            } \
-        } \
+#define MAKE_TEST_CASE(name, count)                                                                                    \
+    TEST_CASE(name, TAG_SECONDORDER) {                                                                                 \
+        for (unsigned i = 1; i <= count; i++) {                                                                        \
+            DYNAMIC_SECTION(name "_" << i << ".daphne") { compareDaphneToRefSimple(dirPath, name, i); }                \
+        }                                                                                                              \
     }
 
-#define MAKE_FAILURE_TEST_CASE(name, count) \
-    TEST_CASE(name ", failure", TAG_SECONDORDER) { \
-        for(unsigned i = 1; i <= count; i++) { \
-            DYNAMIC_SECTION(name "_failure_" << i << ".daphne") { \
-                checkDaphneFailsSimple(dirPath, name "_failure", i); \
-            } \
-        } \
+#define MAKE_FAILURE_TEST_CASE(name, count)                                                                            \
+    TEST_CASE(name ", failure", TAG_SECONDORDER) {                                                                     \
+        for (unsigned i = 1; i <= count; i++) {                                                                        \
+            DYNAMIC_SECTION(name "_failure_" << i << ".daphne") {                                                      \
+                checkDaphneFailsSimple(dirPath, name "_failure", i);                                                   \
+            }                                                                                                          \
+        }                                                                                                              \
     }
 
 MAKE_TEST_CASE("map", 3)
diff --git a/test/api/cli/sql/SQLResultTest.cpp b/test/api/cli/sql/SQLResultTest.cpp
index 56519621a..cf71aa89a 100644
--- a/test/api/cli/sql/SQLResultTest.cpp
+++ b/test/api/cli/sql/SQLResultTest.cpp
@@ -25,11 +25,9 @@
 
 const std::string dirPath = "test/api/cli/sql/";
 
-#define MAKE_TEST_CASE(name, count) \
-    TEST_CASE(name, TAG_OPERATIONS) { \
-        for(unsigned i = 1; i <= count; i++) { \
-            DYNAMIC_SECTION(name "_" << i << ".daphne") { \
-                compareDaphneToRefSimple(dirPath, name, i); \
-            } \
-        } \
+#define MAKE_TEST_CASE(name, count)                                                                                    \
+    TEST_CASE(name, TAG_OPERATIONS) {                                                                                  \
+        for (unsigned i = 1; i <= count; i++) {                                                                        \
+            DYNAMIC_SECTION(name "_" << i << ".daphne") { compareDaphneToRefSimple(dirPath, name, i); }                \
+        }                                                                                                              \
     }
diff --git a/test/api/cli/sql/SQLTest.cpp b/test/api/cli/sql/SQLTest.cpp
index 494eee0ea..3a4a9f8d7 100644
--- a/test/api/cli/sql/SQLTest.cpp
+++ b/test/api/cli/sql/SQLTest.cpp
@@ -25,52 +25,49 @@
 
 const std::string dirPath = "test/api/cli/sql/";
 
-#define MAKE_SUCCESS_TEST_CASE(name, count) \
-    TEST_CASE(name ", success", TAG_SQL) { \
-        for(unsigned i = 1; i <= count; i++) { \
-            DYNAMIC_SECTION(name "_success_" << i << ".daphne") { \
-                checkDaphneStatusCodeSimple(StatusCode::SUCCESS, dirPath, name "_success", i); \
-            } \
-        } \
+#define MAKE_SUCCESS_TEST_CASE(name, count)                                                                            \
+    TEST_CASE(name ", success", TAG_SQL) {                                                                             \
+        for (unsigned i = 1; i <= count; i++) {                                                                        \
+            DYNAMIC_SECTION(name "_success_" << i << ".daphne") {                                                      \
+                checkDaphneStatusCodeSimple(StatusCode::SUCCESS, dirPath, name "_success", i);                         \
+            }                                                                                                          \
+        }                                                                                                              \
     }
 
-#define MAKE_PARSER_FAILURE_TEST_CASE(name, count) \
-    TEST_CASE(name ", parser failure", TAG_SQL) { \
-        for(unsigned i = 1; i <= count; i++) { \
-            DYNAMIC_SECTION(name "_parser_failure_" << i << ".daphne") { \
-                checkDaphneStatusCodeSimple(StatusCode::PARSER_ERROR, dirPath, name "_parser_failure", i); \
-            } \
-        } \
+#define MAKE_PARSER_FAILURE_TEST_CASE(name, count)                                                                     \
+    TEST_CASE(name ", parser failure", TAG_SQL) {                                                                      \
+        for (unsigned i = 1; i <= count; i++) {                                                                        \
+            DYNAMIC_SECTION(name "_parser_failure_" << i << ".daphne") {                                               \
+                checkDaphneStatusCodeSimple(StatusCode::PARSER_ERROR, dirPath, name "_parser_failure", i);             \
+            }                                                                                                          \
+        }                                                                                                              \
     }
 
-#define MAKE_PASS_FAILURE_TEST_CASE(name, count) \
-    TEST_CASE(name ", pass failure", TAG_SQL) { \
-        for(unsigned i = 1; i <= count; i++) { \
-            DYNAMIC_SECTION(name "_pass_failure_" << i << ".daphne") { \
-                checkDaphneStatusCodeSimple(StatusCode::PASS_ERROR, dirPath, name "_pass_failure", i); \
-            } \
-        } \
+#define MAKE_PASS_FAILURE_TEST_CASE(name, count)                                                                       \
+    TEST_CASE(name ", pass failure", TAG_SQL) {                                                                        \
+        for (unsigned i = 1; i <= count; i++) {                                                                        \
+            DYNAMIC_SECTION(name "_pass_failure_" << i << ".daphne") {                                                 \
+                checkDaphneStatusCodeSimple(StatusCode::PASS_ERROR, dirPath, name "_pass_failure", i);                 \
+            }                                                                                                          \
+        }                                                                                                              \
     }
 
-#define MAKE_EXEC_FAILURE_TEST_CASE(name, count) \
-    TEST_CASE(name ", execution failure", TAG_SQL) { \
-        for(unsigned i = 1; i <= count; i++) { \
-            DYNAMIC_SECTION(name "_execution_failure_" << i << ".daphne") { \
-                checkDaphneStatusCodeSimple(StatusCode::EXECUTION_ERROR, dirPath, name "_execution_failure", i); \
-            } \
-        } \
+#define MAKE_EXEC_FAILURE_TEST_CASE(name, count)                                                                       \
+    TEST_CASE(name ", execution failure", TAG_SQL) {                                                                   \
+        for (unsigned i = 1; i <= count; i++) {                                                                        \
+            DYNAMIC_SECTION(name "_execution_failure_" << i << ".daphne") {                                            \
+                checkDaphneStatusCodeSimple(StatusCode::EXECUTION_ERROR, dirPath, name "_execution_failure", i);       \
+            }                                                                                                          \
+        }                                                                                                              \
     }
 
-#define MAKE_TEST_CASE(name, count) \
-    TEST_CASE(name, TAG_SQL) { \
-        for(unsigned i = 1; i <= count; i++) { \
-            DYNAMIC_SECTION(name "_" << i << ".daphne") { \
-                compareDaphneToRefSimple(dirPath, name, i); \
-            } \
-        } \
+#define MAKE_TEST_CASE(name, count)                                                                                    \
+    TEST_CASE(name, TAG_SQL) {                                                                                         \
+        for (unsigned i = 1; i <= count; i++) {                                                                        \
+            DYNAMIC_SECTION(name "_" << i << ".daphne") { compareDaphneToRefSimple(dirPath, name, i); }                \
+        }                                                                                                              \
     }
 
-
 MAKE_SUCCESS_TEST_CASE("basic", 4);
 MAKE_PASS_FAILURE_TEST_CASE("basic", 3);
 MAKE_EXEC_FAILURE_TEST_CASE("basic", 1);
@@ -86,7 +83,6 @@ MAKE_PASS_FAILURE_TEST_CASE("group", 1);
 
 MAKE_TEST_CASE("group", 5)
 
-
 MAKE_TEST_CASE("thetaJoin_equal", 4)
 MAKE_TEST_CASE("thetaJoin_greaterThan", 2)
 MAKE_TEST_CASE("thetaJoin_greaterEqual", 2)
diff --git a/test/api/cli/syntax/SyntaxTest.cpp b/test/api/cli/syntax/SyntaxTest.cpp
index b256b4674..07b182ea2 100644
--- a/test/api/cli/syntax/SyntaxTest.cpp
+++ b/test/api/cli/syntax/SyntaxTest.cpp
@@ -25,13 +25,13 @@
 
 const std::string dirPath = "test/api/cli/syntax/";
 
-#define MAKE_FAILURE_TEST_CASE(name, count) \
-    TEST_CASE(name ", invalid syntax", TAG_SYNTAX) { \
-        for(unsigned i = 1; i <= count; i++) { \
-            DYNAMIC_SECTION(name "_failure_" << i << ".daphne") { \
-                checkDaphneStatusCodeSimple(StatusCode::PARSER_ERROR, dirPath, name "_failure", i); \
-            } \
-        } \
+#define MAKE_FAILURE_TEST_CASE(name, count)                                                                            \
+    TEST_CASE(name ", invalid syntax", TAG_SYNTAX) {                                                                   \
+        for (unsigned i = 1; i <= count; i++) {                                                                        \
+            DYNAMIC_SECTION(name "_failure_" << i << ".daphne") {                                                      \
+                checkDaphneStatusCodeSimple(StatusCode::PARSER_ERROR, dirPath, name "_failure", i);                    \
+            }                                                                                                          \
+        }                                                                                                              \
     }
 
 MAKE_FAILURE_TEST_CASE("general", 7)
diff --git a/test/api/cli/vectorized/MultiThreadedOpsTest.cpp b/test/api/cli/vectorized/MultiThreadedOpsTest.cpp
index c8974b3d2..0feff9e60 100644
--- a/test/api/cli/vectorized/MultiThreadedOpsTest.cpp
+++ b/test/api/cli/vectorized/MultiThreadedOpsTest.cpp
@@ -26,18 +26,19 @@ using namespace std::literals;
 
 auto dirPath = "test/api/cli/vectorized/"sv;
 
-// TODO: check if `vectorizedPipeline` is used and compare vectorization with no vectorization instead of file
-#define MAKE_TEST_CASE(name, suffix, param) \
-    TEST_CASE(std::string(name)+std::string(suffix), TAG_OPERATIONS) { \
-        std::string prefix(dirPath);\
-        prefix += (name);\
-        compareDaphneToRef(prefix + ".txt", prefix + ".daphne", (param)); \
+// TODO: check if `vectorizedPipeline` is used and compare vectorization with no
+// vectorization instead of file
+#define MAKE_TEST_CASE(name, suffix, param)                                                                            \
+    TEST_CASE(std::string(name) + std::string(suffix), TAG_OPERATIONS) {                                               \
+        std::string prefix(dirPath);                                                                                   \
+        prefix += (name);                                                                                              \
+        compareDaphneToRef(prefix + ".txt", prefix + ".daphne", (param));                                              \
     }
-#define MAKE_TEST_CASE_SPARSE(name) \
-    TEST_CASE(name, TAG_OPERATIONS) { \
-        std::string prefix(dirPath);\
-        prefix += (name);\
-        compareDaphneToRef(prefix+".txt", prefix+".daphne", "--select-matrix-representations", "--vec"); \
+#define MAKE_TEST_CASE_SPARSE(name)                                                                                    \
+    TEST_CASE(name, TAG_OPERATIONS) {                                                                                  \
+        std::string prefix(dirPath);                                                                                   \
+        prefix += (name);                                                                                              \
+        compareDaphneToRef(prefix + ".txt", prefix + ".daphne", "--select-matrix-representations", "--vec");           \
     }
 
 MAKE_TEST_CASE("runMatMult", "", "--vec")
@@ -51,11 +52,11 @@ MAKE_TEST_CASE("runIndexing", "", "--vec")
 MAKE_TEST_CASE("runReorganization", "", "--vec")
 MAKE_TEST_CASE("runOther", "", "--vec")
 
-//ToDo: make these tests work
-//#ifdef USE_CUDA
-//MAKE_TEST_CASE("runMatMult", "CUDA", "--vec --cuda")
-//MAKE_TEST_CASE("runEwBinary", "CUDA", "--vec --cuda")
-//MAKE_TEST_CASE("runRowAgg", "CUDA", "--vec --cuda")
-//MAKE_TEST_CASE("runColAgg", "CUDA", "--vec --cuda")
-//MAKE_TEST_CASE("runOther", "CUDA", "--vec --cuda")
-//#endif
+// ToDo: make these tests work
+// #ifdef USE_CUDA
+// MAKE_TEST_CASE("runMatMult", "CUDA", "--vec --cuda")
+// MAKE_TEST_CASE("runEwBinary", "CUDA", "--vec --cuda")
+// MAKE_TEST_CASE("runRowAgg", "CUDA", "--vec --cuda")
+// MAKE_TEST_CASE("runColAgg", "CUDA", "--vec --cuda")
+// MAKE_TEST_CASE("runOther", "CUDA", "--vec --cuda")
+// #endif
diff --git a/test/api/cli/vectorized/VectorizedPipelineTest.cpp b/test/api/cli/vectorized/VectorizedPipelineTest.cpp
index ebc94efb2..3815a8c62 100644
--- a/test/api/cli/vectorized/VectorizedPipelineTest.cpp
+++ b/test/api/cli/vectorized/VectorizedPipelineTest.cpp
@@ -30,39 +30,37 @@ void compareDaphneToDaphneOtherArgs(const std::string &scriptFilePath) {
     std::stringstream outNN;
     std::stringstream errNN;
     int statusNN = runDaphne(outNN, errNN, scriptFilePath.c_str());
-    
+
     // `daphne --vec $scriptFilePath` (vec, no repr)
     std::stringstream outVN;
     std::stringstream errVN;
     int statusVN = runDaphne(outVN, errVN, "--vec", scriptFilePath.c_str());
-    
+
     // `daphne --vec --select-matrix-repr $scriptFilePath` (vec, repr)
     std::stringstream outVR;
     std::stringstream errVR;
     int statusVR = runDaphne(outVR, errVR, "--vec", "--select-matrix-repr", scriptFilePath.c_str());
-    
+
     // Check if all runs were successful.
     CHECK(statusNN == StatusCode::SUCCESS);
     CHECK(statusVN == StatusCode::SUCCESS);
     CHECK(statusVR == StatusCode::SUCCESS);
-    
+
     // Check if all runs yielded the same output on stdout.
     CHECK(generalizeDataTypes(outNN.str()) == generalizeDataTypes(outVN.str()));
     CHECK(generalizeDataTypes(outNN.str()) == generalizeDataTypes(outVR.str()));
-    
+
     // Check if all runs yielded the same output on stderr.
     CHECK(errNN.str() == errVN.str());
     CHECK(errNN.str() == errVR.str());
 }
 
-#define MAKE_TEST_CASE(name, count) \
-    TEST_CASE(name, TAG_VECTORIZED) { \
-        for(unsigned i = 1; i <= count; i++) { \
-            const std::string scriptFilePath = dirPath + name + "_" + std::to_string(i) + ".daphne"; \
-            DYNAMIC_SECTION(scriptFilePath) { \
-                compareDaphneToDaphneOtherArgs(scriptFilePath); \
-            } \
-        } \
+#define MAKE_TEST_CASE(name, count)                                                                                    \
+    TEST_CASE(name, TAG_VECTORIZED) {                                                                                  \
+        for (unsigned i = 1; i <= count; i++) {                                                                        \
+            const std::string scriptFilePath = dirPath + name + "_" + std::to_string(i) + ".daphne";                   \
+            DYNAMIC_SECTION(scriptFilePath) { compareDaphneToDaphneOtherArgs(scriptFilePath); }                        \
+        }                                                                                                              \
     }
 
-MAKE_TEST_CASE("pipeline", 7)
\ No newline at end of file
+MAKE_TEST_CASE("pipeline", 9)
diff --git a/test/api/cli/vectorized/pipeline_8.daphne b/test/api/cli/vectorized/pipeline_8.daphne
new file mode 100644
index 000000000..80fb0252d
--- /dev/null
+++ b/test/api/cli/vectorized/pipeline_8.daphne
@@ -0,0 +1,12 @@
+// From issue #697
+
+N = 1000;
+G = rand(N, N, 1.0, 1.0, 0.000001, -1);
+p = fill(1.0, N, 1);
+
+alpha = 0.85;
+one_minus_alpha = 1.0 - alpha;
+
+p = alpha * (G @ p) + one_minus_alpha * p;
+
+print(p[0,0]);
diff --git a/test/api/cli/vectorized/pipeline_9.daphne b/test/api/cli/vectorized/pipeline_9.daphne
new file mode 100644
index 000000000..81a28dde1
--- /dev/null
+++ b/test/api/cli/vectorized/pipeline_9.daphne
@@ -0,0 +1,7 @@
+// From issue #697
+
+N = 100;
+G = fill(1.0, N, N);
+p = fill(1.0, N, 1);
+y = (G @ p) + p;
+print(sum(y));
diff --git a/test/api/python/DaphneLibTest.cpp b/test/api/python/DaphneLibTest.cpp
index d7171b5b0..95179f171 100644
--- a/test/api/python/DaphneLibTest.cpp
+++ b/test/api/python/DaphneLibTest.cpp
@@ -28,42 +28,42 @@
 
 const std::string dirPath = "test/api/python/";
 
-#define MAKE_TEST_CASE(name) \
-    TEST_CASE(name ".py", TAG_DAPHNELIB) { \
-        const std::string prefix = dirPath+name; \
-        compareDaphneToDaphneLib(prefix+".py", prefix+".daphne"); \
+#define MAKE_TEST_CASE(name)                                                                                           \
+    TEST_CASE(name ".py", TAG_DAPHNELIB) {                                                                             \
+        const std::string prefix = dirPath + name;                                                                     \
+        compareDaphneToDaphneLib(prefix + ".py", prefix + ".daphne");                                                  \
     }
-#define MAKE_TEST_CASE_ENVVAR(name, envVar) \
-    TEST_CASE(name ".py", TAG_DAPHNELIB) { \
-        const char* depAvail = std::getenv(envVar); \
-        if(depAvail == nullptr) { \
-            FAIL("this test case requires environment variable " envVar " to be set to either 0 or 1, but it is unset"); \
-        } \
-        if(!strcmp(depAvail, "1")) { \
-            const std::string prefix = dirPath+name; \
-            compareDaphneToDaphneLib(prefix+".py", prefix+".daphne"); \
-        } \
-        else if(!strcmp(depAvail, "0")) { \
-            SUCCEED("this test case is skipped since environment variable " envVar " is 0"); \
-        } \
-        else { \
-            FAIL("this test case requires environment variable " envVar " to be set to either 0 or 1, but it is something else"); \
-        } \
+#define MAKE_TEST_CASE_ENVVAR(name, envVar)                                                                            \
+    TEST_CASE(name ".py", TAG_DAPHNELIB) {                                                                             \
+        const char *depAvail = std::getenv(envVar);                                                                    \
+        if (depAvail == nullptr) {                                                                                     \
+            FAIL("this test case requires environment variable " envVar                                                \
+                 " to be set to either 0 or 1, but it is unset");                                                      \
+        }                                                                                                              \
+        if (!strcmp(depAvail, "1")) {                                                                                  \
+            const std::string prefix = dirPath + name;                                                                 \
+            compareDaphneToDaphneLib(prefix + ".py", prefix + ".daphne");                                              \
+        } else if (!strcmp(depAvail, "0")) {                                                                           \
+            SUCCEED("this test case is skipped since environment variable " envVar " is 0");                           \
+        } else {                                                                                                       \
+            FAIL("this test case requires environment variable " envVar                                                \
+                 " to be set to either 0 or 1, but it is something else");                                             \
+        }                                                                                                              \
     }
-#define MAKE_TEST_CASE_SCALAR(name) \
-    TEST_CASE(name ".py", TAG_DAPHNELIB) { \
-        const std::string prefix = dirPath+name; \
-        compareDaphneToDaphneLibScalar(prefix+".py", prefix+".daphne"); \
+#define MAKE_TEST_CASE_SCALAR(name)                                                                                    \
+    TEST_CASE(name ".py", TAG_DAPHNELIB) {                                                                             \
+        const std::string prefix = dirPath + name;                                                                     \
+        compareDaphneToDaphneLibScalar(prefix + ".py", prefix + ".daphne");                                            \
     }
-#define MAKE_TEST_CASE_PARAMETRIZED(name, argument) \
-    TEST_CASE((std::string(name)+".py, "+std::string(argument)).c_str(), TAG_DAPHNELIB) { \
-        const std::string prefix = dirPath+name; \
-        compareDaphneToDaphneLib(prefix+".py", prefix+".daphne", argument); \
+#define MAKE_TEST_CASE_PARAMETRIZED(name, argument)                                                                    \
+    TEST_CASE((std::string(name) + ".py, " + std::string(argument)).c_str(), TAG_DAPHNELIB) {                          \
+        const std::string prefix = dirPath + name;                                                                     \
+        compareDaphneToDaphneLib(prefix + ".py", prefix + ".daphne", argument);                                        \
     }
-#define MAKE_TEST_CASE_STR(name, str) \
-    TEST_CASE(name ".py", TAG_DAPHNELIB) { \
-        const std::string prefix = dirPath+name; \
-        compareDaphneLibToStr(str, prefix+".py"); \
+#define MAKE_TEST_CASE_STR(name, str)                                                                                  \
+    TEST_CASE(name ".py", TAG_DAPHNELIB) {                                                                             \
+        const std::string prefix = dirPath + name;                                                                     \
+        compareDaphneLibToStr(str, prefix + ".py");                                                                    \
     }
 
 MAKE_TEST_CASE("data_transfer_numpy_1")
diff --git a/test/codegen/MatMulAccuracyTest.cpp b/test/codegen/MatMulAccuracyTest.cpp
index f93bacd78..5f9ac218f 100644
--- a/test/codegen/MatMulAccuracyTest.cpp
+++ b/test/codegen/MatMulAccuracyTest.cpp
@@ -11,17 +11,24 @@
 const std::string dirPath = "test/api/cli/codegen/";
 
 TEST_CASE("matmul accuracy", "[codegen][matmul]") {
-std::string result = readTextFile(dirPath + "matmul128.result");
+    std::string result = readTextFile(dirPath + "matmul128.result");
     double epsilon = std::numeric_limits<double>().epsilon();
     compareDaphneToStringNumerically(result, dirPath + "matmul128.daphne", 1, epsilon);
     compareDaphneToStringNumerically(result, dirPath + "matmul128.daphne", 1, epsilon, "--mlir-codegen");
-    compareDaphneToStringNumerically(result, dirPath + "matmul128.daphne", 1, epsilon, "--mlir-codegen", "--matmul-fixed-tile-sizes=2,3,4,5,6");
-    compareDaphneToStringNumerically(result, dirPath + "matmul128.daphne", 1, epsilon, "--mlir-codegen", "--matmul-fixed-tile-sizes=2,3,4,5");
-    compareDaphneToStringNumerically(result, dirPath + "matmul128.daphne", 1, epsilon, "--mlir-codegen", "--matmul-fixed-tile-sizes=2,3,4");
-    compareDaphneToStringNumerically(result, dirPath + "matmul128.daphne", 1, epsilon, "--mlir-codegen", "--matmul-fixed-tile-sizes=2,3");
-    compareDaphneToStringNumerically(result, dirPath + "matmul128.daphne", 1, epsilon, "--mlir-codegen", "--matmul-fixed-tile-sizes=2");
-    compareDaphneToStringNumerically(result, dirPath + "matmul128.daphne", 1, epsilon, "--mlir-codegen", "--matmul-vec-size-bits=64");
-    compareDaphneToStringNumerically(result, dirPath + "matmul128.daphne", 1, epsilon, "--mlir-codegen", "--matmul-fixed-tile-sizes=2,3,4", "--matmul-vec-size-bits=64");
+    compareDaphneToStringNumerically(result, dirPath + "matmul128.daphne", 1, epsilon, "--mlir-codegen",
+                                     "--matmul-fixed-tile-sizes=2,3,4,5,6");
+    compareDaphneToStringNumerically(result, dirPath + "matmul128.daphne", 1, epsilon, "--mlir-codegen",
+                                     "--matmul-fixed-tile-sizes=2,3,4,5");
+    compareDaphneToStringNumerically(result, dirPath + "matmul128.daphne", 1, epsilon, "--mlir-codegen",
+                                     "--matmul-fixed-tile-sizes=2,3,4");
+    compareDaphneToStringNumerically(result, dirPath + "matmul128.daphne", 1, epsilon, "--mlir-codegen",
+                                     "--matmul-fixed-tile-sizes=2,3");
+    compareDaphneToStringNumerically(result, dirPath + "matmul128.daphne", 1, epsilon, "--mlir-codegen",
+                                     "--matmul-fixed-tile-sizes=2");
+    compareDaphneToStringNumerically(result, dirPath + "matmul128.daphne", 1, epsilon, "--mlir-codegen",
+                                     "--matmul-vec-size-bits=64");
+    compareDaphneToStringNumerically(result, dirPath + "matmul128.daphne", 1, epsilon, "--mlir-codegen",
+                                     "--matmul-fixed-tile-sizes=2,3,4", "--matmul-vec-size-bits=64");
 }
 
 TEST_CASE("matmul accuracy 576", "[codegen][matmul]") {
@@ -30,10 +37,8 @@ TEST_CASE("matmul accuracy 576", "[codegen][matmul]") {
     std::stringstream err_kernel;
     std::stringstream codegen;
     std::stringstream err_codegen;
-    int status =
-        runDaphne(kernel, err_kernel, (dirPath + "matmul576.daphne").c_str());
+    int status = runDaphne(kernel, err_kernel, (dirPath + "matmul576.daphne").c_str());
     CHECK(status == StatusCode::SUCCESS);
-    compareDaphneToStringNumerically(kernel.str(), dirPath + "matmul576.daphne",
-                                     1, epsilon, "--mlir-codegen",
+    compareDaphneToStringNumerically(kernel.str(), dirPath + "matmul576.daphne", 1, epsilon, "--mlir-codegen",
                                      "--matmul-fixed-tile-sizes=2,3,4,5,6");
 }
diff --git a/test/ir/daphneir/InferTypesTest.cpp b/test/ir/daphneir/InferTypesTest.cpp
index 72bb667c9..3920fd782 100644
--- a/test/ir/daphneir/InferTypesTest.cpp
+++ b/test/ir/daphneir/InferTypesTest.cpp
@@ -25,9 +25,9 @@
  * Allows catch2 can print the name of MLIR types in its error messages.
  * @param os
  * @param type
- * @return 
+ * @return
  */
-std::ostream & operator<<(std::ostream & os, mlir::Type const & type) {
+std::ostream &operator<<(std::ostream &os, mlir::Type const &type) {
     llvm::raw_os_ostream raw(os);
     const_cast<mlir::Type &>(type).print(raw);
     return os;
@@ -49,9 +49,8 @@ using namespace mlir::OpTrait;
  * @brief Utility to find out if two C++ types are instantiations of the same
  * template.
  */
-template<class, class>
-struct is_same_template : std::false_type {};
-template<template<class> class Trait, class ConcreteOp1, class ConcreteOp2>
+template <class, class> struct is_same_template : std::false_type {};
+template <template <class> class Trait, class ConcreteOp1, class ConcreteOp2>
 struct is_same_template<Trait<ConcreteOp1>, Trait<ConcreteOp2>> : std::true_type {};
 
 /**
@@ -60,15 +59,13 @@ struct is_same_template<Trait<ConcreteOp1>, Trait<ConcreteOp2>> : std::true_type
  */
 class ValueMock {
     Type type;
-    
-public:
+
+  public:
     ValueMock(Type type) : type(type) {
         //
     }
-    
-    Type getType() const {
-        return type;
-    }
+
+    Type getType() const { return type; }
 };
 
 // TODO Instead of a single data type trait and value type trait, we could use
@@ -78,55 +75,46 @@ class ValueMock {
  * @brief Mocks an `mlir::Operation`, such that we can use its interface in
  * unit tests without fully wiring a real operation.
  */
-template<template<class> class DataTypeTrait, template<class> class ValueTypeTrait>
-class OperationMock {
-    MLIRContext * context;
+template <template <class> class DataTypeTrait, template <class> class ValueTypeTrait> class OperationMock {
+    MLIRContext *context;
     std::vector<Type> operandTypes;
-    
-public:
-    OperationMock(MLIRContext * context, std::vector<Type> operandTypes)
-    : context(context), operandTypes(operandTypes) {
+
+  public:
+    OperationMock(MLIRContext *context, std::vector<Type> operandTypes) : context(context), operandTypes(operandTypes) {
         //
     }
-    
-    MLIRContext * getContext() const {
-        return context;
-    }
-    
-    template<template<class ConcreteOp> class Trait>
-    bool hasTrait() const {
+
+    MLIRContext *getContext() const { return context; }
+
+    template <template <class ConcreteOp> class Trait> bool hasTrait() const {
         // TODO Instantiating the traits for int could break...
-        return is_same_template<Trait<int>, DataTypeTrait<int>>::value
-                || is_same_template<Trait<int>, ValueTypeTrait<int>>::value;
-    }
-    
-    std::vector<Type> getOperandTypes() const {
-        return operandTypes;
-    }
-    
-    ValueMock getOperand(unsigned idx) const {
-        return ValueMock(operandTypes[idx]);
+        return is_same_template<Trait<int>, DataTypeTrait<int>>::value ||
+               is_same_template<Trait<int>, ValueTypeTrait<int>>::value;
     }
+
+    std::vector<Type> getOperandTypes() const { return operandTypes; }
+
+    ValueMock getOperand(unsigned idx) const { return ValueMock(operandTypes[idx]); }
 };
 
 // ****************************************************************************
 // Macros simplifying test case definitions
 // ****************************************************************************
 
-#define MAKE_CASE(DataTypeTrait, ValueTypeTrait, operandTypes, expectedType) \
-    { \
-        OperationMock<DataTypeTrait, ValueTypeTrait> op(&ctx, operandTypes); \
-        CHECK(inferTypeByTraits(&op) == expectedType); \
+#define MAKE_CASE(DataTypeTrait, ValueTypeTrait, operandTypes, expectedType)                                           \
+    {                                                                                                                  \
+        OperationMock<DataTypeTrait, ValueTypeTrait> op(&ctx, operandTypes);                                           \
+        CHECK(inferTypeByTraits(&op) == expectedType);                                                                 \
     }
 
-#define MAKE_CASE_COMMUTATIVE(DataTypeTrait, ValueTypeTrait, operandType0, operandType1, expectedType) \
-    MAKE_CASE(DataTypeTrait, ValueTypeTrait, ONE({operandType0, operandType1}), expectedType) \
+#define MAKE_CASE_COMMUTATIVE(DataTypeTrait, ValueTypeTrait, operandType0, operandType1, expectedType)                 \
+    MAKE_CASE(DataTypeTrait, ValueTypeTrait, ONE({operandType0, operandType1}), expectedType)                          \
     MAKE_CASE(DataTypeTrait, ValueTypeTrait, ONE({operandType1, operandType0}), expectedType)
 
-#define MAKE_CASE_THROWS(DataTypeTrait, ValueTypeTrait, operandTypes) \
-    { \
-        OperationMock<DataTypeTrait, ValueTypeTrait> op(&ctx, operandTypes); \
-        CHECK_THROWS(inferTypeByTraits(&op)); \
+#define MAKE_CASE_THROWS(DataTypeTrait, ValueTypeTrait, operandTypes)                                                  \
+    {                                                                                                                  \
+        OperationMock<DataTypeTrait, ValueTypeTrait> op(&ctx, operandTypes);                                           \
+        CHECK_THROWS(inferTypeByTraits(&op));                                                                          \
     }
 
 // TODO Make this a general util.
@@ -139,16 +127,14 @@ class OperationMock {
 TEST_CASE("TypeInferenceTraits", TAG_INFERENCE) {
     MLIRContext ctx;
     ctx.getOrLoadDialect<daphne::DaphneDialect>();
-    
+
     OpBuilder builder(&ctx);
-    
+
     // TODO Can we have variadic args, such that we don't need {} at call-sites?
     // Utility function for creating a frame type with the specified column
     // types.
-    auto frm = [&ctx](std::vector<Type> columnTypes) {
-        return daphne::FrameType::get(&ctx, columnTypes);
-    };
-    
+    auto frm = [&ctx](std::vector<Type> columnTypes) { return daphne::FrameType::get(&ctx, columnTypes); };
+
     // A few scalar types for convenient use below.
     [[maybe_unused]] Type u = daphne::UnknownType::get(&ctx);
     [[maybe_unused]] Type str = daphne::StringType::get(&ctx);
@@ -161,7 +147,7 @@ TEST_CASE("TypeInferenceTraits", TAG_INFERENCE) {
     [[maybe_unused]] Type si32 = builder.getIntegerType(32, true);
     [[maybe_unused]] Type si8 = builder.getIntegerType(8, true);
     [[maybe_unused]] Type bl = builder.getI1Type();
-    
+
     // A few matrix types for convenient use below.
     [[maybe_unused]] daphne::MatrixType matu = daphne::MatrixType::get(&ctx, u);
     [[maybe_unused]] daphne::MatrixType matstr = daphne::MatrixType::get(&ctx, str);
@@ -174,18 +160,18 @@ TEST_CASE("TypeInferenceTraits", TAG_INFERENCE) {
     [[maybe_unused]] daphne::MatrixType matsi32 = daphne::MatrixType::get(&ctx, si32);
     [[maybe_unused]] daphne::MatrixType matsi8 = daphne::MatrixType::get(&ctx, si8);
     [[maybe_unused]] daphne::MatrixType matbl = daphne::MatrixType::get(&ctx, bl);
-    
+
     // A few frame types for convenient use below.
     [[maybe_unused]] Type frmf64 = frm({f64});
-    
-    //-------------------------------------------------------------------------
-    // Tests for various combinations of data/value type traits
-    //-------------------------------------------------------------------------
-    
-    // TypeFromFirstArg is actually just one trait, but for the test setup we
-    // assign it twice (which doesn't hurt).
-    #define DTT TypeFromFirstArg
-    #define VTT TypeFromFirstArg
+
+//-------------------------------------------------------------------------
+// Tests for various combinations of data/value type traits
+//-------------------------------------------------------------------------
+
+// TypeFromFirstArg is actually just one trait, but for the test setup we
+// assign it twice (which doesn't hurt).
+#define DTT TypeFromFirstArg
+#define VTT TypeFromFirstArg
     // exactly one arg -> should be retained
     MAKE_CASE(DTT, VTT, {f64}, f64)
     MAKE_CASE(DTT, VTT, {matf64}, matf64)
@@ -196,11 +182,11 @@ TEST_CASE("TypeInferenceTraits", TAG_INFERENCE) {
     MAKE_CASE(DTT, VTT, ONE({matf64, frmf64}), matf64)
     MAKE_CASE(DTT, VTT, ONE({frmf64, u}), frmf64)
     MAKE_CASE(DTT, VTT, ONE({u, f64}), u)
-    #undef DTT
-    #undef VTT
+#undef DTT
+#undef VTT
 
-    #define DTT DataTypeFromFirstArg
-    #define VTT ValueTypeFromFirstArg
+#define DTT DataTypeFromFirstArg
+#define VTT ValueTypeFromFirstArg
     // exactly one arg -> should be retained
     MAKE_CASE(DTT, VTT, {f64}, f64)
     MAKE_CASE(DTT, VTT, {matf64}, matf64)
@@ -211,11 +197,11 @@ TEST_CASE("TypeInferenceTraits", TAG_INFERENCE) {
     MAKE_CASE(DTT, VTT, ONE({matf64, frmf64}), matf64)
     MAKE_CASE(DTT, VTT, ONE({frmf64, u}), frmf64)
     MAKE_CASE(DTT, VTT, ONE({u, f64}), u)
-    #undef DTT
-    #undef VTT
-    
-    #define DTT DataTypeFromArgs
-    #define VTT ValueTypeFromArgs
+#undef DTT
+#undef VTT
+
+#define DTT DataTypeFromArgs
+#define VTT ValueTypeFromArgs
     // one arg -> should be retained
     MAKE_CASE(DTT, VTT, {f64}, f64)
     MAKE_CASE(DTT, VTT, {matf64}, matf64)
@@ -229,68 +215,64 @@ TEST_CASE("TypeInferenceTraits", TAG_INFERENCE) {
     MAKE_CASE_COMMUTATIVE(DTT, VTT, matf64, u, u)
     MAKE_CASE_COMMUTATIVE(DTT, VTT, frmf64, u, u)
     // two args, same data type, mixed value types
-        // scalar
-            // unsigned integer bit width
-            MAKE_CASE_COMMUTATIVE(DTT, VTT, bl, ui8, ui8)
-            MAKE_CASE_COMMUTATIVE(DTT, VTT, ui8, ui32, ui32)
-            MAKE_CASE_COMMUTATIVE(DTT, VTT, ui32, ui64, ui64)
-            // signed integer bit width
-            MAKE_CASE_COMMUTATIVE(DTT, VTT, bl, si8, si8)
-            MAKE_CASE_COMMUTATIVE(DTT, VTT, si8, si32, si32)
-            MAKE_CASE_COMMUTATIVE(DTT, VTT, si32, si64, si64)
-            // signed vs unsigned integer
-            MAKE_CASE_COMMUTATIVE(DTT, VTT, si8, ui8, ui8)
-            MAKE_CASE_COMMUTATIVE(DTT, VTT, si32, ui32, ui32)
-            MAKE_CASE_COMMUTATIVE(DTT, VTT, si64, ui64, ui64)
-            // floating-point precision
-            MAKE_CASE_COMMUTATIVE(DTT, VTT, f32, f64, f64)
-            // floating point vs integer
-            MAKE_CASE_COMMUTATIVE(DTT, VTT, si32, f32, f32)
-            MAKE_CASE_COMMUTATIVE(DTT, VTT, si64, f32, f32)
-            MAKE_CASE_COMMUTATIVE(DTT, VTT, si64, f64, f64)
-            // string vs other types
-            MAKE_CASE_COMMUTATIVE(DTT, VTT, si64, str, str)
-            MAKE_CASE_COMMUTATIVE(DTT, VTT, ui64, str, str)
-            MAKE_CASE_COMMUTATIVE(DTT, VTT, f64, str, str)
-            // unknown vs other types
-            MAKE_CASE_COMMUTATIVE(DTT, VTT, u, ui64, u)
-            MAKE_CASE_COMMUTATIVE(DTT, VTT, u, si64, u)
-            MAKE_CASE_COMMUTATIVE(DTT, VTT, u, f64, u)
-            MAKE_CASE_COMMUTATIVE(DTT, VTT, u, str, u)
-        // matrix (same cases as for scalars, just with matrices)
-            // unsigned integer bit width
-            MAKE_CASE_COMMUTATIVE(DTT, VTT, matbl, matui8, matui8)
-            MAKE_CASE_COMMUTATIVE(DTT, VTT, matui8, matui32, matui32)
-            MAKE_CASE_COMMUTATIVE(DTT, VTT, matui32, matui64, matui64)
-            // signed integer bit width
-            MAKE_CASE_COMMUTATIVE(DTT, VTT, matbl, matsi8, matsi8)
-            MAKE_CASE_COMMUTATIVE(DTT, VTT, matsi8, matsi32, matsi32)
-            MAKE_CASE_COMMUTATIVE(DTT, VTT, matsi32, matsi64, matsi64)
-            // signed vs unsigned integer
-            MAKE_CASE_COMMUTATIVE(DTT, VTT, matsi8, matui8, matui8)
-            MAKE_CASE_COMMUTATIVE(DTT, VTT, matsi32, matui32, matui32)
-            MAKE_CASE_COMMUTATIVE(DTT, VTT, matsi64, matui64, matui64)
-            // floating-point precision
-            MAKE_CASE_COMMUTATIVE(DTT, VTT, matf32, matf64, matf64)
-            // floating point vs integer
-            MAKE_CASE_COMMUTATIVE(DTT, VTT, matsi32, matf32, matf32)
-            MAKE_CASE_COMMUTATIVE(DTT, VTT, matsi64, matf32, matf32)
-            MAKE_CASE_COMMUTATIVE(DTT, VTT, matsi64, matf64, matf64)
-            // string vs other types
-            MAKE_CASE_COMMUTATIVE(DTT, VTT, matsi64, matstr, matstr)
-            MAKE_CASE_COMMUTATIVE(DTT, VTT, matui64, matstr, matstr)
-            MAKE_CASE_COMMUTATIVE(DTT, VTT, matf64, matstr, matstr)
-            // unknown vs other types
-            MAKE_CASE_COMMUTATIVE(DTT, VTT, matu, matui64, matu)
-            MAKE_CASE_COMMUTATIVE(DTT, VTT, matu, matsi64, matu)
-            MAKE_CASE_COMMUTATIVE(DTT, VTT, matu, matf64, matu)
-            MAKE_CASE_COMMUTATIVE(DTT, VTT, matu, matstr, matu)
-        // frame
-        MAKE_CASE_COMMUTATIVE(
-                DTT, VTT,
-                frm({ui8, f64, u, si64}), frm({si32, f32, f64, str}),
-                frm({si32, f64, u, str})
-        )
+    // scalar
+    // unsigned integer bit width
+    MAKE_CASE_COMMUTATIVE(DTT, VTT, bl, ui8, ui8)
+    MAKE_CASE_COMMUTATIVE(DTT, VTT, ui8, ui32, ui32)
+    MAKE_CASE_COMMUTATIVE(DTT, VTT, ui32, ui64, ui64)
+    // signed integer bit width
+    MAKE_CASE_COMMUTATIVE(DTT, VTT, bl, si8, si8)
+    MAKE_CASE_COMMUTATIVE(DTT, VTT, si8, si32, si32)
+    MAKE_CASE_COMMUTATIVE(DTT, VTT, si32, si64, si64)
+    // signed vs unsigned integer
+    MAKE_CASE_COMMUTATIVE(DTT, VTT, si8, ui8, ui8)
+    MAKE_CASE_COMMUTATIVE(DTT, VTT, si32, ui32, ui32)
+    MAKE_CASE_COMMUTATIVE(DTT, VTT, si64, ui64, ui64)
+    // floating-point precision
+    MAKE_CASE_COMMUTATIVE(DTT, VTT, f32, f64, f64)
+    // floating point vs integer
+    MAKE_CASE_COMMUTATIVE(DTT, VTT, si32, f32, f32)
+    MAKE_CASE_COMMUTATIVE(DTT, VTT, si64, f32, f32)
+    MAKE_CASE_COMMUTATIVE(DTT, VTT, si64, f64, f64)
+    // string vs other types
+    MAKE_CASE_COMMUTATIVE(DTT, VTT, si64, str, str)
+    MAKE_CASE_COMMUTATIVE(DTT, VTT, ui64, str, str)
+    MAKE_CASE_COMMUTATIVE(DTT, VTT, f64, str, str)
+    // unknown vs other types
+    MAKE_CASE_COMMUTATIVE(DTT, VTT, u, ui64, u)
+    MAKE_CASE_COMMUTATIVE(DTT, VTT, u, si64, u)
+    MAKE_CASE_COMMUTATIVE(DTT, VTT, u, f64, u)
+    MAKE_CASE_COMMUTATIVE(DTT, VTT, u, str, u)
+    // matrix (same cases as for scalars, just with matrices)
+    // unsigned integer bit width
+    MAKE_CASE_COMMUTATIVE(DTT, VTT, matbl, matui8, matui8)
+    MAKE_CASE_COMMUTATIVE(DTT, VTT, matui8, matui32, matui32)
+    MAKE_CASE_COMMUTATIVE(DTT, VTT, matui32, matui64, matui64)
+    // signed integer bit width
+    MAKE_CASE_COMMUTATIVE(DTT, VTT, matbl, matsi8, matsi8)
+    MAKE_CASE_COMMUTATIVE(DTT, VTT, matsi8, matsi32, matsi32)
+    MAKE_CASE_COMMUTATIVE(DTT, VTT, matsi32, matsi64, matsi64)
+    // signed vs unsigned integer
+    MAKE_CASE_COMMUTATIVE(DTT, VTT, matsi8, matui8, matui8)
+    MAKE_CASE_COMMUTATIVE(DTT, VTT, matsi32, matui32, matui32)
+    MAKE_CASE_COMMUTATIVE(DTT, VTT, matsi64, matui64, matui64)
+    // floating-point precision
+    MAKE_CASE_COMMUTATIVE(DTT, VTT, matf32, matf64, matf64)
+    // floating point vs integer
+    MAKE_CASE_COMMUTATIVE(DTT, VTT, matsi32, matf32, matf32)
+    MAKE_CASE_COMMUTATIVE(DTT, VTT, matsi64, matf32, matf32)
+    MAKE_CASE_COMMUTATIVE(DTT, VTT, matsi64, matf64, matf64)
+    // string vs other types
+    MAKE_CASE_COMMUTATIVE(DTT, VTT, matsi64, matstr, matstr)
+    MAKE_CASE_COMMUTATIVE(DTT, VTT, matui64, matstr, matstr)
+    MAKE_CASE_COMMUTATIVE(DTT, VTT, matf64, matstr, matstr)
+    // unknown vs other types
+    MAKE_CASE_COMMUTATIVE(DTT, VTT, matu, matui64, matu)
+    MAKE_CASE_COMMUTATIVE(DTT, VTT, matu, matsi64, matu)
+    MAKE_CASE_COMMUTATIVE(DTT, VTT, matu, matf64, matu)
+    MAKE_CASE_COMMUTATIVE(DTT, VTT, matu, matstr, matu)
+    // frame
+    MAKE_CASE_COMMUTATIVE(DTT, VTT, frm({ui8, f64, u, si64}), frm({si32, f32, f64, str}), frm({si32, f64, u, str}))
     // multiple args, mixed data types, mixed value types
     MAKE_CASE(DTT, VTT, ONE({si64, matf32, frmf64, u}), u)
     MAKE_CASE(DTT, VTT, ONE({matf32, f64, si32}), matf64)
@@ -301,254 +283,220 @@ TEST_CASE("TypeInferenceTraits", TAG_INFERENCE) {
     MAKE_CASE_THROWS(DTT, VTT, ONE({frmf64, frm({f32, si64})}))
     MAKE_CASE_THROWS(DTT, VTT, ONE({frmf64, frm({f32, si64}), f64}))
     MAKE_CASE_THROWS(DTT, VTT, ONE({frm({f32, si64}), frm({ui8, ui32, ui64})}))
-    #undef DTT
-    #undef VTT
-    
-    #define DTT DataTypeFromFirstArg
-    #define VTT ValueTypeFromArgsFP
+#undef DTT
+#undef VTT
+
+#define DTT DataTypeFromFirstArg
+#define VTT ValueTypeFromArgsFP
     // one argument
-        // scalar
-            // bool becomes most general floating-point
-            MAKE_CASE(DTT, VTT, {bl}, f64)
-            // integers become most general floating-point
-            MAKE_CASE(DTT, VTT, {si8}, f64)
-            MAKE_CASE(DTT, VTT, {si32}, f64)
-            MAKE_CASE(DTT, VTT, {si64}, f64)
-            MAKE_CASE(DTT, VTT, {ui8}, f64)
-            MAKE_CASE(DTT, VTT, {ui32}, f64)
-            MAKE_CASE(DTT, VTT, {ui64}, f64)
-            // floating-point stays as-is
-            MAKE_CASE(DTT, VTT, {f32}, f32)
-            MAKE_CASE(DTT, VTT, {f64}, f64)
-            // string becomes most general floating-point
-            MAKE_CASE(DTT, VTT, {str}, f64)
-            // unknown stays unknown
-            MAKE_CASE(DTT, VTT, {u}, u)
-        // matrix
-            // bool becomes most general floating-point
-            MAKE_CASE(DTT, VTT, {matbl}, matf64)
-            // integers become most general floating-point
-            MAKE_CASE(DTT, VTT, {matsi8}, matf64)
-            MAKE_CASE(DTT, VTT, {matsi32}, matf64)
-            MAKE_CASE(DTT, VTT, {matsi64}, matf64)
-            MAKE_CASE(DTT, VTT, {matui8}, matf64)
-            MAKE_CASE(DTT, VTT, {matui32}, matf64)
-            MAKE_CASE(DTT, VTT, {matui64}, matf64)
-            // floating-point stays as-is
-            MAKE_CASE(DTT, VTT, {matf32}, matf32)
-            MAKE_CASE(DTT, VTT, {matf64}, matf64)
-            // string becomes most general floating-point
-            MAKE_CASE(DTT, VTT, {matstr}, matf64)
-            // unknown stays unknown
-            MAKE_CASE(DTT, VTT, {matu}, matu)
-        // frame
-        MAKE_CASE(
-                DTT, VTT,
-                {frm({bl, ui8, si32, f32, f64, str, u})},
-                frm({f64, f64, f64, f32, f64, f64, u})
-        )
+    // scalar
+    // bool becomes most general floating-point
+    MAKE_CASE(DTT, VTT, {bl}, f64)
+    // integers become most general floating-point
+    MAKE_CASE(DTT, VTT, {si8}, f64)
+    MAKE_CASE(DTT, VTT, {si32}, f64)
+    MAKE_CASE(DTT, VTT, {si64}, f64)
+    MAKE_CASE(DTT, VTT, {ui8}, f64)
+    MAKE_CASE(DTT, VTT, {ui32}, f64)
+    MAKE_CASE(DTT, VTT, {ui64}, f64)
+    // floating-point stays as-is
+    MAKE_CASE(DTT, VTT, {f32}, f32)
+    MAKE_CASE(DTT, VTT, {f64}, f64)
+    // string becomes most general floating-point
+    MAKE_CASE(DTT, VTT, {str}, f64)
+    // unknown stays unknown
+    MAKE_CASE(DTT, VTT, {u}, u)
+    // matrix
+    // bool becomes most general floating-point
+    MAKE_CASE(DTT, VTT, {matbl}, matf64)
+    // integers become most general floating-point
+    MAKE_CASE(DTT, VTT, {matsi8}, matf64)
+    MAKE_CASE(DTT, VTT, {matsi32}, matf64)
+    MAKE_CASE(DTT, VTT, {matsi64}, matf64)
+    MAKE_CASE(DTT, VTT, {matui8}, matf64)
+    MAKE_CASE(DTT, VTT, {matui32}, matf64)
+    MAKE_CASE(DTT, VTT, {matui64}, matf64)
+    // floating-point stays as-is
+    MAKE_CASE(DTT, VTT, {matf32}, matf32)
+    MAKE_CASE(DTT, VTT, {matf64}, matf64)
+    // string becomes most general floating-point
+    MAKE_CASE(DTT, VTT, {matstr}, matf64)
+    // unknown stays unknown
+    MAKE_CASE(DTT, VTT, {matu}, matu)
+    // frame
+    MAKE_CASE(DTT, VTT, {frm({bl, ui8, si32, f32, f64, str, u})}, frm({f64, f64, f64, f32, f64, f64, u}))
     // multiple args
-        // scalar
-        MAKE_CASE(DTT, VTT, ONE({ui8, si32}), f64)
-        MAKE_CASE(DTT, VTT, ONE({f32, f64}), f64)
-        MAKE_CASE(DTT, VTT, ONE({si32, f32}), f32)
-        MAKE_CASE(DTT, VTT, ONE({si32, f32, str}), f64)
-        MAKE_CASE(DTT, VTT, ONE({si32, f32, u}), u)
-        // matrix
-        MAKE_CASE(DTT, VTT, ONE({matui8, matsi32}), matf64)
-        MAKE_CASE(DTT, VTT, ONE({matf32, matf64}), matf64)
-        MAKE_CASE(DTT, VTT, ONE({matsi32, matf32}), matf32)
-        MAKE_CASE(DTT, VTT, ONE({matsi32, matf32, matstr}), matf64)
-        MAKE_CASE(DTT, VTT, ONE({matsi32, matf32, matu}), matu)
-        // frame
-        MAKE_CASE(
-                DTT, VTT,
-                ONE({frm({ui8, f32, str}), frm({si32, si8, u})}),
-                frm({f64, f32, u})
-        )
-        // mixed
-        MAKE_CASE(DTT, VTT, ONE({matf32, si32}), matf32)
-        MAKE_CASE(DTT, VTT, ONE({matsi32, si64}), matf64)
-        MAKE_CASE(DTT, VTT, ONE({frm({ui8, f64}), matf32}), frm({f32, f64}))
-    #undef DTT
-    #undef VTT
-    
-    #define DTT DataTypeFromFirstArg
-    #define VTT ValueTypeFromArgsInt
+    // scalar
+    MAKE_CASE(DTT, VTT, ONE({ui8, si32}), f64)
+    MAKE_CASE(DTT, VTT, ONE({f32, f64}), f64)
+    MAKE_CASE(DTT, VTT, ONE({si32, f32}), f32)
+    MAKE_CASE(DTT, VTT, ONE({si32, f32, str}), f64)
+    MAKE_CASE(DTT, VTT, ONE({si32, f32, u}), u)
+    // matrix
+    MAKE_CASE(DTT, VTT, ONE({matui8, matsi32}), matf64)
+    MAKE_CASE(DTT, VTT, ONE({matf32, matf64}), matf64)
+    MAKE_CASE(DTT, VTT, ONE({matsi32, matf32}), matf32)
+    MAKE_CASE(DTT, VTT, ONE({matsi32, matf32, matstr}), matf64)
+    MAKE_CASE(DTT, VTT, ONE({matsi32, matf32, matu}), matu)
+    // frame
+    MAKE_CASE(DTT, VTT, ONE({frm({ui8, f32, str}), frm({si32, si8, u})}), frm({f64, f32, u}))
+    // mixed
+    MAKE_CASE(DTT, VTT, ONE({matf32, si32}), matf32)
+    MAKE_CASE(DTT, VTT, ONE({matsi32, si64}), matf64)
+    MAKE_CASE(DTT, VTT, ONE({frm({ui8, f64}), matf32}), frm({f32, f64}))
+#undef DTT
+#undef VTT
+
+#define DTT DataTypeFromFirstArg
+#define VTT ValueTypeFromArgsInt
     // one argument
-        // scalar
-            // bool stays bool
-            MAKE_CASE(DTT, VTT, {bl}, bl) // TODO Is this what we want?
-            // integers stay as-is
-            MAKE_CASE(DTT, VTT, {si8}, si8)
-            MAKE_CASE(DTT, VTT, {si32}, si32)
-            MAKE_CASE(DTT, VTT, {si64}, si64)
-            MAKE_CASE(DTT, VTT, {ui8}, ui8)
-            MAKE_CASE(DTT, VTT, {ui32}, ui32)
-            MAKE_CASE(DTT, VTT, {ui64}, ui64)
-            // floating-point becomes most general integer
-            MAKE_CASE(DTT, VTT, {f32}, ui64)
-            MAKE_CASE(DTT, VTT, {f64}, ui64)
-            // string becomes most general integer
-            MAKE_CASE(DTT, VTT, {str}, ui64)
-            // unknown stays unknown
-            MAKE_CASE(DTT, VTT, {u}, u)
-        // matrix
-            // bool stays bool
-            MAKE_CASE(DTT, VTT, {matbl}, matbl) // TODO Is this what we want?
-            // integers stay as-is
-            MAKE_CASE(DTT, VTT, {matsi8}, matsi8)
-            MAKE_CASE(DTT, VTT, {matsi32}, matsi32)
-            MAKE_CASE(DTT, VTT, {matsi64}, matsi64)
-            MAKE_CASE(DTT, VTT, {matui8}, matui8)
-            MAKE_CASE(DTT, VTT, {matui32}, matui32)
-            MAKE_CASE(DTT, VTT, {matui64}, matui64)
-            // floating-point becomes most general integer
-            MAKE_CASE(DTT, VTT, {matf32}, matui64)
-            MAKE_CASE(DTT, VTT, {matf64}, matui64)
-            // string becomes most general integer
-            MAKE_CASE(DTT, VTT, {matstr}, matui64)
-            // unknown stays unknown
-            MAKE_CASE(DTT, VTT, {matu}, matu)
-        // frame
-        MAKE_CASE(
-                DTT, VTT,
-                {frm({bl, ui8, si32, f32, f64, str, u})},
-                frm({bl, ui8, si32, ui64, ui64, ui64, u})
-        ) // TODO Is this what we want regarding bool?
-    // multiple args
-        // scalar
-        MAKE_CASE(DTT, VTT, ONE({ui8, si32}), si32)
-        MAKE_CASE(DTT, VTT, ONE({f32, f64}), ui64)
-        MAKE_CASE(DTT, VTT, ONE({si32, f32}), ui64)
-        MAKE_CASE(DTT, VTT, ONE({si32, f32, str}), ui64)
-        MAKE_CASE(DTT, VTT, ONE({si32, f32, u}), u)
-        // matrix
-        MAKE_CASE(DTT, VTT, ONE({matui8, matsi32}), matsi32)
-        MAKE_CASE(DTT, VTT, ONE({matf32, matf64}), matui64)
-        MAKE_CASE(DTT, VTT, ONE({matsi32, matf32}), matui64)
-        MAKE_CASE(DTT, VTT, ONE({matsi32, matf32, matstr}), matui64)
-        MAKE_CASE(DTT, VTT, ONE({matsi32, matf32, matu}), matu)
-        // frame
-        MAKE_CASE(
-                DTT, VTT,
-                ONE({frm({ui8, f32, str}), frm({si32, si8, u})}),
-                frm({si32, ui64, u})
-        )
-        // mixed
-        MAKE_CASE(DTT, VTT, ONE({matf32, si32}), matui64)
-        MAKE_CASE(DTT, VTT, ONE({matsi32, si64}), matsi64)
-        MAKE_CASE(DTT, VTT, ONE({frm({ui8, f64}), matf32}), frm({ui64, ui64}))
-    #undef DTT
-    #undef VTT
-    
-    #define DTT DataTypeFromArgs
-    #define VTT ValueTypesConcat
+    // scalar
+    // bool stays bool
+    MAKE_CASE(DTT, VTT, {bl}, bl) // TODO Is this what we want?
+    // integers stay as-is
+    MAKE_CASE(DTT, VTT, {si8}, si8)
+    MAKE_CASE(DTT, VTT, {si32}, si32)
+    MAKE_CASE(DTT, VTT, {si64}, si64)
+    MAKE_CASE(DTT, VTT, {ui8}, ui8)
+    MAKE_CASE(DTT, VTT, {ui32}, ui32)
+    MAKE_CASE(DTT, VTT, {ui64}, ui64)
+    // floating-point becomes most general integer
+    MAKE_CASE(DTT, VTT, {f32}, ui64)
+    MAKE_CASE(DTT, VTT, {f64}, ui64)
+    // string becomes most general integer
+    MAKE_CASE(DTT, VTT, {str}, ui64)
+    // unknown stays unknown
+    MAKE_CASE(DTT, VTT, {u}, u)
+    // matrix
+    // bool stays bool
+    MAKE_CASE(DTT, VTT, {matbl}, matbl) // TODO Is this what we want?
+    // integers stay as-is
+    MAKE_CASE(DTT, VTT, {matsi8}, matsi8)
+    MAKE_CASE(DTT, VTT, {matsi32}, matsi32)
+    MAKE_CASE(DTT, VTT, {matsi64}, matsi64)
+    MAKE_CASE(DTT, VTT, {matui8}, matui8)
+    MAKE_CASE(DTT, VTT, {matui32}, matui32)
+    MAKE_CASE(DTT, VTT, {matui64}, matui64)
+    // floating-point becomes most general integer
+    MAKE_CASE(DTT, VTT, {matf32}, matui64)
+    MAKE_CASE(DTT, VTT, {matf64}, matui64)
+    // string becomes most general integer
+    MAKE_CASE(DTT, VTT, {matstr}, matui64)
+    // unknown stays unknown
+    MAKE_CASE(DTT, VTT, {matu}, matu)
+    // frame
+    MAKE_CASE(DTT, VTT, {frm({bl, ui8, si32, f32, f64, str, u})},
+              frm({bl, ui8, si32, ui64, ui64, ui64, u})) // TODO Is this what we want regarding bool?
+                                                         // multiple args
+    // scalar
+    MAKE_CASE(DTT, VTT, ONE({ui8, si32}), si32)
+    MAKE_CASE(DTT, VTT, ONE({f32, f64}), ui64)
+    MAKE_CASE(DTT, VTT, ONE({si32, f32}), ui64)
+    MAKE_CASE(DTT, VTT, ONE({si32, f32, str}), ui64)
+    MAKE_CASE(DTT, VTT, ONE({si32, f32, u}), u)
+    // matrix
+    MAKE_CASE(DTT, VTT, ONE({matui8, matsi32}), matsi32)
+    MAKE_CASE(DTT, VTT, ONE({matf32, matf64}), matui64)
+    MAKE_CASE(DTT, VTT, ONE({matsi32, matf32}), matui64)
+    MAKE_CASE(DTT, VTT, ONE({matsi32, matf32, matstr}), matui64)
+    MAKE_CASE(DTT, VTT, ONE({matsi32, matf32, matu}), matu)
+    // frame
+    MAKE_CASE(DTT, VTT, ONE({frm({ui8, f32, str}), frm({si32, si8, u})}), frm({si32, ui64, u}))
+    // mixed
+    MAKE_CASE(DTT, VTT, ONE({matf32, si32}), matui64)
+    MAKE_CASE(DTT, VTT, ONE({matsi32, si64}), matsi64)
+    MAKE_CASE(DTT, VTT, ONE({frm({ui8, f64}), matf32}), frm({ui64, ui64}))
+#undef DTT
+#undef VTT
+
+#define DTT DataTypeFromArgs
+#define VTT ValueTypesConcat
     // one arg -> not allowed
     MAKE_CASE_THROWS(DTT, VTT, {f64})
     MAKE_CASE_THROWS(DTT, VTT, {matf64})
     MAKE_CASE_THROWS(DTT, VTT, {frmf64})
     MAKE_CASE_THROWS(DTT, VTT, {u})
     // two args
-        // same data type, mixed value types
-        MAKE_CASE(DTT, VTT, ONE({f32, f64}), f64)
-        MAKE_CASE(DTT, VTT, ONE({f32, u}), u)
-        MAKE_CASE(DTT, VTT, ONE({matf32, matf64}), matf64)
-        MAKE_CASE(DTT, VTT, ONE({matf32, matu}), matu)
-        MAKE_CASE(DTT, VTT, ONE({frm({f32}), frm({f64})}), frm({f32, f64}))
-        MAKE_CASE(
-                DTT, VTT,
-                ONE({frm({f32, u, si64}), frm({f64, f32})}),
-                frm({f32, u, si64, f64, f32})
-        )
-        // same mixed type, mixed value types
-        MAKE_CASE(DTT, VTT, ONE({matf32, f64}), matf64)
-        MAKE_CASE(DTT, VTT, ONE({f32, frm({f64, si64})}), frm({f32, f64, si64}))
-        // TODO How to properly represent this case (see #421)?
-//        MAKE_CASE(
-//                DTT, VTT,
-//                // note: matrix without shape information
-//                ONE({matf32, frm({f64, si64})}),
-//                frm({u})
-//        )
-        MAKE_CASE(DTT, VTT, ONE({matf32.withShape(-1, 3), frm({f64, si64})}), frm({f32, f32, f32, f64, si64}))
-        MAKE_CASE(DTT, VTT, ONE({frm({f64, si64}), matf32.withShape(-1, 3)}), frm({f64, si64, f32, f32, f32}))
+    // same data type, mixed value types
+    MAKE_CASE(DTT, VTT, ONE({f32, f64}), f64)
+    MAKE_CASE(DTT, VTT, ONE({f32, u}), u)
+    MAKE_CASE(DTT, VTT, ONE({matf32, matf64}), matf64)
+    MAKE_CASE(DTT, VTT, ONE({matf32, matu}), matu)
+    MAKE_CASE(DTT, VTT, ONE({frm({f32}), frm({f64})}), frm({f32, f64}))
+    MAKE_CASE(DTT, VTT, ONE({frm({f32, u, si64}), frm({f64, f32})}), frm({f32, u, si64, f64, f32}))
+    // same mixed type, mixed value types
+    MAKE_CASE(DTT, VTT, ONE({matf32, f64}), matf64)
+    MAKE_CASE(DTT, VTT, ONE({f32, frm({f64, si64})}), frm({f32, f64, si64}))
+    // TODO How to properly represent this case (see #421)?
+    //        MAKE_CASE(
+    //                DTT, VTT,
+    //                // note: matrix without shape information
+    //                ONE({matf32, frm({f64, si64})}),
+    //                frm({u})
+    //        )
+    MAKE_CASE(DTT, VTT, ONE({matf32.withShape(-1, 3), frm({f64, si64})}), frm({f32, f32, f32, f64, si64}))
+    MAKE_CASE(DTT, VTT, ONE({frm({f64, si64}), matf32.withShape(-1, 3)}), frm({f64, si64, f32, f32, f32}))
     // more than two args -> additional args shouldn't impact the value type
     // (maybe we change this later) (same cases as above, just more args)
-        // same data type, mixed value types
-        MAKE_CASE(DTT, VTT, ONE({f32, f64, str}), f64)
-        MAKE_CASE(DTT, VTT, ONE({f32, u, str}), u)
-        MAKE_CASE(DTT, VTT, ONE({matf32, matf64, matstr}), matf64)
-        MAKE_CASE(DTT, VTT, ONE({matf32, matu, matstr}), matu)
-        MAKE_CASE(DTT, VTT, ONE({frm({f32}), frm({f64}), frm({str})}), frm({f32, f64}))
-        MAKE_CASE(
-                DTT, VTT,
-                ONE({frm({f32, u, si64}), frm({f64, f32}), frm({str})}),
-                frm({f32, u, si64, f64, f32})
-        )
-        // same mixed type, mixed value types
-        MAKE_CASE(DTT, VTT, ONE({matf32, f64, matstr}), matf64)
-        MAKE_CASE(
-                DTT, VTT,
-                ONE({f32, frm({f64, si64}), frm({str})}),
-                frm({f32, f64, si64})
-        )
-        // TODO How to properly represent this case (see #421)?
-//        MAKE_CASE(
-//                DTT, VTT,
-//                // note: matrix without shape information
-//                ONE({matf32, frm({f64, si64}), frm({str})}),
-//                frm({u})
-//        )
-        MAKE_CASE(
-                DTT, VTT,
-                ONE({matf32.withShape(-1, 3), frm({f64, si64}), str}),
-                frm({f32, f32, f32, f64, si64})
-        )
-        MAKE_CASE(
-                DTT, VTT,
-                ONE({frm({f64, si64}), matf32.withShape(-1, 3), str}),
-                frm({f64, si64, f32, f32, f32})
-        )
-    #undef DTT
-    #undef VTT
-
-    #define DTT DataTypeSca
-    #define VTT ValueTypeFromFirstArg
-        // Mainly test if result is scalar and input frame column types are
-        // collapsed correctly.
-        MAKE_CASE(DTT, VTT, {f64}, f64)
-        MAKE_CASE(DTT, VTT, {matf64}, f64)
-        MAKE_CASE(DTT, VTT, {frmf64}, f64)
-        MAKE_CASE(DTT, VTT, {frm({f64, f32, str})}, str)
-        MAKE_CASE(DTT, VTT, {u}, u)
-    #undef DTT
-    #undef VTT
-
-    #define DTT DataTypeMat
-    #define VTT ValueTypeFromFirstArg
-        // Mainly test if result is matrix and input frame column types are
-        // collapsed correctly.
-        MAKE_CASE(DTT, VTT, {f64}, matf64)
-        MAKE_CASE(DTT, VTT, {matf64}, matf64)
-        MAKE_CASE(DTT, VTT, {frmf64}, matf64)
-        MAKE_CASE(DTT, VTT, {frm({f64, f32, str})}, matstr)
-        MAKE_CASE(DTT, VTT, {u}, matu)
-    #undef DTT
-    #undef VTT
-
-    #define DTT DataTypeFrm
-    #define VTT ValueTypeFromFirstArg
-        // Mainly test if result is frame and result column types are derived
-        // correctly.
-        MAKE_CASE(DTT, VTT, {f64}, frmf64)
-        // TODO How to properly represent this case (see #421)?
-//        MAKE_CASE(DTT, VTT, {matf64}, frm({u})) // note: matrix without shape information
-        MAKE_CASE(DTT, VTT, {matf64.withShape(-1, 3)}, frm({f64, f64, f64}))
-        MAKE_CASE(DTT, VTT, {frmf64}, frmf64)
-        MAKE_CASE(DTT, VTT, {frm({f64, f32, str})}, frm({f64, f32, str}))
-        // TODO How to properly represent this case (see #421)?
-//        MAKE_CASE(DTT, VTT, {u}, frm({u}))
-    #undef DTT
-    #undef VTT
+    // same data type, mixed value types
+    MAKE_CASE(DTT, VTT, ONE({f32, f64, str}), f64)
+    MAKE_CASE(DTT, VTT, ONE({f32, u, str}), u)
+    MAKE_CASE(DTT, VTT, ONE({matf32, matf64, matstr}), matf64)
+    MAKE_CASE(DTT, VTT, ONE({matf32, matu, matstr}), matu)
+    MAKE_CASE(DTT, VTT, ONE({frm({f32}), frm({f64}), frm({str})}), frm({f32, f64}))
+    MAKE_CASE(DTT, VTT, ONE({frm({f32, u, si64}), frm({f64, f32}), frm({str})}), frm({f32, u, si64, f64, f32}))
+    // same mixed type, mixed value types
+    MAKE_CASE(DTT, VTT, ONE({matf32, f64, matstr}), matf64)
+    MAKE_CASE(DTT, VTT, ONE({f32, frm({f64, si64}), frm({str})}), frm({f32, f64, si64}))
+    // TODO How to properly represent this case (see #421)?
+    //        MAKE_CASE(
+    //                DTT, VTT,
+    //                // note: matrix without shape information
+    //                ONE({matf32, frm({f64, si64}), frm({str})}),
+    //                frm({u})
+    //        )
+    MAKE_CASE(DTT, VTT, ONE({matf32.withShape(-1, 3), frm({f64, si64}), str}), frm({f32, f32, f32, f64, si64}))
+    MAKE_CASE(DTT, VTT, ONE({frm({f64, si64}), matf32.withShape(-1, 3), str}), frm({f64, si64, f32, f32, f32}))
+#undef DTT
+#undef VTT
+
+#define DTT DataTypeSca
+#define VTT ValueTypeFromFirstArg
+    // Mainly test if result is scalar and input frame column types are
+    // collapsed correctly.
+    MAKE_CASE(DTT, VTT, {f64}, f64)
+    MAKE_CASE(DTT, VTT, {matf64}, f64)
+    MAKE_CASE(DTT, VTT, {frmf64}, f64)
+    MAKE_CASE(DTT, VTT, {frm({f64, f32, str})}, str)
+    MAKE_CASE(DTT, VTT, {u}, u)
+#undef DTT
+#undef VTT
+
+#define DTT DataTypeMat
+#define VTT ValueTypeFromFirstArg
+    // Mainly test if result is matrix and input frame column types are
+    // collapsed correctly.
+    MAKE_CASE(DTT, VTT, {f64}, matf64)
+    MAKE_CASE(DTT, VTT, {matf64}, matf64)
+    MAKE_CASE(DTT, VTT, {frmf64}, matf64)
+    MAKE_CASE(DTT, VTT, {frm({f64, f32, str})}, matstr)
+    MAKE_CASE(DTT, VTT, {u}, matu)
+#undef DTT
+#undef VTT
+
+#define DTT DataTypeFrm
+#define VTT ValueTypeFromFirstArg
+    // Mainly test if result is frame and result column types are derived
+    // correctly.
+    MAKE_CASE(DTT, VTT, {f64}, frmf64)
+    // TODO How to properly represent this case (see #421)?
+    //        MAKE_CASE(DTT, VTT, {matf64}, frm({u})) // note: matrix without
+    //        shape information
+    MAKE_CASE(DTT, VTT, {matf64.withShape(-1, 3)}, frm({f64, f64, f64}))
+    MAKE_CASE(DTT, VTT, {frmf64}, frmf64)
+    MAKE_CASE(DTT, VTT, {frm({f64, f32, str})}, frm({f64, f32, str}))
+    // TODO How to properly represent this case (see #421)?
+    //        MAKE_CASE(DTT, VTT, {u}, frm({u}))
+#undef DTT
+#undef VTT
 }
diff --git a/test/parser/config/ConfigParserTest.cpp b/test/parser/config/ConfigParserTest.cpp
index 259f6cd6e..88e5807f0 100644
--- a/test/parser/config/ConfigParserTest.cpp
+++ b/test/parser/config/ConfigParserTest.cpp
@@ -13,99 +13,101 @@
  *  See the License for the specific language governing permissions and
  *  limitations under the License.
  */
- 
-#include <tags.h>
 
-#include <catch.hpp>
+#include "run_tests.h"
 
 #include <parser/config/ConfigParser.h>
 
-#include <iostream>
-
 const std::string dirPath = "test/parser/config/configFiles/";
 
-TEST_CASE("Proper config file from daphne root directory")
-{
+TEST_CASE("Proper config file from daphne root directory") {
+    auto dctx = setupContextAndLogger();
     const std::string configFile = "UserConfig.json";
     DaphneUserConfig userConfig{};
     REQUIRE(ConfigParser::fileExists(configFile));
     REQUIRE_NOTHROW(ConfigParser::readUserConfig(configFile, userConfig));
 }
 
-
-TEST_CASE("Missing config file", TAG_PARSER)
-{
+TEST_CASE("Missing config file", TAG_PARSER) {
+    auto dctx = setupContextAndLogger();
     const std::string configFile = "-";
-    REQUIRE_THROWS(ConfigParser::fileExists(configFile));
+
+    auto throwLambda = [configFile]() {
+        if (!ConfigParser::fileExists(configFile))
+            throw std::runtime_error("could not open file '" + configFile + "' for reading user config");
+    };
+
+    REQUIRE_THROWS(throwLambda());
 }
 
-TEST_CASE("Empty config file", TAG_PARSER)
-{
+TEST_CASE("Empty config file", TAG_PARSER) {
+    auto dctx = setupContextAndLogger();
     const std::string configFile = dirPath + "UserConfig1.json";
     DaphneUserConfig userConfig{};
     REQUIRE(ConfigParser::fileExists(configFile));
     REQUIRE_NOTHROW(ConfigParser::readUserConfig(configFile, userConfig));
 }
 
-TEST_CASE("Wrong JSON format in the config file", TAG_PARSER)
-{
+TEST_CASE("Wrong JSON format in the config file", TAG_PARSER) {
+    auto dctx = setupContextAndLogger();
     const std::string configFile = dirPath + "UserConfig2.json";
     DaphneUserConfig userConfig{};
     REQUIRE(ConfigParser::fileExists(configFile));
     REQUIRE_THROWS(ConfigParser::readUserConfig(configFile, userConfig));
 }
 
-TEST_CASE("Wrong format of config file", TAG_PARSER)
-{
+TEST_CASE("Wrong format of config file", TAG_PARSER) {
+    auto dctx = setupContextAndLogger();
     const std::string configFile = dirPath + "UserConfig3.txt";
     DaphneUserConfig userConfig{};
     REQUIRE(ConfigParser::fileExists(configFile));
     REQUIRE_THROWS(ConfigParser::readUserConfig(configFile, userConfig));
 }
 
-
-TEST_CASE("Config file that contains only some keys", TAG_PARSER)
-{
+TEST_CASE("Config file that contains only some keys", TAG_PARSER) {
+    auto dctx = setupContextAndLogger();
     const std::string configFile = dirPath + "UserConfig4.json";
     DaphneUserConfig userConfig{};
     REQUIRE(ConfigParser::fileExists(configFile));
     REQUIRE_NOTHROW(ConfigParser::readUserConfig(configFile, userConfig));
 }
 
-TEST_CASE("Unknown key in the config file", TAG_PARSER)
-{
+TEST_CASE("Unknown key in the config file", TAG_PARSER) {
+    auto dctx = setupContextAndLogger();
     const std::string configFile = dirPath + "UserConfig5.json";
     DaphneUserConfig userConfig{};
     REQUIRE(ConfigParser::fileExists(configFile));
     REQUIRE_THROWS(ConfigParser::readUserConfig(configFile, userConfig));
 }
 
-TEST_CASE("The unknown value of param in the config file", TAG_PARSER)
-{
+TEST_CASE("The unknown value of param in the config file", TAG_PARSER) {
+    auto dctx = setupContextAndLogger();
     const std::string configFile = dirPath + "UserConfig6.json";
     DaphneUserConfig userConfig{};
     REQUIRE(ConfigParser::fileExists(configFile));
     REQUIRE_THROWS(ConfigParser::readUserConfig(configFile, userConfig));
 }
 
-TEST_CASE("An adequate enum value set in the config file", TAG_PARSER)
-{
+TEST_CASE("An adequate enum value set in the config file", TAG_PARSER) {
+    auto dctx = setupContextAndLogger();
     const std::string configFile = dirPath + "UserConfig7.json";
     DaphneUserConfig userConfig{};
     REQUIRE(ConfigParser::fileExists(configFile));
     REQUIRE_NOTHROW(ConfigParser::readUserConfig(configFile, userConfig));
 }
 
-TEST_CASE("An unknown enum value set in the config file", TAG_PARSER)
-{
+TEST_CASE("An unknown enum value set in the config file", TAG_PARSER) {
+    auto dctx = setupContextAndLogger();
     const std::string configFile = dirPath + "UserConfig8.json";
     DaphneUserConfig userConfig{};
     REQUIRE(ConfigParser::fileExists(configFile));
     REQUIRE_THROWS(ConfigParser::readUserConfig(configFile, userConfig));
 }
 
-TEST_CASE("Integer set as enum value instead of the name of the enum value in string format in the config file", TAG_PARSER)
-{
+TEST_CASE("Integer set as enum value instead of the name of the enum value in "
+          "string format in the config file",
+          TAG_PARSER) {
+    auto dctx = setupContextAndLogger();
     const std::string configFile = dirPath + "UserConfig9.json";
     DaphneUserConfig userConfig{};
     REQUIRE(ConfigParser::fileExists(configFile));
diff --git a/test/run_tests.cpp b/test/run_tests.cpp
index fa5fd7a7b..af90b6001 100644
--- a/test/run_tests.cpp
+++ b/test/run_tests.cpp
@@ -19,28 +19,25 @@
 
 #include "run_tests.h"
 
-#include <api/cli/DaphneUserConfig.h>
 #include "runtime/local/kernels/CreateDaphneContext.h"
+#include <api/cli/DaphneUserConfig.h>
 #ifdef USE_CUDA
-    #include "runtime/local/kernels/CUDA/CreateCUDAContext.h"
+#include "runtime/local/kernels/CUDA/CreateCUDAContext.h"
 #endif
 
 #include "spdlog/cfg/env.h"
 
 std::unique_ptr<DaphneContext> setupContextAndLogger() {
-    if(not logger) {
+    if (not logger) {
         logger = std::make_unique<DaphneLogger>(user_config);
         user_config.log_ptr->registerLoggers();
         spdlog::cfg::load_env_levels();
     }
 
-    DaphneContext* dctx_;
-    createDaphneContext(
-        dctx_, reinterpret_cast<uint64_t>(&user_config),
-        reinterpret_cast<uint64_t>(&dispatchMapping),
-        reinterpret_cast<uint64_t>(&Statistics::instance()),
-        reinterpret_cast<uint64_t>(&StringRefCounter::instance())
-    );
+    DaphneContext *dctx_;
+    createDaphneContext(dctx_, reinterpret_cast<uint64_t>(&user_config), reinterpret_cast<uint64_t>(&dispatchMapping),
+                        reinterpret_cast<uint64_t>(&Statistics::instance()),
+                        reinterpret_cast<uint64_t>(&StringRefCounter::instance()));
 
 #ifdef USE_CUDA
     CUDA::createCUDAContext(dctx_);
diff --git a/test/run_tests.h b/test/run_tests.h
index 36f37e5c9..24dd97de6 100644
--- a/test/run_tests.h
+++ b/test/run_tests.h
@@ -16,8 +16,8 @@
 
 #pragma once
 
-#include <catch.hpp>
 #include "tags.h"
+#include <catch.hpp>
 
 #include <runtime/local/context/DaphneContext.h>
 
diff --git a/test/runtime/distributed/worker/WorkerTest.cpp b/test/runtime/distributed/worker/WorkerTest.cpp
index 634509ba4..a675927bd 100644
--- a/test/runtime/distributed/worker/WorkerTest.cpp
+++ b/test/runtime/distributed/worker/WorkerTest.cpp
@@ -14,122 +14,117 @@
  *  limitations under the License.
  */
 
-
-
+#include "run_tests.h"
 #include "runtime/distributed/worker/WorkerImpl.h"
-#include "runtime/local/kernels/EwBinaryMat.h"
 #include "runtime/local/kernels/CheckEq.h"
-#include "run_tests.h"
+#include "runtime/local/kernels/EwBinaryMat.h"
 
 #include <tags.h>
 
 #include <catch.hpp>
 
+#include <api/cli/Utils.h>
 #include <runtime/local/io/File.h>
 #include <runtime/local/io/ReadCsv.h>
-#include <api/cli/Utils.h>
 #include <thread>
 
 const std::string dirPath = "test/runtime/distributed/worker/";
 
-TEMPLATE_PRODUCT_TEST_CASE("Simple distributed worker functionality test", TAG_DISTRIBUTED, (DenseMatrix), (double))
-{
+TEMPLATE_PRODUCT_TEST_CASE("Simple distributed worker functionality test", TAG_DISTRIBUTED, (DenseMatrix), (double)) {
     auto dctx = setupContextAndLogger();
     user_config.resolveLibDir();
-    using DT = TestType;    
+    using DT = TestType;
     WorkerImpl workerImpl(user_config);
-    
-    WHEN ("Sending a task where no outputs are expected")
-    {
+
+    WHEN("Sending a task where no outputs are expected") {
 
         std::string task("func.func @" + WorkerImpl::DISTRIBUTED_FUNCTION_NAME +
-            "() -> () {\n"
-            "  \"daphne.return\"() : () -> ()\n"
-            "}\n");
-        
+                         "() -> () {\n"
+                         "  \"daphne.return\"() : () -> ()\n"
+                         "}\n");
+
         std::vector<WorkerImpl::StoredInfo> inputs, outputs;
         auto status = workerImpl.Compute(&outputs, inputs, task);
 
-        THEN ("No filenames are returned")
-        {
+        THEN("No filenames are returned") {
             REQUIRE(status.ok());
             REQUIRE(outputs.size() == 0);
         }
     }
 
-    WHEN ("Sending simple random generation task")
-    {
+    WHEN("Sending simple random generation task") {
         std::string task("func.func @" + WorkerImpl::DISTRIBUTED_FUNCTION_NAME +
-            "() -> !daphne.Matrix<?x?xf64> {\n"
-            "    %3 = \"daphne.constant\"() {value = 2 : si64} : () -> si64\n"
-            "    %4 = \"daphne.cast\"(%3) : (si64) -> index\n"
-            "    %5 = \"daphne.constant\"() {value = 3 : si64} : () -> si64\n"
-            "    %6 = \"daphne.cast\"(%5) : (si64) -> index\n"
-            "    %7 = \"daphne.constant\"() {value = 1.000000e+02 : f64} : () -> f64\n"
-            "    %8 = \"daphne.constant\"() {value = 2.000000e+02 : f64} : () -> f64\n"
-            "    %9 = \"daphne.constant\"() {value = 1.000000e+00 : f64} : () -> f64\n"
-            "    %10 = \"daphne.constant\"() {value = -1 : si64} : () -> si64\n"
-            "    %11 = \"daphne.randMatrix\"(%4, %6, %7, %8, %9, %10) : (index, index, f64, f64, f64, si64) -> !daphne.Matrix<?x?xf64>"
-            "    \"daphne.return\"(%11) : (!daphne.Matrix<?x?xf64>) -> ()\n"
-            "  }");
+                         "() -> !daphne.Matrix<?x?xf64> {\n"
+                         "    %3 = \"daphne.constant\"() {value = 2 : si64} : () -> si64\n"
+                         "    %4 = \"daphne.cast\"(%3) : (si64) -> index\n"
+                         "    %5 = \"daphne.constant\"() {value = 3 : si64} : () -> si64\n"
+                         "    %6 = \"daphne.cast\"(%5) : (si64) -> index\n"
+                         "    %7 = \"daphne.constant\"() {value = 1.000000e+02 : f64} : () "
+                         "-> f64\n"
+                         "    %8 = \"daphne.constant\"() {value = 2.000000e+02 : f64} : () "
+                         "-> f64\n"
+                         "    %9 = \"daphne.constant\"() {value = 1.000000e+00 : f64} : () "
+                         "-> f64\n"
+                         "    %10 = \"daphne.constant\"() {value = -1 : si64} : () -> si64\n"
+                         "    %11 = \"daphne.randMatrix\"(%4, %6, %7, %8, %9, %10) : "
+                         "(index, index, f64, f64, f64, si64) -> !daphne.Matrix<?x?xf64>"
+                         "    \"daphne.return\"(%11) : (!daphne.Matrix<?x?xf64>) -> ()\n"
+                         "  }");
 
         std::vector<WorkerImpl::StoredInfo> inputs, outputs;
         auto status = workerImpl.Compute(&outputs, inputs, task);
 
-        THEN ("The filename of the matrix is returned")
-        {
+        THEN("The filename of the matrix is returned") {
             REQUIRE(status.ok());
             REQUIRE(outputs.size() == 1);
         }
     }
 
-    WHEN ("Sending a task with a read on the worker") 
-    {
+    WHEN("Sending a task with a read on the worker") {
         std::vector<WorkerImpl::StoredInfo> inputs, outputs;
         inputs.push_back(WorkerImpl::StoredInfo({dirPath + "mat.csv", 2, 4}));
-        std::string task(
-            "func.func @" + WorkerImpl::DISTRIBUTED_FUNCTION_NAME +
-                "(%mat: !daphne.Matrix<?x?xf64>) -> !daphne.Matrix<?x?xf64> {\n"
-                "  %r = \"daphne.ewAdd\"(%mat, %mat) : (!daphne.Matrix<?x?xf64>, !daphne.Matrix<?x?xf64>) -> !daphne.Matrix<?x?xf64>\n"
-                "  \"daphne.return\"(%r) : (!daphne.Matrix<?x?xf64>) -> ()\n"
-                "}");
+        std::string task("func.func @" + WorkerImpl::DISTRIBUTED_FUNCTION_NAME +
+                         "(%mat: !daphne.Matrix<?x?xf64>) -> !daphne.Matrix<?x?xf64> {\n"
+                         "  %r = \"daphne.ewAdd\"(%mat, %mat) : (!daphne.Matrix<?x?xf64>, "
+                         "!daphne.Matrix<?x?xf64>) -> !daphne.Matrix<?x?xf64>\n"
+                         "  \"daphne.return\"(%r) : (!daphne.Matrix<?x?xf64>) -> ()\n"
+                         "}");
 
         auto status = workerImpl.Compute(&outputs, inputs, task);
 
-        THEN ("A Matrix is returned") {
+        THEN("A Matrix is returned") {
             REQUIRE(status.ok());
             REQUIRE(outputs.size() == 1);
         }
     }
 
-    WHEN ("Sending a task with a read on the worker")
-    {
+    WHEN("Sending a task with a read on the worker") {
         std::vector<WorkerImpl::StoredInfo> inputs, outputs;
-        
+
         auto identification = dirPath + "mat.csv";
         size_t rows = 2;
         size_t cols = 4;
-        inputs.push_back(WorkerImpl::StoredInfo({identification , rows, cols}));
-
-        std::string task(
-            "func.func @" + WorkerImpl::DISTRIBUTED_FUNCTION_NAME +
-                "(%mat: !daphne.Matrix<?x?xf64>) -> !daphne.Matrix<?x?xf64> {\n"
-                "  %r = \"daphne.ewAdd\"(%mat, %mat) : (!daphne.Matrix<?x?xf64>, !daphne.Matrix<?x?xf64>) -> !daphne.Matrix<?x?xf64>\n"
-                "  \"daphne.return\"(%r) : (!daphne.Matrix<?x?xf64>) -> ()\n"
-                "}");
-        
-        auto status = workerImpl.Compute(&outputs, inputs, task);        
-
-        THEN ("A Matrix is returned with all elements doubled") {
+        inputs.push_back(WorkerImpl::StoredInfo({identification, rows, cols}));
+
+        std::string task("func.func @" + WorkerImpl::DISTRIBUTED_FUNCTION_NAME +
+                         "(%mat: !daphne.Matrix<?x?xf64>) -> !daphne.Matrix<?x?xf64> {\n"
+                         "  %r = \"daphne.ewAdd\"(%mat, %mat) : (!daphne.Matrix<?x?xf64>, "
+                         "!daphne.Matrix<?x?xf64>) -> !daphne.Matrix<?x?xf64>\n"
+                         "  \"daphne.return\"(%r) : (!daphne.Matrix<?x?xf64>) -> ()\n"
+                         "}");
+
+        auto status = workerImpl.Compute(&outputs, inputs, task);
+
+        THEN("A Matrix is returned with all elements doubled") {
             REQUIRE(status.ok());
             REQUIRE(outputs.size() == 1);
-            
+
             Structure *structure;
-            
+
             structure = workerImpl.Transfer(outputs[0]);
             REQUIRE(status.ok());
-            
-            DT *mat = dynamic_cast<DT*>(structure);
+
+            DT *mat = dynamic_cast<DT *>(structure);
             REQUIRE(mat != nullptr);
 
             DT *matOrig = nullptr;
@@ -138,13 +133,10 @@ TEMPLATE_PRODUCT_TEST_CASE("Simple distributed worker functionality test", TAG_D
             readCsv(matOrig, identification.c_str(), rows, cols, delim);
 
             DT *matOrigTimes2 = nullptr;
-            EwBinaryMat<DT, DT, DT>::apply(BinaryOpCode::ADD,
-                matOrigTimes2,
-                matOrig,
-                matOrig,
-                nullptr);
+            EwBinaryMat<DT, DT, DT>::apply(BinaryOpCode::ADD, matOrigTimes2, matOrig, matOrig, nullptr);
 
-            // TODO: epsilon check once it is no longer ensured the same kernel will be used
+            // TODO: epsilon check once it is no longer ensured the same kernel
+            // will be used
             CHECK(*mat == *matOrigTimes2);
         }
     }
diff --git a/test/runtime/local/datastructures/CSRMatrixTest.cpp b/test/runtime/local/datastructures/CSRMatrixTest.cpp
index df862c44f..827679e89 100644
--- a/test/runtime/local/datastructures/CSRMatrixTest.cpp
+++ b/test/runtime/local/datastructures/CSRMatrixTest.cpp
@@ -27,42 +27,43 @@
 TEMPLATE_TEST_CASE("CSRMatrix allocates enough space", TAG_DATASTRUCTURES, ALL_VALUE_TYPES) {
     // No assertions in this test case. We just want to see if it runs without
     // crashing.
-    
+
     using ValueType = TestType;
-    
+
     const size_t numRows = 10000;
     const size_t numCols = 2000;
     const size_t numNonZeros = 500;
-    
-    CSRMatrix<ValueType> * m = DataObjectFactory::create<CSRMatrix<ValueType>>(numRows, numCols, numNonZeros, false);
-    
-    ValueType * values = m->getValues();
-    size_t * colIdxs = m->getColIdxs();
-    size_t * rowOffsets = m->getRowOffsets();
-    
+
+    CSRMatrix<ValueType> *m = DataObjectFactory::create<CSRMatrix<ValueType>>(numRows, numCols, numNonZeros, false);
+
+    ValueType *values = m->getValues();
+    size_t *colIdxs = m->getColIdxs();
+    size_t *rowOffsets = m->getRowOffsets();
+
     // Fill all arrays with ones of the respective type. Note that this does
     // not result in a valid CSR representation, but we only want to check if
     // there is enough space.
-    for(size_t i = 0; i < numNonZeros; i++) {
+    for (size_t i = 0; i < numNonZeros; i++) {
         values[i] = ValueType(1);
         colIdxs[i] = size_t(1);
     }
-    for(size_t i = 0; i <= numRows; i++)
+    for (size_t i = 0; i <= numRows; i++)
         rowOffsets[i] = size_t(1);
-    
+
     DataObjectFactory::destroy(m);
 }
 
 TEST_CASE("CSRMatrix sub-matrix works properly", TAG_DATASTRUCTURES) {
     using ValueType = uint64_t;
-    
+
     const size_t numRowsOrig = 10;
     const size_t numColsOrig = 7;
     const size_t numNonZeros = 3;
-    
-    CSRMatrix<ValueType> * mOrig = DataObjectFactory::create<CSRMatrix<ValueType>>(numRowsOrig, numColsOrig, numNonZeros, true);
-    CSRMatrix<ValueType> * mSub = DataObjectFactory::create<CSRMatrix<ValueType>>(mOrig, 3, 5);
-    
+
+    CSRMatrix<ValueType> *mOrig =
+        DataObjectFactory::create<CSRMatrix<ValueType>>(numRowsOrig, numColsOrig, numNonZeros, true);
+    CSRMatrix<ValueType> *mSub = DataObjectFactory::create<CSRMatrix<ValueType>>(mOrig, 3, 5);
+
     // Sub-matrix dimensions are as expected.
     CHECK(mSub->getNumRows() == 2);
     CHECK(mSub->getNumCols() == numColsOrig);
@@ -70,13 +71,13 @@ TEST_CASE("CSRMatrix sub-matrix works properly", TAG_DATASTRUCTURES) {
     // Sub-matrix shares data array with original.
     CHECK(mSub->getValues() == mOrig->getValues());
     CHECK(mSub->getColIdxs() == mOrig->getColIdxs());
-    
-    ValueType * rowOffsetsOrig = mOrig->getRowOffsets();
-    ValueType * rowOffsetsSub = mSub->getRowOffsets();
+
+    ValueType *rowOffsetsOrig = mOrig->getRowOffsets();
+    ValueType *rowOffsetsSub = mSub->getRowOffsets();
     CHECK((rowOffsetsSub >= rowOffsetsOrig && rowOffsetsSub <= rowOffsetsOrig + numRowsOrig));
     rowOffsetsSub[0] = 123;
     CHECK(rowOffsetsOrig[3] == 123);
-    
+
     // Freeing both matrices does not result in double-free errors.
     SECTION("Freeing the original matrix first is fine") {
         DataObjectFactory::destroy(mOrig);
diff --git a/test/runtime/local/datastructures/DenseMatrixTest.cpp b/test/runtime/local/datastructures/DenseMatrixTest.cpp
index 6f07176cd..8314439f4 100644
--- a/test/runtime/local/datastructures/DenseMatrixTest.cpp
+++ b/test/runtime/local/datastructures/DenseMatrixTest.cpp
@@ -14,10 +14,10 @@
  * limitations under the License.
  */
 
+#include <runtime/local/datagen/GenGivenVals.h>
 #include <runtime/local/datastructures/DataObjectFactory.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
 #include <runtime/local/datastructures/ValueTypeUtils.h>
-#include <runtime/local/datagen/GenGivenVals.h>
 
 #include <tags.h>
 
@@ -28,21 +28,21 @@
 TEMPLATE_TEST_CASE("DenseMatrix allocates enough space", TAG_DATASTRUCTURES, ALL_VALUE_TYPES) {
     // No assertions in this test case. We just want to see if it runs without
     // crashing.
-    
+
     using ValueType = TestType;
-    
+
     const size_t numRows = 10000;
     const size_t numCols = 2000;
-    
-    DenseMatrix<ValueType> * m = DataObjectFactory::create<DenseMatrix<ValueType>>(numRows, numCols, false);
-    
-    ValueType * values = m->getValues();
+
+    DenseMatrix<ValueType> *m = DataObjectFactory::create<DenseMatrix<ValueType>>(numRows, numCols, false);
+
+    ValueType *values = m->getValues();
     const size_t numCells = numRows * numCols;
-    
+
     // Fill the matrix with ones of the respective value type.
-    for(size_t i = 0; i < numCells; i++)
+    for (size_t i = 0; i < numCells; i++)
         values[i] = ValueType(1);
-    
+
     DataObjectFactory::destroy(m);
 }
 
@@ -52,28 +52,26 @@ TEST_CASE("DenseMatrix for strings", TAG_DATASTRUCTURES) {
     const size_t bytesPerCell = 1;
 
     using expectedStrings = const std::vector<std::string>;
-    
+
     // We do not use operator== to compare to a matrix created by genGivenVals()
     // here, since this would rely on the functionality we want to test.
-    auto compareMatToArr = [](const DenseMatrix<const char*>* mat, const expectedStrings& exp) {
-        for(size_t r = 0; r < mat->getNumRows(); r++)
-            for(size_t c = 0; c < mat->getNumCols(); c++)
-                if(strcmp(mat->get(r,c), exp[r*mat->getNumCols() + c].c_str()))
+    auto compareMatToArr = [](const DenseMatrix<const char *> *mat, const expectedStrings &exp) {
+        for (size_t r = 0; r < mat->getNumRows(); r++)
+            for (size_t c = 0; c < mat->getNumCols(); c++)
+                if (strcmp(mat->get(r, c), exp[r * mat->getNumCols() + c].c_str()))
                     return false;
         return true;
     };
 
     SECTION("Append") {
-        auto m = DataObjectFactory::create<DenseMatrix<const char*>>(numRows, numCols, false);
+        auto m = DataObjectFactory::create<DenseMatrix<const char *>>(numRows, numCols, false);
         m->prepareAppend();
-        expectedStrings exp = {"0", "", "", "3", 
-                            "10", "", "", "13",
-                            "20", "", "", "23"};
+        expectedStrings exp = {"0", "", "", "3", "10", "", "", "13", "20", "", "", "23"};
         m->prepareAppend();
-        for(size_t r = 0; r < numRows; r++)
-            for(size_t c = 0; c < numCols; c++)
-                if(c % 3 == 0)
-                    m->append(r, c, std::string(std::to_string(r*10+c)).c_str());
+        for (size_t r = 0; r < numRows; r++)
+            for (size_t c = 0; c < numCols; c++)
+                if (c % 3 == 0)
+                    m->append(r, c, std::string(std::to_string(r * 10 + c)).c_str());
 
         m->finishAppend();
         CHECK(compareMatToArr(m, exp));
@@ -81,27 +79,24 @@ TEST_CASE("DenseMatrix for strings", TAG_DATASTRUCTURES) {
     }
 
     SECTION("Set") {
-        expectedStrings exp1 = {"", "1", "", "3",
-                                    "", "11", "", "13",
-                                    "", "21", "", "23"};
-
-        expectedStrings exp2 = {"0", "1" ,"2", "3",
-                                    "10", "11", "12", "13",
-                                    "20", "21", "22", "23"};
-        DenseMatrix<const char*> * m = DataObjectFactory::create<DenseMatrix<const char*>>(numRows, numCols, false, numRows*numCols*bytesPerCell);
-
-        for(size_t r = 0; r < numRows; r++)
-            for(size_t c = 0; c < numCols; c++){
-                size_t num = r*10+c;
-                if(num % 2){
+        expectedStrings exp1 = {"", "1", "", "3", "", "11", "", "13", "", "21", "", "23"};
+
+        expectedStrings exp2 = {"0", "1", "2", "3", "10", "11", "12", "13", "20", "21", "22", "23"};
+        DenseMatrix<const char *> *m = DataObjectFactory::create<DenseMatrix<const char *>>(
+            numRows, numCols, false, numRows * numCols * bytesPerCell);
+
+        for (size_t r = 0; r < numRows; r++)
+            for (size_t c = 0; c < numCols; c++) {
+                size_t num = r * 10 + c;
+                if (num % 2) {
                     m->set(r, c, std::string(std::to_string(num)).c_str());
                 }
             }
         CHECK(compareMatToArr(m, exp1));
-        for(size_t r = 0; r < numRows; r++)
-            for(size_t c = 0; c < numCols; c++){
-                size_t num = r*10+c;
-                if(!(num % 2)){
+        for (size_t r = 0; r < numRows; r++)
+            for (size_t c = 0; c < numCols; c++) {
+                size_t num = r * 10 + c;
+                if (!(num % 2)) {
                     m->set(r, c, std::string(std::to_string(num)).c_str());
                 }
             }
@@ -111,29 +106,26 @@ TEST_CASE("DenseMatrix for strings", TAG_DATASTRUCTURES) {
 
     SECTION("Append + Set") {
 
-        expectedStrings exp1 = {"0", "", "", "3", 
-                                    "10", "", "", "13",    
-                                    "20", "", "", "23"};
+        expectedStrings exp1 = {"0", "", "", "3", "10", "", "", "13", "20", "", "", "23"};
 
-        expectedStrings exp2 = {"0", "", "", "3", 
-                                    "10", std::string(100, 'O').c_str(), "", "13",    
-                                    "20", "", "", "23"};
+        expectedStrings exp2 = {"0", "", "", "3", "10", std::string(100, 'O').c_str(), "", "13", "20", "", "", "23"};
 
-        expectedStrings exp3 = {"0", "", "", "3", 
-                                    "10", std::string(100, 'O').c_str(), "", "13",    
-                                    "20", std::string(5000, 'X').c_str(), "", "23"};
+        expectedStrings exp3 = {
+            "0", "",  "", "3", "10", std::string(100, 'O').c_str(), "", "13", "20", std::string(5000, 'X').c_str(),
+            "",  "23"};
 
-        expectedStrings exp4 = {"0", "", "", "3", 
-                                    "10", std::string(100, 'O').c_str(), "", "13",    
-                                    "20", std::string(5, 'X').c_str(), "", "23"};
+        expectedStrings exp4 = {
+            "0", "",  "", "3", "10", std::string(100, 'O').c_str(), "", "13", "20", std::string(5, 'X').c_str(),
+            "",  "23"};
 
-        auto m = DataObjectFactory::create<DenseMatrix<const char*>>(numRows, numCols, false, numRows*numCols*bytesPerCell);
+        auto m = DataObjectFactory::create<DenseMatrix<const char *>>(numRows, numCols, false,
+                                                                      numRows * numCols * bytesPerCell);
         m->set(1, 1, std::string(20, 'C').c_str()); // will be overwritten by append
         m->prepareAppend();
-        for(size_t r = 0; r < numRows; r++)
-            for(size_t c = 0; c < numCols; c++)
-                if(c % 3 == 0)
-                    m->append(r, c, std::string(std::to_string(r*10+c)).c_str());
+        for (size_t r = 0; r < numRows; r++)
+            for (size_t c = 0; c < numCols; c++)
+                if (c % 3 == 0)
+                    m->append(r, c, std::string(std::to_string(r * 10 + c)).c_str());
         m->finishAppend();
 
         CHECK(compareMatToArr(m, exp1));
@@ -149,16 +141,16 @@ TEST_CASE("DenseMatrix for strings", TAG_DATASTRUCTURES) {
     SECTION("View") {
         expectedStrings exp1 = {"1", "2", "11", "12"};
         expectedStrings exp2 = {"1", "2", "11", std::string(5, 'X').c_str()};
-        expectedStrings exp3 = {"0", "1", "2", "3",
-                                    "10", "11", std::string(5, 'X').c_str(), "13",
-                                    "20", "21", "22", "23"};
-        DenseMatrix<const char*> * m = DataObjectFactory::create<DenseMatrix<const char*>>(numRows, numCols, false, numRows*numCols*bytesPerCell);
-        for(size_t r = 0; r < numRows; r++)
-            for(size_t c = 0; c < numCols; c++)
-                m->set(r, c, std::string(std::to_string(r*10+c)).c_str());
-        auto mView = DataObjectFactory::create<DenseMatrix<const char*>>(m, 0, 2, 1, 3);
+        expectedStrings exp3 = {"0",  "1",  "2",  "3",  "10", "11", std::string(5, 'X').c_str(),
+                                "13", "20", "21", "22", "23"};
+        DenseMatrix<const char *> *m = DataObjectFactory::create<DenseMatrix<const char *>>(
+            numRows, numCols, false, numRows * numCols * bytesPerCell);
+        for (size_t r = 0; r < numRows; r++)
+            for (size_t c = 0; c < numCols; c++)
+                m->set(r, c, std::string(std::to_string(r * 10 + c)).c_str());
+        auto mView = DataObjectFactory::create<DenseMatrix<const char *>>(m, 0, 2, 1, 3);
         CHECK(compareMatToArr(mView, exp1));
-        
+
         mView->set(1, 1, std::string(5, 'X').c_str());
         CHECK(compareMatToArr(mView, exp2));
         CHECK(compareMatToArr(m, exp3));
@@ -170,26 +162,26 @@ TEST_CASE("DenseMatrix for strings", TAG_DATASTRUCTURES) {
 
 TEST_CASE("DenseMatrix sub-matrix works properly", TAG_DATASTRUCTURES) {
     using ValueType = uint64_t;
-    
+
     const size_t numRowsOrig = 10;
     const size_t numColsOrig = 7;
     const size_t numCellsOrig = numRowsOrig * numColsOrig;
-    
-    DenseMatrix<ValueType> * mOrig = DataObjectFactory::create<DenseMatrix<ValueType>>(numRowsOrig, numColsOrig, true);
-    DenseMatrix<ValueType> * mSub = DataObjectFactory::create<DenseMatrix<ValueType>>(mOrig, 3, 5, 1, 4);
-    
+
+    DenseMatrix<ValueType> *mOrig = DataObjectFactory::create<DenseMatrix<ValueType>>(numRowsOrig, numColsOrig, true);
+    DenseMatrix<ValueType> *mSub = DataObjectFactory::create<DenseMatrix<ValueType>>(mOrig, 3, 5, 1, 4);
+
     // Sub-matrix dimensions are as expected.
     CHECK(mSub->getNumRows() == 2);
     CHECK(mSub->getNumCols() == 3);
     CHECK(mSub->getRowSkip() == numColsOrig);
 
     // Sub-matrix shares data array with original.
-    ValueType * valuesOrig = mOrig->getValues();
-    ValueType * valuesSub = mSub->getValues();
+    ValueType *valuesOrig = mOrig->getValues();
+    ValueType *valuesSub = mSub->getValues();
     CHECK((valuesSub >= valuesOrig && valuesSub < valuesOrig + numCellsOrig));
     valuesSub[0] = 123;
     CHECK(valuesOrig[3 * numColsOrig + 1] == 123);
-    
+
     // Freeing both matrices does not result in double-free errors.
     SECTION("Freeing the original matrix first is fine") {
         DataObjectFactory::destroy(mOrig);
@@ -199,4 +191,124 @@ TEST_CASE("DenseMatrix sub-matrix works properly", TAG_DATASTRUCTURES) {
         DataObjectFactory::destroy(mSub);
         DataObjectFactory::destroy(mOrig);
     }
+}
+
+TEMPLATE_TEST_CASE("DenseMatrix with string value type", TAG_DATASTRUCTURES, ALL_STRING_VALUE_TYPES) {
+    using ValueType = TestType;
+
+    using expectedStrings = const std::vector<ValueType>;
+
+    // We do not use operator == to compare to a matrix created by genGivenVals()
+    // here, since this would rely on the functionality we want to test.
+    auto compareMatToArr = [](const DenseMatrix<ValueType> *mat, const expectedStrings &exp) {
+        for (size_t r = 0; r < mat->getNumRows(); r++)
+            for (size_t c = 0; c < mat->getNumCols(); c++)
+                if ((mat->get(r, c) != exp[r * mat->getNumCols() + c]))
+                    return false;
+        return true;
+    };
+
+    SECTION("Create") {
+        const size_t numRows = 10000;
+        const size_t numCols = 2000;
+
+        DenseMatrix<ValueType> *m = DataObjectFactory::create<DenseMatrix<ValueType>>(numRows, numCols, false);
+
+        ValueType *values = m->getValues();
+        const size_t numCells = numRows * numCols;
+
+        for (size_t i = 0; i < numCells; i++)
+            values[i] = ValueType();
+
+        DataObjectFactory::destroy(m);
+    }
+
+    SECTION("Append") {
+        const size_t numRows = 3;
+        const size_t numCols = 4;
+
+        DenseMatrix<ValueType> *m = DataObjectFactory::create<DenseMatrix<ValueType>>(numRows, numCols, false);
+        expectedStrings exp = {ValueType("0"),  ValueType(""), ValueType(""), ValueType("3"),
+                               ValueType("10"), ValueType(""), ValueType(""), ValueType("13"),
+                               ValueType("20"), ValueType(""), ValueType(""), ValueType("23")};
+        m->prepareAppend();
+
+        for (size_t r = 0; r < numRows; r++)
+            for (size_t c = 0; c < numCols; c++)
+                if (c % 3 == 0)
+                    m->append(r, c, ValueType(std::to_string(r * 10 + c).c_str()));
+
+        m->finishAppend();
+
+        CHECK(compareMatToArr(m, exp));
+
+        DataObjectFactory::destroy<DenseMatrix<ValueType>>(m);
+    }
+
+    SECTION("Set") {
+        const size_t numRows = 3;
+        const size_t numCols = 4;
+        expectedStrings exp1 = {ValueType(""), ValueType("1"),  ValueType(""), ValueType("3"),
+                                ValueType(""), ValueType("11"), ValueType(""), ValueType("13"),
+                                ValueType(""), ValueType("21"), ValueType(""), ValueType("23")};
+
+        expectedStrings exp2 = {ValueType("0"),  ValueType("1"),  ValueType("2"),  ValueType("3"),
+                                ValueType("10"), ValueType("11"), ValueType("12"), ValueType("13"),
+                                ValueType("20"), ValueType("21"), ValueType("22"), ValueType("23")};
+        DenseMatrix<ValueType> *m = DataObjectFactory::create<DenseMatrix<ValueType>>(numRows, numCols, false);
+
+        for (size_t r = 0; r < numRows; r++)
+            for (size_t c = 0; c < numCols; c++) {
+                size_t num = r * 10 + c;
+                if (num % 2) {
+                    m->set(r, c, ValueType(std::to_string(num).c_str()));
+                }
+            }
+        CHECK(compareMatToArr(m, exp1));
+
+        for (size_t r = 0; r < numRows; r++)
+            for (size_t c = 0; c < numCols; c++) {
+                size_t num = r * 10 + c;
+                if (!(num % 2)) {
+                    m->set(r, c, ValueType(std::to_string(num).c_str()));
+                }
+            }
+        CHECK(compareMatToArr(m, exp2));
+
+        DataObjectFactory::destroy(m);
+    }
+
+    SECTION("View") {
+        const size_t numRows = 3;
+        const size_t numCols = 4;
+        expectedStrings exp1 = {ValueType("1"), ValueType("2"), ValueType("11"), "12"};
+        expectedStrings exp2 = {ValueType("1"), ValueType("2"), ValueType("11"),
+                                ValueType(std::string(5, 'X').c_str())};
+        expectedStrings exp3 = {ValueType("0"),
+                                ValueType("1"),
+                                ValueType("2"),
+                                ValueType("3"),
+                                ValueType("10"),
+                                ValueType("11"),
+                                ValueType(std::string(5, 'X').c_str()),
+                                ValueType("13"),
+                                ValueType("20"),
+                                ValueType("21"),
+                                ValueType("22"),
+                                ValueType("23")};
+        DenseMatrix<ValueType> *m = DataObjectFactory::create<DenseMatrix<ValueType>>(numRows, numCols, false);
+        for (size_t r = 0; r < numRows; r++)
+            for (size_t c = 0; c < numCols; c++)
+                m->set(r, c, ValueType(std::to_string(r * 10 + c).c_str()));
+        auto mView = DataObjectFactory::create<DenseMatrix<ValueType>>(m, 0, 2, 1, 3);
+        CHECK(compareMatToArr(mView, exp1));
+
+        mView->set(1, 1, ValueType(std::string(5, 'X').c_str()));
+        CHECK(compareMatToArr(mView, exp2));
+
+        CHECK(compareMatToArr(m, exp3));
+
+        DataObjectFactory::destroy(m);
+        DataObjectFactory::destroy(mView);
+    }
 }
\ No newline at end of file
diff --git a/test/runtime/local/datastructures/FrameTest.cpp b/test/runtime/local/datastructures/FrameTest.cpp
index df5f96b96..20966c4a2 100644
--- a/test/runtime/local/datastructures/FrameTest.cpp
+++ b/test/runtime/local/datastructures/FrameTest.cpp
@@ -32,24 +32,24 @@
 TEST_CASE("Frame allocates enough space", TAG_DATASTRUCTURES) {
     // No assertions in this test case. We just want to see if it runs without
     // crashing.
-    
+
     const size_t numRows = 10000;
     const ValueTypeCode schema[] = {ValueTypeCode::SI8, ValueTypeCode::UI32, ValueTypeCode::F64};
     const size_t numCols = sizeof(schema) / sizeof(ValueTypeCode);
-    
-    Frame * f = DataObjectFactory::create<Frame>(numRows, numCols, schema, nullptr, false);
-    
-    int8_t   * col0 = f->getColumn<int8_t>  (0)->getValues();
-    uint32_t * col1 = f->getColumn<uint32_t>(1)->getValues();
-    double   * col2 = f->getColumn<double>  (2)->getValues();
-    
+
+    Frame *f = DataObjectFactory::create<Frame>(numRows, numCols, schema, nullptr, false);
+
+    int8_t *col0 = f->getColumn<int8_t>(0)->getValues();
+    uint32_t *col1 = f->getColumn<uint32_t>(1)->getValues();
+    double *col2 = f->getColumn<double>(2)->getValues();
+
     // Fill the column arrays with ones of the respective value type.
-    for(size_t i = 0; i < numRows; i++) {
+    for (size_t i = 0; i < numRows; i++) {
         col0[i] = int8_t(1);
         col1[i] = uint32_t(1);
         col2[i] = double(1);
     }
-    
+
     DataObjectFactory::destroy(f);
 }
 
@@ -57,32 +57,32 @@ TEST_CASE("Frame sub-frame works properly", TAG_DATASTRUCTURES) {
     const size_t numRowsOrig = 10;
     const ValueTypeCode schemaOrig[] = {ValueTypeCode::SI8, ValueTypeCode::UI32, ValueTypeCode::F64};
     const size_t numColsOrig = sizeof(schemaOrig) / sizeof(ValueTypeCode);
-    
-    Frame * fOrig = DataObjectFactory::create<Frame>(numRowsOrig, numColsOrig, schemaOrig, nullptr, true);
+
+    Frame *fOrig = DataObjectFactory::create<Frame>(numRowsOrig, numColsOrig, schemaOrig, nullptr, true);
     const size_t colIdxsSub[] = {2, 0};
     const size_t numColsSub = sizeof(colIdxsSub) / sizeof(size_t);
-    Frame * fSub = DataObjectFactory::create<Frame>(fOrig, 3, 5, numColsSub, colIdxsSub);
-    
+    Frame *fSub = DataObjectFactory::create<Frame>(fOrig, 3, 5, numColsSub, colIdxsSub);
+
     // Sub-frame dimensions are as expected.
     CHECK(fSub->getNumRows() == 2);
     CHECK(fSub->getNumCols() == numColsSub);
-    
+
     // Sub-frame schema is as expected.
     CHECK(fSub->getColumnType(0) == ValueTypeCode::F64);
     CHECK(fSub->getColumnType(1) == ValueTypeCode::SI8);
 
     // Sub-frame shares data arrays with original.
-    int8_t * colOrig0 = fOrig->getColumn<int8_t>(0)->getValues();
-    double * colOrig2 = fOrig->getColumn<double>(2)->getValues();
-    double * colSub0 = fSub->getColumn<double>(0)->getValues();
-    int8_t * colSub1 = fSub->getColumn<int8_t>(1)->getValues();
+    int8_t *colOrig0 = fOrig->getColumn<int8_t>(0)->getValues();
+    double *colOrig2 = fOrig->getColumn<double>(2)->getValues();
+    double *colSub0 = fSub->getColumn<double>(0)->getValues();
+    int8_t *colSub1 = fSub->getColumn<int8_t>(1)->getValues();
     CHECK((colSub0 >= colOrig2 && colSub0 < colOrig2 + numRowsOrig));
     CHECK((colSub1 >= colOrig0 && colSub1 < colOrig0 + numRowsOrig));
     colSub0[0] = double(123);
     colSub1[0] = int8_t(456);
     CHECK(colOrig2[3] == double(123));
     CHECK(colOrig0[3] == int8_t(456));
-    
+
     // Freeing both frames does not result in double-free errors.
     SECTION("Freeing the original frame first is fine") {
         DataObjectFactory::destroy(fOrig);
@@ -98,9 +98,9 @@ TEST_CASE("Frame columns can be accessed by label", TAG_DATASTRUCTURES) {
     auto c0 = genGivenVals<DenseMatrix<int64_t>>(3, {-1, -2, -3});
     auto c1 = genGivenVals<DenseMatrix<double>>(3, {1.1, 2.2, 3.3});
     auto c2 = genGivenVals<DenseMatrix<uint8_t>>(3, {10, 20, 30});
-    
+
     std::vector<Structure *> colMats = {c0, c1, c2};
-    
+
     SECTION("implit labels") {
         auto f = DataObjectFactory::create<Frame>(colMats, nullptr);
 
@@ -111,7 +111,7 @@ TEST_CASE("Frame columns can be accessed by label", TAG_DATASTRUCTURES) {
         CHECK(*c0_ == *c0);
         CHECK(*c1_ == *c1);
         CHECK(*c2_ == *c2);
-        
+
         DataObjectFactory::destroy(f);
     }
     SECTION("explicit labels") {
@@ -125,10 +125,10 @@ TEST_CASE("Frame columns can be accessed by label", TAG_DATASTRUCTURES) {
         CHECK(*c0_ == *c0);
         CHECK(*c1_ == *c1);
         CHECK(*c2_ == *c2);
-        
+
         DataObjectFactory::destroy(f);
     }
-    
+
     DataObjectFactory::destroy(c0);
     DataObjectFactory::destroy(c1);
     DataObjectFactory::destroy(c2);
@@ -151,28 +151,28 @@ TEST_CASE("Frame sub-frame for empty source frame works properly", TAG_DATASTRUC
     const size_t numRowsOrig = 0;
     const ValueTypeCode schemaOrig[] = {ValueTypeCode::SI8, ValueTypeCode::UI32, ValueTypeCode::F64};
     const size_t numColsOrig = sizeof(schemaOrig) / sizeof(ValueTypeCode);
-    
-    Frame * fOrig = DataObjectFactory::create<Frame>(numRowsOrig, numColsOrig, schemaOrig, nullptr, true);
+
+    Frame *fOrig = DataObjectFactory::create<Frame>(numRowsOrig, numColsOrig, schemaOrig, nullptr, true);
     const size_t colIdxsSub[] = {2, 0};
     const size_t numColsSub = sizeof(colIdxsSub) / sizeof(size_t);
-    Frame * fSub = DataObjectFactory::create<Frame>(fOrig, 0, 0, numColsSub, colIdxsSub);
-    
+    Frame *fSub = DataObjectFactory::create<Frame>(fOrig, 0, 0, numColsSub, colIdxsSub);
+
     // Sub-frame dimensions are as expected.
     CHECK(fSub->getNumRows() == 0);
     CHECK(fSub->getNumCols() == numColsSub);
-    
+
     // Sub-frame schema is as expected.
     CHECK(fSub->getColumnType(0) == ValueTypeCode::F64);
     CHECK(fSub->getColumnType(1) == ValueTypeCode::SI8);
 
     // Sub-frame shares data arrays with original.
-    int8_t * colOrig0 = fOrig->getColumn<int8_t>(0)->getValues();
-    double * colOrig2 = fOrig->getColumn<double>(2)->getValues();
-    double * colSub0 = fSub->getColumn<double>(0)->getValues();
-    int8_t * colSub1 = fSub->getColumn<int8_t>(1)->getValues();
+    int8_t *colOrig0 = fOrig->getColumn<int8_t>(0)->getValues();
+    double *colOrig2 = fOrig->getColumn<double>(2)->getValues();
+    double *colSub0 = fSub->getColumn<double>(0)->getValues();
+    int8_t *colSub1 = fSub->getColumn<int8_t>(1)->getValues();
     CHECK(colSub0 == colOrig2);
     CHECK(colSub1 == colOrig0);
-    
+
     // Freeing both frames does not result in double-free errors.
     SECTION("Freeing the original frame first is fine") {
         DataObjectFactory::destroy(fOrig);
diff --git a/test/runtime/local/datastructures/MatrixTest.cpp b/test/runtime/local/datastructures/MatrixTest.cpp
index f3bfcdefb..250acab52 100644
--- a/test/runtime/local/datastructures/MatrixTest.cpp
+++ b/test/runtime/local/datastructures/MatrixTest.cpp
@@ -29,26 +29,44 @@
 
 TEMPLATE_PRODUCT_TEST_CASE("Matrix.get()", TAG_DATASTRUCTURES, (DenseMatrix, CSRMatrix), (double, uint32_t)) {
     using DT = TestType;
-    
+
     SECTION("empty matrix") {
         auto m = genGivenVals<DT>(3, {
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-        });
-        
-        for(size_t r = 0; r < 3; r++)
-            for(size_t c = 0; c < 3; c++)
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                     });
+
+        for (size_t r = 0; r < 3; r++)
+            for (size_t c = 0; c < 3; c++)
                 CHECK(m->get(r, c) == 0);
-        
+
         DataObjectFactory::destroy(m);
     }
     SECTION("sparse matrix") {
         auto m = genGivenVals<DT>(3, {
-            0, 0, 1, 0,
-            0, 0, 0, 0,
-            0, 2, 0, 0,
-        });
+                                         0,
+                                         0,
+                                         1,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         2,
+                                         0,
+                                         0,
+                                     });
 
         CHECK(m->get(0, 0) == 0);
         CHECK(m->get(0, 1) == 0);
@@ -62,15 +80,24 @@ TEMPLATE_PRODUCT_TEST_CASE("Matrix.get()", TAG_DATASTRUCTURES, (DenseMatrix, CSR
         CHECK(m->get(2, 1) == 2);
         CHECK(m->get(2, 2) == 0);
         CHECK(m->get(2, 3) == 0);
-        
+
         DataObjectFactory::destroy(m);
     }
     SECTION("full matrix") {
         auto m = genGivenVals<DT>(3, {
-            6,  7,  3, 8,
-            9,  2, 10, 4,
-            1, 11,  5, 12,
-        });
+                                         6,
+                                         7,
+                                         3,
+                                         8,
+                                         9,
+                                         2,
+                                         10,
+                                         4,
+                                         1,
+                                         11,
+                                         5,
+                                         12,
+                                     });
 
         CHECK(m->get(0, 0) == 6);
         CHECK(m->get(0, 1) == 7);
@@ -84,21 +111,33 @@ TEMPLATE_PRODUCT_TEST_CASE("Matrix.get()", TAG_DATASTRUCTURES, (DenseMatrix, CSR
         CHECK(m->get(2, 1) == 11);
         CHECK(m->get(2, 2) == 5);
         CHECK(m->get(2, 3) == 12);
-        
+
         DataObjectFactory::destroy(m);
     }
 }
 
 TEMPLATE_PRODUCT_TEST_CASE("Matrix.get() on view", TAG_DATASTRUCTURES, (DenseMatrix), (double, uint32_t)) {
     using DT = TestType;
-    
+
     auto m = genGivenVals<DT>(4, {
-         1,  2,  3,  4,
-         5,  6,  7,  8,
-         9, 10, 11, 12,
-        13, 14, 15, 16,
-    });
-    
+                                     1,
+                                     2,
+                                     3,
+                                     4,
+                                     5,
+                                     6,
+                                     7,
+                                     8,
+                                     9,
+                                     10,
+                                     11,
+                                     12,
+                                     13,
+                                     14,
+                                     15,
+                                     16,
+                                 });
+
     SECTION("row range") {
         auto view = DataObjectFactory::create<DT>(m, 1, 3, 0, 4);
         CHECK(view->get(0, 0) == 5);
@@ -131,20 +170,32 @@ TEMPLATE_PRODUCT_TEST_CASE("Matrix.get() on view", TAG_DATASTRUCTURES, (DenseMat
         CHECK(view->get(1, 1) == 11);
         DataObjectFactory::destroy(view);
     }
-    
+
     DataObjectFactory::destroy(m);
 }
 
 TEMPLATE_PRODUCT_TEST_CASE("Matrix.get() on view", TAG_DATASTRUCTURES, (CSRMatrix), (double, uint32_t)) {
     using DT = TestType;
-    
+
     auto m = genGivenVals<DT>(4, {
-        0, 0, 1, 0,
-        0, 2, 3, 0,
-        4, 0, 5, 0,
-        0, 0, 0, 6,
-    });
-    
+                                     0,
+                                     0,
+                                     1,
+                                     0,
+                                     0,
+                                     2,
+                                     3,
+                                     0,
+                                     4,
+                                     0,
+                                     5,
+                                     0,
+                                     0,
+                                     0,
+                                     0,
+                                     6,
+                                 });
+
     SECTION("row range") {
         auto view = DataObjectFactory::create<DT>(m, 1, 3);
         CHECK(view->get(0, 0) == 0);
@@ -158,38 +209,68 @@ TEMPLATE_PRODUCT_TEST_CASE("Matrix.get() on view", TAG_DATASTRUCTURES, (CSRMatri
         DataObjectFactory::destroy(view);
     }
     // CSRMatrix does not support views on column ranges.
-    
+
     DataObjectFactory::destroy(m);
 }
 
-TEMPLATE_PRODUCT_TEST_CASE("Matrix.set() for filling a matrix", TAG_DATASTRUCTURES, (DenseMatrix, CSRMatrix), (double, uint32_t)) {
+TEMPLATE_PRODUCT_TEST_CASE("Matrix.set() for filling a matrix", TAG_DATASTRUCTURES, (DenseMatrix, CSRMatrix),
+                           (double, uint32_t)) {
     using DT = TestType;
-    
-    auto m = genGivenVals<DT>(3, {
-        0, 0, 0, 0,
-        0, 0, 0, 0,
-        0, 0, 0, 0,
-    }, 12);
-    
+
+    auto m = genGivenVals<DT>(3,
+                              {
+                                  0,
+                                  0,
+                                  0,
+                                  0,
+                                  0,
+                                  0,
+                                  0,
+                                  0,
+                                  0,
+                                  0,
+                                  0,
+                                  0,
+                              },
+                              12);
+
     auto mExpEnd = genGivenVals<DT>(3, {
-        6,  7,  3,  8,
-        9,  2, 10,  4,
-        1, 11,  5, 12,
-    });
-    
+                                           6,
+                                           7,
+                                           3,
+                                           8,
+                                           9,
+                                           2,
+                                           10,
+                                           4,
+                                           1,
+                                           11,
+                                           5,
+                                           12,
+                                       });
+
     SECTION("sorted coordinates") {
         m->set(0, 0, 6);
         m->set(0, 1, 7);
         m->set(0, 2, 3);
-        
+
         auto mExpMid = genGivenVals<DT>(3, {
-            6, 7, 3, 0,
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-        });
+                                               6,
+                                               7,
+                                               3,
+                                               0,
+                                               0,
+                                               0,
+                                               0,
+                                               0,
+                                               0,
+                                               0,
+                                               0,
+                                               0,
+                                           });
         CHECK(*m == *mExpMid);
         DataObjectFactory::destroy(mExpMid);
-        
+
         m->set(0, 3, 8);
         m->set(1, 0, 9);
         m->set(1, 1, 2);
@@ -199,22 +280,31 @@ TEMPLATE_PRODUCT_TEST_CASE("Matrix.set() for filling a matrix", TAG_DATASTRUCTUR
         m->set(2, 1, 11);
         m->set(2, 2, 5);
         m->set(2, 3, 12);
-        
+
         CHECK(*m == *mExpEnd);
     }
     SECTION("unsorted coordinates") {
         m->set(0, 3, 8);
         m->set(2, 0, 1);
         m->set(1, 1, 2);
-        
+
         auto mExpMid = genGivenVals<DT>(3, {
-            0, 0, 0, 8,
-            0, 2, 0, 0,
-            1, 0, 0, 0,
-        });
+                                               0,
+                                               0,
+                                               0,
+                                               8,
+                                               0,
+                                               2,
+                                               0,
+                                               0,
+                                               1,
+                                               0,
+                                               0,
+                                               0,
+                                           });
         CHECK(*m == *mExpMid);
         DataObjectFactory::destroy(mExpMid);
-        
+
         m->set(2, 1, 11);
         m->set(0, 1, 7);
         m->set(0, 2, 3);
@@ -224,23 +314,22 @@ TEMPLATE_PRODUCT_TEST_CASE("Matrix.set() for filling a matrix", TAG_DATASTRUCTUR
         m->set(1, 3, 4);
         m->set(0, 0, 6);
         m->set(2, 3, 12);
-        
+
         CHECK(*m == *mExpEnd);
     }
-    
+
     DataObjectFactory::destroy(mExpEnd);
 }
 
-TEMPLATE_PRODUCT_TEST_CASE("Matrix.set() for overwriting elements", TAG_DATASTRUCTURES, (DenseMatrix, CSRMatrix), (double, uint32_t)) {
+TEMPLATE_PRODUCT_TEST_CASE("Matrix.set() for overwriting elements", TAG_DATASTRUCTURES, (DenseMatrix, CSRMatrix),
+                           (double, uint32_t)) {
     using DT = TestType;
-    
+
     std::vector<typename DT::VT> vals = {
-        0, 0, 1, 0,
-        0, 2, 0, 3,
-        0, 0, 4, 0,
+        0, 0, 1, 0, 0, 2, 0, 3, 0, 0, 4, 0,
     };
     auto m = genGivenVals<DT>(3, vals, 12);
-    
+
     SECTION("zero to zero") {
         m->set(1, 2, 0);
         auto mExp = genGivenVals<DT>(3, vals, 12);
@@ -250,47 +339,86 @@ TEMPLATE_PRODUCT_TEST_CASE("Matrix.set() for overwriting elements", TAG_DATASTRU
     SECTION("zero to non-zero") {
         m->set(1, 2, 5);
         auto mExp = genGivenVals<DT>(3, {
-            0, 0, 1, 0,
-            0, 2, 5, 3,
-            0, 0, 4, 0,
-        });
+                                            0,
+                                            0,
+                                            1,
+                                            0,
+                                            0,
+                                            2,
+                                            5,
+                                            3,
+                                            0,
+                                            0,
+                                            4,
+                                            0,
+                                        });
         CHECK(*m == *mExp);
         DataObjectFactory::destroy(mExp);
     }
     SECTION("non-zero to zero") {
         m->set(1, 1, 0);
         auto mExp = genGivenVals<DT>(3, {
-            0, 0, 1, 0,
-            0, 0, 0, 3,
-            0, 0, 4, 0,
-        });
+                                            0,
+                                            0,
+                                            1,
+                                            0,
+                                            0,
+                                            0,
+                                            0,
+                                            3,
+                                            0,
+                                            0,
+                                            4,
+                                            0,
+                                        });
         CHECK(*m == *mExp);
         DataObjectFactory::destroy(mExp);
     }
     SECTION("non-zero to non-zero") {
         m->set(1, 1, 5);
         auto mExp = genGivenVals<DT>(3, {
-            0, 0, 1, 0,
-            0, 5, 0, 3,
-            0, 0, 4, 0,
-        });
+                                            0,
+                                            0,
+                                            1,
+                                            0,
+                                            0,
+                                            5,
+                                            0,
+                                            3,
+                                            0,
+                                            0,
+                                            4,
+                                            0,
+                                        });
         CHECK(*m == *mExp);
         DataObjectFactory::destroy(mExp);
     }
-    
+
     DataObjectFactory::destroy(m);
 }
 
 TEMPLATE_PRODUCT_TEST_CASE("Matrix.set() on view", TAG_DATASTRUCTURES, (DenseMatrix), (double, uint32_t)) {
     using DT = TestType;
-    
+
     auto m = genGivenVals<DT>(4, {
-         1,  2,  3,  4,
-         5,  6,  7,  8,
-         9, 10, 11, 12,
-        13, 14, 15, 16,
-    });
-    
+                                     1,
+                                     2,
+                                     3,
+                                     4,
+                                     5,
+                                     6,
+                                     7,
+                                     8,
+                                     9,
+                                     10,
+                                     11,
+                                     12,
+                                     13,
+                                     14,
+                                     15,
+                                     16,
+                                 });
+
     SECTION("row range") {
         auto view = DataObjectFactory::create<DT>(m, 1, 3, 0, 4);
         view->set(0, 0, 111);
@@ -298,11 +426,23 @@ TEMPLATE_PRODUCT_TEST_CASE("Matrix.set() on view", TAG_DATASTRUCTURES, (DenseMat
         view->set(1, 1, 333);
         view->set(1, 3, 444);
         auto mExp = genGivenVals<DT>(4, {
-              1,   2,   3,   4,
-            111,   6, 222,   8,
-              9, 333,  11, 444,
-             13,  14,  15,  16,
-        });
+                                            1,
+                                            2,
+                                            3,
+                                            4,
+                                            111,
+                                            6,
+                                            222,
+                                            8,
+                                            9,
+                                            333,
+                                            11,
+                                            444,
+                                            13,
+                                            14,
+                                            15,
+                                            16,
+                                        });
         CHECK(*m == *mExp);
         DataObjectFactory::destroy(mExp);
         DataObjectFactory::destroy(view);
@@ -314,11 +454,23 @@ TEMPLATE_PRODUCT_TEST_CASE("Matrix.set() on view", TAG_DATASTRUCTURES, (DenseMat
         view->set(2, 0, 333);
         view->set(3, 1, 444);
         auto mExp = genGivenVals<DT>(4, {
-             1, 111,   3,  4,
-             5,   6, 222,  8,
-             9, 333,  11, 12,
-            13,  14, 444, 16,
-        });
+                                            1,
+                                            111,
+                                            3,
+                                            4,
+                                            5,
+                                            6,
+                                            222,
+                                            8,
+                                            9,
+                                            333,
+                                            11,
+                                            12,
+                                            13,
+                                            14,
+                                            444,
+                                            16,
+                                        });
         CHECK(*m == *mExp);
         DataObjectFactory::destroy(mExp);
         DataObjectFactory::destroy(view);
@@ -328,65 +480,123 @@ TEMPLATE_PRODUCT_TEST_CASE("Matrix.set() on view", TAG_DATASTRUCTURES, (DenseMat
         view->set(0, 1, 111);
         view->set(1, 0, 222);
         auto mExp = genGivenVals<DT>(4, {
-             1,   2,   3,  4,
-             5,   6, 111,  8,
-             9, 222,  11, 12,
-            13,  14,  15, 16,
-        });
+                                            1,
+                                            2,
+                                            3,
+                                            4,
+                                            5,
+                                            6,
+                                            111,
+                                            8,
+                                            9,
+                                            222,
+                                            11,
+                                            12,
+                                            13,
+                                            14,
+                                            15,
+                                            16,
+                                        });
         CHECK(*m == *mExp);
         DataObjectFactory::destroy(mExp);
         DataObjectFactory::destroy(view);
     }
-    
+
     DataObjectFactory::destroy(m);
 }
 
 TEMPLATE_PRODUCT_TEST_CASE("Matrix.set() on view", TAG_DATASTRUCTURES, (CSRMatrix), (double, uint32_t)) {
     using DT = TestType;
-    
-    auto m = genGivenVals<DT>(4, {
-        0, 0, 1, 0,
-        0, 2, 3, 0,
-        4, 0, 5, 0,
-        0, 0, 6, 7,
-    }, 12);
-    
+
+    auto m = genGivenVals<DT>(4,
+                              {
+                                  0,
+                                  0,
+                                  1,
+                                  0,
+                                  0,
+                                  2,
+                                  3,
+                                  0,
+                                  4,
+                                  0,
+                                  5,
+                                  0,
+                                  0,
+                                  0,
+                                  6,
+                                  7,
+                              },
+                              12);
+
     SECTION("row range") {
         auto view = DataObjectFactory::create<DT>(m, 1, 3);
         view->set(0, 0, 111); // zero to non-zero
-        view->set(0, 1, 0); // non-zero to zero
+        view->set(0, 1, 0);   // non-zero to zero
         view->set(1, 0, 222); // non-zero to non-zero
-        view->set(1, 1, 0); // zero to zero
+        view->set(1, 1, 0);   // zero to zero
         auto mExp = genGivenVals<DT>(4, {
-              0, 0, 1, 0,
-            111, 0, 3, 0,
-            222, 0, 5, 0,
-              0, 0, 6, 7,
-        });
+                                            0,
+                                            0,
+                                            1,
+                                            0,
+                                            111,
+                                            0,
+                                            3,
+                                            0,
+                                            222,
+                                            0,
+                                            5,
+                                            0,
+                                            0,
+                                            0,
+                                            6,
+                                            7,
+                                        });
         CHECK(*m == *mExp);
         DataObjectFactory::destroy(mExp);
         DataObjectFactory::destroy(view);
     }
     // CSRMatrix does not support views on column ranges.
-    
+
     DataObjectFactory::destroy(m);
 }
 
 TEMPLATE_PRODUCT_TEST_CASE("Matrix.append()", TAG_DATASTRUCTURES, (DenseMatrix, CSRMatrix), (double, uint32_t)) {
     using DT = TestType;
-    
-    auto m = genGivenVals<DT>(3, {
-        0, 0, 0, 0,
-        0, 0, 0, 0,
-        0, 0, 0, 0,
-    }, 12);
-    
+
+    auto m = genGivenVals<DT>(3,
+                              {
+                                  0,
+                                  0,
+                                  0,
+                                  0,
+                                  0,
+                                  0,
+                                  0,
+                                  0,
+                                  0,
+                                  0,
+                                  0,
+                                  0,
+                              },
+                              12);
+
     SECTION("appending nothing") {
         auto mExp = genGivenVals<DT>(3, {
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-        });
+                                            0,
+                                            0,
+                                            0,
+                                            0,
+                                            0,
+                                            0,
+                                            0,
+                                            0,
+                                            0,
+                                            0,
+                                            0,
+                                            0,
+                                        });
         m->prepareAppend();
         m->finishAppend();
         CHECK(*m == *mExp);
@@ -394,10 +604,19 @@ TEMPLATE_PRODUCT_TEST_CASE("Matrix.append()", TAG_DATASTRUCTURES, (DenseMatrix,
     }
     SECTION("appending one non-last element") {
         auto mExp = genGivenVals<DT>(3, {
-            0, 0, 0, 0,
-            0, 0, 3, 0,
-            0, 0, 0, 0,
-        });
+                                            0,
+                                            0,
+                                            0,
+                                            0,
+                                            0,
+                                            0,
+                                            3,
+                                            0,
+                                            0,
+                                            0,
+                                            0,
+                                            0,
+                                        });
         m->prepareAppend();
         m->append(1, 2, 3);
         m->finishAppend();
@@ -406,10 +625,19 @@ TEMPLATE_PRODUCT_TEST_CASE("Matrix.append()", TAG_DATASTRUCTURES, (DenseMatrix,
     }
     SECTION("appending only last element") {
         auto mExp = genGivenVals<DT>(3, {
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-            0, 0, 0, 3,
-        });
+                                            0,
+                                            0,
+                                            0,
+                                            0,
+                                            0,
+                                            0,
+                                            0,
+                                            0,
+                                            0,
+                                            0,
+                                            0,
+                                            3,
+                                        });
         m->prepareAppend();
         m->append(2, 3, 3);
         m->finishAppend();
@@ -418,10 +646,19 @@ TEMPLATE_PRODUCT_TEST_CASE("Matrix.append()", TAG_DATASTRUCTURES, (DenseMatrix,
     }
     SECTION("appending multiple elements") {
         auto mExp = genGivenVals<DT>(3, {
-            0, 1, 0, 0,
-            0, 0, 0, 0,
-            0, 2, 3, 0,
-        });
+                                            0,
+                                            1,
+                                            0,
+                                            0,
+                                            0,
+                                            0,
+                                            0,
+                                            0,
+                                            0,
+                                            2,
+                                            3,
+                                            0,
+                                        });
         m->prepareAppend();
         m->append(0, 1, 1);
         m->append(2, 1, 2);
@@ -432,10 +669,19 @@ TEMPLATE_PRODUCT_TEST_CASE("Matrix.append()", TAG_DATASTRUCTURES, (DenseMatrix,
     }
     SECTION("appending all elements") {
         auto mExp = genGivenVals<DT>(3, {
-            6,  7,  3,  8,
-            9,  2, 10,  4,
-            1, 11,  5, 12,
-        });
+                                            6,
+                                            7,
+                                            3,
+                                            8,
+                                            9,
+                                            2,
+                                            10,
+                                            4,
+                                            1,
+                                            11,
+                                            5,
+                                            12,
+                                        });
         m->prepareAppend();
         m->append(0, 0, 6);
         m->append(0, 1, 7);
@@ -453,21 +699,21 @@ TEMPLATE_PRODUCT_TEST_CASE("Matrix.append()", TAG_DATASTRUCTURES, (DenseMatrix,
         CHECK(*m == *mExp);
         DataObjectFactory::destroy(mExp);
     }
-    
+
     DataObjectFactory::destroy(m);
 }
 
 TEMPLATE_PRODUCT_TEST_CASE("Matrix.append() on view", TAG_DATASTRUCTURES, (DenseMatrix), (double, uint32_t)) {
     using DT = TestType;
-    
+
     auto m = DataObjectFactory::create<DT>(4, 5, false);
-    
+
     auto view1 = DataObjectFactory::create<DT>(m, 0, 2, 0, 2);
     auto view2 = DataObjectFactory::create<DT>(m, 0, 2, 2, 5);
     auto view3 = DataObjectFactory::create<DT>(m, 2, 4, 0, 5);
-    
+
     // Views can be populated using append in any order for DenseMatrix.
-    
+
     view2->prepareAppend();
     view2->append(0, 0, 11);
     view2->append(0, 1, 22);
@@ -476,14 +722,14 @@ TEMPLATE_PRODUCT_TEST_CASE("Matrix.append() on view", TAG_DATASTRUCTURES, (Dense
     view2->append(1, 1, 55);
     view2->append(1, 2, 66);
     view2->finishAppend();
-    
+
     view1->prepareAppend();
     view1->append(0, 0, 1);
     view1->append(0, 1, 2);
     view1->append(1, 0, 3);
     view1->append(1, 1, 4);
     view1->finishAppend();
-    
+
     view3->prepareAppend();
     view3->append(0, 0, 111);
     view3->append(0, 1, 222);
@@ -496,14 +742,12 @@ TEMPLATE_PRODUCT_TEST_CASE("Matrix.append() on view", TAG_DATASTRUCTURES, (Dense
     view3->append(1, 3, 999);
     view3->append(1, 4, 123);
     view3->finishAppend();
-    
-    CHECK(*m == *genGivenVals<DT>(4, {
-        1, 2, 11, 22, 33,
-        3, 4, 44, 55, 66,
-        111, 222, 333, 444, 555,
-        666, 777, 888, 999, 123,
-    }));
-    
+
+    CHECK(*m ==
+          *genGivenVals<DT>(4, {
+                                   1, 2, 11, 22, 33, 3, 4, 44, 55, 66, 111, 222, 333, 444, 555, 666, 777, 888, 999, 123,
+                               }));
+
     DataObjectFactory::destroy(m);
     DataObjectFactory::destroy(view1);
     DataObjectFactory::destroy(view2);
@@ -512,41 +756,51 @@ TEMPLATE_PRODUCT_TEST_CASE("Matrix.append() on view", TAG_DATASTRUCTURES, (Dense
 
 TEMPLATE_PRODUCT_TEST_CASE("Matrix.append() on view", TAG_DATASTRUCTURES, (CSRMatrix), (double, uint32_t)) {
     using DT = TestType;
-    
+
     auto m = DataObjectFactory::create<DT>(5, 3, 15, false);
-    
+
     auto view1 = DataObjectFactory::create<DT>(m, 0, 2);
     auto view2 = DataObjectFactory::create<DT>(m, 2, 3);
     auto view3 = DataObjectFactory::create<DT>(m, 3, 5);
-    
+
     // When using append, views must be populated in order for CSRMatrix.
-    
+
     view1->prepareAppend();
     view1->append(0, 1, 1);
     view1->append(1, 2, 2);
     view1->finishAppend();
-    
+
     view2->prepareAppend();
     view2->append(0, 0, 11);
     view2->append(0, 1, 22);
     view2->append(0, 2, 0);
     view2->finishAppend();
-    
+
     view3->prepareAppend();
     view3->append(0, 1, 111);
     view3->append(0, 2, 0);
     view3->append(1, 0, 222);
     view3->append(1, 1, 333);
     view3->finishAppend();
-    
+
     CHECK(*m == *genGivenVals<DT>(5, {
-          0,   1, 0,
-          0,   0, 2,
-         11,  22, 0,
-          0, 111, 0,
-        222, 333, 0,
-    }));
-    
+                                         0,
+                                         1,
+                                         0,
+                                         0,
+                                         0,
+                                         2,
+                                         11,
+                                         22,
+                                         0,
+                                         0,
+                                         111,
+                                         0,
+                                         222,
+                                         333,
+                                         0,
+                                     }));
+
     DataObjectFactory::destroy(m);
     DataObjectFactory::destroy(view1);
     DataObjectFactory::destroy(view2);
diff --git a/test/runtime/local/datastructures/TaskQueueTest.cpp b/test/runtime/local/datastructures/TaskQueueTest.cpp
index cb10a303b..30d622773 100644
--- a/test/runtime/local/datastructures/TaskQueueTest.cpp
+++ b/test/runtime/local/datastructures/TaskQueueTest.cpp
@@ -14,21 +14,21 @@
  * limitations under the License.
  */
 
-#include <runtime/local/vectorized/TaskQueues.h>
-#include <tags.h>
 #include <catch.hpp>
 #include <cstdint>
+#include <runtime/local/vectorized/TaskQueues.h>
+#include <tags.h>
 
 TEST_CASE("Task sequence", TAG_DATASTRUCTURES) {
-    TaskQueue* bq = new BlockingTaskQueue(5);
+    TaskQueue *bq = new BlockingTaskQueue(5);
     std::mutex mtx;
-    CompiledPipelineTaskData<DenseMatrix<double>> data{{}, {}, {}, 0, 0, nullptr, nullptr, nullptr, nullptr, 0, 0,
-            nullptr, nullptr, 0, nullptr};
-    Task* t1 = new CompiledPipelineTask<DenseMatrix<double>>(data, mtx, nullptr);
-    Task* t2 = new CompiledPipelineTask<DenseMatrix<double>>(data, mtx, nullptr);
-    Task* t3 = new CompiledPipelineTask<DenseMatrix<double>>(data, mtx, nullptr);
-    
-    //check return sequence
+    CompiledPipelineTaskData<DenseMatrix<double>> data{{},      {}, {}, 0,       0,       nullptr, nullptr, nullptr,
+                                                       nullptr, 0,  0,  nullptr, nullptr, 0,       nullptr};
+    Task *t1 = new CompiledPipelineTask<DenseMatrix<double>>(data, mtx, nullptr);
+    Task *t2 = new CompiledPipelineTask<DenseMatrix<double>>(data, mtx, nullptr);
+    Task *t3 = new CompiledPipelineTask<DenseMatrix<double>>(data, mtx, nullptr);
+
+    // check return sequence
     bq->enqueueTask(t1);
     bq->enqueueTask(t2);
     bq->enqueueTask(t3);
@@ -43,12 +43,12 @@ TEST_CASE("Task sequence", TAG_DATASTRUCTURES) {
 }
 
 TEST_CASE("Queue size", TAG_DATASTRUCTURES) {
-    TaskQueue* bq = new BlockingTaskQueue(5);
+    TaskQueue *bq = new BlockingTaskQueue(5);
     std::mutex mtx;
-    CompiledPipelineTaskData<DenseMatrix<double>> data{{}, {}, {}, 0, 0, nullptr, nullptr, nullptr, nullptr, 0, 0,
-            nullptr, nullptr, 0, nullptr};
-    Task* t1 = new CompiledPipelineTask<DenseMatrix<double>>(data, mtx, nullptr);
-    Task* t2 = new CompiledPipelineTask<DenseMatrix<double>>(data, mtx, nullptr);
+    CompiledPipelineTaskData<DenseMatrix<double>> data{{},      {}, {}, 0,       0,       nullptr, nullptr, nullptr,
+                                                       nullptr, 0,  0,  nullptr, nullptr, 0,       nullptr};
+    Task *t1 = new CompiledPipelineTask<DenseMatrix<double>>(data, mtx, nullptr);
+    Task *t2 = new CompiledPipelineTask<DenseMatrix<double>>(data, mtx, nullptr);
 
     // check proper size management
     CHECK(bq->size() == 0);
@@ -67,18 +67,18 @@ TEST_CASE("Queue size", TAG_DATASTRUCTURES) {
 }
 
 TEST_CASE("EOF handling", TAG_DATASTRUCTURES) {
-    TaskQueue* bq = new BlockingTaskQueue(5);
+    TaskQueue *bq = new BlockingTaskQueue(5);
     std::mutex mtx;
-    CompiledPipelineTaskData<DenseMatrix<double>> data{{}, {}, {}, 0, 0, nullptr, nullptr, nullptr, nullptr, 0, 0,
-            nullptr, nullptr, 0, nullptr};
-    Task* t1 = new CompiledPipelineTask<DenseMatrix<double>>(data, mtx, nullptr);
+    CompiledPipelineTaskData<DenseMatrix<double>> data{{},      {}, {}, 0,       0,       nullptr, nullptr, nullptr,
+                                                       nullptr, 0,  0,  nullptr, nullptr, 0,       nullptr};
+    Task *t1 = new CompiledPipelineTask<DenseMatrix<double>>(data, mtx, nullptr);
 
     // check EOF after last task
     bq->enqueueTask(t1);
     bq->closeInput();
     bq->dequeueTask();
-    CHECK(dynamic_cast<EOFTask*>(bq->dequeueTask()));
-    CHECK(dynamic_cast<EOFTask*>(bq->dequeueTask()));
+    CHECK(dynamic_cast<EOFTask *>(bq->dequeueTask()));
+    CHECK(dynamic_cast<EOFTask *>(bq->dequeueTask()));
 
     delete t1;
     delete bq;
diff --git a/test/runtime/local/datastructures/TensorTest.cpp b/test/runtime/local/datastructures/TensorTest.cpp
index d75fc8f5e..9fe0deace 100644
--- a/test/runtime/local/datastructures/TensorTest.cpp
+++ b/test/runtime/local/datastructures/TensorTest.cpp
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include <runtime/local/datastructures/ContiguousTensor.h>
 #include <runtime/local/datastructures/ChunkedTensor.h>
+#include <runtime/local/datastructures/ContiguousTensor.h>
 #include <runtime/local/datastructures/DataObjectFactory.h>
 #include <runtime/local/datastructures/ValueTypeUtils.h>
 
@@ -24,9 +24,9 @@
 #include <catch.hpp>
 
 #include <cstdint>
-#include <vector>
 #include <memory>
 #include <utility>
+#include <vector>
 
 TEMPLATE_TEST_CASE("Tensor Creation", TAG_DATASTRUCTURES, double, float, uint32_t, uint64_t, int32_t, int64_t) {
     std::vector<size_t> rank0_shape = {};
@@ -42,36 +42,35 @@ TEMPLATE_TEST_CASE("Tensor Creation", TAG_DATASTRUCTURES, double, float, uint32_
     std::vector<size_t> rank4_chunk_shape = {5, 5, 10, 2};
 
     ContiguousTensor<TestType> *ct0 =
-      DataObjectFactory::create<ContiguousTensor<TestType>>(rank0_shape, InitCode::IOTA);
+        DataObjectFactory::create<ContiguousTensor<TestType>>(rank0_shape, InitCode::IOTA);
     ContiguousTensor<TestType> *ct1 =
-      DataObjectFactory::create<ContiguousTensor<TestType>>(rank1_shape, InitCode::IOTA);
+        DataObjectFactory::create<ContiguousTensor<TestType>>(rank1_shape, InitCode::IOTA);
     ContiguousTensor<TestType> *ct2 =
-      DataObjectFactory::create<ContiguousTensor<TestType>>(rank2_shape, InitCode::IOTA);
+        DataObjectFactory::create<ContiguousTensor<TestType>>(rank2_shape, InitCode::IOTA);
     ContiguousTensor<TestType> *ct3 =
-      DataObjectFactory::create<ContiguousTensor<TestType>>(rank3_shape, InitCode::IOTA);
+        DataObjectFactory::create<ContiguousTensor<TestType>>(rank3_shape, InitCode::IOTA);
     ContiguousTensor<TestType> *ct4 =
-      DataObjectFactory::create<ContiguousTensor<TestType>>(rank4_shape, InitCode::IOTA);
+        DataObjectFactory::create<ContiguousTensor<TestType>>(rank4_shape, InitCode::IOTA);
 
     ChunkedTensor<TestType> *cht0 =
-      DataObjectFactory::create<ChunkedTensor<TestType>>(rank0_shape, rank0_chunk_shape, InitCode::IOTA);
+        DataObjectFactory::create<ChunkedTensor<TestType>>(rank0_shape, rank0_chunk_shape, InitCode::IOTA);
     ChunkedTensor<TestType> *cht1 =
-      DataObjectFactory::create<ChunkedTensor<TestType>>(rank1_shape, rank1_chunk_shape, InitCode::IOTA);
+        DataObjectFactory::create<ChunkedTensor<TestType>>(rank1_shape, rank1_chunk_shape, InitCode::IOTA);
     ChunkedTensor<TestType> *cht2 =
-      DataObjectFactory::create<ChunkedTensor<TestType>>(rank2_shape, rank2_chunk_shape, InitCode::IOTA);
+        DataObjectFactory::create<ChunkedTensor<TestType>>(rank2_shape, rank2_chunk_shape, InitCode::IOTA);
     ChunkedTensor<TestType> *cht3 =
-      DataObjectFactory::create<ChunkedTensor<TestType>>(rank3_shape, rank3_chunk_shape, InitCode::IOTA);
+        DataObjectFactory::create<ChunkedTensor<TestType>>(rank3_shape, rank3_chunk_shape, InitCode::IOTA);
     ChunkedTensor<TestType> *cht4 =
-      DataObjectFactory::create<ChunkedTensor<TestType>>(rank4_shape, rank4_chunk_shape, InitCode::IOTA);
+        DataObjectFactory::create<ChunkedTensor<TestType>>(rank4_shape, rank4_chunk_shape, InitCode::IOTA);
 
     DataObjectFactory::destroy(ct4, ct3, ct2, ct1, ct0);
     DataObjectFactory::destroy(cht4, cht3, cht2, cht1, cht0);
 }
 
-template<typename T>
-T getIOTAValue(const std::vector<size_t> &tensor_shape, const std::vector<size_t> &ids) {
+template <typename T> T getIOTAValue(const std::vector<size_t> &tensor_shape, const std::vector<size_t> &ids) {
     std::vector<size_t> strides;
     strides.resize(tensor_shape.size());
-    strides[0]    = 1;
+    strides[0] = 1;
     size_t result = ids[0];
     for (size_t i = 1; i < tensor_shape.size(); i++) {
         strides[i] = strides[i - 1] * tensor_shape[i - 1];
@@ -80,13 +79,7 @@ T getIOTAValue(const std::vector<size_t> &tensor_shape, const std::vector<size_t
     return static_cast<T>(result);
 }
 
-TEMPLATE_TEST_CASE("Tensor layout and accessors",
-                   TAG_DATASTRUCTURES,
-                   double,
-                   float,
-                   uint32_t,
-                   uint64_t,
-                   int32_t,
+TEMPLATE_TEST_CASE("Tensor layout and accessors", TAG_DATASTRUCTURES, double, float, uint32_t, uint64_t, int32_t,
                    int64_t) {
     std::vector<size_t> tensor_shape0 = {4, 4, 4};
     std::vector<size_t> tensor_shape1 = {3, 2, 7, 5};
@@ -95,14 +88,14 @@ TEMPLATE_TEST_CASE("Tensor layout and accessors",
     std::vector<size_t> chunk_shape1 = {2, 4, 2, 2};
 
     ContiguousTensor<TestType> *ct0 =
-      DataObjectFactory::create<ContiguousTensor<TestType>>(tensor_shape0, InitCode::IOTA);
+        DataObjectFactory::create<ContiguousTensor<TestType>>(tensor_shape0, InitCode::IOTA);
     ContiguousTensor<TestType> *ct1 =
-      DataObjectFactory::create<ContiguousTensor<TestType>>(tensor_shape1, InitCode::IOTA);
+        DataObjectFactory::create<ContiguousTensor<TestType>>(tensor_shape1, InitCode::IOTA);
 
     ChunkedTensor<TestType> *cht0 =
-      DataObjectFactory::create<ChunkedTensor<TestType>>(tensor_shape0, chunk_shape0, InitCode::IOTA);
+        DataObjectFactory::create<ChunkedTensor<TestType>>(tensor_shape0, chunk_shape0, InitCode::IOTA);
     ChunkedTensor<TestType> *cht1 =
-      DataObjectFactory::create<ChunkedTensor<TestType>>(tensor_shape1, chunk_shape1, InitCode::IOTA);
+        DataObjectFactory::create<ChunkedTensor<TestType>>(tensor_shape1, chunk_shape1, InitCode::IOTA);
 
     SECTION("Manual layout check") {
         TestType expected_chunked[64] = {0,  1,  4,  5,  16, 17, 20, 21, 2,  3,  6,  7,  18, 19, 22, 23,
@@ -176,25 +169,19 @@ TEMPLATE_TEST_CASE("Tensor layout and accessors",
     DataObjectFactory::destroy(cht0, cht1);
 }
 
-TEMPLATE_TEST_CASE("Tensor dicing, rechunking and conversions",
-                   TAG_DATASTRUCTURES,
-                   double,
-                   float,
-                   uint32_t,
-                   uint64_t,
-                   int32_t,
-                   int64_t) {
+TEMPLATE_TEST_CASE("Tensor dicing, rechunking and conversions", TAG_DATASTRUCTURES, double, float, uint32_t, uint64_t,
+                   int32_t, int64_t) {
     std::vector<size_t> tensor_shape = {3, 4, 2};
-    std::vector<size_t> chunk_shape  = {2, 2, 2};
+    std::vector<size_t> chunk_shape = {2, 2, 2};
 
     ContiguousTensor<TestType> *ct =
-      DataObjectFactory::create<ContiguousTensor<TestType>>(tensor_shape, InitCode::IOTA);
+        DataObjectFactory::create<ContiguousTensor<TestType>>(tensor_shape, InitCode::IOTA);
     ChunkedTensor<TestType> *cht =
-      DataObjectFactory::create<ChunkedTensor<TestType>>(tensor_shape, chunk_shape, InitCode::IOTA);
+        DataObjectFactory::create<ChunkedTensor<TestType>>(tensor_shape, chunk_shape, InitCode::IOTA);
 
     SECTION(".dice() and variants") {
         std::vector<std::pair<size_t, size_t>> dice_range = {{1, 3}, {3, 4}, {0, 2}};
-        ContiguousTensor<TestType> *dice0                 = ct->tryDice(dice_range);
+        ContiguousTensor<TestType> *dice0 = ct->tryDice(dice_range);
 
         REQUIRE(dice0 != nullptr);
         REQUIRE(dice0->get({0, 0, 0}) == 10);
@@ -219,7 +206,7 @@ TEMPLATE_TEST_CASE("Tensor dicing, rechunking and conversions",
         REQUIRE(dice3->data[3] == 23);
 
         std::vector<std::pair<size_t, size_t>> dice_chunk_range = {{0, 1}, {1, 2}, {0, 1}};
-        ChunkedTensor<TestType> *dice4                          = cht->tryDiceAtChunkLvl(dice_chunk_range);
+        ChunkedTensor<TestType> *dice4 = cht->tryDiceAtChunkLvl(dice_chunk_range);
 
         REQUIRE(dice4 != nullptr);
         REQUIRE(dice4->data[0] == 6);
@@ -237,7 +224,7 @@ TEMPLATE_TEST_CASE("Tensor dicing, rechunking and conversions",
 
     SECTION(".rechunk()") {
         ChunkedTensor<TestType> *cht1 =
-          DataObjectFactory::create<ChunkedTensor<TestType>>(tensor_shape, chunk_shape, InitCode::IOTA);
+            DataObjectFactory::create<ChunkedTensor<TestType>>(tensor_shape, chunk_shape, InitCode::IOTA);
         REQUIRE(cht1->tryRechunk({2, 4, 2}));
 
         for (size_t i = 0; i < 3; i++) {
@@ -263,7 +250,7 @@ TEMPLATE_TEST_CASE("Tensor dicing, rechunking and conversions",
             data[i] = i;
         }
 
-        DenseMatrix<TestType> *matrix   = DataObjectFactory::create<DenseMatrix<TestType>>(3, 4, data);
+        DenseMatrix<TestType> *matrix = DataObjectFactory::create<DenseMatrix<TestType>>(3, 4, data);
         ContiguousTensor<TestType> *ct1 = DataObjectFactory::create<ContiguousTensor<TestType>>(matrix);
 
         for (size_t i = 0; i < 4; i++) {
@@ -279,9 +266,9 @@ TEMPLATE_TEST_CASE("Tensor dicing, rechunking and conversions",
 
     SECTION("Converion: rank 2 ContiguousTensor -> DenseMatrix") {
         ContiguousTensor<TestType> *ct1 =
-          DataObjectFactory::create<ContiguousTensor<TestType>>(std::vector<size_t>({4, 3}), InitCode::IOTA);
+            DataObjectFactory::create<ContiguousTensor<TestType>>(std::vector<size_t>({4, 3}), InitCode::IOTA);
         ContiguousTensor<TestType> *ct2 =
-          DataObjectFactory::create<ContiguousTensor<TestType>>(std::vector<size_t>({4, 3, 3}), InitCode::IOTA);
+            DataObjectFactory::create<ContiguousTensor<TestType>>(std::vector<size_t>({4, 3, 3}), InitCode::IOTA);
 
         REQUIRE(!(ct2->tryToGetDenseMatrix()));
         DenseMatrix<TestType> *matrix = ct1->tryToGetDenseMatrix();
diff --git a/test/runtime/local/io/DaphneSerializerTest.cpp b/test/runtime/local/io/DaphneSerializerTest.cpp
index b2d5974a1..3317e35f3 100644
--- a/test/runtime/local/io/DaphneSerializerTest.cpp
+++ b/test/runtime/local/io/DaphneSerializerTest.cpp
@@ -15,8 +15,8 @@
  */
 
 #include <runtime/local/datagen/GenGivenVals.h>
-#include <runtime/local/datastructures/DenseMatrix.h>
 #include <runtime/local/datastructures/CSRMatrix.h>
+#include <runtime/local/datastructures/DenseMatrix.h>
 #include <runtime/local/io/DaphneSerializer.h>
 #include <runtime/local/kernels/CheckEq.h>
 #include <runtime/local/kernels/RandMatrix.h>
@@ -25,10 +25,10 @@
 
 #include <catch.hpp>
 
-#include <vector>
 #include <cmath>
 #include <cstdint>
 #include <limits>
+#include <vector>
 
 #define DATA_TYPES DenseMatrix, CSRMatrix
 #define VALUE_TYPES int8_t, int32_t, int64_t, uint8_t, uint32_t, uint64_t, float, double
@@ -38,22 +38,14 @@
 // Small defined matrices
 // ----------------------------------------------------------------------------
 
-TEMPLATE_PRODUCT_TEST_CASE("DaphneSerializer serialize/deserialize", TAG_IO, (DATA_TYPES), (VALUE_TYPES))
-{
+TEMPLATE_PRODUCT_TEST_CASE("DaphneSerializer serialize/deserialize", TAG_IO, (DATA_TYPES), (VALUE_TYPES)) {
     using DT = TestType;
-    DT* mat = nullptr;
+    DT *mat = nullptr;
     if (std::is_same<DT, DenseMatrix<typename DT::VT>>::value) {
-        mat = genGivenVals<DT>(5, {0, 23, 4, 94, 53,
-                                    6, 13, 89, 31, 21,
-                                    42, 45, 78, 35, 25,
-                                    2, 23, 88, 123, 5,
-                                    44, 77, 2, 1, 2});
+        mat = genGivenVals<DT>(
+            5, {0, 23, 4, 94, 53, 6, 13, 89, 31, 21, 42, 45, 78, 35, 25, 2, 23, 88, 123, 5, 44, 77, 2, 1, 2});
     } else if (std::is_same<DT, CSRMatrix<typename DT::VT>>::value) {
-        mat = genGivenVals<DT>(5, {0, 0, 0, 0, 53,
-                                    0, 0, 0, 0, 0,
-                                    0, 0, 78, 0, 0,
-                                    0, 0, 0, 123, 0,
-                                    0, 77, 0, 0, 0});
+        mat = genGivenVals<DT>(5, {0, 0, 0, 0, 53, 0, 0, 0, 0, 0, 0, 0, 78, 0, 0, 0, 0, 0, 123, 0, 0, 77, 0, 0, 0});
     }
 
     // Serialize and deserialize
@@ -68,32 +60,22 @@ TEMPLATE_PRODUCT_TEST_CASE("DaphneSerializer serialize/deserialize", TAG_IO, (DA
     DataObjectFactory::destroy(newMat);
 }
 
-TEMPLATE_PRODUCT_TEST_CASE("DaphneSerializer serialize/deserialize in chunks out of order", TAG_IO, (DATA_TYPES), (VALUE_TYPES))
-{
+TEMPLATE_PRODUCT_TEST_CASE("DaphneSerializer serialize/deserialize in chunks out of order", TAG_IO, (DATA_TYPES),
+                           (VALUE_TYPES)) {
     using DT = TestType;
-    DT* mat = nullptr;
+    DT *mat = nullptr;
     if (std::is_same<DT, DenseMatrix<typename DT::VT>>::value) {
-        mat = genGivenVals<DT>(10, {66, 58, 24, 118, 51, 22, 75, 32, 17, 8,
-                                     74, 55, 44, 63, 51, 44, 75, 87, 63, 42,
-                                     71, 108, 10, 101, 92, 34, 101, 89, 39, 91,
-                                     48, 36, 69, 63, 69, 18, 7, 56, 63, 28,
-                                     61, 16, 9, 87, 25, 40, 12, 27, 22, 18,
-                                     11, 4, 22, 71, 94, 82, 65, 93, 45, 24,
-                                     38, 93, 102, 99, 29, 90, 84, 72, 93, 10,
-                                     80, 98, 18, 21, 89, 104, 12, 82, 25, 38,
-                                     39, 74, 64, 26, 55, 78, 104, 93, 34, 76,
-                                     123, 65, 36, 87, 48, 87, 53, 73, 31, 82});
+        mat = genGivenVals<DT>(10,
+                               {66, 58,  24,  118, 51, 22, 75,  32, 17, 8,  74,  55, 44, 63, 51, 44,  75, 87, 63, 42,
+                                71, 108, 10,  101, 92, 34, 101, 89, 39, 91, 48,  36, 69, 63, 69, 18,  7,  56, 63, 28,
+                                61, 16,  9,   87,  25, 40, 12,  27, 22, 18, 11,  4,  22, 71, 94, 82,  65, 93, 45, 24,
+                                38, 93,  102, 99,  29, 90, 84,  72, 93, 10, 80,  98, 18, 21, 89, 104, 12, 82, 25, 38,
+                                39, 74,  64,  26,  55, 78, 104, 93, 34, 76, 123, 65, 36, 87, 48, 87,  53, 73, 31, 82});
     } else if (std::is_same<DT, CSRMatrix<typename DT::VT>>::value) {
-        mat = genGivenVals<DT>(10, {0, 0, 55, 0, 0, 0, 0, 0, 0, 0,
-                                     0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                     0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                     0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                     0, 0, 0, 0, 0, 0, 40, 0, 0, 0,
-                                     0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                     0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                     0, 98, 0, 0, 0, 0, 0, 0, 0, 0,
-                                     0, 0, 0, 0, 0, 0, 0, 93, 0, 0,
-                                     0, 0, 0, 0, 0, 0, 0, 0, 0, 0});
+        mat = genGivenVals<DT>(10, {0, 0, 55, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0,
+                                    0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0, 40, 0, 0, 0,
+                                    0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0, 98, 0, 0, 0,
+                                    0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 93, 0, 0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0});
     }
 
     // Serialize in chunks and copy them to an array of messages
@@ -102,8 +84,7 @@ TEMPLATE_PRODUCT_TEST_CASE("DaphneSerializer serialize/deserialize in chunks out
 
     DaphneSerializerOutOfOrderChunks<DT> serializer(mat, chunkSize);
     // Serialize matrix
-    while (serializer.HasNextChunk())
-    {
+    while (serializer.HasNextChunk()) {
         std::vector<char> bufferTmp(200);
         serializer.SerializeNextChunk(bufferTmp);
         // Push to vector of messages
@@ -119,8 +100,7 @@ TEMPLATE_PRODUCT_TEST_CASE("DaphneSerializer serialize/deserialize in chunks out
     size_t i = 0;
     DT *res = nullptr;
     // We want to iterate using HasNextChunk() method
-    while (deserializer.HasNextChunk())
-    {
+    while (deserializer.HasNextChunk()) {
         // Deserialize message
         res = deserializer.DeserializeNextChunk(message[i++]);
     }
@@ -132,32 +112,22 @@ TEMPLATE_PRODUCT_TEST_CASE("DaphneSerializer serialize/deserialize in chunks out
         DataObjectFactory::destroy(res);
 }
 
-TEMPLATE_PRODUCT_TEST_CASE("DaphneSerializer serialize/deserialize in order using iterator", TAG_IO, (DATA_TYPES), (VALUE_TYPES))
-{
+TEMPLATE_PRODUCT_TEST_CASE("DaphneSerializer serialize/deserialize in order using iterator", TAG_IO, (DATA_TYPES),
+                           (VALUE_TYPES)) {
     using DT = TestType;
-    DT* mat = nullptr;
+    DT *mat = nullptr;
     if (std::is_same<DT, DenseMatrix<typename DT::VT>>::value) {
-        mat = genGivenVals<DT>(10, {66, 58, 24, 118, 51, 22, 75, 32, 17, 8,
-                                     74, 55, 44, 63, 51, 44, 75, 87, 63, 42,
-                                     71, 108, 10, 101, 92, 34, 101, 89, 39, 91,
-                                     48, 36, 69, 63, 69, 18, 7, 56, 63, 28,
-                                     61, 16, 9, 87, 25, 40, 12, 27, 22, 18,
-                                     11, 4, 22, 71, 94, 82, 65, 93, 45, 24,
-                                     38, 93, 102, 99, 29, 90, 84, 72, 93, 10,
-                                     80, 98, 18, 21, 89, 104, 12, 82, 25, 38,
-                                     39, 74, 64, 26, 55, 78, 104, 93, 34, 76,
-                                     123, 65, 36, 87, 48, 87, 53, 73, 31, 82});
+        mat = genGivenVals<DT>(10,
+                               {66, 58,  24,  118, 51, 22, 75,  32, 17, 8,  74,  55, 44, 63, 51, 44,  75, 87, 63, 42,
+                                71, 108, 10,  101, 92, 34, 101, 89, 39, 91, 48,  36, 69, 63, 69, 18,  7,  56, 63, 28,
+                                61, 16,  9,   87,  25, 40, 12,  27, 22, 18, 11,  4,  22, 71, 94, 82,  65, 93, 45, 24,
+                                38, 93,  102, 99,  29, 90, 84,  72, 93, 10, 80,  98, 18, 21, 89, 104, 12, 82, 25, 38,
+                                39, 74,  64,  26,  55, 78, 104, 93, 34, 76, 123, 65, 36, 87, 48, 87,  53, 73, 31, 82});
     } else if (std::is_same<DT, CSRMatrix<typename DT::VT>>::value) {
-        mat = genGivenVals<DT>(10, {0, 0, 55, 0, 0, 0, 0, 0, 0, 0,
-                                     0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                     0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                     0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                     0, 0, 0, 0, 0, 0, 40, 0, 0, 0,
-                                     0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                     0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                     0, 98, 0, 0, 0, 0, 0, 0, 0, 0,
-                                     0, 0, 0, 0, 0, 0, 0, 93, 0, 0,
-                                     0, 0, 0, 0, 0, 0, 0, 0, 0, 0});
+        mat = genGivenVals<DT>(10, {0, 0, 55, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0,
+                                    0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0, 40, 0, 0, 0,
+                                    0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0, 98, 0, 0, 0,
+                                    0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 93, 0, 0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0});
     }
 
     auto tempBuff = std::vector<char>(DaphneSerializer<DT>::length(mat));
@@ -165,8 +135,7 @@ TEMPLATE_PRODUCT_TEST_CASE("DaphneSerializer serialize/deserialize in order usin
     auto ser = DaphneSerializerChunks<DT>(mat, 200);
 
     size_t idx = 0;
-    for (auto it = ser.begin(); it != ser.end(); ++it)
-    {
+    for (auto it = ser.begin(); it != ser.end(); ++it) {
         std::copy(it->second->begin(), it->second->begin() + it->first, tempBuff.begin() + idx);
         idx += it->first;
     }
@@ -174,8 +143,7 @@ TEMPLATE_PRODUCT_TEST_CASE("DaphneSerializer serialize/deserialize in order usin
     DT *newMat = nullptr;
     idx = 0;
     auto deser = DaphneDeserializerChunks<DT>(&newMat, 200);
-    for (auto it = deser.begin(); it != deser.end(); ++it)
-    {
+    for (auto it = deser.begin(); it != deser.end(); ++it) {
         size_t chnck = std::min(size_t(200), tempBuff.capacity() - idx);
         std::copy(tempBuff.begin() + idx, tempBuff.begin() + idx + chnck, it->second->begin());
         it->first = chnck;
@@ -191,21 +159,19 @@ TEMPLATE_PRODUCT_TEST_CASE("DaphneSerializer serialize/deserialize in order usin
 // ----------------------------------------------------------------------------
 // Large random matrices
 // ----------------------------------------------------------------------------
-TEMPLATE_PRODUCT_TEST_CASE("DaphneSerializer serialize/deserialize, large random input", TAG_IO, (DATA_TYPES), (VALUE_TYPES))
-{
+TEMPLATE_PRODUCT_TEST_CASE("DaphneSerializer serialize/deserialize, large random input", TAG_IO, (DATA_TYPES),
+                           (VALUE_TYPES)) {
     using DT = TestType;
     const size_t numRows = 10000;
     const size_t numCols = 10000;
 
     // Set sparsity to 0.05 if we test for Sparse Matrix else 1.0
     double sparsity = 1.0;
-    if (std::is_same<DT, CSRMatrix<typename DT::VT>>::value){
+    if (std::is_same<DT, CSRMatrix<typename DT::VT>>::value) {
         sparsity = 0.05;
     }
-    DT * mat = nullptr;
-    randMatrix<DT, typename DT::VT>(
-            mat, numRows, numCols, 0, 127, sparsity, -1, nullptr
-    );
+    DT *mat = nullptr;
+    randMatrix<DT, typename DT::VT>(mat, numRows, numCols, 0, 127, sparsity, -1, nullptr);
 
     // Serialize and deserialize
     std::vector<char> buffer;
@@ -219,8 +185,9 @@ TEMPLATE_PRODUCT_TEST_CASE("DaphneSerializer serialize/deserialize, large random
     DataObjectFactory::destroy(newMat);
 }
 
-TEMPLATE_PRODUCT_TEST_CASE("DaphneSerializer serialize/deserialize in chunks out of order, large random input", TAG_IO, (DATA_TYPES), (VALUE_TYPES))
-{
+TEMPLATE_PRODUCT_TEST_CASE("DaphneSerializer serialize/deserialize in chunks "
+                           "out of order, large random input",
+                           TAG_IO, (DATA_TYPES), (VALUE_TYPES)) {
     using DT = TestType;
     const size_t numRows = 10000;
     const size_t numCols = 10000;
@@ -228,21 +195,18 @@ TEMPLATE_PRODUCT_TEST_CASE("DaphneSerializer serialize/deserialize in chunks out
 
     // Set sparsity to 0.05 if we test for Sparse Matrix else 1.0
     double sparsity = 1.0;
-    if (std::is_same<DT, CSRMatrix<typename DT::VT>>::value){
+    if (std::is_same<DT, CSRMatrix<typename DT::VT>>::value) {
         sparsity = 0.05;
     }
-    DT * mat = nullptr;
-    randMatrix<DT, typename DT::VT>(
-            mat, numRows, numCols, 0, 127, sparsity, -1, nullptr
-    );
+    DT *mat = nullptr;
+    randMatrix<DT, typename DT::VT>(mat, numRows, numCols, 0, 127, sparsity, -1, nullptr);
 
     // Serialize in chunks and copy them to an array of messages
     std::vector<std::vector<char>> message;
 
     DaphneSerializerOutOfOrderChunks<DT> serializer(mat, bufferSize);
     // Serialize matrix
-    while (serializer.HasNextChunk())
-    {
+    while (serializer.HasNextChunk()) {
         std::vector<char> bufferTmp(bufferSize);
         serializer.SerializeNextChunk(bufferTmp);
         // Push to vector of messages
@@ -258,8 +222,7 @@ TEMPLATE_PRODUCT_TEST_CASE("DaphneSerializer serialize/deserialize in chunks out
     size_t i = 0;
     DT *res = nullptr;
     // We want to iterate using HasNextChunk() method
-    while (deserializer.HasNextChunk())
-    {
+    while (deserializer.HasNextChunk()) {
         // Deserialize message
         res = deserializer.DeserializeNextChunk(message[i++]);
     }
@@ -271,8 +234,9 @@ TEMPLATE_PRODUCT_TEST_CASE("DaphneSerializer serialize/deserialize in chunks out
         DataObjectFactory::destroy(res);
 }
 
-TEMPLATE_PRODUCT_TEST_CASE("DaphneSerializer serialize/deserialize in order using iterator, large random input", TAG_IO, (CSRMatrix), (VALUE_TYPES))
-{
+TEMPLATE_PRODUCT_TEST_CASE("DaphneSerializer serialize/deserialize in order "
+                           "using iterator, large random input",
+                           TAG_IO, (CSRMatrix), (VALUE_TYPES)) {
     using DT = TestType;
     const size_t numRows = 10000;
     const size_t numCols = 10000;
@@ -280,20 +244,17 @@ TEMPLATE_PRODUCT_TEST_CASE("DaphneSerializer serialize/deserialize in order usin
 
     // Set sparsity to 0.05 if we test for Sparse Matrix else 1.0
     double sparsity = 1.0;
-    if (std::is_same<DT, CSRMatrix<typename DT::VT>>::value){
+    if (std::is_same<DT, CSRMatrix<typename DT::VT>>::value) {
         sparsity = 0.05;
     }
-    DT * mat = nullptr;
-    randMatrix<DT, typename DT::VT>(
-            mat, numRows, numCols, 0, 127, sparsity, -1, nullptr
-    );
+    DT *mat = nullptr;
+    randMatrix<DT, typename DT::VT>(mat, numRows, numCols, 0, 127, sparsity, -1, nullptr);
 
     auto tempBuff = std::vector<char>(DaphneSerializer<DT>::length(mat));
     auto ser = DaphneSerializerChunks<DT>(mat, bufferSize);
 
     size_t idx = 0;
-    for (auto it = ser.begin(); it != ser.end(); ++it)
-    {
+    for (auto it = ser.begin(); it != ser.end(); ++it) {
         std::copy(it->second->begin(), it->second->begin() + it->first, tempBuff.begin() + idx);
         idx += it->first;
     }
@@ -301,15 +262,14 @@ TEMPLATE_PRODUCT_TEST_CASE("DaphneSerializer serialize/deserialize in order usin
     DT *newMat = nullptr;
     idx = 0;
     auto deser = DaphneDeserializerChunks<DT>(&newMat, bufferSize);
-    for (auto it = deser.begin(); it != deser.end(); ++it)
-    {
+    for (auto it = deser.begin(); it != deser.end(); ++it) {
         size_t chnck = std::min(size_t(bufferSize), tempBuff.capacity() - idx);
         std::copy(tempBuff.begin() + idx, tempBuff.begin() + idx + chnck, it->second->begin());
         it->first = chnck;
         idx += chnck;
     }
     CHECK(*newMat == *mat);
-    
+
     DataObjectFactory::destroy(mat);
     if (newMat != nullptr) // suppress warning
         DataObjectFactory::destroy(newMat);
diff --git a/test/runtime/local/io/ReadCsvStr.csv b/test/runtime/local/io/ReadCsvStr.csv
new file mode 100644
index 000000000..50eea33eb
--- /dev/null
+++ b/test/runtime/local/io/ReadCsvStr.csv
@@ -0,0 +1,10 @@
+"apple, orange",35,Fruit Basket
+"dog, cat",30,Pets
+table,27,Furniture Set
+"""",22,Unknown Item
+"abc""def",33,"No Category\""
+"red, blue\n",50,
+"\n\"abc""def\"",28,"Mixed string"
+"line1
+line2",27,"with newline"
+"\"red, \"\"",41,""
diff --git a/test/runtime/local/io/ReadCsvStr.csv.meta b/test/runtime/local/io/ReadCsvStr.csv.meta
new file mode 100644
index 000000000..0fe44b980
--- /dev/null
+++ b/test/runtime/local/io/ReadCsvStr.csv.meta
@@ -0,0 +1,5 @@
+{
+    "numRows": 9,
+    "numCols": 3,
+    "valueType": "str"
+}
\ No newline at end of file
diff --git a/test/runtime/local/io/ReadCsvTest.cpp b/test/runtime/local/io/ReadCsvTest.cpp
index d53a19882..32134d81d 100644
--- a/test/runtime/local/io/ReadCsvTest.cpp
+++ b/test/runtime/local/io/ReadCsvTest.cpp
@@ -16,8 +16,8 @@
 
 #include <runtime/local/datastructures/DataObjectFactory.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
-#include <runtime/local/io/ReadCsv.h>
 #include <runtime/local/io/File.h>
+#include <runtime/local/io/ReadCsv.h>
 
 #include <tags.h>
 
@@ -30,240 +30,285 @@
 #include <limits>
 
 TEMPLATE_PRODUCT_TEST_CASE("ReadCsv", TAG_IO, (DenseMatrix), (double)) {
-  using DT = TestType;
-  DT *m = nullptr;
+    using DT = TestType;
+    DT *m = nullptr;
 
-  size_t numRows = 2;
-  size_t numCols = 4;
+    size_t numRows = 2;
+    size_t numCols = 4;
 
-  char filename[] = "./test/runtime/local/io/ReadCsv1.csv";
-  char delim = ',';
+    char filename[] = "./test/runtime/local/io/ReadCsv1.csv";
+    char delim = ',';
 
-  readCsv(m, filename, numRows, numCols, delim);
+    readCsv(m, filename, numRows, numCols, delim);
 
-  REQUIRE(m->getNumRows() == numRows);
-  REQUIRE(m->getNumCols() == numCols);
+    REQUIRE(m->getNumRows() == numRows);
+    REQUIRE(m->getNumCols() == numCols);
 
-  CHECK(m->get(0, 0) == -0.1);
-  CHECK(m->get(0, 1) == -0.2);
-  CHECK(m->get(0, 2) == 0.1);
-  CHECK(m->get(0, 3) == 0.2);
+    CHECK(m->get(0, 0) == -0.1);
+    CHECK(m->get(0, 1) == -0.2);
+    CHECK(m->get(0, 2) == 0.1);
+    CHECK(m->get(0, 3) == 0.2);
 
-  CHECK(m->get(1, 0) == 3.14);
-  CHECK(m->get(1, 1) == 5.41);
-  CHECK(m->get(1, 2) == 6.22216);
-  CHECK(m->get(1, 3) == 5);
+    CHECK(m->get(1, 0) == 3.14);
+    CHECK(m->get(1, 1) == 5.41);
+    CHECK(m->get(1, 2) == 6.22216);
+    CHECK(m->get(1, 3) == 5);
 
-  DataObjectFactory::destroy(m);
+    DataObjectFactory::destroy(m);
 }
 
 TEMPLATE_PRODUCT_TEST_CASE("ReadCsv", TAG_IO, (DenseMatrix), (uint8_t)) {
-  using DT = TestType;
-  DT *m = nullptr;
+    using DT = TestType;
+    DT *m = nullptr;
 
-  size_t numRows = 2;
-  size_t numCols = 4;
+    size_t numRows = 2;
+    size_t numCols = 4;
 
-  char filename[] = "./test/runtime/local/io/ReadCsv2.csv";
-  char delim = ',';
+    char filename[] = "./test/runtime/local/io/ReadCsv2.csv";
+    char delim = ',';
 
-  readCsv(m, filename, numRows, numCols, delim);
+    readCsv(m, filename, numRows, numCols, delim);
 
-  REQUIRE(m->getNumRows() == numRows);
-  REQUIRE(m->getNumCols() == numCols);
+    REQUIRE(m->getNumRows() == numRows);
+    REQUIRE(m->getNumCols() == numCols);
 
-  CHECK(m->get(0, 0) == 1);
-  CHECK(m->get(0, 1) == 2);
-  CHECK(m->get(0, 2) == 3);
-  CHECK(m->get(0, 3) == 4);
+    CHECK(m->get(0, 0) == 1);
+    CHECK(m->get(0, 1) == 2);
+    CHECK(m->get(0, 2) == 3);
+    CHECK(m->get(0, 3) == 4);
 
-  /* File contains negative numbers. Expect cast to positive */
-  CHECK(m->get(1, 0) == 255);
-  CHECK(m->get(1, 1) == 254);
-  CHECK(m->get(1, 2) == 253);
-  CHECK(m->get(1, 3) == 252);
+    /* File contains negative numbers. Expect cast to positive */
+    CHECK(m->get(1, 0) == 255);
+    CHECK(m->get(1, 1) == 254);
+    CHECK(m->get(1, 2) == 253);
+    CHECK(m->get(1, 3) == 252);
 
-  DataObjectFactory::destroy(m);
+    DataObjectFactory::destroy(m);
 }
 
-TEMPLATE_PRODUCT_TEST_CASE("ReadCsv, col + row ignore", TAG_IO,
-                           (DenseMatrix), (int8_t)) {
-  using DT = TestType;
-  DT *m = nullptr;
+TEMPLATE_PRODUCT_TEST_CASE("ReadCsv, col + row ignore", TAG_IO, (DenseMatrix), (int8_t)) {
+    using DT = TestType;
+    DT *m = nullptr;
 
-  size_t numRows = 1;
-  size_t numCols = 2;
+    size_t numRows = 1;
+    size_t numCols = 2;
 
-  char filename[] = "./test/runtime/local/io/ReadCsv2.csv";
-  char delim = ',';
+    char filename[] = "./test/runtime/local/io/ReadCsv2.csv";
+    char delim = ',';
 
-  readCsv(m, filename, numRows, numCols, delim);
+    readCsv(m, filename, numRows, numCols, delim);
 
-  REQUIRE(m->getNumRows() == numRows);
-  REQUIRE(m->getNumCols() == numCols);
+    REQUIRE(m->getNumRows() == numRows);
+    REQUIRE(m->getNumCols() == numCols);
 
-  CHECK(m->get(0, 0) == 1);
-  CHECK(m->get(0, 1) == 2);
+    CHECK(m->get(0, 0) == 1);
+    CHECK(m->get(0, 1) == 2);
 
-  DataObjectFactory::destroy(m);
+    DataObjectFactory::destroy(m);
 }
 
-TEMPLATE_PRODUCT_TEST_CASE("ReadCsv, INF and NAN parsing", TAG_IO,
-                           (DenseMatrix), (double)) {
-  using DT = TestType;
-  DT *m = nullptr;
+TEMPLATE_PRODUCT_TEST_CASE("ReadCsv, INF and NAN parsing", TAG_IO, (DenseMatrix), (double)) {
+    using DT = TestType;
+    DT *m = nullptr;
 
-  size_t numRows = 2;
-  size_t numCols = 4;
+    size_t numRows = 2;
+    size_t numCols = 4;
 
-  char filename[] = "./test/runtime/local/io/ReadCsv3.csv";
-  char delim = ',';
+    char filename[] = "./test/runtime/local/io/ReadCsv3.csv";
+    char delim = ',';
 
-  readCsv(m, filename, numRows, numCols, delim);
+    readCsv(m, filename, numRows, numCols, delim);
 
-  REQUIRE(m->getNumRows() == numRows);
-  REQUIRE(m->getNumCols() == numCols);
+    REQUIRE(m->getNumRows() == numRows);
+    REQUIRE(m->getNumCols() == numCols);
 
-  CHECK(m->get(0, 0) == -std::numeric_limits<double>::infinity());
-  CHECK(m->get(0, 1) == std::numeric_limits<double>::infinity());
-  CHECK(m->get(0, 2) == -std::numeric_limits<double>::infinity());
-  CHECK(m->get(0, 3) == std::numeric_limits<double>::infinity());
+    CHECK(m->get(0, 0) == -std::numeric_limits<double>::infinity());
+    CHECK(m->get(0, 1) == std::numeric_limits<double>::infinity());
+    CHECK(m->get(0, 2) == -std::numeric_limits<double>::infinity());
+    CHECK(m->get(0, 3) == std::numeric_limits<double>::infinity());
 
-  CHECK(std::isnan(m->get(1, 0)));
-  CHECK(std::isnan(m->get(1, 1)));
-  CHECK(std::isnan(m->get(1, 2)));
-  CHECK(std::isnan(m->get(1, 3)));
+    CHECK(std::isnan(m->get(1, 0)));
+    CHECK(std::isnan(m->get(1, 1)));
+    CHECK(std::isnan(m->get(1, 2)));
+    CHECK(std::isnan(m->get(1, 3)));
 
-  DataObjectFactory::destroy(m);
+    DataObjectFactory::destroy(m);
 }
 
 TEST_CASE("ReadCsv, frame of floats", TAG_IO) {
-  ValueTypeCode schema[] = { ValueTypeCode::F64, ValueTypeCode::F64, ValueTypeCode::F64, ValueTypeCode::F64 };
-  Frame *m = NULL;
+    ValueTypeCode schema[] = {ValueTypeCode::F64, ValueTypeCode::F64, ValueTypeCode::F64, ValueTypeCode::F64};
+    Frame *m = NULL;
 
-  size_t numRows = 2;
-  size_t numCols = 4;
+    size_t numRows = 2;
+    size_t numCols = 4;
 
-  char filename[] = "./test/runtime/local/io/ReadCsv1.csv";
-  char delim = ',';
+    char filename[] = "./test/runtime/local/io/ReadCsv1.csv";
+    char delim = ',';
 
-  readCsv(m, filename, numRows, numCols, delim, schema);
+    readCsv(m, filename, numRows, numCols, delim, schema);
 
-  REQUIRE(m->getNumRows() == numRows);
-  REQUIRE(m->getNumCols() == numCols);
+    REQUIRE(m->getNumRows() == numRows);
+    REQUIRE(m->getNumCols() == numCols);
 
-  CHECK(m->getColumn<double>(0)->get(0, 0) == -0.1);
-  CHECK(m->getColumn<double>(1)->get(0, 0) == -0.2);
-  CHECK(m->getColumn<double>(2)->get(0, 0) == 0.1);
-  CHECK(m->getColumn<double>(3)->get(0, 0) == 0.2);
+    CHECK(m->getColumn<double>(0)->get(0, 0) == -0.1);
+    CHECK(m->getColumn<double>(1)->get(0, 0) == -0.2);
+    CHECK(m->getColumn<double>(2)->get(0, 0) == 0.1);
+    CHECK(m->getColumn<double>(3)->get(0, 0) == 0.2);
 
-  CHECK(m->getColumn<double>(0)->get(1, 0) == 3.14);
-  CHECK(m->getColumn<double>(1)->get(1, 0) == 5.41);
-  CHECK(m->getColumn<double>(2)->get(1, 0) == 6.22216);
-  CHECK(m->getColumn<double>(3)->get(1, 0) == 5);
+    CHECK(m->getColumn<double>(0)->get(1, 0) == 3.14);
+    CHECK(m->getColumn<double>(1)->get(1, 0) == 5.41);
+    CHECK(m->getColumn<double>(2)->get(1, 0) == 6.22216);
+    CHECK(m->getColumn<double>(3)->get(1, 0) == 5);
 
-  DataObjectFactory::destroy(m);
+    DataObjectFactory::destroy(m);
 }
 
 TEST_CASE("ReadCsv, frame of uint8s", TAG_IO) {
-  ValueTypeCode schema[] = { ValueTypeCode::UI8, ValueTypeCode::UI8, ValueTypeCode::UI8, ValueTypeCode::UI8 };
-  Frame *m = NULL;
+    ValueTypeCode schema[] = {ValueTypeCode::UI8, ValueTypeCode::UI8, ValueTypeCode::UI8, ValueTypeCode::UI8};
+    Frame *m = NULL;
 
-  size_t numRows = 2;
-  size_t numCols = 4;
+    size_t numRows = 2;
+    size_t numCols = 4;
 
-  char filename[] = "./test/runtime/local/io/ReadCsv2.csv";
-  char delim = ',';
+    char filename[] = "./test/runtime/local/io/ReadCsv2.csv";
+    char delim = ',';
 
-  readCsv(m, filename, numRows, numCols, delim, schema);
+    readCsv(m, filename, numRows, numCols, delim, schema);
 
-  REQUIRE(m->getNumRows() == numRows);
-  REQUIRE(m->getNumCols() == numCols);
+    REQUIRE(m->getNumRows() == numRows);
+    REQUIRE(m->getNumCols() == numCols);
 
-  CHECK(m->getColumn<uint8_t>(0)->get(0, 0) == 1);
-  CHECK(m->getColumn<uint8_t>(1)->get(0, 0) == 2);
-  CHECK(m->getColumn<uint8_t>(2)->get(0, 0) == 3);
-  CHECK(m->getColumn<uint8_t>(3)->get(0, 0) == 4);
+    CHECK(m->getColumn<uint8_t>(0)->get(0, 0) == 1);
+    CHECK(m->getColumn<uint8_t>(1)->get(0, 0) == 2);
+    CHECK(m->getColumn<uint8_t>(2)->get(0, 0) == 3);
+    CHECK(m->getColumn<uint8_t>(3)->get(0, 0) == 4);
 
-  /* File contains negative numbers. Expect cast to positive */
-  CHECK(m->getColumn<uint8_t>(0)->get(1, 0) == 255);
-  CHECK(m->getColumn<uint8_t>(1)->get(1, 0) == 254);
-  CHECK(m->getColumn<uint8_t>(2)->get(1, 0) == 253);
-  CHECK(m->getColumn<uint8_t>(3)->get(1, 0) == 252);
+    /* File contains negative numbers. Expect cast to positive */
+    CHECK(m->getColumn<uint8_t>(0)->get(1, 0) == 255);
+    CHECK(m->getColumn<uint8_t>(1)->get(1, 0) == 254);
+    CHECK(m->getColumn<uint8_t>(2)->get(1, 0) == 253);
+    CHECK(m->getColumn<uint8_t>(3)->get(1, 0) == 252);
 
-  DataObjectFactory::destroy(m);
+    DataObjectFactory::destroy(m);
 }
 
 TEST_CASE("ReadCsv, col + row ignore", TAG_IO) {
-  ValueTypeCode schema[] = { ValueTypeCode::UI8, ValueTypeCode::UI8 };
-  Frame *m = NULL;
+    ValueTypeCode schema[] = {ValueTypeCode::UI8, ValueTypeCode::UI8};
+    Frame *m = NULL;
 
-  size_t numRows = 1;
-  size_t numCols = 2;
+    size_t numRows = 1;
+    size_t numCols = 2;
 
-  char filename[] = "./test/runtime/local/io/ReadCsv2.csv";
-  char delim = ',';
+    char filename[] = "./test/runtime/local/io/ReadCsv2.csv";
+    char delim = ',';
 
-  readCsv(m, filename, numRows, numCols, delim, schema);
+    readCsv(m, filename, numRows, numCols, delim, schema);
 
-  REQUIRE(m->getNumRows() == numRows);
-  REQUIRE(m->getNumCols() == numCols);
+    REQUIRE(m->getNumRows() == numRows);
+    REQUIRE(m->getNumCols() == numCols);
 
-  CHECK(m->getColumn<uint8_t>(0)->get(0, 0) == 1);
-  CHECK(m->getColumn<uint8_t>(1)->get(0, 0) == 2);
+    CHECK(m->getColumn<uint8_t>(0)->get(0, 0) == 1);
+    CHECK(m->getColumn<uint8_t>(1)->get(0, 0) == 2);
 
-  DataObjectFactory::destroy(m);
+    DataObjectFactory::destroy(m);
 }
 
 TEST_CASE("ReadCsv, INF and NAN parsing", TAG_IO) {
-  ValueTypeCode schema[] = { ValueTypeCode::F64, ValueTypeCode::F64, ValueTypeCode::F64, ValueTypeCode::F64 };
-  Frame *m = NULL;
+    ValueTypeCode schema[] = {ValueTypeCode::F64, ValueTypeCode::F64, ValueTypeCode::F64, ValueTypeCode::F64};
+    Frame *m = NULL;
 
-  size_t numRows = 2;
-  size_t numCols = 4;
+    size_t numRows = 2;
+    size_t numCols = 4;
 
-  char filename[] = "./test/runtime/local/io/ReadCsv3.csv";
-  char delim = ',';
+    char filename[] = "./test/runtime/local/io/ReadCsv3.csv";
+    char delim = ',';
 
-  readCsv(m, filename, numRows, numCols, delim, schema);
+    readCsv(m, filename, numRows, numCols, delim, schema);
 
-  REQUIRE(m->getNumRows() == numRows);
-  REQUIRE(m->getNumCols() == numCols);
+    REQUIRE(m->getNumRows() == numRows);
+    REQUIRE(m->getNumCols() == numCols);
 
-  CHECK(m->getColumn<double>(0)->get(0, 0) == -std::numeric_limits<double>::infinity());
-  CHECK(m->getColumn<double>(1)->get(0, 0) == std::numeric_limits<double>::infinity());
-  CHECK(m->getColumn<double>(2)->get(0, 0) == -std::numeric_limits<double>::infinity());
-  CHECK(m->getColumn<double>(3)->get(0, 0) == std::numeric_limits<double>::infinity());
+    CHECK(m->getColumn<double>(0)->get(0, 0) == -std::numeric_limits<double>::infinity());
+    CHECK(m->getColumn<double>(1)->get(0, 0) == std::numeric_limits<double>::infinity());
+    CHECK(m->getColumn<double>(2)->get(0, 0) == -std::numeric_limits<double>::infinity());
+    CHECK(m->getColumn<double>(3)->get(0, 0) == std::numeric_limits<double>::infinity());
 
-  CHECK(std::isnan(m->getColumn<double>(0)->get(1, 0)));
-  CHECK(std::isnan(m->getColumn<double>(1)->get(1, 0)));
-  CHECK(std::isnan(m->getColumn<double>(2)->get(1, 0)));
-  CHECK(std::isnan(m->getColumn<double>(3)->get(1, 0)));
+    CHECK(std::isnan(m->getColumn<double>(0)->get(1, 0)));
+    CHECK(std::isnan(m->getColumn<double>(1)->get(1, 0)));
+    CHECK(std::isnan(m->getColumn<double>(2)->get(1, 0)));
+    CHECK(std::isnan(m->getColumn<double>(3)->get(1, 0)));
 
-  DataObjectFactory::destroy(m);
+    DataObjectFactory::destroy(m);
 }
 
 TEST_CASE("ReadCsv, varying columns", TAG_IO) {
-  ValueTypeCode schema[] = { ValueTypeCode::SI8, ValueTypeCode::F32 };
-  Frame *m = NULL;
+    ValueTypeCode schema[] = {ValueTypeCode::SI8, ValueTypeCode::F32};
+    Frame *m = NULL;
 
-  size_t numRows = 2;
-  size_t numCols = 2;
+    size_t numRows = 2;
+    size_t numCols = 2;
 
-  char filename[] = "./test/runtime/local/io/ReadCsv4.csv";
-  char delim = ',';
+    char filename[] = "./test/runtime/local/io/ReadCsv4.csv";
+    char delim = ',';
 
-  readCsv(m, filename, numRows, numCols, delim, schema);
+    readCsv(m, filename, numRows, numCols, delim, schema);
 
-  REQUIRE(m->getNumRows() == numRows);
-  REQUIRE(m->getNumCols() == numCols);
+    REQUIRE(m->getNumRows() == numRows);
+    REQUIRE(m->getNumCols() == numCols);
 
-  CHECK(m->getColumn<int8_t>(0)->get(0, 0) == 1);
-  CHECK(m->getColumn<float>(1)->get(0, 0) == 0.5);
+    CHECK(m->getColumn<int8_t>(0)->get(0, 0) == 1);
+    CHECK(m->getColumn<float>(1)->get(0, 0) == 0.5);
 
-  CHECK(m->getColumn<int8_t>(0)->get(1, 0) == 2);
-  CHECK(m->getColumn<float>(1)->get(1, 0) == 1.0);
+    CHECK(m->getColumn<int8_t>(0)->get(1, 0) == 2);
+    CHECK(m->getColumn<float>(1)->get(1, 0) == 1.0);
 
-  DataObjectFactory::destroy(m);
+    DataObjectFactory::destroy(m);
+}
 
+TEMPLATE_PRODUCT_TEST_CASE("ReadCsv", TAG_IO, (DenseMatrix), (ALL_STRING_VALUE_TYPES)) {
+    using DT = TestType;
+    DT *m = nullptr;
+
+    size_t numRows = 9;
+    size_t numCols = 3;
+
+    char filename[] = "./test/runtime/local/io/ReadCsvStr.csv";
+    char delim = ',';
+
+    readCsv(m, filename, numRows, numCols, delim);
+
+    REQUIRE(m->getNumRows() == numRows);
+    REQUIRE(m->getNumCols() == numCols);
+
+    CHECK(m->get(0, 0) == "apple, orange");
+    CHECK(m->get(1, 0) == "dog, cat");
+    CHECK(m->get(2, 0) == "table");
+    CHECK(m->get(3, 0) == "\"\"");
+    CHECK(m->get(4, 0) == "abc\"\"def");
+    CHECK(m->get(5, 0) == "red, blue\\n");
+    CHECK(m->get(6, 0) == "\\n\\\"abc\"\"def\\\"");
+    CHECK(m->get(7, 0) == "line1\nline2");
+    CHECK(m->get(8, 0) == "\\\"red, \\\"\\\"");
+
+    CHECK(m->get(0, 1) == "35");
+    CHECK(m->get(1, 1) == "30");
+    CHECK(m->get(2, 1) == "27");
+    CHECK(m->get(3, 1) == "22");
+    CHECK(m->get(4, 1) == "33");
+    CHECK(m->get(5, 1) == "50");
+    CHECK(m->get(6, 1) == "28");
+    CHECK(m->get(7, 1) == "27");
+    CHECK(m->get(8, 1) == "41");
+
+    CHECK(m->get(0, 2) == "Fruit Basket");
+    CHECK(m->get(1, 2) == "Pets");
+    CHECK(m->get(2, 2) == "Furniture Set");
+    CHECK(m->get(3, 2) == "Unknown Item");
+    CHECK(m->get(4, 2) == "No Category\\\"");
+    CHECK(m->get(5, 2) == "");
+    CHECK(m->get(6, 2) == "Mixed string");
+    CHECK(m->get(7, 2) == "with newline");
+    CHECK(m->get(8, 2) == "");
+
+    DataObjectFactory::destroy(m);
 }
diff --git a/test/runtime/local/io/ReadDaphneTest.cpp b/test/runtime/local/io/ReadDaphneTest.cpp
index 6a97f16db..489843123 100644
--- a/test/runtime/local/io/ReadDaphneTest.cpp
+++ b/test/runtime/local/io/ReadDaphneTest.cpp
@@ -27,551 +27,546 @@
 #include <runtime/local/io/ReadDaphne.h>
 
 TEMPLATE_PRODUCT_TEST_CASE("ReadDaphne CIG", TAG_IO, (DenseMatrix), (int32_t)) {
-  using DT = TestType;
-  DT *m = nullptr;
+    using DT = TestType;
+    DT *m = nullptr;
 
-  size_t numRows = 9;
-  size_t numCols = 9;
+    size_t numRows = 9;
+    size_t numCols = 9;
 
-  char filename[] = "./test/runtime/local/io/cig.dbdf";
-  readDaphne(m, filename);
+    char filename[] = "./test/runtime/local/io/cig.dbdf";
+    readDaphne(m, filename);
 
-  REQUIRE(m->getNumRows() == numRows);
-  REQUIRE(m->getNumCols() == numCols);
+    REQUIRE(m->getNumRows() == numRows);
+    REQUIRE(m->getNumCols() == numCols);
 
-  CHECK(m->get(0, 0) == 1);
-  CHECK(m->get(2, 0) == 0);
-  CHECK(m->get(3, 4) == 9);
-  CHECK(m->get(7, 4) == 4);
+    CHECK(m->get(0, 0) == 1);
+    CHECK(m->get(2, 0) == 0);
+    CHECK(m->get(3, 4) == 9);
+    CHECK(m->get(7, 4) == 4);
 
-
-  DataObjectFactory::destroy(m);
+    DataObjectFactory::destroy(m);
 }
 
 TEMPLATE_PRODUCT_TEST_CASE("ReadDaphne AIG", TAG_IO, (DenseMatrix), (int32_t)) {
-  using DT = TestType;
-  DT *m = nullptr;
+    using DT = TestType;
+    DT *m = nullptr;
 
-  size_t numRows = 4;
-  size_t numCols = 3;
+    size_t numRows = 4;
+    size_t numCols = 3;
 
-  char filename[] = "./test/runtime/local/io/aig.dbdf";
-  readDaphne(m, filename);
+    char filename[] = "./test/runtime/local/io/aig.dbdf";
+    readDaphne(m, filename);
 
-  REQUIRE(m->getNumRows() == numRows);
-  REQUIRE(m->getNumCols() == numCols);
+    REQUIRE(m->getNumRows() == numRows);
+    REQUIRE(m->getNumCols() == numCols);
 
-  CHECK(m->get(0, 0) == 1);
-  CHECK(m->get(1, 0) == 2);
-  CHECK(m->get(0, 1) == 5);
-  CHECK(m->get(3, 2) == 12);
-  CHECK(m->get(2, 1) == 7);
+    CHECK(m->get(0, 0) == 1);
+    CHECK(m->get(1, 0) == 2);
+    CHECK(m->get(0, 1) == 5);
+    CHECK(m->get(3, 2) == 12);
+    CHECK(m->get(2, 1) == 7);
 
-  DataObjectFactory::destroy(m);
+    DataObjectFactory::destroy(m);
 }
 
 TEMPLATE_PRODUCT_TEST_CASE("ReadDaphne CRG", TAG_IO, (DenseMatrix), (double)) {
-  using DT = TestType;
-  DT *m = nullptr;
+    using DT = TestType;
+    DT *m = nullptr;
 
-  size_t numRows = 497;
-  size_t numCols = 507;
+    size_t numRows = 497;
+    size_t numCols = 507;
 
-  char filename[] = "./test/runtime/local/io/crg.dbdf";
-  readDaphne(m, filename);
+    char filename[] = "./test/runtime/local/io/crg.dbdf";
+    readDaphne(m, filename);
 
-  REQUIRE(m->getNumRows() == numRows);
-  REQUIRE(m->getNumCols() == numCols);
+    REQUIRE(m->getNumRows() == numRows);
+    REQUIRE(m->getNumCols() == numCols);
 
-  CHECK(m->get(5, 0) == 0.25599762);
-  CHECK(m->get(6, 0) == 0.13827993);
-  CHECK(m->get(200, 4) == 0.20001954);
+    CHECK(m->get(5, 0) == 0.25599762);
+    CHECK(m->get(6, 0) == 0.13827993);
+    CHECK(m->get(200, 4) == 0.20001954);
 
-  DataObjectFactory::destroy(m);
+    DataObjectFactory::destroy(m);
 }
 
 TEMPLATE_PRODUCT_TEST_CASE("ReadDaphne CRS", TAG_IO, (DenseMatrix), (double)) {
-  using DT = TestType;
-  DT *m = nullptr;
+    using DT = TestType;
+    DT *m = nullptr;
 
-  size_t numRows = 66;
-  size_t numCols = 66;
+    size_t numRows = 66;
+    size_t numCols = 66;
 
-  char filename[] = "./test/runtime/local/io/crs.dbdf";
-  readDaphne(m, filename);
+    char filename[] = "./test/runtime/local/io/crs.dbdf";
+    readDaphne(m, filename);
 
-  REQUIRE(m->getNumRows() == numRows);
-  REQUIRE(m->getNumCols() == numCols);
+    REQUIRE(m->getNumRows() == numRows);
+    REQUIRE(m->getNumCols() == numCols);
 
-  CHECK(m->get(36, 29) == 926.188986068);
+    CHECK(m->get(36, 29) == 926.188986068);
 
-  for(size_t r = 0; r<numRows; r++)
-    for(size_t c = r+1; c<numCols; c++)
-      CHECK(m->get(r,c) == m->get(c,r));
+    for (size_t r = 0; r < numRows; r++)
+        for (size_t c = r + 1; c < numCols; c++)
+            CHECK(m->get(r, c) == m->get(c, r));
 
-  DataObjectFactory::destroy(m);
+    DataObjectFactory::destroy(m);
 }
 
 TEMPLATE_PRODUCT_TEST_CASE("ReadDaphne CRK", TAG_IO, (DenseMatrix), (double)) {
-  using DT = TestType;
-  DT *m = nullptr;
+    using DT = TestType;
+    DT *m = nullptr;
 
-  size_t numRows = 66;
-  size_t numCols = 66;
+    size_t numRows = 66;
+    size_t numCols = 66;
 
-  char filename[] = "./test/runtime/local/io/crk.dbdf";
-  readDaphne(m, filename);
+    char filename[] = "./test/runtime/local/io/crk.dbdf";
+    readDaphne(m, filename);
 
-  REQUIRE(m->getNumRows() == numRows);
-  REQUIRE(m->getNumCols() == numCols);
+    REQUIRE(m->getNumRows() == numRows);
+    REQUIRE(m->getNumCols() == numCols);
 
-  CHECK(m->get(29, 36) == -926.188986068);
+    CHECK(m->get(29, 36) == -926.188986068);
 
-  for(size_t r = 0; r<numRows; r++) {
-    CHECK(m->get(r,r) == 0);
-    for(size_t c = r+1; c<numCols; c++)
-      CHECK(m->get(r,c) == -m->get(c,r));
-  }
+    for (size_t r = 0; r < numRows; r++) {
+        CHECK(m->get(r, r) == 0);
+        for (size_t c = r + 1; c < numCols; c++)
+            CHECK(m->get(r, c) == -m->get(c, r));
+    }
 
-  DataObjectFactory::destroy(m);
+    DataObjectFactory::destroy(m);
 }
 
 TEMPLATE_PRODUCT_TEST_CASE("ReadDaphne CPS", TAG_IO, (DenseMatrix), (int32_t)) {
-  using DT = TestType;
-  DT *m = nullptr;
+    using DT = TestType;
+    DT *m = nullptr;
 
-  size_t numRows = 24;
-  size_t numCols = 24;
+    size_t numRows = 24;
+    size_t numCols = 24;
 
-  char filename[] = "./test/runtime/local/io/cps.dbdf";
-  readDaphne(m, filename);
+    char filename[] = "./test/runtime/local/io/cps.dbdf";
+    readDaphne(m, filename);
 
-  REQUIRE(m->getNumRows() == numRows);
-  REQUIRE(m->getNumCols() == numCols);
+    REQUIRE(m->getNumRows() == numRows);
+    REQUIRE(m->getNumCols() == numCols);
 
-  CHECK(m->get( 0, 0) != 0);
-  CHECK(m->get( 1, 0) == 0);
-  CHECK(m->get(3, 15) != 0);
+    CHECK(m->get(0, 0) != 0);
+    CHECK(m->get(1, 0) == 0);
+    CHECK(m->get(3, 15) != 0);
 
-  for(size_t r = 0; r<numRows; r++)
-    for(size_t c = r+1; c<numCols; c++)
-      if(m->get(r,c) == 0)
-        CHECK(m->get(c,r) == 0);
-      else
-        CHECK(m->get(c,r) != 0);
+    for (size_t r = 0; r < numRows; r++)
+        for (size_t c = r + 1; c < numCols; c++)
+            if (m->get(r, c) == 0)
+                CHECK(m->get(c, r) == 0);
+            else
+                CHECK(m->get(c, r) != 0);
 
-  DataObjectFactory::destroy(m);
+    DataObjectFactory::destroy(m);
 }
 
 TEMPLATE_PRODUCT_TEST_CASE("ReadDaphne AIK", TAG_IO, (DenseMatrix), (int32_t)) {
-  using DT = TestType;
-  DT *m = nullptr;
+    using DT = TestType;
+    DT *m = nullptr;
 
-  size_t numRows = 4;
-  size_t numCols = 4;
+    size_t numRows = 4;
+    size_t numCols = 4;
 
-  char filename[] = "./test/runtime/local/io/aik.dbdf";
-  readDaphne(m, filename);
+    char filename[] = "./test/runtime/local/io/aik.dbdf";
+    readDaphne(m, filename);
 
-  REQUIRE(m->getNumRows() == numRows);
-  REQUIRE(m->getNumCols() == numCols);
+    REQUIRE(m->getNumRows() == numRows);
+    REQUIRE(m->getNumCols() == numCols);
 
-  CHECK(m->get(1, 0) == 1);
+    CHECK(m->get(1, 0) == 1);
 
-  for(size_t r = 0; r<numRows; r++) {
-    CHECK(m->get(r,r) == 0);
-    for(size_t c = r+1; c<numCols; c++)
-      CHECK(m->get(r,c) == -m->get(c,r));
-  }
+    for (size_t r = 0; r < numRows; r++) {
+        CHECK(m->get(r, r) == 0);
+        for (size_t c = r + 1; c < numCols; c++)
+            CHECK(m->get(r, c) == -m->get(c, r));
+    }
 
-  DataObjectFactory::destroy(m);
+    DataObjectFactory::destroy(m);
 }
 
 TEMPLATE_PRODUCT_TEST_CASE("ReadDaphne AIS", TAG_IO, (DenseMatrix), (int32_t)) {
-  using DT = TestType;
-  DT *m = nullptr;
+    using DT = TestType;
+    DT *m = nullptr;
 
-  size_t numRows = 3;
-  size_t numCols = 3;
+    size_t numRows = 3;
+    size_t numCols = 3;
 
-  char filename[] = "./test/runtime/local/io/ais.dbdf";
-  readDaphne(m, filename);
+    char filename[] = "./test/runtime/local/io/ais.dbdf";
+    readDaphne(m, filename);
 
-  REQUIRE(m->getNumRows() == numRows);
-  REQUIRE(m->getNumCols() == numCols);
+    REQUIRE(m->getNumRows() == numRows);
+    REQUIRE(m->getNumCols() == numCols);
 
-  CHECK(m->get(1, 1) == 4);
+    CHECK(m->get(1, 1) == 4);
 
-  for(size_t r = 0; r<numRows; r++)
-    for(size_t c = r+1; c<numCols; c++)
-      CHECK(m->get(r,c) == m->get(c,r));
+    for (size_t r = 0; r < numRows; r++)
+        for (size_t c = r + 1; c < numCols; c++)
+            CHECK(m->get(r, c) == m->get(c, r));
 
-  DataObjectFactory::destroy(m);
+    DataObjectFactory::destroy(m);
 }
 
 TEMPLATE_PRODUCT_TEST_CASE("ReadDaphne CIG (CSR)", TAG_IO, (CSRMatrix), (int32_t)) {
-  using DT = TestType;
-  DT *m = nullptr;
+    using DT = TestType;
+    DT *m = nullptr;
 
-  size_t numRows = 9;
-  size_t numCols = 9;
+    size_t numRows = 9;
+    size_t numCols = 9;
 
-  char filename[] = "./test/runtime/local/io/cig-csr.dbdf";
-  readDaphne(m, filename);
+    char filename[] = "./test/runtime/local/io/cig-csr.dbdf";
+    readDaphne(m, filename);
 
-  REQUIRE(m->getNumRows() == numRows);
-  REQUIRE(m->getNumCols() == numCols);
+    REQUIRE(m->getNumRows() == numRows);
+    REQUIRE(m->getNumCols() == numCols);
 
-  CHECK(m->get(0, 0) == 1);
-  CHECK(m->get(2, 0) == 0);
-  CHECK(m->get(3, 4) == 9);
-  CHECK(m->get(7, 4) == 4);
+    CHECK(m->get(0, 0) == 1);
+    CHECK(m->get(2, 0) == 0);
+    CHECK(m->get(3, 4) == 9);
+    CHECK(m->get(7, 4) == 4);
 
-  DataObjectFactory::destroy(m);
+    DataObjectFactory::destroy(m);
 }
 
 TEMPLATE_PRODUCT_TEST_CASE("ReadDaphne AIG (CSR)", TAG_IO, (CSRMatrix), (int32_t)) {
-  using DT = TestType;
-  DT *m = nullptr;
+    using DT = TestType;
+    DT *m = nullptr;
 
-  size_t numRows = 4;
-  size_t numCols = 3;
+    size_t numRows = 4;
+    size_t numCols = 3;
 
-  char filename[] = "./test/runtime/local/io/aig-csr.dbdf";
-  readDaphne(m, filename);
+    char filename[] = "./test/runtime/local/io/aig-csr.dbdf";
+    readDaphne(m, filename);
 
-  REQUIRE(m->getNumRows() == numRows);
-  REQUIRE(m->getNumCols() == numCols);
+    REQUIRE(m->getNumRows() == numRows);
+    REQUIRE(m->getNumCols() == numCols);
 
-  CHECK(m->get(0, 0) == 1);
-  CHECK(m->get(1, 0) == 2);
-  CHECK(m->get(0, 1) == 5);
-  CHECK(m->get(3, 2) == 12);
-  CHECK(m->get(2, 1) == 7);
+    CHECK(m->get(0, 0) == 1);
+    CHECK(m->get(1, 0) == 2);
+    CHECK(m->get(0, 1) == 5);
+    CHECK(m->get(3, 2) == 12);
+    CHECK(m->get(2, 1) == 7);
 
-  DataObjectFactory::destroy(m);
+    DataObjectFactory::destroy(m);
 }
 
 TEMPLATE_PRODUCT_TEST_CASE("ReadDaphne CRG (CSR)", TAG_IO, (CSRMatrix), (double)) {
-  using DT = TestType;
-  DT *m = nullptr;
+    using DT = TestType;
+    DT *m = nullptr;
 
-  size_t numRows = 497;
-  size_t numCols = 507;
+    size_t numRows = 497;
+    size_t numCols = 507;
 
-  char filename[] = "./test/runtime/local/io/crg-csr.dbdf";
-  readDaphne(m, filename);
+    char filename[] = "./test/runtime/local/io/crg-csr.dbdf";
+    readDaphne(m, filename);
 
-  REQUIRE(m->getNumRows() == numRows);
-  REQUIRE(m->getNumCols() == numCols);
+    REQUIRE(m->getNumRows() == numRows);
+    REQUIRE(m->getNumCols() == numCols);
 
-  CHECK(m->get(5, 0) == 0.25599762);
-  CHECK(m->get(6, 0) == 0.13827993);
-  CHECK(m->get(200, 4) == 0.20001954);
+    CHECK(m->get(5, 0) == 0.25599762);
+    CHECK(m->get(6, 0) == 0.13827993);
+    CHECK(m->get(200, 4) == 0.20001954);
 
-  DataObjectFactory::destroy(m);
+    DataObjectFactory::destroy(m);
 }
 
 TEMPLATE_PRODUCT_TEST_CASE("ReadDaphne CRS (CSR)", TAG_IO, (CSRMatrix), (double)) {
-  using DT = TestType;
-  DT *m = nullptr;
+    using DT = TestType;
+    DT *m = nullptr;
 
-  size_t numRows = 66;
-  size_t numCols = 66;
+    size_t numRows = 66;
+    size_t numCols = 66;
 
-  char filename[] = "./test/runtime/local/io/crs-csr.dbdf";
-  readDaphne(m, filename);
+    char filename[] = "./test/runtime/local/io/crs-csr.dbdf";
+    readDaphne(m, filename);
 
-  REQUIRE(m->getNumRows() == numRows);
-  REQUIRE(m->getNumCols() == numCols);
+    REQUIRE(m->getNumRows() == numRows);
+    REQUIRE(m->getNumCols() == numCols);
 
-  CHECK(m->get(36, 29) == 926.188986068);
+    CHECK(m->get(36, 29) == 926.188986068);
 
-  for(size_t r = 0; r<numRows; r++)
-    for(size_t c = r+1; c<numCols; c++)
-      CHECK(m->get(r,c) == m->get(c,r));
+    for (size_t r = 0; r < numRows; r++)
+        for (size_t c = r + 1; c < numCols; c++)
+            CHECK(m->get(r, c) == m->get(c, r));
 
-  DataObjectFactory::destroy(m);
+    DataObjectFactory::destroy(m);
 }
 
 TEMPLATE_PRODUCT_TEST_CASE("ReadDaphne CRK (CSR)", TAG_IO, (CSRMatrix), (double)) {
-  using DT = TestType;
-  DT *m = nullptr;
+    using DT = TestType;
+    DT *m = nullptr;
 
-  size_t numRows = 66;
-  size_t numCols = 66;
+    size_t numRows = 66;
+    size_t numCols = 66;
 
-  char filename[] = "./test/runtime/local/io/crk-csr.dbdf";
-  readDaphne(m, filename);
+    char filename[] = "./test/runtime/local/io/crk-csr.dbdf";
+    readDaphne(m, filename);
 
-  REQUIRE(m->getNumRows() == numRows);
-  REQUIRE(m->getNumCols() == numCols);
+    REQUIRE(m->getNumRows() == numRows);
+    REQUIRE(m->getNumCols() == numCols);
 
-  CHECK(m->get(29, 36) == -926.188986068);
+    CHECK(m->get(29, 36) == -926.188986068);
 
-  for(size_t r = 0; r<numRows; r++) {
-    CHECK(m->get(r,r) == 0);
-    for(size_t c = r+1; c<numCols; c++)
-      CHECK(m->get(r,c) == -m->get(c,r));
-  }
+    for (size_t r = 0; r < numRows; r++) {
+        CHECK(m->get(r, r) == 0);
+        for (size_t c = r + 1; c < numCols; c++)
+            CHECK(m->get(r, c) == -m->get(c, r));
+    }
 
-  DataObjectFactory::destroy(m);
+    DataObjectFactory::destroy(m);
 }
 
 TEMPLATE_PRODUCT_TEST_CASE("ReadDaphne CPS (CSR)", TAG_IO, (CSRMatrix), (int32_t)) {
-  using DT = TestType;
-  DT *m = nullptr;
+    using DT = TestType;
+    DT *m = nullptr;
 
-  size_t numRows = 24;
-  size_t numCols = 24;
+    size_t numRows = 24;
+    size_t numCols = 24;
 
-  char filename[] = "./test/runtime/local/io/cps-csr.dbdf";
-  readDaphne(m, filename);
+    char filename[] = "./test/runtime/local/io/cps-csr.dbdf";
+    readDaphne(m, filename);
 
-  REQUIRE(m->getNumRows() == numRows);
-  REQUIRE(m->getNumCols() == numCols);
+    REQUIRE(m->getNumRows() == numRows);
+    REQUIRE(m->getNumCols() == numCols);
 
-  CHECK(m->get( 0, 0) != 0);
-  CHECK(m->get( 1, 0) == 0);
-  CHECK(m->get(3, 15) != 0);
+    CHECK(m->get(0, 0) != 0);
+    CHECK(m->get(1, 0) == 0);
+    CHECK(m->get(3, 15) != 0);
 
-  for(size_t r = 0; r<numRows; r++)
-    for(size_t c = r+1; c<numCols; c++)
-      if(m->get(r,c) == 0)
-        CHECK(m->get(c,r) == 0);
-      else
-        CHECK(m->get(c,r) != 0);
+    for (size_t r = 0; r < numRows; r++)
+        for (size_t c = r + 1; c < numCols; c++)
+            if (m->get(r, c) == 0)
+                CHECK(m->get(c, r) == 0);
+            else
+                CHECK(m->get(c, r) != 0);
 
-  DataObjectFactory::destroy(m);
+    DataObjectFactory::destroy(m);
 }
 
 TEMPLATE_PRODUCT_TEST_CASE("ReadDaphne AIK (CSR)", TAG_IO, (CSRMatrix), (int32_t)) {
-  using DT = TestType;
-  DT *m = nullptr;
+    using DT = TestType;
+    DT *m = nullptr;
 
-  size_t numRows = 4;
-  size_t numCols = 4;
+    size_t numRows = 4;
+    size_t numCols = 4;
 
-  char filename[] = "./test/runtime/local/io/aik-csr.dbdf";
-  readDaphne(m, filename);
+    char filename[] = "./test/runtime/local/io/aik-csr.dbdf";
+    readDaphne(m, filename);
 
-  REQUIRE(m->getNumRows() == numRows);
-  REQUIRE(m->getNumCols() == numCols);
+    REQUIRE(m->getNumRows() == numRows);
+    REQUIRE(m->getNumCols() == numCols);
 
-  CHECK(m->get(1, 0) == 1);
+    CHECK(m->get(1, 0) == 1);
 
-  for(size_t r = 0; r<numRows; r++) {
-    CHECK(m->get(r,r) == 0);
-    for(size_t c = r+1; c<numCols; c++)
-      CHECK(m->get(r,c) == -m->get(c,r));
-  }
+    for (size_t r = 0; r < numRows; r++) {
+        CHECK(m->get(r, r) == 0);
+        for (size_t c = r + 1; c < numCols; c++)
+            CHECK(m->get(r, c) == -m->get(c, r));
+    }
 
-  DataObjectFactory::destroy(m);
+    DataObjectFactory::destroy(m);
 }
 
 TEMPLATE_PRODUCT_TEST_CASE("ReadDaphne AIS (CSR)", TAG_IO, (CSRMatrix), (int32_t)) {
-  using DT = TestType;
-  DT *m = nullptr;
+    using DT = TestType;
+    DT *m = nullptr;
 
-  size_t numRows = 3;
-  size_t numCols = 3;
+    size_t numRows = 3;
+    size_t numCols = 3;
 
-  char filename[] = "./test/runtime/local/io/ais-csr.dbdf";
-  readDaphne(m, filename);
+    char filename[] = "./test/runtime/local/io/ais-csr.dbdf";
+    readDaphne(m, filename);
 
-  REQUIRE(m->getNumRows() == numRows);
-  REQUIRE(m->getNumCols() == numCols);
+    REQUIRE(m->getNumRows() == numRows);
+    REQUIRE(m->getNumCols() == numCols);
 
-  CHECK(m->get(1, 1) == 4);
+    CHECK(m->get(1, 1) == 4);
 
-  for(size_t r = 0; r<numRows; r++)
-    for(size_t c = r+1; c<numCols; c++)
-      CHECK(m->get(r,c) == m->get(c,r));
+    for (size_t r = 0; r < numRows; r++)
+        for (size_t c = r + 1; c < numCols; c++)
+            CHECK(m->get(r, c) == m->get(c, r));
 
-  DataObjectFactory::destroy(m);
+    DataObjectFactory::destroy(m);
 }
 
 TEST_CASE("ReadDaphne CIG (Frame)", TAG_IO) {
-  using DT = Frame;
-  DT *m = nullptr;
+    using DT = Frame;
+    DT *m = nullptr;
 
-  size_t numRows = 9;
-  size_t numCols = 9;
+    size_t numRows = 9;
+    size_t numCols = 9;
 
-  char filename[] = "./test/runtime/local/io/cig-f.dbdf";
-  readDaphne(m, filename);
+    char filename[] = "./test/runtime/local/io/cig-f.dbdf";
+    readDaphne(m, filename);
 
-  REQUIRE(m->getNumRows() == numRows);
-  REQUIRE(m->getNumCols() == numCols);
+    REQUIRE(m->getNumRows() == numRows);
+    REQUIRE(m->getNumCols() == numCols);
 
-  CHECK(m->getColumn<int64_t>(0)->get(0, 0) == 1);
-  CHECK(m->getColumn<int64_t>(0)->get(2, 0) == 0);
-  CHECK(m->getColumn<int64_t>(4)->get(3, 0) == 9);
-  CHECK(m->getColumn<int64_t>(4)->get(7, 0) == 4);
+    CHECK(m->getColumn<int64_t>(0)->get(0, 0) == 1);
+    CHECK(m->getColumn<int64_t>(0)->get(2, 0) == 0);
+    CHECK(m->getColumn<int64_t>(4)->get(3, 0) == 9);
+    CHECK(m->getColumn<int64_t>(4)->get(7, 0) == 4);
 
-  DataObjectFactory::destroy(m);
+    DataObjectFactory::destroy(m);
 }
 
 TEST_CASE("ReadDaphne AIG (Frame)", TAG_IO) {
-  using DT = Frame;
-  DT *m = nullptr;
+    using DT = Frame;
+    DT *m = nullptr;
 
-  size_t numRows = 4;
-  size_t numCols = 3;
+    size_t numRows = 4;
+    size_t numCols = 3;
 
-  char filename[] = "./test/runtime/local/io/aig-f.dbdf";
-  readDaphne(m, filename);
+    char filename[] = "./test/runtime/local/io/aig-f.dbdf";
+    readDaphne(m, filename);
 
-  REQUIRE(m->getNumRows() == numRows);
-  REQUIRE(m->getNumCols() == numCols);
+    REQUIRE(m->getNumRows() == numRows);
+    REQUIRE(m->getNumCols() == numCols);
 
-  CHECK(m->getColumn<int64_t>(0)->get(0, 0) == 1);
-  CHECK(m->getColumn<int64_t>(0)->get(1, 0) == 2);
-  CHECK(m->getColumn<int64_t>(1)->get(0, 0) == 5);
-  CHECK(m->getColumn<int64_t>(2)->get(3, 0) == 12);
-  CHECK(m->getColumn<int64_t>(1)->get(2, 0) == 7);
+    CHECK(m->getColumn<int64_t>(0)->get(0, 0) == 1);
+    CHECK(m->getColumn<int64_t>(0)->get(1, 0) == 2);
+    CHECK(m->getColumn<int64_t>(1)->get(0, 0) == 5);
+    CHECK(m->getColumn<int64_t>(2)->get(3, 0) == 12);
+    CHECK(m->getColumn<int64_t>(1)->get(2, 0) == 7);
 
-  DataObjectFactory::destroy(m);
+    DataObjectFactory::destroy(m);
 }
 
 TEST_CASE("ReadDaphne CRG (Frame)", TAG_IO) {
-  using DT = Frame;
-  DT *m = nullptr;
+    using DT = Frame;
+    DT *m = nullptr;
 
-  size_t numRows = 497;
-  size_t numCols = 507;
+    size_t numRows = 497;
+    size_t numCols = 507;
 
-  char filename[] = "./test/runtime/local/io/crg-f.dbdf";
-  readDaphne(m, filename);
+    char filename[] = "./test/runtime/local/io/crg-f.dbdf";
+    readDaphne(m, filename);
 
-  REQUIRE(m->getNumRows() == numRows);
-  REQUIRE(m->getNumCols() == numCols);
+    REQUIRE(m->getNumRows() == numRows);
+    REQUIRE(m->getNumCols() == numCols);
 
-  CHECK(m->getColumn<double>(0)->get(5, 0) == 0.25599762);
-  CHECK(m->getColumn<double>(0)->get(6, 0) == 0.13827993);
-  CHECK(m->getColumn<double>(4)->get(200, 0) == 0.20001954);
+    CHECK(m->getColumn<double>(0)->get(5, 0) == 0.25599762);
+    CHECK(m->getColumn<double>(0)->get(6, 0) == 0.13827993);
+    CHECK(m->getColumn<double>(4)->get(200, 0) == 0.20001954);
 
-  DataObjectFactory::destroy(m);
+    DataObjectFactory::destroy(m);
 }
 
 TEST_CASE("ReadDaphne CRS (Frame)", TAG_IO) {
-  using DT = Frame;
-  DT *m = nullptr;
+    using DT = Frame;
+    DT *m = nullptr;
 
-  size_t numRows = 66;
-  size_t numCols = 66;
+    size_t numRows = 66;
+    size_t numCols = 66;
 
-  char filename[] = "./test/runtime/local/io/crs-f.dbdf";
-  readDaphne(m, filename);
+    char filename[] = "./test/runtime/local/io/crs-f.dbdf";
+    readDaphne(m, filename);
 
-  REQUIRE(m->getNumRows() == numRows);
-  REQUIRE(m->getNumCols() == numCols);
+    REQUIRE(m->getNumRows() == numRows);
+    REQUIRE(m->getNumCols() == numCols);
 
-  CHECK(m->getColumn<double>(29)->get(36, 0) == 926.188986068);
+    CHECK(m->getColumn<double>(29)->get(36, 0) == 926.188986068);
 
-  for(size_t r = 0; r<numRows; r++)
-    for(size_t c = r+1; c<numCols; c++)
-      CHECK(m->getColumn<double>(c)->get(r,0)
-        ==  m->getColumn<double>(r)->get(c,0));
+    for (size_t r = 0; r < numRows; r++)
+        for (size_t c = r + 1; c < numCols; c++)
+            CHECK(m->getColumn<double>(c)->get(r, 0) == m->getColumn<double>(r)->get(c, 0));
 
-  DataObjectFactory::destroy(m);
+    DataObjectFactory::destroy(m);
 }
 
 TEST_CASE("ReadDaphne CRK (Frame)", TAG_IO) {
-  using DT = Frame;
-  DT *m = nullptr;
+    using DT = Frame;
+    DT *m = nullptr;
 
-  size_t numRows = 66;
-  size_t numCols = 66;
+    size_t numRows = 66;
+    size_t numCols = 66;
 
-  char filename[] = "./test/runtime/local/io/crk-f.dbdf";
-  readDaphne(m, filename);
+    char filename[] = "./test/runtime/local/io/crk-f.dbdf";
+    readDaphne(m, filename);
 
-  REQUIRE(m->getNumRows() == numRows);
-  REQUIRE(m->getNumCols() == numCols);
+    REQUIRE(m->getNumRows() == numRows);
+    REQUIRE(m->getNumCols() == numCols);
 
-  CHECK(m->getColumn<double>(36)->get(29, 0) == -926.188986068);
+    CHECK(m->getColumn<double>(36)->get(29, 0) == -926.188986068);
 
-  for(size_t r = 0; r<numRows; r++) {
-    CHECK(m->getColumn<double>(r)->get(r,0) == 0);
-    for(size_t c = r+1; c<numCols; c++)
-      CHECK(m->getColumn<double>(c)->get(r,0)
-        == -m->getColumn<double>(r)->get(c,0));
-  }
+    for (size_t r = 0; r < numRows; r++) {
+        CHECK(m->getColumn<double>(r)->get(r, 0) == 0);
+        for (size_t c = r + 1; c < numCols; c++)
+            CHECK(m->getColumn<double>(c)->get(r, 0) == -m->getColumn<double>(r)->get(c, 0));
+    }
 
-  DataObjectFactory::destroy(m);
+    DataObjectFactory::destroy(m);
 }
 
 TEST_CASE("ReadDaphne CPS (Frame)", TAG_IO) {
-  using DT = Frame;
-  DT *m = nullptr;
+    using DT = Frame;
+    DT *m = nullptr;
 
-  size_t numRows = 24;
-  size_t numCols = 24;
+    size_t numRows = 24;
+    size_t numCols = 24;
 
-  char filename[] = "./test/runtime/local/io/cps-f.dbdf";
-  readDaphne(m, filename);
+    char filename[] = "./test/runtime/local/io/cps-f.dbdf";
+    readDaphne(m, filename);
 
-  REQUIRE(m->getNumRows() == numRows);
-  REQUIRE(m->getNumCols() == numCols);
+    REQUIRE(m->getNumRows() == numRows);
+    REQUIRE(m->getNumCols() == numCols);
 
-  CHECK(m->getColumn<double>(0)->get( 0, 0) != 0);
-  CHECK(m->getColumn<double>(0)->get( 1, 0) == 0);
-  CHECK(m->getColumn<double>(15)->get(3, 0) != 0);
+    CHECK(m->getColumn<double>(0)->get(0, 0) != 0);
+    CHECK(m->getColumn<double>(0)->get(1, 0) == 0);
+    CHECK(m->getColumn<double>(15)->get(3, 0) != 0);
 
-  for(size_t r = 0; r<numRows; r++)
-    for(size_t c = r+1; c<numCols; c++)
-      if(m->getColumn<double>(c)->get(r,0) == 0)
-        CHECK(m->getColumn<double>(r)->get(c,0) == 0);
-      else
-        CHECK(m->getColumn<double>(r)->get(c,0) != 0);
+    for (size_t r = 0; r < numRows; r++)
+        for (size_t c = r + 1; c < numCols; c++)
+            if (m->getColumn<double>(c)->get(r, 0) == 0)
+                CHECK(m->getColumn<double>(r)->get(c, 0) == 0);
+            else
+                CHECK(m->getColumn<double>(r)->get(c, 0) != 0);
 
-  DataObjectFactory::destroy(m);
+    DataObjectFactory::destroy(m);
 }
 
 TEST_CASE("ReadDaphne AIK (Frame)", TAG_IO) {
-  using DT = Frame;
-  DT *m = nullptr;
+    using DT = Frame;
+    DT *m = nullptr;
 
-  size_t numRows = 4;
-  size_t numCols = 4;
+    size_t numRows = 4;
+    size_t numCols = 4;
 
-  char filename[] = "./test/runtime/local/io/aik-f.dbdf";
-  readDaphne(m, filename);
+    char filename[] = "./test/runtime/local/io/aik-f.dbdf";
+    readDaphne(m, filename);
 
-  REQUIRE(m->getNumRows() == numRows);
-  REQUIRE(m->getNumCols() == numCols);
+    REQUIRE(m->getNumRows() == numRows);
+    REQUIRE(m->getNumCols() == numCols);
 
-  CHECK(m->getColumn<int64_t>(0)->get(1, 0) == 1);
+    CHECK(m->getColumn<int64_t>(0)->get(1, 0) == 1);
 
-  for(size_t r = 0; r<numRows; r++) {
-    CHECK(m->getColumn<int64_t>(r)->get(r,0) == 0);
-    for(size_t c = r+1; c<numCols; c++)
-      CHECK(m->getColumn<int64_t>(c)->get(r,0)
-        == -m->getColumn<int64_t>(r)->get(c,0));
-  }
+    for (size_t r = 0; r < numRows; r++) {
+        CHECK(m->getColumn<int64_t>(r)->get(r, 0) == 0);
+        for (size_t c = r + 1; c < numCols; c++)
+            CHECK(m->getColumn<int64_t>(c)->get(r, 0) == -m->getColumn<int64_t>(r)->get(c, 0));
+    }
 
-  DataObjectFactory::destroy(m);
+    DataObjectFactory::destroy(m);
 }
 
 TEST_CASE("ReadDaphne AIS (Frame)", TAG_IO) {
-  using DT = Frame;
-  DT *m = nullptr;
+    using DT = Frame;
+    DT *m = nullptr;
 
-  size_t numRows = 3;
-  size_t numCols = 3;
-  char filename[] = "./test/runtime/local/io/ais-f.dbdf";
-  readDaphne(m, filename);
+    size_t numRows = 3;
+    size_t numCols = 3;
+    char filename[] = "./test/runtime/local/io/ais-f.dbdf";
+    readDaphne(m, filename);
 
-  REQUIRE(m->getNumRows() == numRows);
-  REQUIRE(m->getNumCols() == numCols);
+    REQUIRE(m->getNumRows() == numRows);
+    REQUIRE(m->getNumCols() == numCols);
 
-  CHECK(m->getColumn<int64_t>(1)->get(1, 0) == 4);
+    CHECK(m->getColumn<int64_t>(1)->get(1, 0) == 4);
 
-  for (size_t r = 0; r<numRows; r++)
-    for(size_t c = r+1; c<numCols; c++)
-      CHECK(m->getColumn<int64_t>(c)->get(r,0)
-        ==  m->getColumn<int64_t>(r)->get(c,0));
+    for (size_t r = 0; r < numRows; r++)
+        for (size_t c = r + 1; c < numCols; c++)
+            CHECK(m->getColumn<int64_t>(c)->get(r, 0) == m->getColumn<int64_t>(r)->get(c, 0));
 
-  DataObjectFactory::destroy(m);
+    DataObjectFactory::destroy(m);
 }
diff --git a/test/runtime/local/io/ReadMMTest.cpp b/test/runtime/local/io/ReadMMTest.cpp
index c7dda4f48..0c76ae98d 100644
--- a/test/runtime/local/io/ReadMMTest.cpp
+++ b/test/runtime/local/io/ReadMMTest.cpp
@@ -29,551 +29,547 @@
 #include <limits>
 
 TEMPLATE_PRODUCT_TEST_CASE("ReadMM CIG", TAG_IO, (DenseMatrix), (int32_t)) {
-  using DT = TestType;
-  DT *m = nullptr;
+    using DT = TestType;
+    DT *m = nullptr;
 
-  size_t numRows = 9;
-  size_t numCols = 9;
+    size_t numRows = 9;
+    size_t numCols = 9;
 
-  char filename[] = "./test/runtime/local/io/cig.mtx";
-  readMM(m, filename);
+    char filename[] = "./test/runtime/local/io/cig.mtx";
+    readMM(m, filename);
 
-  REQUIRE(m->getNumRows() == numRows);
-  REQUIRE(m->getNumCols() == numCols);
+    REQUIRE(m->getNumRows() == numRows);
+    REQUIRE(m->getNumCols() == numCols);
 
-  CHECK(m->get(0, 0) == 1);
-  CHECK(m->get(2, 0) == 0);
-  CHECK(m->get(3, 4) == 9);
-  CHECK(m->get(7, 4) == 4);
+    CHECK(m->get(0, 0) == 1);
+    CHECK(m->get(2, 0) == 0);
+    CHECK(m->get(3, 4) == 9);
+    CHECK(m->get(7, 4) == 4);
 
-  DataObjectFactory::destroy(m);
+    DataObjectFactory::destroy(m);
 }
 
 TEMPLATE_PRODUCT_TEST_CASE("ReadMM AIG", TAG_IO, (DenseMatrix), (int32_t)) {
-  using DT = TestType;
-  DT *m = nullptr;
+    using DT = TestType;
+    DT *m = nullptr;
 
-  size_t numRows = 4;
-  size_t numCols = 3;
+    size_t numRows = 4;
+    size_t numCols = 3;
 
-  char filename[] = "./test/runtime/local/io/aig.mtx";
-  readMM(m, filename);
+    char filename[] = "./test/runtime/local/io/aig.mtx";
+    readMM(m, filename);
 
-  REQUIRE(m->getNumRows() == numRows);
-  REQUIRE(m->getNumCols() == numCols);
+    REQUIRE(m->getNumRows() == numRows);
+    REQUIRE(m->getNumCols() == numCols);
 
-  CHECK(m->get(0, 0) == 1);
-  CHECK(m->get(1, 0) == 2);
-  CHECK(m->get(0, 1) == 5);
-  CHECK(m->get(3, 2) == 12);
-  CHECK(m->get(2, 1) == 7);
+    CHECK(m->get(0, 0) == 1);
+    CHECK(m->get(1, 0) == 2);
+    CHECK(m->get(0, 1) == 5);
+    CHECK(m->get(3, 2) == 12);
+    CHECK(m->get(2, 1) == 7);
 
-  DataObjectFactory::destroy(m);
+    DataObjectFactory::destroy(m);
 }
 
 TEMPLATE_PRODUCT_TEST_CASE("ReadMM CRG", TAG_IO, (DenseMatrix), (double)) {
-  using DT = TestType;
-  DT *m = nullptr;
+    using DT = TestType;
+    DT *m = nullptr;
 
-  size_t numRows = 497;
-  size_t numCols = 507;
+    size_t numRows = 497;
+    size_t numCols = 507;
 
-  char filename[] = "./test/runtime/local/io/crg.mtx";
-  readMM(m, filename);
+    char filename[] = "./test/runtime/local/io/crg.mtx";
+    readMM(m, filename);
 
-  REQUIRE(m->getNumRows() == numRows);
-  REQUIRE(m->getNumCols() == numCols);
+    REQUIRE(m->getNumRows() == numRows);
+    REQUIRE(m->getNumCols() == numCols);
 
-  CHECK(m->get(5, 0) == 0.25599762);
-  CHECK(m->get(6, 0) == 0.13827993);
-  CHECK(m->get(200, 4) == 0.20001954);
+    CHECK(m->get(5, 0) == 0.25599762);
+    CHECK(m->get(6, 0) == 0.13827993);
+    CHECK(m->get(200, 4) == 0.20001954);
 
-  DataObjectFactory::destroy(m);
+    DataObjectFactory::destroy(m);
 }
 
 TEMPLATE_PRODUCT_TEST_CASE("ReadMM CRS", TAG_IO, (DenseMatrix), (double)) {
-  using DT = TestType;
-  DT *m = nullptr;
+    using DT = TestType;
+    DT *m = nullptr;
 
-  size_t numRows = 66;
-  size_t numCols = 66;
+    size_t numRows = 66;
+    size_t numCols = 66;
 
-  char filename[] = "./test/runtime/local/io/crs.mtx";
-  readMM(m, filename);
+    char filename[] = "./test/runtime/local/io/crs.mtx";
+    readMM(m, filename);
 
-  REQUIRE(m->getNumRows() == numRows);
-  REQUIRE(m->getNumCols() == numCols);
+    REQUIRE(m->getNumRows() == numRows);
+    REQUIRE(m->getNumCols() == numCols);
 
-  CHECK(m->get(36, 29) == 926.188986068);
+    CHECK(m->get(36, 29) == 926.188986068);
 
-  for(size_t r = 0; r<numRows; r++)
-    for(size_t c = r+1; c<numCols; c++)
-      CHECK(m->get(r,c) == m->get(c,r));
+    for (size_t r = 0; r < numRows; r++)
+        for (size_t c = r + 1; c < numCols; c++)
+            CHECK(m->get(r, c) == m->get(c, r));
 
-  DataObjectFactory::destroy(m);
+    DataObjectFactory::destroy(m);
 }
 
 TEMPLATE_PRODUCT_TEST_CASE("ReadMM CRK", TAG_IO, (DenseMatrix), (double)) {
-  using DT = TestType;
-  DT *m = nullptr;
+    using DT = TestType;
+    DT *m = nullptr;
 
-  size_t numRows = 66;
-  size_t numCols = 66;
+    size_t numRows = 66;
+    size_t numCols = 66;
 
-  char filename[] = "./test/runtime/local/io/crk.mtx";
-  readMM(m, filename);
+    char filename[] = "./test/runtime/local/io/crk.mtx";
+    readMM(m, filename);
 
-  REQUIRE(m->getNumRows() == numRows);
-  REQUIRE(m->getNumCols() == numCols);
+    REQUIRE(m->getNumRows() == numRows);
+    REQUIRE(m->getNumCols() == numCols);
 
-  CHECK(m->get(29, 36) == -926.188986068);
+    CHECK(m->get(29, 36) == -926.188986068);
 
-  for(size_t r = 0; r<numRows; r++) {
-    CHECK(m->get(r,r) == 0);
-    for(size_t c = r+1; c<numCols; c++)
-      CHECK(m->get(r,c) == -m->get(c,r));
-  }
+    for (size_t r = 0; r < numRows; r++) {
+        CHECK(m->get(r, r) == 0);
+        for (size_t c = r + 1; c < numCols; c++)
+            CHECK(m->get(r, c) == -m->get(c, r));
+    }
 
-  DataObjectFactory::destroy(m);
+    DataObjectFactory::destroy(m);
 }
 
 TEMPLATE_PRODUCT_TEST_CASE("ReadMM CPS", TAG_IO, (DenseMatrix), (int32_t)) {
-  using DT = TestType;
-  DT *m = nullptr;
+    using DT = TestType;
+    DT *m = nullptr;
 
-  size_t numRows = 24;
-  size_t numCols = 24;
+    size_t numRows = 24;
+    size_t numCols = 24;
 
-  char filename[] = "./test/runtime/local/io/cps.mtx";
-  readMM(m, filename);
+    char filename[] = "./test/runtime/local/io/cps.mtx";
+    readMM(m, filename);
 
-  REQUIRE(m->getNumRows() == numRows);
-  REQUIRE(m->getNumCols() == numCols);
+    REQUIRE(m->getNumRows() == numRows);
+    REQUIRE(m->getNumCols() == numCols);
 
-  CHECK(m->get( 0, 0) != 0);
-  CHECK(m->get( 1, 0) == 0);
-  CHECK(m->get(3, 15) != 0);
+    CHECK(m->get(0, 0) != 0);
+    CHECK(m->get(1, 0) == 0);
+    CHECK(m->get(3, 15) != 0);
 
-  for(size_t r = 0; r<numRows; r++)
-    for(size_t c = r+1; c<numCols; c++)
-      if(m->get(r,c) == 0)
-        CHECK(m->get(c,r) == 0);
-      else
-        CHECK(m->get(c,r) != 0);
+    for (size_t r = 0; r < numRows; r++)
+        for (size_t c = r + 1; c < numCols; c++)
+            if (m->get(r, c) == 0)
+                CHECK(m->get(c, r) == 0);
+            else
+                CHECK(m->get(c, r) != 0);
 
-  DataObjectFactory::destroy(m);
+    DataObjectFactory::destroy(m);
 }
 
 TEMPLATE_PRODUCT_TEST_CASE("ReadMM AIK", TAG_IO, (DenseMatrix), (int32_t)) {
-  using DT = TestType;
-  DT *m = nullptr;
+    using DT = TestType;
+    DT *m = nullptr;
 
-  size_t numRows = 4;
-  size_t numCols = 4;
+    size_t numRows = 4;
+    size_t numCols = 4;
 
-  char filename[] = "./test/runtime/local/io/aik.mtx";
-  readMM(m, filename);
+    char filename[] = "./test/runtime/local/io/aik.mtx";
+    readMM(m, filename);
 
-  REQUIRE(m->getNumRows() == numRows);
-  REQUIRE(m->getNumCols() == numCols);
+    REQUIRE(m->getNumRows() == numRows);
+    REQUIRE(m->getNumCols() == numCols);
 
-  CHECK(m->get(1, 0) == 1);
+    CHECK(m->get(1, 0) == 1);
 
-  for(size_t r = 0; r<numRows; r++) {
-    CHECK(m->get(r,r) == 0);
-    for(size_t c = r+1; c<numCols; c++)
-      CHECK(m->get(r,c) == -m->get(c,r));
-  }
+    for (size_t r = 0; r < numRows; r++) {
+        CHECK(m->get(r, r) == 0);
+        for (size_t c = r + 1; c < numCols; c++)
+            CHECK(m->get(r, c) == -m->get(c, r));
+    }
 
-  DataObjectFactory::destroy(m);
+    DataObjectFactory::destroy(m);
 }
 
 TEMPLATE_PRODUCT_TEST_CASE("ReadMM AIS", TAG_IO, (DenseMatrix), (int32_t)) {
-  using DT = TestType;
-  DT *m = nullptr;
+    using DT = TestType;
+    DT *m = nullptr;
 
-  size_t numRows = 3;
-  size_t numCols = 3;
+    size_t numRows = 3;
+    size_t numCols = 3;
 
-  char filename[] = "./test/runtime/local/io/ais.mtx";
-  readMM(m, filename);
+    char filename[] = "./test/runtime/local/io/ais.mtx";
+    readMM(m, filename);
 
-  REQUIRE(m->getNumRows() == numRows);
-  REQUIRE(m->getNumCols() == numCols);
+    REQUIRE(m->getNumRows() == numRows);
+    REQUIRE(m->getNumCols() == numCols);
 
-  CHECK(m->get(1, 1) == 4);
+    CHECK(m->get(1, 1) == 4);
 
-  for(size_t r = 0; r<numRows; r++)
-    for(size_t c = r+1; c<numCols; c++)
-      CHECK(m->get(r,c) == m->get(c,r));
+    for (size_t r = 0; r < numRows; r++)
+        for (size_t c = r + 1; c < numCols; c++)
+            CHECK(m->get(r, c) == m->get(c, r));
 
-  DataObjectFactory::destroy(m);
+    DataObjectFactory::destroy(m);
 }
 
 TEMPLATE_PRODUCT_TEST_CASE("ReadMM CIG (CSR)", TAG_IO, (CSRMatrix), (int32_t)) {
-  using DT = TestType;
-  DT *m = nullptr;
+    using DT = TestType;
+    DT *m = nullptr;
 
-  size_t numRows = 9;
-  size_t numCols = 9;
+    size_t numRows = 9;
+    size_t numCols = 9;
 
-  char filename[] = "./test/runtime/local/io/cig.mtx";
-  readMM(m, filename);
+    char filename[] = "./test/runtime/local/io/cig.mtx";
+    readMM(m, filename);
 
-  REQUIRE(m->getNumRows() == numRows);
-  REQUIRE(m->getNumCols() == numCols);
+    REQUIRE(m->getNumRows() == numRows);
+    REQUIRE(m->getNumCols() == numCols);
 
-  CHECK(m->get(0, 0) == 1);
-  CHECK(m->get(2, 0) == 0);
-  CHECK(m->get(3, 4) == 9);
-  CHECK(m->get(7, 4) == 4);
+    CHECK(m->get(0, 0) == 1);
+    CHECK(m->get(2, 0) == 0);
+    CHECK(m->get(3, 4) == 9);
+    CHECK(m->get(7, 4) == 4);
 
-  DataObjectFactory::destroy(m);
+    DataObjectFactory::destroy(m);
 }
 
 TEMPLATE_PRODUCT_TEST_CASE("ReadMM AIG (CSR)", TAG_IO, (CSRMatrix), (int32_t)) {
-  using DT = TestType;
-  DT *m = nullptr;
+    using DT = TestType;
+    DT *m = nullptr;
 
-  size_t numRows = 4;
-  size_t numCols = 3;
+    size_t numRows = 4;
+    size_t numCols = 3;
 
-  char filename[] = "./test/runtime/local/io/aig.mtx";
-  readMM(m, filename);
+    char filename[] = "./test/runtime/local/io/aig.mtx";
+    readMM(m, filename);
 
-  REQUIRE(m->getNumRows() == numRows);
-  REQUIRE(m->getNumCols() == numCols);
+    REQUIRE(m->getNumRows() == numRows);
+    REQUIRE(m->getNumCols() == numCols);
 
-  CHECK(m->get(0, 0) == 1);
-  CHECK(m->get(1, 0) == 2);
-  CHECK(m->get(0, 1) == 5);
-  CHECK(m->get(3, 2) == 12);
-  CHECK(m->get(2, 1) == 7);
+    CHECK(m->get(0, 0) == 1);
+    CHECK(m->get(1, 0) == 2);
+    CHECK(m->get(0, 1) == 5);
+    CHECK(m->get(3, 2) == 12);
+    CHECK(m->get(2, 1) == 7);
 
-  DataObjectFactory::destroy(m);
+    DataObjectFactory::destroy(m);
 }
 
 TEMPLATE_PRODUCT_TEST_CASE("ReadMM CRG (CSR)", TAG_IO, (CSRMatrix), (double)) {
-  using DT = TestType;
-  DT *m = nullptr;
+    using DT = TestType;
+    DT *m = nullptr;
 
-  size_t numRows = 497;
-  size_t numCols = 507;
+    size_t numRows = 497;
+    size_t numCols = 507;
 
-  char filename[] = "./test/runtime/local/io/crg.mtx";
-  readMM(m, filename);
+    char filename[] = "./test/runtime/local/io/crg.mtx";
+    readMM(m, filename);
 
-  REQUIRE(m->getNumRows() == numRows);
-  REQUIRE(m->getNumCols() == numCols);
+    REQUIRE(m->getNumRows() == numRows);
+    REQUIRE(m->getNumCols() == numCols);
 
-  CHECK(m->get(5, 0) == 0.25599762);
-  CHECK(m->get(6, 0) == 0.13827993);
-  CHECK(m->get(200, 4) == 0.20001954);
+    CHECK(m->get(5, 0) == 0.25599762);
+    CHECK(m->get(6, 0) == 0.13827993);
+    CHECK(m->get(200, 4) == 0.20001954);
 
-  DataObjectFactory::destroy(m);
+    DataObjectFactory::destroy(m);
 }
 
 TEMPLATE_PRODUCT_TEST_CASE("ReadMM CRS (CSR)", TAG_IO, (CSRMatrix), (double)) {
-  using DT = TestType;
-  DT *m = nullptr;
+    using DT = TestType;
+    DT *m = nullptr;
 
-  size_t numRows = 66;
-  size_t numCols = 66;
+    size_t numRows = 66;
+    size_t numCols = 66;
 
-  char filename[] = "./test/runtime/local/io/crs.mtx";
-  readMM(m, filename);
+    char filename[] = "./test/runtime/local/io/crs.mtx";
+    readMM(m, filename);
 
-  REQUIRE(m->getNumRows() == numRows);
-  REQUIRE(m->getNumCols() == numCols);
+    REQUIRE(m->getNumRows() == numRows);
+    REQUIRE(m->getNumCols() == numCols);
 
-  CHECK(m->get(36, 29) == 926.188986068);
+    CHECK(m->get(36, 29) == 926.188986068);
 
-  for(size_t r = 0; r<numRows; r++)
-    for(size_t c = r+1; c<numCols; c++)
-      CHECK(m->get(r,c) == m->get(c,r));
+    for (size_t r = 0; r < numRows; r++)
+        for (size_t c = r + 1; c < numCols; c++)
+            CHECK(m->get(r, c) == m->get(c, r));
 
-  DataObjectFactory::destroy(m);
+    DataObjectFactory::destroy(m);
 }
 
 TEMPLATE_PRODUCT_TEST_CASE("ReadMM CRK (CSR)", TAG_IO, (CSRMatrix), (double)) {
-  using DT = TestType;
-  DT *m = nullptr;
+    using DT = TestType;
+    DT *m = nullptr;
 
-  size_t numRows = 66;
-  size_t numCols = 66;
+    size_t numRows = 66;
+    size_t numCols = 66;
 
-  char filename[] = "./test/runtime/local/io/crk.mtx";
-  readMM(m, filename);
+    char filename[] = "./test/runtime/local/io/crk.mtx";
+    readMM(m, filename);
 
-  REQUIRE(m->getNumRows() == numRows);
-  REQUIRE(m->getNumCols() == numCols);
+    REQUIRE(m->getNumRows() == numRows);
+    REQUIRE(m->getNumCols() == numCols);
 
-  CHECK(m->get(29, 36) == -926.188986068);
+    CHECK(m->get(29, 36) == -926.188986068);
 
-  for(size_t r = 0; r<numRows; r++) {
-    CHECK(m->get(r,r) == 0);
-    for(size_t c = r+1; c<numCols; c++)
-      CHECK(m->get(r,c) == -m->get(c,r));
-  }
+    for (size_t r = 0; r < numRows; r++) {
+        CHECK(m->get(r, r) == 0);
+        for (size_t c = r + 1; c < numCols; c++)
+            CHECK(m->get(r, c) == -m->get(c, r));
+    }
 
-  DataObjectFactory::destroy(m);
+    DataObjectFactory::destroy(m);
 }
 
 TEMPLATE_PRODUCT_TEST_CASE("ReadMM CPS (CSR)", TAG_IO, (CSRMatrix), (int32_t)) {
-  using DT = TestType;
-  DT *m = nullptr;
+    using DT = TestType;
+    DT *m = nullptr;
 
-  size_t numRows = 24;
-  size_t numCols = 24;
+    size_t numRows = 24;
+    size_t numCols = 24;
 
-  char filename[] = "./test/runtime/local/io/cps.mtx";
-  readMM(m, filename);
+    char filename[] = "./test/runtime/local/io/cps.mtx";
+    readMM(m, filename);
 
-  REQUIRE(m->getNumRows() == numRows);
-  REQUIRE(m->getNumCols() == numCols);
+    REQUIRE(m->getNumRows() == numRows);
+    REQUIRE(m->getNumCols() == numCols);
 
-  CHECK(m->get( 0, 0) != 0);
-  CHECK(m->get( 1, 0) == 0);
-  CHECK(m->get(3, 15) != 0);
+    CHECK(m->get(0, 0) != 0);
+    CHECK(m->get(1, 0) == 0);
+    CHECK(m->get(3, 15) != 0);
 
-  for(size_t r = 0; r<numRows; r++)
-    for(size_t c = r+1; c<numCols; c++)
-      if(m->get(r,c) == 0)
-        CHECK(m->get(c,r) == 0);
-      else
-        CHECK(m->get(c,r) != 0);
+    for (size_t r = 0; r < numRows; r++)
+        for (size_t c = r + 1; c < numCols; c++)
+            if (m->get(r, c) == 0)
+                CHECK(m->get(c, r) == 0);
+            else
+                CHECK(m->get(c, r) != 0);
 
-  DataObjectFactory::destroy(m);
+    DataObjectFactory::destroy(m);
 }
 
 TEMPLATE_PRODUCT_TEST_CASE("ReadMM AIK (CSR)", TAG_IO, (CSRMatrix), (int32_t)) {
-  using DT = TestType;
-  DT *m = nullptr;
+    using DT = TestType;
+    DT *m = nullptr;
 
-  size_t numRows = 4;
-  size_t numCols = 4;
+    size_t numRows = 4;
+    size_t numCols = 4;
 
-  char filename[] = "./test/runtime/local/io/aik.mtx";
-  readMM(m, filename);
+    char filename[] = "./test/runtime/local/io/aik.mtx";
+    readMM(m, filename);
 
-  REQUIRE(m->getNumRows() == numRows);
-  REQUIRE(m->getNumCols() == numCols);
+    REQUIRE(m->getNumRows() == numRows);
+    REQUIRE(m->getNumCols() == numCols);
 
-  CHECK(m->get(1, 0) == 1);
+    CHECK(m->get(1, 0) == 1);
 
-  for(size_t r = 0; r<numRows; r++) {
-    CHECK(m->get(r,r) == 0);
-    for(size_t c = r+1; c<numCols; c++)
-      CHECK(m->get(r,c) == -m->get(c,r));
-  }
+    for (size_t r = 0; r < numRows; r++) {
+        CHECK(m->get(r, r) == 0);
+        for (size_t c = r + 1; c < numCols; c++)
+            CHECK(m->get(r, c) == -m->get(c, r));
+    }
 
-  DataObjectFactory::destroy(m);
+    DataObjectFactory::destroy(m);
 }
 
 TEMPLATE_PRODUCT_TEST_CASE("ReadMM AIS (CSR)", TAG_IO, (CSRMatrix), (int32_t)) {
-  using DT = TestType;
-  DT *m = nullptr;
+    using DT = TestType;
+    DT *m = nullptr;
 
-  size_t numRows = 3;
-  size_t numCols = 3;
+    size_t numRows = 3;
+    size_t numCols = 3;
 
-  char filename[] = "./test/runtime/local/io/ais.mtx";
-  readMM(m, filename);
+    char filename[] = "./test/runtime/local/io/ais.mtx";
+    readMM(m, filename);
 
-  REQUIRE(m->getNumRows() == numRows);
-  REQUIRE(m->getNumCols() == numCols);
+    REQUIRE(m->getNumRows() == numRows);
+    REQUIRE(m->getNumCols() == numCols);
 
-  CHECK(m->get(1, 1) == 4);
+    CHECK(m->get(1, 1) == 4);
 
-  for(size_t r = 0; r<numRows; r++)
-    for(size_t c = r+1; c<numCols; c++)
-      CHECK(m->get(r,c) == m->get(c,r));
+    for (size_t r = 0; r < numRows; r++)
+        for (size_t c = r + 1; c < numCols; c++)
+            CHECK(m->get(r, c) == m->get(c, r));
 
-  DataObjectFactory::destroy(m);
+    DataObjectFactory::destroy(m);
 }
 
 TEST_CASE("ReadMM CIG (Frame)", TAG_IO) {
-  using DT = Frame;
-  DT *m = nullptr;
+    using DT = Frame;
+    DT *m = nullptr;
 
-  size_t numRows = 9;
-  size_t numCols = 9;
+    size_t numRows = 9;
+    size_t numCols = 9;
 
-  char filename[] = "./test/runtime/local/io/cig.mtx";
-  readMM(m, filename);
+    char filename[] = "./test/runtime/local/io/cig.mtx";
+    readMM(m, filename);
 
-  REQUIRE(m->getNumRows() == numRows);
-  REQUIRE(m->getNumCols() == numCols);
+    REQUIRE(m->getNumRows() == numRows);
+    REQUIRE(m->getNumCols() == numCols);
 
-  CHECK(m->getColumn<int64_t>(0)->get(0, 0) == 1);
-  CHECK(m->getColumn<int64_t>(0)->get(2, 0) == 0);
-  CHECK(m->getColumn<int64_t>(4)->get(3, 0) == 9);
-  CHECK(m->getColumn<int64_t>(4)->get(7, 0) == 4);
+    CHECK(m->getColumn<int64_t>(0)->get(0, 0) == 1);
+    CHECK(m->getColumn<int64_t>(0)->get(2, 0) == 0);
+    CHECK(m->getColumn<int64_t>(4)->get(3, 0) == 9);
+    CHECK(m->getColumn<int64_t>(4)->get(7, 0) == 4);
 
-  DataObjectFactory::destroy(m);
+    DataObjectFactory::destroy(m);
 }
 
 TEST_CASE("ReadMM AIG (Frame)", TAG_IO) {
-  using DT = Frame;
-  DT *m = nullptr;
+    using DT = Frame;
+    DT *m = nullptr;
 
-  size_t numRows = 4;
-  size_t numCols = 3;
+    size_t numRows = 4;
+    size_t numCols = 3;
 
-  char filename[] = "./test/runtime/local/io/aig.mtx";
-  readMM(m, filename);
+    char filename[] = "./test/runtime/local/io/aig.mtx";
+    readMM(m, filename);
 
-  REQUIRE(m->getNumRows() == numRows);
-  REQUIRE(m->getNumCols() == numCols);
+    REQUIRE(m->getNumRows() == numRows);
+    REQUIRE(m->getNumCols() == numCols);
 
-  CHECK(m->getColumn<int64_t>(0)->get(0, 0) == 1);
-  CHECK(m->getColumn<int64_t>(0)->get(1, 0) == 2);
-  CHECK(m->getColumn<int64_t>(1)->get(0, 0) == 5);
-  CHECK(m->getColumn<int64_t>(2)->get(3, 0) == 12);
-  CHECK(m->getColumn<int64_t>(1)->get(2, 0) == 7);
+    CHECK(m->getColumn<int64_t>(0)->get(0, 0) == 1);
+    CHECK(m->getColumn<int64_t>(0)->get(1, 0) == 2);
+    CHECK(m->getColumn<int64_t>(1)->get(0, 0) == 5);
+    CHECK(m->getColumn<int64_t>(2)->get(3, 0) == 12);
+    CHECK(m->getColumn<int64_t>(1)->get(2, 0) == 7);
 
-  DataObjectFactory::destroy(m);
+    DataObjectFactory::destroy(m);
 }
 
 TEST_CASE("ReadMM CRG (Frame)", TAG_IO) {
-  using DT = Frame;
-  DT *m = nullptr;
+    using DT = Frame;
+    DT *m = nullptr;
 
-  size_t numRows = 497;
-  size_t numCols = 507;
+    size_t numRows = 497;
+    size_t numCols = 507;
 
-  char filename[] = "./test/runtime/local/io/crg.mtx";
-  readMM(m, filename);
+    char filename[] = "./test/runtime/local/io/crg.mtx";
+    readMM(m, filename);
 
-  REQUIRE(m->getNumRows() == numRows);
-  REQUIRE(m->getNumCols() == numCols);
+    REQUIRE(m->getNumRows() == numRows);
+    REQUIRE(m->getNumCols() == numCols);
 
-  CHECK(m->getColumn<double>(0)->get(5, 0) == 0.25599762);
-  CHECK(m->getColumn<double>(0)->get(6, 0) == 0.13827993);
-  CHECK(m->getColumn<double>(4)->get(200, 0) == 0.20001954);
+    CHECK(m->getColumn<double>(0)->get(5, 0) == 0.25599762);
+    CHECK(m->getColumn<double>(0)->get(6, 0) == 0.13827993);
+    CHECK(m->getColumn<double>(4)->get(200, 0) == 0.20001954);
 
-  DataObjectFactory::destroy(m);
+    DataObjectFactory::destroy(m);
 }
 
 TEST_CASE("ReadMM CRS (Frame)", TAG_IO) {
-  using DT = Frame;
-  DT *m = nullptr;
+    using DT = Frame;
+    DT *m = nullptr;
 
-  size_t numRows = 66;
-  size_t numCols = 66;
+    size_t numRows = 66;
+    size_t numCols = 66;
 
-  char filename[] = "./test/runtime/local/io/crs.mtx";
-  readMM(m, filename);
+    char filename[] = "./test/runtime/local/io/crs.mtx";
+    readMM(m, filename);
 
-  REQUIRE(m->getNumRows() == numRows);
-  REQUIRE(m->getNumCols() == numCols);
+    REQUIRE(m->getNumRows() == numRows);
+    REQUIRE(m->getNumCols() == numCols);
 
-  CHECK(m->getColumn<double>(29)->get(36, 0) == 926.188986068);
+    CHECK(m->getColumn<double>(29)->get(36, 0) == 926.188986068);
 
-  for(size_t r = 0; r<numRows; r++)
-    for(size_t c = r+1; c<numCols; c++)
-      CHECK(m->getColumn<double>(c)->get(r,0)
-        ==  m->getColumn<double>(r)->get(c,0));
+    for (size_t r = 0; r < numRows; r++)
+        for (size_t c = r + 1; c < numCols; c++)
+            CHECK(m->getColumn<double>(c)->get(r, 0) == m->getColumn<double>(r)->get(c, 0));
 
-  DataObjectFactory::destroy(m);
+    DataObjectFactory::destroy(m);
 }
 
 TEST_CASE("ReadMM CRK (Frame)", TAG_IO) {
-  using DT = Frame;
-  DT *m = nullptr;
+    using DT = Frame;
+    DT *m = nullptr;
 
-  size_t numRows = 66;
-  size_t numCols = 66;
+    size_t numRows = 66;
+    size_t numCols = 66;
 
-  char filename[] = "./test/runtime/local/io/crk.mtx";
-  readMM(m, filename);
+    char filename[] = "./test/runtime/local/io/crk.mtx";
+    readMM(m, filename);
 
-  REQUIRE(m->getNumRows() == numRows);
-  REQUIRE(m->getNumCols() == numCols);
+    REQUIRE(m->getNumRows() == numRows);
+    REQUIRE(m->getNumCols() == numCols);
 
-  CHECK(m->getColumn<double>(36)->get(29, 0) == -926.188986068);
+    CHECK(m->getColumn<double>(36)->get(29, 0) == -926.188986068);
 
-  for(size_t r = 0; r<numRows; r++) {
-    CHECK(m->getColumn<double>(r)->get(r,0) == 0);
-    for(size_t c = r+1; c<numCols; c++)
-      CHECK(m->getColumn<double>(c)->get(r,0)
-        == -m->getColumn<double>(r)->get(c,0));
-  }
+    for (size_t r = 0; r < numRows; r++) {
+        CHECK(m->getColumn<double>(r)->get(r, 0) == 0);
+        for (size_t c = r + 1; c < numCols; c++)
+            CHECK(m->getColumn<double>(c)->get(r, 0) == -m->getColumn<double>(r)->get(c, 0));
+    }
 
-  DataObjectFactory::destroy(m);
+    DataObjectFactory::destroy(m);
 }
 
 TEST_CASE("ReadMM CPS (Frame)", TAG_IO) {
-  using DT = Frame;
-  DT *m = nullptr;
+    using DT = Frame;
+    DT *m = nullptr;
 
-  size_t numRows = 24;
-  size_t numCols = 24;
+    size_t numRows = 24;
+    size_t numCols = 24;
 
-  char filename[] = "./test/runtime/local/io/cps.mtx";
-  readMM(m, filename);
+    char filename[] = "./test/runtime/local/io/cps.mtx";
+    readMM(m, filename);
 
-  REQUIRE(m->getNumRows() == numRows);
-  REQUIRE(m->getNumCols() == numCols);
+    REQUIRE(m->getNumRows() == numRows);
+    REQUIRE(m->getNumCols() == numCols);
 
-  CHECK(m->getColumn<double>(0)->get( 0, 0) != 0);
-  CHECK(m->getColumn<double>(0)->get( 1, 0) == 0);
-  CHECK(m->getColumn<double>(15)->get(3, 0) != 0);
+    CHECK(m->getColumn<double>(0)->get(0, 0) != 0);
+    CHECK(m->getColumn<double>(0)->get(1, 0) == 0);
+    CHECK(m->getColumn<double>(15)->get(3, 0) != 0);
 
-  for(size_t r = 0; r<numRows; r++)
-    for(size_t c = r+1; c<numCols; c++)
-      if(m->getColumn<double>(c)->get(r,0) == 0)
-        CHECK(m->getColumn<double>(r)->get(c,0) == 0);
-      else
-        CHECK(m->getColumn<double>(r)->get(c,0) != 0);
+    for (size_t r = 0; r < numRows; r++)
+        for (size_t c = r + 1; c < numCols; c++)
+            if (m->getColumn<double>(c)->get(r, 0) == 0)
+                CHECK(m->getColumn<double>(r)->get(c, 0) == 0);
+            else
+                CHECK(m->getColumn<double>(r)->get(c, 0) != 0);
 
-  DataObjectFactory::destroy(m);
+    DataObjectFactory::destroy(m);
 }
 
 TEST_CASE("ReadMM AIK (Frame)", TAG_IO) {
-  using DT = Frame;
-  DT *m = nullptr;
+    using DT = Frame;
+    DT *m = nullptr;
 
-  size_t numRows = 4;
-  size_t numCols = 4;
+    size_t numRows = 4;
+    size_t numCols = 4;
 
-  char filename[] = "./test/runtime/local/io/aik.mtx";
-  readMM(m, filename);
+    char filename[] = "./test/runtime/local/io/aik.mtx";
+    readMM(m, filename);
 
-  REQUIRE(m->getNumRows() == numRows);
-  REQUIRE(m->getNumCols() == numCols);
+    REQUIRE(m->getNumRows() == numRows);
+    REQUIRE(m->getNumCols() == numCols);
 
-  CHECK(m->getColumn<int64_t>(0)->get(1, 0) == 1);
+    CHECK(m->getColumn<int64_t>(0)->get(1, 0) == 1);
 
-  for(size_t r = 0; r<numRows; r++) {
-    CHECK(m->getColumn<int64_t>(r)->get(r,0) == 0);
-    for(size_t c = r+1; c<numCols; c++)
-      CHECK(m->getColumn<int64_t>(c)->get(r,0)
-        == -m->getColumn<int64_t>(r)->get(c,0));
-  }
+    for (size_t r = 0; r < numRows; r++) {
+        CHECK(m->getColumn<int64_t>(r)->get(r, 0) == 0);
+        for (size_t c = r + 1; c < numCols; c++)
+            CHECK(m->getColumn<int64_t>(c)->get(r, 0) == -m->getColumn<int64_t>(r)->get(c, 0));
+    }
 
-  DataObjectFactory::destroy(m);
+    DataObjectFactory::destroy(m);
 }
 
 TEST_CASE("ReadMM AIS (Frame)", TAG_IO) {
-  using DT = Frame;
-  DT *m = nullptr;
+    using DT = Frame;
+    DT *m = nullptr;
 
-  size_t numRows = 3;
-  size_t numCols = 3;
+    size_t numRows = 3;
+    size_t numCols = 3;
 
-  char filename[] = "./test/runtime/local/io/ais.mtx";
-  readMM(m, filename);
+    char filename[] = "./test/runtime/local/io/ais.mtx";
+    readMM(m, filename);
 
-  REQUIRE(m->getNumRows() == numRows);
-  REQUIRE(m->getNumCols() == numCols);
+    REQUIRE(m->getNumRows() == numRows);
+    REQUIRE(m->getNumCols() == numCols);
 
-  CHECK(m->getColumn<int64_t>(1)->get(1, 0) == 4);
+    CHECK(m->getColumn<int64_t>(1)->get(1, 0) == 4);
 
-  for(size_t r = 0; r<numRows; r++)
-    for(size_t c = r+1; c<numCols; c++)
-      CHECK(m->getColumn<int64_t>(c)->get(r,0)
-        ==  m->getColumn<int64_t>(r)->get(c,0));
+    for (size_t r = 0; r < numRows; r++)
+        for (size_t c = r + 1; c < numCols; c++)
+            CHECK(m->getColumn<int64_t>(c)->get(r, 0) == m->getColumn<int64_t>(r)->get(c, 0));
 
-  DataObjectFactory::destroy(m);
+    DataObjectFactory::destroy(m);
 }
diff --git a/test/runtime/local/io/ReadParquetTest.cpp b/test/runtime/local/io/ReadParquetTest.cpp
index 1d369749d..b5108054e 100644
--- a/test/runtime/local/io/ReadParquetTest.cpp
+++ b/test/runtime/local/io/ReadParquetTest.cpp
@@ -14,9 +14,9 @@
  * limitations under the License.
  */
 
-#include <runtime/local/datastructures/Frame.h>
-#include <runtime/local/datastructures/DenseMatrix.h>
 #include <runtime/local/datastructures/CSRMatrix.h>
+#include <runtime/local/datastructures/DenseMatrix.h>
+#include <runtime/local/datastructures/Frame.h>
 #include <runtime/local/io/ReadParquet.h>
 
 #include <tags.h>
@@ -30,56 +30,56 @@
 #include <limits>
 
 TEST_CASE("ReadParquet, Frame", TAG_IO) {
-  ValueTypeCode schema[] = { ValueTypeCode::F64, ValueTypeCode::F64, ValueTypeCode::F64, ValueTypeCode::F64 };
-  Frame *m = NULL;
+    ValueTypeCode schema[] = {ValueTypeCode::F64, ValueTypeCode::F64, ValueTypeCode::F64, ValueTypeCode::F64};
+    Frame *m = NULL;
 
-  size_t numRows = 2;
-  size_t numCols = 4;
+    size_t numRows = 2;
+    size_t numCols = 4;
 
-  const char filename[] = "./test/runtime/local/io/ReadParquet1.parquet";
-  const char *fn = &filename[0];
+    const char filename[] = "./test/runtime/local/io/ReadParquet1.parquet";
+    const char *fn = &filename[0];
 
-  readParquet(m, fn, numRows, numCols, schema);
+    readParquet(m, fn, numRows, numCols, schema);
 
-  REQUIRE(m->getNumRows() == numRows);
-  REQUIRE(m->getNumCols() == numCols);
+    REQUIRE(m->getNumRows() == numRows);
+    REQUIRE(m->getNumCols() == numCols);
 
-  CHECK(m->getColumn<double>(0)->get(0, 0) == -0.1);
-  CHECK(m->getColumn<double>(1)->get(0, 0) == -0.2);
-  CHECK(m->getColumn<double>(2)->get(0, 0) == 0.1);
-  CHECK(m->getColumn<double>(3)->get(0, 0) == 0.2);
+    CHECK(m->getColumn<double>(0)->get(0, 0) == -0.1);
+    CHECK(m->getColumn<double>(1)->get(0, 0) == -0.2);
+    CHECK(m->getColumn<double>(2)->get(0, 0) == 0.1);
+    CHECK(m->getColumn<double>(3)->get(0, 0) == 0.2);
 
-  CHECK(m->getColumn<double>(0)->get(1, 0) == 3.14);
-  CHECK(m->getColumn<double>(1)->get(1, 0) == 5.41);
-  CHECK(m->getColumn<double>(2)->get(1, 0) == 6.22216);
-  CHECK(m->getColumn<double>(3)->get(1, 0) == 5);
+    CHECK(m->getColumn<double>(0)->get(1, 0) == 3.14);
+    CHECK(m->getColumn<double>(1)->get(1, 0) == 5.41);
+    CHECK(m->getColumn<double>(2)->get(1, 0) == 6.22216);
+    CHECK(m->getColumn<double>(3)->get(1, 0) == 5);
 
-  DataObjectFactory::destroy(m);
+    DataObjectFactory::destroy(m);
 }
 
 TEMPLATE_PRODUCT_TEST_CASE("ReadParquet, DenseMatrix", TAG_IO, (DenseMatrix), (double)) {
-  using DT = TestType;
-  DT *m = nullptr;
+    using DT = TestType;
+    DT *m = nullptr;
 
-  size_t numRows = 2;
-  size_t numCols = 4;
+    size_t numRows = 2;
+    size_t numCols = 4;
 
-  char filename[] = "./test/runtime/local/io/ReadParquet1.parquet";
+    char filename[] = "./test/runtime/local/io/ReadParquet1.parquet";
 
-  readParquet(m, filename, numRows, numCols);
+    readParquet(m, filename, numRows, numCols);
 
-  REQUIRE(m->getNumRows() == numRows);
-  REQUIRE(m->getNumCols() == numCols);
+    REQUIRE(m->getNumRows() == numRows);
+    REQUIRE(m->getNumCols() == numCols);
 
-  CHECK(m->get(0, 0) == -0.1);
-  CHECK(m->get(0, 1) == -0.2);
-  CHECK(m->get(0, 2) == 0.1);
-  CHECK(m->get(0, 3) == 0.2);
+    CHECK(m->get(0, 0) == -0.1);
+    CHECK(m->get(0, 1) == -0.2);
+    CHECK(m->get(0, 2) == 0.1);
+    CHECK(m->get(0, 3) == 0.2);
 
-  CHECK(m->get(1, 0) == 3.14);
-  CHECK(m->get(1, 1) == 5.41);
-  CHECK(m->get(1, 2) == 6.22216);
-  CHECK(m->get(1, 3) == 5);
+    CHECK(m->get(1, 0) == 3.14);
+    CHECK(m->get(1, 1) == 5.41);
+    CHECK(m->get(1, 2) == 6.22216);
+    CHECK(m->get(1, 3) == 5);
 
-  DataObjectFactory::destroy(m);
+    DataObjectFactory::destroy(m);
 }
diff --git a/test/runtime/local/io/WriteDaphneTest.cpp b/test/runtime/local/io/WriteDaphneTest.cpp
index faa61a234..fe2815954 100644
--- a/test/runtime/local/io/WriteDaphneTest.cpp
+++ b/test/runtime/local/io/WriteDaphneTest.cpp
@@ -16,623 +16,619 @@
 
 #include <runtime/local/datastructures/DataObjectFactory.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
-#include <runtime/local/io/WriteDaphne.h>
 #include <runtime/local/io/ReadMM.h>
+#include <runtime/local/io/WriteDaphne.h>
 
 #include <tags.h>
 
 #include <catch.hpp>
 
-#include <vector>
 #include <cmath>
 #include <cstdint>
 #include <limits>
+#include <vector>
 
 TEMPLATE_PRODUCT_TEST_CASE("WriteDaphne CIG", TAG_IO, (DenseMatrix), (int32_t)) {
-  using DT = TestType;
-  DT *m = nullptr;
+    using DT = TestType;
+    DT *m = nullptr;
 
-  char filename[] = "./test/runtime/local/io/cig.mtx";
-  readMM(m, filename);
+    char filename[] = "./test/runtime/local/io/cig.mtx";
+    readMM(m, filename);
 
-  char fn[] = "./test/runtime/local/io/cig.dbdf";
-  writeDaphne(m, fn);
+    char fn[] = "./test/runtime/local/io/cig.dbdf";
+    writeDaphne(m, fn);
 
-  CHECK(m->get(0, 0) == 1);
-  CHECK(m->get(2, 0) == 0);
-  CHECK(m->get(3, 4) == 9);
-  CHECK(m->get(7, 4) == 4);
+    CHECK(m->get(0, 0) == 1);
+    CHECK(m->get(2, 0) == 0);
+    CHECK(m->get(3, 4) == 9);
+    CHECK(m->get(7, 4) == 4);
 
-  DataObjectFactory::destroy(m);
+    DataObjectFactory::destroy(m);
 }
 
 TEMPLATE_PRODUCT_TEST_CASE("WriteDaphne AIG", TAG_IO, (DenseMatrix), (int32_t)) {
-  using DT = TestType;
-  DT *m = nullptr;
+    using DT = TestType;
+    DT *m = nullptr;
 
-  size_t numRows = 4;
-  size_t numCols = 3;
+    size_t numRows = 4;
+    size_t numCols = 3;
 
-  char filename[] = "./test/runtime/local/io/aig.mtx";
-  readMM(m, filename);
+    char filename[] = "./test/runtime/local/io/aig.mtx";
+    readMM(m, filename);
 
-  char fn [] = "./test/runtime/local/io/aig.dbdf";
-  writeDaphne(m, fn);
+    char fn[] = "./test/runtime/local/io/aig.dbdf";
+    writeDaphne(m, fn);
 
-  REQUIRE(m->getNumRows() == numRows);
-  REQUIRE(m->getNumCols() == numCols);
+    REQUIRE(m->getNumRows() == numRows);
+    REQUIRE(m->getNumCols() == numCols);
 
-  CHECK(m->get(0, 0) == 1);
-  CHECK(m->get(1, 0) == 2);
-  CHECK(m->get(0, 1) == 5);
-  CHECK(m->get(3, 2) == 12);
-  CHECK(m->get(2, 1) == 7);
+    CHECK(m->get(0, 0) == 1);
+    CHECK(m->get(1, 0) == 2);
+    CHECK(m->get(0, 1) == 5);
+    CHECK(m->get(3, 2) == 12);
+    CHECK(m->get(2, 1) == 7);
 
-  DataObjectFactory::destroy(m);
+    DataObjectFactory::destroy(m);
 }
 
 TEMPLATE_PRODUCT_TEST_CASE("WriteDaphne CRG", TAG_IO, (DenseMatrix), (double)) {
-  using DT = TestType;
-  DT *m = nullptr;
+    using DT = TestType;
+    DT *m = nullptr;
 
-  size_t numRows = 497;
-  size_t numCols = 507;
+    size_t numRows = 497;
+    size_t numCols = 507;
 
-  char filename[] = "./test/runtime/local/io/crg.mtx";
-  readMM(m, filename);
+    char filename[] = "./test/runtime/local/io/crg.mtx";
+    readMM(m, filename);
 
-  char fn [] = "./test/runtime/local/io/crg.dbdf";
-  writeDaphne(m, fn);
+    char fn[] = "./test/runtime/local/io/crg.dbdf";
+    writeDaphne(m, fn);
 
-  REQUIRE(m->getNumRows() == numRows);
-  REQUIRE(m->getNumCols() == numCols);
+    REQUIRE(m->getNumRows() == numRows);
+    REQUIRE(m->getNumCols() == numCols);
 
-  CHECK(m->get(5, 0) == 0.25599762);
-  CHECK(m->get(6, 0) == 0.13827993);
-  CHECK(m->get(200, 4) == 0.20001954);
+    CHECK(m->get(5, 0) == 0.25599762);
+    CHECK(m->get(6, 0) == 0.13827993);
+    CHECK(m->get(200, 4) == 0.20001954);
 
-  DataObjectFactory::destroy(m);
+    DataObjectFactory::destroy(m);
 }
 
 TEMPLATE_PRODUCT_TEST_CASE("WriteDaphne CRS", TAG_IO, (DenseMatrix), (double)) {
-  using DT = TestType;
-  DT *m = nullptr;
+    using DT = TestType;
+    DT *m = nullptr;
 
-  size_t numRows = 66;
-  size_t numCols = 66;
+    size_t numRows = 66;
+    size_t numCols = 66;
 
-  char filename[] = "./test/runtime/local/io/crs.mtx";
-  readMM(m, filename);
+    char filename[] = "./test/runtime/local/io/crs.mtx";
+    readMM(m, filename);
 
-  char fn [] = "./test/runtime/local/io/crs.dbdf";
-  writeDaphne(m, fn);
+    char fn[] = "./test/runtime/local/io/crs.dbdf";
+    writeDaphne(m, fn);
 
-  REQUIRE(m->getNumRows() == numRows);
-  REQUIRE(m->getNumCols() == numCols);
+    REQUIRE(m->getNumRows() == numRows);
+    REQUIRE(m->getNumCols() == numCols);
 
-  CHECK(m->get(36, 29) == 926.188986068);
+    CHECK(m->get(36, 29) == 926.188986068);
 
-  for(size_t r = 0; r<numRows; r++)
-    for(size_t c = r+1; c<numCols; c++)
-      CHECK(m->get(r,c) == m->get(c,r));
+    for (size_t r = 0; r < numRows; r++)
+        for (size_t c = r + 1; c < numCols; c++)
+            CHECK(m->get(r, c) == m->get(c, r));
 
-  DataObjectFactory::destroy(m);
+    DataObjectFactory::destroy(m);
 }
 
 TEMPLATE_PRODUCT_TEST_CASE("WriteDaphne CRK", TAG_IO, (DenseMatrix), (double)) {
-  using DT = TestType;
-  DT *m = nullptr;
+    using DT = TestType;
+    DT *m = nullptr;
 
-  size_t numRows = 66;
-  size_t numCols = 66;
+    size_t numRows = 66;
+    size_t numCols = 66;
 
-  char filename[] = "./test/runtime/local/io/crk.mtx";
-  readMM(m, filename);
+    char filename[] = "./test/runtime/local/io/crk.mtx";
+    readMM(m, filename);
 
-  char fn [] = "./test/runtime/local/io/crk.dbdf";
-  writeDaphne(m, fn);
+    char fn[] = "./test/runtime/local/io/crk.dbdf";
+    writeDaphne(m, fn);
 
-  REQUIRE(m->getNumRows() == numRows);
-  REQUIRE(m->getNumCols() == numCols);
+    REQUIRE(m->getNumRows() == numRows);
+    REQUIRE(m->getNumCols() == numCols);
 
-  CHECK(m->get(29, 36) == -926.188986068);
+    CHECK(m->get(29, 36) == -926.188986068);
 
-  for(size_t r = 0; r<numRows; r++) {
-    CHECK(m->get(r,r) == 0);
-    for(size_t c = r+1; c<numCols; c++)
-      CHECK(m->get(r,c) == -m->get(c,r));
-  }
+    for (size_t r = 0; r < numRows; r++) {
+        CHECK(m->get(r, r) == 0);
+        for (size_t c = r + 1; c < numCols; c++)
+            CHECK(m->get(r, c) == -m->get(c, r));
+    }
 
-  DataObjectFactory::destroy(m);
+    DataObjectFactory::destroy(m);
 }
 
 TEMPLATE_PRODUCT_TEST_CASE("WriteDaphne CPS", TAG_IO, (DenseMatrix), (int32_t)) {
-  using DT = TestType;
-  DT *m = nullptr;
+    using DT = TestType;
+    DT *m = nullptr;
 
-  size_t numRows = 24;
-  size_t numCols = 24;
+    size_t numRows = 24;
+    size_t numCols = 24;
 
-  char filename[] = "./test/runtime/local/io/cps.mtx";
-  readMM(m, filename);
+    char filename[] = "./test/runtime/local/io/cps.mtx";
+    readMM(m, filename);
 
-  char fn [] = "./test/runtime/local/io/cps.dbdf";
-  writeDaphne(m, fn);
+    char fn[] = "./test/runtime/local/io/cps.dbdf";
+    writeDaphne(m, fn);
 
-  REQUIRE(m->getNumRows() == numRows);
-  REQUIRE(m->getNumCols() == numCols);
+    REQUIRE(m->getNumRows() == numRows);
+    REQUIRE(m->getNumCols() == numCols);
 
-  CHECK(m->get( 0, 0) != 0);
-  CHECK(m->get( 1, 0) == 0);
-  CHECK(m->get(3, 15) != 0);
+    CHECK(m->get(0, 0) != 0);
+    CHECK(m->get(1, 0) == 0);
+    CHECK(m->get(3, 15) != 0);
 
-  for(size_t r = 0; r<numRows; r++)
-    for(size_t c = r+1; c<numCols; c++)
-      if(m->get(r,c) == 0)
-        CHECK(m->get(c,r) == 0);
-      else
-        CHECK(m->get(c,r) != 0);
+    for (size_t r = 0; r < numRows; r++)
+        for (size_t c = r + 1; c < numCols; c++)
+            if (m->get(r, c) == 0)
+                CHECK(m->get(c, r) == 0);
+            else
+                CHECK(m->get(c, r) != 0);
 
-  DataObjectFactory::destroy(m);
+    DataObjectFactory::destroy(m);
 }
 
 TEMPLATE_PRODUCT_TEST_CASE("WriteDaphne AIK", TAG_IO, (DenseMatrix), (int32_t)) {
-  using DT = TestType;
-  DT *m = nullptr;
+    using DT = TestType;
+    DT *m = nullptr;
 
-  size_t numRows = 4;
-  size_t numCols = 4;
+    size_t numRows = 4;
+    size_t numCols = 4;
 
-  char filename[] = "./test/runtime/local/io/aik.mtx";
-  readMM(m, filename);
+    char filename[] = "./test/runtime/local/io/aik.mtx";
+    readMM(m, filename);
 
-  char fn [] = "./test/runtime/local/io/aik.dbdf";
-  writeDaphne(m, fn);
+    char fn[] = "./test/runtime/local/io/aik.dbdf";
+    writeDaphne(m, fn);
 
-  REQUIRE(m->getNumRows() == numRows);
-  REQUIRE(m->getNumCols() == numCols);
+    REQUIRE(m->getNumRows() == numRows);
+    REQUIRE(m->getNumCols() == numCols);
 
-  CHECK(m->get(1, 0) == 1);
+    CHECK(m->get(1, 0) == 1);
 
-  for(size_t r = 0; r<numRows; r++) {
-    CHECK(m->get(r,r) == 0);
-    for(size_t c = r+1; c<numCols; c++)
-      CHECK(m->get(r,c) == -m->get(c,r));
-  }
+    for (size_t r = 0; r < numRows; r++) {
+        CHECK(m->get(r, r) == 0);
+        for (size_t c = r + 1; c < numCols; c++)
+            CHECK(m->get(r, c) == -m->get(c, r));
+    }
 
-  DataObjectFactory::destroy(m);
+    DataObjectFactory::destroy(m);
 }
 
 TEMPLATE_PRODUCT_TEST_CASE("WriteDaphne AIS", TAG_IO, (DenseMatrix), (int32_t)) {
-  using DT = TestType;
-  DT *m = nullptr;
+    using DT = TestType;
+    DT *m = nullptr;
 
-  size_t numRows = 3;
-  size_t numCols = 3;
+    size_t numRows = 3;
+    size_t numCols = 3;
 
-  char filename[] = "./test/runtime/local/io/ais.mtx";
-  readMM(m, filename);
+    char filename[] = "./test/runtime/local/io/ais.mtx";
+    readMM(m, filename);
 
-  char fn [] = "./test/runtime/local/io/ais.dbdf";
-  writeDaphne(m, fn);
+    char fn[] = "./test/runtime/local/io/ais.dbdf";
+    writeDaphne(m, fn);
 
-  REQUIRE(m->getNumRows() == numRows);
-  REQUIRE(m->getNumCols() == numCols);
+    REQUIRE(m->getNumRows() == numRows);
+    REQUIRE(m->getNumCols() == numCols);
 
-  CHECK(m->get(1, 1) == 4);
+    CHECK(m->get(1, 1) == 4);
 
-  for(size_t r = 0; r<numRows; r++)
-    for(size_t c = r+1; c<numCols; c++)
-      CHECK(m->get(r,c) == m->get(c,r));
+    for (size_t r = 0; r < numRows; r++)
+        for (size_t c = r + 1; c < numCols; c++)
+            CHECK(m->get(r, c) == m->get(c, r));
 
-  DataObjectFactory::destroy(m);
+    DataObjectFactory::destroy(m);
 }
 
 TEMPLATE_PRODUCT_TEST_CASE("WriteDaphne CIG (CSR)", TAG_IO, (CSRMatrix), (int32_t)) {
-  using DT = TestType;
-  DT *m = nullptr;
+    using DT = TestType;
+    DT *m = nullptr;
 
-  size_t numRows = 9;
-  size_t numCols = 9;
+    size_t numRows = 9;
+    size_t numCols = 9;
 
-  char filename[] = "./test/runtime/local/io/cig.mtx";
-  readMM(m, filename);
+    char filename[] = "./test/runtime/local/io/cig.mtx";
+    readMM(m, filename);
 
-  char fn [] = "./test/runtime/local/io/cig-csr.dbdf";
-  writeDaphne(m, fn);
+    char fn[] = "./test/runtime/local/io/cig-csr.dbdf";
+    writeDaphne(m, fn);
 
-  REQUIRE(m->getNumRows() == numRows);
-  REQUIRE(m->getNumCols() == numCols);
+    REQUIRE(m->getNumRows() == numRows);
+    REQUIRE(m->getNumCols() == numCols);
 
-  CHECK(m->get(0, 0) == 1);
-  CHECK(m->get(2, 0) == 0);
-  CHECK(m->get(3, 4) == 9);
-  CHECK(m->get(7, 4) == 4);
+    CHECK(m->get(0, 0) == 1);
+    CHECK(m->get(2, 0) == 0);
+    CHECK(m->get(3, 4) == 9);
+    CHECK(m->get(7, 4) == 4);
 
-  DataObjectFactory::destroy(m);
+    DataObjectFactory::destroy(m);
 }
 
 TEMPLATE_PRODUCT_TEST_CASE("WriteDaphne AIG (CSR)", TAG_IO, (CSRMatrix), (int32_t)) {
-  using DT = TestType;
-  DT *m = nullptr;
+    using DT = TestType;
+    DT *m = nullptr;
 
-  size_t numRows = 4;
-  size_t numCols = 3;
+    size_t numRows = 4;
+    size_t numCols = 3;
 
-  char filename[] = "./test/runtime/local/io/aig.mtx";
-  readMM(m, filename);
+    char filename[] = "./test/runtime/local/io/aig.mtx";
+    readMM(m, filename);
 
-  char fn [] = "./test/runtime/local/io/aig-csr.dbdf";
-  writeDaphne(m, fn);
+    char fn[] = "./test/runtime/local/io/aig-csr.dbdf";
+    writeDaphne(m, fn);
 
-  REQUIRE(m->getNumRows() == numRows);
-  REQUIRE(m->getNumCols() == numCols);
+    REQUIRE(m->getNumRows() == numRows);
+    REQUIRE(m->getNumCols() == numCols);
 
-  CHECK(m->get(0, 0) == 1);
-  CHECK(m->get(1, 0) == 2);
-  CHECK(m->get(0, 1) == 5);
-  CHECK(m->get(3, 2) == 12);
-  CHECK(m->get(2, 1) == 7);
+    CHECK(m->get(0, 0) == 1);
+    CHECK(m->get(1, 0) == 2);
+    CHECK(m->get(0, 1) == 5);
+    CHECK(m->get(3, 2) == 12);
+    CHECK(m->get(2, 1) == 7);
 
-  DataObjectFactory::destroy(m);
+    DataObjectFactory::destroy(m);
 }
 
 TEMPLATE_PRODUCT_TEST_CASE("WriteDaphne CRG (CSR)", TAG_IO, (CSRMatrix), (double)) {
-  using DT = TestType;
-  DT *m = nullptr;
+    using DT = TestType;
+    DT *m = nullptr;
 
-  size_t numRows = 497;
-  size_t numCols = 507;
+    size_t numRows = 497;
+    size_t numCols = 507;
 
-  char filename[] = "./test/runtime/local/io/crg.mtx";
-  readMM(m, filename);
+    char filename[] = "./test/runtime/local/io/crg.mtx";
+    readMM(m, filename);
 
-  char fn [] = "./test/runtime/local/io/crg-csr.dbdf";
-  writeDaphne(m, fn);
+    char fn[] = "./test/runtime/local/io/crg-csr.dbdf";
+    writeDaphne(m, fn);
 
-  REQUIRE(m->getNumRows() == numRows);
-  REQUIRE(m->getNumCols() == numCols);
+    REQUIRE(m->getNumRows() == numRows);
+    REQUIRE(m->getNumCols() == numCols);
 
-  CHECK(m->get(5, 0) == 0.25599762);
-  CHECK(m->get(6, 0) == 0.13827993);
-  CHECK(m->get(200, 4) == 0.20001954);
+    CHECK(m->get(5, 0) == 0.25599762);
+    CHECK(m->get(6, 0) == 0.13827993);
+    CHECK(m->get(200, 4) == 0.20001954);
 
-  DataObjectFactory::destroy(m);
+    DataObjectFactory::destroy(m);
 }
 
 TEMPLATE_PRODUCT_TEST_CASE("WriteDaphne CRS (CSR)", TAG_IO, (CSRMatrix), (double)) {
-  using DT = TestType;
-  DT *m = nullptr;
+    using DT = TestType;
+    DT *m = nullptr;
 
-  size_t numRows = 66;
-  size_t numCols = 66;
+    size_t numRows = 66;
+    size_t numCols = 66;
 
-  char filename[] = "./test/runtime/local/io/crs.mtx";
-  readMM(m, filename);
+    char filename[] = "./test/runtime/local/io/crs.mtx";
+    readMM(m, filename);
 
-  char fn [] = "./test/runtime/local/io/crs-csr.dbdf";
-  writeDaphne(m, fn);
+    char fn[] = "./test/runtime/local/io/crs-csr.dbdf";
+    writeDaphne(m, fn);
 
-  REQUIRE(m->getNumRows() == numRows);
-  REQUIRE(m->getNumCols() == numCols);
+    REQUIRE(m->getNumRows() == numRows);
+    REQUIRE(m->getNumCols() == numCols);
 
-  CHECK(m->get(36, 29) == 926.188986068);
+    CHECK(m->get(36, 29) == 926.188986068);
 
-  for(size_t r = 0; r<numRows; r++)
-    for(size_t c = r+1; c<numCols; c++)
-      CHECK(m->get(r,c) == m->get(c,r));
+    for (size_t r = 0; r < numRows; r++)
+        for (size_t c = r + 1; c < numCols; c++)
+            CHECK(m->get(r, c) == m->get(c, r));
 
-  DataObjectFactory::destroy(m);
+    DataObjectFactory::destroy(m);
 }
 
 TEMPLATE_PRODUCT_TEST_CASE("WriteDaphne CRK (CSR)", TAG_IO, (CSRMatrix), (double)) {
-  using DT = TestType;
-  DT *m = nullptr;
+    using DT = TestType;
+    DT *m = nullptr;
 
-  size_t numRows = 66;
-  size_t numCols = 66;
+    size_t numRows = 66;
+    size_t numCols = 66;
 
-  char filename[] = "./test/runtime/local/io/crk.mtx";
-  readMM(m, filename);
+    char filename[] = "./test/runtime/local/io/crk.mtx";
+    readMM(m, filename);
 
-  char fn [] = "./test/runtime/local/io/crk-csr.dbdf";
-  writeDaphne(m, fn);
+    char fn[] = "./test/runtime/local/io/crk-csr.dbdf";
+    writeDaphne(m, fn);
 
-  REQUIRE(m->getNumRows() == numRows);
-  REQUIRE(m->getNumCols() == numCols);
+    REQUIRE(m->getNumRows() == numRows);
+    REQUIRE(m->getNumCols() == numCols);
 
-  CHECK(m->get(29, 36) == -926.188986068);
+    CHECK(m->get(29, 36) == -926.188986068);
 
-  for(size_t r = 0; r<numRows; r++) {
-    CHECK(m->get(r,r) == 0);
-    for(size_t c = r+1; c<numCols; c++)
-      CHECK(m->get(r,c) == -m->get(c,r));
-  }
+    for (size_t r = 0; r < numRows; r++) {
+        CHECK(m->get(r, r) == 0);
+        for (size_t c = r + 1; c < numCols; c++)
+            CHECK(m->get(r, c) == -m->get(c, r));
+    }
 
-  DataObjectFactory::destroy(m);
+    DataObjectFactory::destroy(m);
 }
 
 TEMPLATE_PRODUCT_TEST_CASE("WriteDaphne CPS (CSR)", TAG_IO, (CSRMatrix), (int32_t)) {
-  using DT = TestType;
-  DT *m = nullptr;
+    using DT = TestType;
+    DT *m = nullptr;
 
-  size_t numRows = 24;
-  size_t numCols = 24;
+    size_t numRows = 24;
+    size_t numCols = 24;
 
-  char filename[] = "./test/runtime/local/io/cps.mtx";
-  readMM(m, filename);
+    char filename[] = "./test/runtime/local/io/cps.mtx";
+    readMM(m, filename);
 
-  char fn [] = "./test/runtime/local/io/cps-csr.dbdf";
-  writeDaphne(m, fn);
+    char fn[] = "./test/runtime/local/io/cps-csr.dbdf";
+    writeDaphne(m, fn);
 
-  REQUIRE(m->getNumRows() == numRows);
-  REQUIRE(m->getNumCols() == numCols);
+    REQUIRE(m->getNumRows() == numRows);
+    REQUIRE(m->getNumCols() == numCols);
 
-  CHECK(m->get( 0, 0) != 0);
-  CHECK(m->get( 1, 0) == 0);
-  CHECK(m->get(3, 15) != 0);
+    CHECK(m->get(0, 0) != 0);
+    CHECK(m->get(1, 0) == 0);
+    CHECK(m->get(3, 15) != 0);
 
-  for(size_t r = 0; r<numRows; r++)
-    for(size_t c = r+1; c<numCols; c++)
-      if(m->get(r,c) == 0)
-        CHECK(m->get(c,r) == 0);
-      else
-        CHECK(m->get(c,r) != 0);
+    for (size_t r = 0; r < numRows; r++)
+        for (size_t c = r + 1; c < numCols; c++)
+            if (m->get(r, c) == 0)
+                CHECK(m->get(c, r) == 0);
+            else
+                CHECK(m->get(c, r) != 0);
 
-  DataObjectFactory::destroy(m);
+    DataObjectFactory::destroy(m);
 }
 
 TEMPLATE_PRODUCT_TEST_CASE("WriteDaphne AIK (CSR)", TAG_IO, (CSRMatrix), (int32_t)) {
-  using DT = TestType;
-  DT *m = nullptr;
+    using DT = TestType;
+    DT *m = nullptr;
 
-  size_t numRows = 4;
-  size_t numCols = 4;
+    size_t numRows = 4;
+    size_t numCols = 4;
 
-  char filename[] = "./test/runtime/local/io/aik.mtx";
-  readMM(m, filename);
+    char filename[] = "./test/runtime/local/io/aik.mtx";
+    readMM(m, filename);
 
-  char fn [] = "./test/runtime/local/io/aik-csr.dbdf";
-  writeDaphne(m, fn);
+    char fn[] = "./test/runtime/local/io/aik-csr.dbdf";
+    writeDaphne(m, fn);
 
-  REQUIRE(m->getNumRows() == numRows);
-  REQUIRE(m->getNumCols() == numCols);
+    REQUIRE(m->getNumRows() == numRows);
+    REQUIRE(m->getNumCols() == numCols);
 
-  CHECK(m->get(1, 0) == 1);
+    CHECK(m->get(1, 0) == 1);
 
-  for(size_t r = 0; r<numRows; r++) {
-    CHECK(m->get(r,r) == 0);
-    for(size_t c = r+1; c<numCols; c++)
-      CHECK(m->get(r,c) == -m->get(c,r));
-  }
+    for (size_t r = 0; r < numRows; r++) {
+        CHECK(m->get(r, r) == 0);
+        for (size_t c = r + 1; c < numCols; c++)
+            CHECK(m->get(r, c) == -m->get(c, r));
+    }
 
-  DataObjectFactory::destroy(m);
+    DataObjectFactory::destroy(m);
 }
 
 TEMPLATE_PRODUCT_TEST_CASE("WriteDaphne AIS (CSR)", TAG_IO, (CSRMatrix), (int32_t)) {
-  using DT = TestType;
-  DT *m = nullptr;
+    using DT = TestType;
+    DT *m = nullptr;
 
-  size_t numRows = 3;
-  size_t numCols = 3;
+    size_t numRows = 3;
+    size_t numCols = 3;
 
-  char filename[] = "./test/runtime/local/io/ais.mtx";
-  readMM(m, filename);
+    char filename[] = "./test/runtime/local/io/ais.mtx";
+    readMM(m, filename);
 
-  char fn [] = "./test/runtime/local/io/ais-csr.dbdf";
-  writeDaphne(m, fn);
+    char fn[] = "./test/runtime/local/io/ais-csr.dbdf";
+    writeDaphne(m, fn);
 
-  REQUIRE(m->getNumRows() == numRows);
-  REQUIRE(m->getNumCols() == numCols);
+    REQUIRE(m->getNumRows() == numRows);
+    REQUIRE(m->getNumCols() == numCols);
 
-  CHECK(m->get(1, 1) == 4);
+    CHECK(m->get(1, 1) == 4);
 
-  for(size_t r = 0; r<numRows; r++)
-    for(size_t c = r+1; c<numCols; c++)
-      CHECK(m->get(r,c) == m->get(c,r));
+    for (size_t r = 0; r < numRows; r++)
+        for (size_t c = r + 1; c < numCols; c++)
+            CHECK(m->get(r, c) == m->get(c, r));
 
-  DataObjectFactory::destroy(m);
+    DataObjectFactory::destroy(m);
 }
 
 TEST_CASE("WriteDaphne CIG (Frame)", TAG_IO) {
-  using DT = Frame;
-  DT *m = nullptr;
+    using DT = Frame;
+    DT *m = nullptr;
 
-  size_t numRows = 9;
-  size_t numCols = 9;
+    size_t numRows = 9;
+    size_t numCols = 9;
 
-  char filename[] = "./test/runtime/local/io/cig.mtx";
-  readMM(m, filename);
+    char filename[] = "./test/runtime/local/io/cig.mtx";
+    readMM(m, filename);
 
-  char fn [] = "./test/runtime/local/io/cig-f.dbdf";
-  writeDaphne(m, fn);
+    char fn[] = "./test/runtime/local/io/cig-f.dbdf";
+    writeDaphne(m, fn);
 
-  REQUIRE(m->getNumRows() == numRows);
-  REQUIRE(m->getNumCols() == numCols);
+    REQUIRE(m->getNumRows() == numRows);
+    REQUIRE(m->getNumCols() == numCols);
 
-  CHECK(m->getColumn<int64_t>(0)->get(0, 0) == 1);
-  CHECK(m->getColumn<int64_t>(0)->get(2, 0) == 0);
-  CHECK(m->getColumn<int64_t>(4)->get(3, 0) == 9);
-  CHECK(m->getColumn<int64_t>(4)->get(7, 0) == 4);
+    CHECK(m->getColumn<int64_t>(0)->get(0, 0) == 1);
+    CHECK(m->getColumn<int64_t>(0)->get(2, 0) == 0);
+    CHECK(m->getColumn<int64_t>(4)->get(3, 0) == 9);
+    CHECK(m->getColumn<int64_t>(4)->get(7, 0) == 4);
 
-  DataObjectFactory::destroy(m);
+    DataObjectFactory::destroy(m);
 }
 
 TEST_CASE("WriteDaphne AIG (Frame)", TAG_IO) {
-  using DT = Frame;
-  DT *m = nullptr;
+    using DT = Frame;
+    DT *m = nullptr;
 
-  size_t numRows = 4;
-  size_t numCols = 3;
+    size_t numRows = 4;
+    size_t numCols = 3;
 
-  char filename[] = "./test/runtime/local/io/aig.mtx";
-  readMM(m, filename);
-  char fn [] = "./test/runtime/local/io/aig-f.dbdf";
-  writeDaphne(m, fn);
+    char filename[] = "./test/runtime/local/io/aig.mtx";
+    readMM(m, filename);
+    char fn[] = "./test/runtime/local/io/aig-f.dbdf";
+    writeDaphne(m, fn);
 
-  REQUIRE(m->getNumRows() == numRows);
-  REQUIRE(m->getNumCols() == numCols);
+    REQUIRE(m->getNumRows() == numRows);
+    REQUIRE(m->getNumCols() == numCols);
 
-  CHECK(m->getColumn<int64_t>(0)->get(0, 0) == 1);
-  CHECK(m->getColumn<int64_t>(0)->get(1, 0) == 2);
-  CHECK(m->getColumn<int64_t>(1)->get(0, 0) == 5);
-  CHECK(m->getColumn<int64_t>(2)->get(3, 0) == 12);
-  CHECK(m->getColumn<int64_t>(1)->get(2, 0) == 7);
+    CHECK(m->getColumn<int64_t>(0)->get(0, 0) == 1);
+    CHECK(m->getColumn<int64_t>(0)->get(1, 0) == 2);
+    CHECK(m->getColumn<int64_t>(1)->get(0, 0) == 5);
+    CHECK(m->getColumn<int64_t>(2)->get(3, 0) == 12);
+    CHECK(m->getColumn<int64_t>(1)->get(2, 0) == 7);
 
-  DataObjectFactory::destroy(m);
+    DataObjectFactory::destroy(m);
 }
 
 TEST_CASE("WriteDaphne CRG (Frame)", TAG_IO) {
-  using DT = Frame;
-  DT *m = nullptr;
+    using DT = Frame;
+    DT *m = nullptr;
 
-  size_t numRows = 497;
-  size_t numCols = 507;
+    size_t numRows = 497;
+    size_t numCols = 507;
 
-  char filename[] = "./test/runtime/local/io/crg.mtx";
-  readMM(m, filename);
-  char fn [] = "./test/runtime/local/io/crg-f.dbdf";
-  writeDaphne(m, fn);
+    char filename[] = "./test/runtime/local/io/crg.mtx";
+    readMM(m, filename);
+    char fn[] = "./test/runtime/local/io/crg-f.dbdf";
+    writeDaphne(m, fn);
 
-  REQUIRE(m->getNumRows() == numRows);
-  REQUIRE(m->getNumCols() == numCols);
+    REQUIRE(m->getNumRows() == numRows);
+    REQUIRE(m->getNumCols() == numCols);
 
-  CHECK(m->getColumn<double>(0)->get(5, 0) == 0.25599762);
-  CHECK(m->getColumn<double>(0)->get(6, 0) == 0.13827993);
-  CHECK(m->getColumn<double>(4)->get(200, 0) == 0.20001954);
+    CHECK(m->getColumn<double>(0)->get(5, 0) == 0.25599762);
+    CHECK(m->getColumn<double>(0)->get(6, 0) == 0.13827993);
+    CHECK(m->getColumn<double>(4)->get(200, 0) == 0.20001954);
 
-  DataObjectFactory::destroy(m);
+    DataObjectFactory::destroy(m);
 }
 
 TEST_CASE("WriteDaphne CRS (Frame)", TAG_IO) {
-  using DT = Frame;
-  DT *m = nullptr;
+    using DT = Frame;
+    DT *m = nullptr;
 
-  size_t numRows = 66;
-  size_t numCols = 66;
+    size_t numRows = 66;
+    size_t numCols = 66;
 
-  char filename[] = "./test/runtime/local/io/crs.mtx";
-  readMM(m, filename);
-  char fn [] = "./test/runtime/local/io/crs-f.dbdf";
-  writeDaphne(m, fn);
+    char filename[] = "./test/runtime/local/io/crs.mtx";
+    readMM(m, filename);
+    char fn[] = "./test/runtime/local/io/crs-f.dbdf";
+    writeDaphne(m, fn);
 
-  REQUIRE(m->getNumRows() == numRows);
-  REQUIRE(m->getNumCols() == numCols);
+    REQUIRE(m->getNumRows() == numRows);
+    REQUIRE(m->getNumCols() == numCols);
 
-  CHECK(m->getColumn<double>(29)->get(36, 0) == 926.188986068);
+    CHECK(m->getColumn<double>(29)->get(36, 0) == 926.188986068);
 
-  for(size_t r = 0; r<numRows; r++)
-    for(size_t c = r+1; c<numCols; c++)
-      CHECK(m->getColumn<double>(c)->get(r,0)
-        ==  m->getColumn<double>(r)->get(c,0));
+    for (size_t r = 0; r < numRows; r++)
+        for (size_t c = r + 1; c < numCols; c++)
+            CHECK(m->getColumn<double>(c)->get(r, 0) == m->getColumn<double>(r)->get(c, 0));
 
-  DataObjectFactory::destroy(m);
+    DataObjectFactory::destroy(m);
 }
 
 TEST_CASE("WriteDaphne CRK (Frame)", TAG_IO) {
-  using DT = Frame;
-  DT *m = nullptr;
+    using DT = Frame;
+    DT *m = nullptr;
 
-  size_t numRows = 66;
-  size_t numCols = 66;
+    size_t numRows = 66;
+    size_t numCols = 66;
 
-  char filename[] = "./test/runtime/local/io/crk.mtx";
-  readMM(m, filename);
-  char fn [] = "./test/runtime/local/io/crk-f.dbdf";
-  writeDaphne(m, fn);
+    char filename[] = "./test/runtime/local/io/crk.mtx";
+    readMM(m, filename);
+    char fn[] = "./test/runtime/local/io/crk-f.dbdf";
+    writeDaphne(m, fn);
 
-  REQUIRE(m->getNumRows() == numRows);
-  REQUIRE(m->getNumCols() == numCols);
+    REQUIRE(m->getNumRows() == numRows);
+    REQUIRE(m->getNumCols() == numCols);
 
-  CHECK(m->getColumn<double>(36)->get(29, 0) == -926.188986068);
+    CHECK(m->getColumn<double>(36)->get(29, 0) == -926.188986068);
 
-  for(size_t r = 0; r<numRows; r++) {
-    CHECK(m->getColumn<double>(r)->get(r,0) == 0);
-    for(size_t c = r+1; c<numCols; c++)
-      CHECK(m->getColumn<double>(c)->get(r,0)
-        == -m->getColumn<double>(r)->get(c,0));
-  }
+    for (size_t r = 0; r < numRows; r++) {
+        CHECK(m->getColumn<double>(r)->get(r, 0) == 0);
+        for (size_t c = r + 1; c < numCols; c++)
+            CHECK(m->getColumn<double>(c)->get(r, 0) == -m->getColumn<double>(r)->get(c, 0));
+    }
 
-  DataObjectFactory::destroy(m);
+    DataObjectFactory::destroy(m);
 }
 
 TEST_CASE("WriteDaphne CPS (Frame)", TAG_IO) {
-  using DT = Frame;
-  DT *m = nullptr;
+    using DT = Frame;
+    DT *m = nullptr;
 
-  size_t numRows = 24;
-  size_t numCols = 24;
+    size_t numRows = 24;
+    size_t numCols = 24;
 
-  char filename[] = "./test/runtime/local/io/cps.mtx";
-  readMM(m, filename);
-  char fn [] = "./test/runtime/local/io/cps-f.dbdf";
-  writeDaphne(m, fn);
+    char filename[] = "./test/runtime/local/io/cps.mtx";
+    readMM(m, filename);
+    char fn[] = "./test/runtime/local/io/cps-f.dbdf";
+    writeDaphne(m, fn);
 
-  REQUIRE(m->getNumRows() == numRows);
-  REQUIRE(m->getNumCols() == numCols);
+    REQUIRE(m->getNumRows() == numRows);
+    REQUIRE(m->getNumCols() == numCols);
 
-  CHECK(m->getColumn<double>(0)->get( 0, 0) != 0);
-  CHECK(m->getColumn<double>(0)->get( 1, 0) == 0);
-  CHECK(m->getColumn<double>(15)->get(3, 0) != 0);
+    CHECK(m->getColumn<double>(0)->get(0, 0) != 0);
+    CHECK(m->getColumn<double>(0)->get(1, 0) == 0);
+    CHECK(m->getColumn<double>(15)->get(3, 0) != 0);
 
-  for(size_t r = 0; r<numRows; r++)
-    for(size_t c = r+1; c<numCols; c++)
-      if(m->getColumn<double>(c)->get(r,0) == 0)
-        CHECK(m->getColumn<double>(r)->get(c,0) == 0);
-      else
-        CHECK(m->getColumn<double>(r)->get(c,0) != 0);
+    for (size_t r = 0; r < numRows; r++)
+        for (size_t c = r + 1; c < numCols; c++)
+            if (m->getColumn<double>(c)->get(r, 0) == 0)
+                CHECK(m->getColumn<double>(r)->get(c, 0) == 0);
+            else
+                CHECK(m->getColumn<double>(r)->get(c, 0) != 0);
 
-  DataObjectFactory::destroy(m);
+    DataObjectFactory::destroy(m);
 }
 
 TEST_CASE("WriteDaphne AIK (Frame)", TAG_IO) {
-  using DT = Frame;
-  DT *m = nullptr;
+    using DT = Frame;
+    DT *m = nullptr;
 
-  size_t numRows = 4;
-  size_t numCols = 4;
+    size_t numRows = 4;
+    size_t numCols = 4;
 
-  char filename[] = "./test/runtime/local/io/aik.mtx";
-  readMM(m, filename);
-  char fn [] = "./test/runtime/local/io/aik-f.dbdf";
-  writeDaphne(m, fn);
+    char filename[] = "./test/runtime/local/io/aik.mtx";
+    readMM(m, filename);
+    char fn[] = "./test/runtime/local/io/aik-f.dbdf";
+    writeDaphne(m, fn);
 
-  REQUIRE(m->getNumRows() == numRows);
-  REQUIRE(m->getNumCols() == numCols);
+    REQUIRE(m->getNumRows() == numRows);
+    REQUIRE(m->getNumCols() == numCols);
 
-  CHECK(m->getColumn<int64_t>(0)->get(1, 0) == 1);
+    CHECK(m->getColumn<int64_t>(0)->get(1, 0) == 1);
 
-  for(size_t r = 0; r<numRows; r++) {
-    CHECK(m->getColumn<int64_t>(r)->get(r,0) == 0);
-    for(size_t c = r+1; c<numCols; c++)
-      CHECK(m->getColumn<int64_t>(c)->get(r,0)
-        == -m->getColumn<int64_t>(r)->get(c,0));
-  }
+    for (size_t r = 0; r < numRows; r++) {
+        CHECK(m->getColumn<int64_t>(r)->get(r, 0) == 0);
+        for (size_t c = r + 1; c < numCols; c++)
+            CHECK(m->getColumn<int64_t>(c)->get(r, 0) == -m->getColumn<int64_t>(r)->get(c, 0));
+    }
 
-  DataObjectFactory::destroy(m);
+    DataObjectFactory::destroy(m);
 }
 
 TEST_CASE("WriteDaphne AIS (Frame)", TAG_IO) {
-  using DT = Frame;
-  DT *m = nullptr;
+    using DT = Frame;
+    DT *m = nullptr;
 
-  size_t numRows = 3;
-  size_t numCols = 3;
+    size_t numRows = 3;
+    size_t numCols = 3;
 
-  char filename[] = "./test/runtime/local/io/ais.mtx";
-  readMM(m, filename);
-  char fn [] = "./test/runtime/local/io/ais-f.dbdf";
-  writeDaphne(m, fn);
+    char filename[] = "./test/runtime/local/io/ais.mtx";
+    readMM(m, filename);
+    char fn[] = "./test/runtime/local/io/ais-f.dbdf";
+    writeDaphne(m, fn);
 
-  REQUIRE(m->getNumRows() == numRows);
-  REQUIRE(m->getNumCols() == numCols);
+    REQUIRE(m->getNumRows() == numRows);
+    REQUIRE(m->getNumCols() == numCols);
 
-  CHECK(m->getColumn<int64_t>(1)->get(1, 0) == 4);
+    CHECK(m->getColumn<int64_t>(1)->get(1, 0) == 4);
 
-  for(size_t r = 0; r<numRows; r++)
-    for(size_t c = r+1; c<numCols; c++)
-      CHECK(m->getColumn<int64_t>(c)->get(r,0)
-        ==  m->getColumn<int64_t>(r)->get(c,0));
+    for (size_t r = 0; r < numRows; r++)
+        for (size_t c = r + 1; c < numCols; c++)
+            CHECK(m->getColumn<int64_t>(c)->get(r, 0) == m->getColumn<int64_t>(r)->get(c, 0));
 
-  DataObjectFactory::destroy(m);
+    DataObjectFactory::destroy(m);
 }
diff --git a/test/runtime/local/kernels/AggAllTest.cpp b/test/runtime/local/kernels/AggAllTest.cpp
index f17f29ccf..d2e66ad7f 100644
--- a/test/runtime/local/kernels/AggAllTest.cpp
+++ b/test/runtime/local/kernels/AggAllTest.cpp
@@ -17,9 +17,9 @@
 #include <runtime/local/datagen/GenGivenVals.h>
 #include <runtime/local/datastructures/CSRMatrix.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
-#include <runtime/local/kernels/CheckEq.h>
 #include <runtime/local/kernels/AggAll.h>
 #include <runtime/local/kernels/AggOpCode.h>
+#include <runtime/local/kernels/CheckEq.h>
 
 #include <tags.h>
 
@@ -31,34 +31,53 @@
 #define DATA_TYPES DenseMatrix, CSRMatrix, Matrix
 #define VALUE_TYPES double, float, uint8_t, uint32_t, uint64_t, int8_t, int32_t, int64_t
 
-template<typename VTRes, class DTArg>
-void checkAggAll(AggOpCode opCode, const DTArg * arg, VTRes exp) {
+template <typename VTRes, class DTArg> void checkAggAll(AggOpCode opCode, const DTArg *arg, VTRes exp) {
     VTRes res = aggAll<VTRes, DTArg>(opCode, arg, nullptr);
     CHECK(Approx(res).epsilon(1e-5) == exp);
 }
 
 // The value types of argument and result could be different, so we need to
 // test various combinations.
-#define SUM_TEST_CASE(VTRes) TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("sum - result value type: " #VTRes), TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)) { \
-    using DTArg = TestType; \
-     \
-    auto m0 = genGivenVals<DTArg>(3, { \
-        0, 0, 0, 0, \
-        0, 0, 0, 0, \
-        0, 0, 0, 0, \
-    }); \
-    auto m1 = genGivenVals<DTArg>(3, { \
-        3, 0, 2, 0, \
-        0, 0, 1, 1, \
-        2, 5, 0, 0, \
-    }); \
-     \
-    checkAggAll(AggOpCode::SUM, m0, (VTRes)0); \
-    checkAggAll(AggOpCode::SUM, m1, (VTRes)14); \
-     \
-    DataObjectFactory::destroy(m0); \
-    DataObjectFactory::destroy(m1); \
-}
+#define SUM_TEST_CASE(VTRes)                                                                                           \
+    TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("sum - result value type: " #VTRes), TAG_KERNELS, (DATA_TYPES),               \
+                               (VALUE_TYPES)) {                                                                        \
+        using DTArg = TestType;                                                                                        \
+                                                                                                                       \
+        auto m0 = genGivenVals<DTArg>(3, {                                                                             \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                         });                                                                           \
+        auto m1 = genGivenVals<DTArg>(3, {                                                                             \
+                                             3,                                                                        \
+                                             0,                                                                        \
+                                             2,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             1,                                                                        \
+                                             1,                                                                        \
+                                             2,                                                                        \
+                                             5,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                         });                                                                           \
+                                                                                                                       \
+        checkAggAll(AggOpCode::SUM, m0, (VTRes)0);                                                                     \
+        checkAggAll(AggOpCode::SUM, m1, (VTRes)14);                                                                    \
+                                                                                                                       \
+        DataObjectFactory::destroy(m0);                                                                                \
+        DataObjectFactory::destroy(m1);                                                                                \
+    }
 SUM_TEST_CASE(int64_t)
 SUM_TEST_CASE(int32_t)
 SUM_TEST_CASE(uint64_t)
@@ -71,28 +90,55 @@ SUM_TEST_CASE(float)
 // The value types of argument and result can be assumed to be the same.
 TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("min"), TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)) {
     using DT = TestType;
-    
+
     auto m0 = genGivenVals<DT>(3, {
-        0, 0, 0, 0,
-        0, 0, 0, 0,
-        0, 0, 0, 0,
-    });
+                                      0,
+                                      0,
+                                      0,
+                                      0,
+                                      0,
+                                      0,
+                                      0,
+                                      0,
+                                      0,
+                                      0,
+                                      0,
+                                      0,
+                                  });
     auto m1 = genGivenVals<DT>(3, {
-        4, 6, 3, 9,
-        5, 2, 8, 9,
-        7, 4, 5, 4,
-    });
+                                      4,
+                                      6,
+                                      3,
+                                      9,
+                                      5,
+                                      2,
+                                      8,
+                                      9,
+                                      7,
+                                      4,
+                                      5,
+                                      4,
+                                  });
     auto m2 = genGivenVals<DT>(3, {
-        4, 0, 0, 9,
-        0, 2, 0, 0,
-        0, 0, 5, 0,
-    });
-    
+                                      4,
+                                      0,
+                                      0,
+                                      9,
+                                      0,
+                                      2,
+                                      0,
+                                      0,
+                                      0,
+                                      0,
+                                      5,
+                                      0,
+                                  });
+
     // In case of min the result type is the same as the input type
     checkAggAll(AggOpCode::MIN, m0, (typename DT::VT)0);
     checkAggAll(AggOpCode::MIN, m1, (typename DT::VT)2);
     checkAggAll(AggOpCode::MIN, m2, (typename DT::VT)0);
-    
+
     DataObjectFactory::destroy(m0);
     DataObjectFactory::destroy(m1);
     DataObjectFactory::destroy(m2);
@@ -101,28 +147,55 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("min"), TAG_KERNELS, (DATA_TYPES), (VALUE_T
 // The value types of argument and result can be assumed to be the same.
 TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("max"), TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)) {
     using DT = TestType;
-    
+
     auto m0 = genGivenVals<DT>(3, {
-        0, 0, 0, 0,
-        0, 0, 0, 0,
-        0, 0, 0, 0,
-    });
+                                      0,
+                                      0,
+                                      0,
+                                      0,
+                                      0,
+                                      0,
+                                      0,
+                                      0,
+                                      0,
+                                      0,
+                                      0,
+                                      0,
+                                  });
     auto m1 = genGivenVals<DT>(3, {
-        4, 6, 3, 9,
-        5, 2, 8, 9,
-        7, 4, 5, 4,
-    });
+                                      4,
+                                      6,
+                                      3,
+                                      9,
+                                      5,
+                                      2,
+                                      8,
+                                      9,
+                                      7,
+                                      4,
+                                      5,
+                                      4,
+                                  });
     auto m2 = genGivenVals<DT>(3, {
-        4, 0, 0, 9,
-        0, 2, 0, 0,
-        0, 0, 5, 0,
-    });
+                                      4,
+                                      0,
+                                      0,
+                                      9,
+                                      0,
+                                      2,
+                                      0,
+                                      0,
+                                      0,
+                                      0,
+                                      5,
+                                      0,
+                                  });
 
     // In case of max the result type is the same as the input type
     checkAggAll(AggOpCode::MAX, m0, (typename DT::VT)0);
     checkAggAll(AggOpCode::MAX, m1, (typename DT::VT)9);
     checkAggAll(AggOpCode::MAX, m2, (typename DT::VT)9);
-    
+
     DataObjectFactory::destroy(m0);
     DataObjectFactory::destroy(m1);
     DataObjectFactory::destroy(m2);
@@ -130,93 +203,176 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("max"), TAG_KERNELS, (DATA_TYPES), (VALUE_T
 
 // The value types of argument and result could be different, so we need to
 // test various combinations.
-#define MEAN_TEST_CASE(VTRes) TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("mean - result value type: " #VTRes), TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)) { \
-    using DTArg = TestType;  \
-     \
-    auto m0 = genGivenVals<DTArg>(3, { \
-        0, 0, 0, 0, \
-        0, 0, 0, 0, \
-        0, 0, 0, 0, \
-    }); \
-    auto m1 = genGivenVals<DTArg>(3, { \
-        1, 6, 3, 9, \
-        2, 2, 8, 9, \
-        4, 4, 5, 4, \
-    }); \
-    auto m2 = genGivenVals<DTArg>(3, { \
-        4, 0, 0, 9, \
-        0, 6, 0, 0, \
-        0, 0, 5, 0, \
-    }); \
-     \
-    checkAggAll(AggOpCode::MEAN, m0, (VTRes)0); \
-    checkAggAll(AggOpCode::MEAN, m1, (VTRes)4.75); \
-    checkAggAll(AggOpCode::MEAN, m2, (VTRes)2); \
-     \
-    DataObjectFactory::destroy(m0); \
-    DataObjectFactory::destroy(m1); \
-    DataObjectFactory::destroy(m2); \
-}
+#define MEAN_TEST_CASE(VTRes)                                                                                          \
+    TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("mean - result value type: " #VTRes), TAG_KERNELS, (DATA_TYPES),              \
+                               (VALUE_TYPES)) {                                                                        \
+        using DTArg = TestType;                                                                                        \
+                                                                                                                       \
+        auto m0 = genGivenVals<DTArg>(3, {                                                                             \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                         });                                                                           \
+        auto m1 = genGivenVals<DTArg>(3, {                                                                             \
+                                             1,                                                                        \
+                                             6,                                                                        \
+                                             3,                                                                        \
+                                             9,                                                                        \
+                                             2,                                                                        \
+                                             2,                                                                        \
+                                             8,                                                                        \
+                                             9,                                                                        \
+                                             4,                                                                        \
+                                             4,                                                                        \
+                                             5,                                                                        \
+                                             4,                                                                        \
+                                         });                                                                           \
+        auto m2 = genGivenVals<DTArg>(3, {                                                                             \
+                                             4,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             9,                                                                        \
+                                             0,                                                                        \
+                                             6,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             5,                                                                        \
+                                             0,                                                                        \
+                                         });                                                                           \
+                                                                                                                       \
+        checkAggAll(AggOpCode::MEAN, m0, (VTRes)0);                                                                    \
+        checkAggAll(AggOpCode::MEAN, m1, (VTRes)4.75);                                                                 \
+        checkAggAll(AggOpCode::MEAN, m2, (VTRes)2);                                                                    \
+                                                                                                                       \
+        DataObjectFactory::destroy(m0);                                                                                \
+        DataObjectFactory::destroy(m1);                                                                                \
+        DataObjectFactory::destroy(m2);                                                                                \
+    }
 MEAN_TEST_CASE(int64_t);
 MEAN_TEST_CASE(double);
 
-
-#define STDDEV_TEST_CASE(VTRes) TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("stddev - result value type: " #VTRes), TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)) { \
-    using DTArg = TestType;  \
-     \
-    auto m0 = genGivenVals<DTArg>(3, { \
-        0, 0, 0, 0, \
-        0, 0, 0, 0, \
-        0, 0, 0, 0, \
-    }); \
-    auto m1 = genGivenVals<DTArg>(3, { \
-        4, 0, 0, 9, \
-        0, 6, 0, 0, \
-        0, 0, 5, 0, \
-    }); \
-    auto m2 = genGivenVals<DTArg>(3, { \
-        1, 6, 3, 9, \
-        2, 2, 8, 9, \
-        4, 4, 5, 4, \
-    }); \
-     \
-    checkAggAll(AggOpCode::STDDEV, m0, (VTRes)0); \
-    checkAggAll(AggOpCode::STDDEV, m1, (VTRes)3.0276503540974916654); \
-    checkAggAll(AggOpCode::STDDEV, m2, (VTRes)2.6180463454008346998); \
-     \
-    DataObjectFactory::destroy(m0); \
-    DataObjectFactory::destroy(m1); \
-    DataObjectFactory::destroy(m2); \
-}
+#define STDDEV_TEST_CASE(VTRes)                                                                                        \
+    TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("stddev - result value type: " #VTRes), TAG_KERNELS, (DATA_TYPES),            \
+                               (VALUE_TYPES)) {                                                                        \
+        using DTArg = TestType;                                                                                        \
+                                                                                                                       \
+        auto m0 = genGivenVals<DTArg>(3, {                                                                             \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                         });                                                                           \
+        auto m1 = genGivenVals<DTArg>(3, {                                                                             \
+                                             4,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             9,                                                                        \
+                                             0,                                                                        \
+                                             6,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             5,                                                                        \
+                                             0,                                                                        \
+                                         });                                                                           \
+        auto m2 = genGivenVals<DTArg>(3, {                                                                             \
+                                             1,                                                                        \
+                                             6,                                                                        \
+                                             3,                                                                        \
+                                             9,                                                                        \
+                                             2,                                                                        \
+                                             2,                                                                        \
+                                             8,                                                                        \
+                                             9,                                                                        \
+                                             4,                                                                        \
+                                             4,                                                                        \
+                                             5,                                                                        \
+                                             4,                                                                        \
+                                         });                                                                           \
+                                                                                                                       \
+        checkAggAll(AggOpCode::STDDEV, m0, (VTRes)0);                                                                  \
+        checkAggAll(AggOpCode::STDDEV, m1, (VTRes)3.0276503540974916654);                                              \
+        checkAggAll(AggOpCode::STDDEV, m2, (VTRes)2.6180463454008346998);                                              \
+                                                                                                                       \
+        DataObjectFactory::destroy(m0);                                                                                \
+        DataObjectFactory::destroy(m1);                                                                                \
+        DataObjectFactory::destroy(m2);                                                                                \
+    }
 STDDEV_TEST_CASE(int64_t);
 STDDEV_TEST_CASE(double);
 
-#define VAR_TEST_CASE(VTRes) TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("var - result value type: " #VTRes), TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)) { \
-    using DTArg = TestType;  \
-     \
-    auto m0 = genGivenVals<DTArg>(3, { \
-        0, 0, 0, 0, \
-        0, 0, 0, 0, \
-        0, 0, 0, 0, \
-    }); \
-    auto m1 = genGivenVals<DTArg>(3, { \
-        4, 0, 0, 9, \
-        0, 6, 0, 0, \
-        0, 0, 5, 0, \
-    }); \
-    auto m2 = genGivenVals<DTArg>(3, { \
-        0, 1, 2, \
-        4, 4, 5, \
-        9, 12, 8, \
-    }); \
-     \
-    checkAggAll(AggOpCode::VAR, m0, (VTRes)0); \
-    checkAggAll(AggOpCode::VAR, m1, (VTRes)9.1666666666666666667); \
-    checkAggAll(AggOpCode::VAR, m2, (VTRes)14); \
-     \
-    DataObjectFactory::destroy(m0); \
-    DataObjectFactory::destroy(m1); \
-    DataObjectFactory::destroy(m2); \
-}
+#define VAR_TEST_CASE(VTRes)                                                                                           \
+    TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("var - result value type: " #VTRes), TAG_KERNELS, (DATA_TYPES),               \
+                               (VALUE_TYPES)) {                                                                        \
+        using DTArg = TestType;                                                                                        \
+                                                                                                                       \
+        auto m0 = genGivenVals<DTArg>(3, {                                                                             \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                         });                                                                           \
+        auto m1 = genGivenVals<DTArg>(3, {                                                                             \
+                                             4,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             9,                                                                        \
+                                             0,                                                                        \
+                                             6,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             5,                                                                        \
+                                             0,                                                                        \
+                                         });                                                                           \
+        auto m2 = genGivenVals<DTArg>(3, {                                                                             \
+                                             0,                                                                        \
+                                             1,                                                                        \
+                                             2,                                                                        \
+                                             4,                                                                        \
+                                             4,                                                                        \
+                                             5,                                                                        \
+                                             9,                                                                        \
+                                             12,                                                                       \
+                                             8,                                                                        \
+                                         });                                                                           \
+                                                                                                                       \
+        checkAggAll(AggOpCode::VAR, m0, (VTRes)0);                                                                     \
+        checkAggAll(AggOpCode::VAR, m1, (VTRes)9.1666666666666666667);                                                 \
+        checkAggAll(AggOpCode::VAR, m2, (VTRes)14);                                                                    \
+                                                                                                                       \
+        DataObjectFactory::destroy(m0);                                                                                \
+        DataObjectFactory::destroy(m1);                                                                                \
+        DataObjectFactory::destroy(m2);                                                                                \
+    }
 VAR_TEST_CASE(int64_t);
 VAR_TEST_CASE(double);
diff --git a/test/runtime/local/kernels/AggColTest.cpp b/test/runtime/local/kernels/AggColTest.cpp
index 116bcbb30..8a8d18c58 100644
--- a/test/runtime/local/kernels/AggColTest.cpp
+++ b/test/runtime/local/kernels/AggColTest.cpp
@@ -17,9 +17,9 @@
 #include <runtime/local/datagen/GenGivenVals.h>
 #include <runtime/local/datastructures/CSRMatrix.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
-#include <runtime/local/kernels/CheckEqApprox.h>
 #include <runtime/local/kernels/AggCol.h>
 #include <runtime/local/kernels/AggOpCode.h>
+#include <runtime/local/kernels/CheckEqApprox.h>
 
 #include <tags.h>
 
@@ -32,9 +32,8 @@
 #define DATA_TYPES DenseMatrix, CSRMatrix, Matrix
 #define VALUE_TYPES double, uint32_t
 
-template<class DTRes, class DTArg>
-void checkAggCol(AggOpCode opCode, const DTArg * arg, const DTRes * exp) {
-    DTRes * res = nullptr;
+template <class DTRes, class DTArg> void checkAggCol(AggOpCode opCode, const DTArg *arg, const DTRes *exp) {
+    DTRes *res = nullptr;
     aggCol<DTRes, DTArg>(opCode, res, arg, nullptr);
     CHECK(checkEqApprox(res, exp, 1e-5, nullptr));
     DataObjectFactory::destroy(res);
@@ -42,32 +41,49 @@ void checkAggCol(AggOpCode opCode, const DTArg * arg, const DTRes * exp) {
 
 // The value types of argument and result could be different, so we need to
 // test various combinations.
-#define SUM_TEST_CASE(VTRes) TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("sum - result value type: " #VTRes), TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)) { \
-    using DTArg = TestType; \
-    using DTRes = typename std::conditional< \
-                        std::is_same<DTArg, Matrix<typename DTArg::VT>>::value, \
-                        Matrix<VTRes>, \
-                        DenseMatrix<VTRes> \
-                    >::type; \
-     \
-    auto m0 = genGivenVals<DTArg>(3, { \
-        0, 0, 0, 0, \
-        0, 0, 0, 0, \
-        0, 0, 0, 0, \
-    }); \
-    auto m0exp = genGivenVals<DTRes>(1, {0, 0, 0, 0}); \
-    auto m1 = genGivenVals<DTArg>(3, { \
-        3, 0, 2, 0, \
-        0, 0, 1, 1, \
-        2, 5, 0, 0, \
-    }); \
-    auto m1exp = genGivenVals<DTRes>(1, {5, 5, 3, 1}); \
-     \
-    checkAggCol(AggOpCode::SUM, m0, m0exp); \
-    checkAggCol(AggOpCode::SUM, m1, m1exp); \
-     \
-    DataObjectFactory::destroy(m0, m0exp, m1, m1exp); \
-}
+#define SUM_TEST_CASE(VTRes)                                                                                           \
+    TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("sum - result value type: " #VTRes), TAG_KERNELS, (DATA_TYPES),               \
+                               (VALUE_TYPES)) {                                                                        \
+        using DTArg = TestType;                                                                                        \
+        using DTRes = typename std::conditional<std::is_same<DTArg, Matrix<typename DTArg::VT>>::value, Matrix<VTRes>, \
+                                                DenseMatrix<VTRes>>::type;                                             \
+                                                                                                                       \
+        auto m0 = genGivenVals<DTArg>(3, {                                                                             \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                         });                                                                           \
+        auto m0exp = genGivenVals<DTRes>(1, {0, 0, 0, 0});                                                             \
+        auto m1 = genGivenVals<DTArg>(3, {                                                                             \
+                                             3,                                                                        \
+                                             0,                                                                        \
+                                             2,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             1,                                                                        \
+                                             1,                                                                        \
+                                             2,                                                                        \
+                                             5,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                         });                                                                           \
+        auto m1exp = genGivenVals<DTRes>(1, {5, 5, 3, 1});                                                             \
+                                                                                                                       \
+        checkAggCol(AggOpCode::SUM, m0, m0exp);                                                                        \
+        checkAggCol(AggOpCode::SUM, m1, m1exp);                                                                        \
+                                                                                                                       \
+        DataObjectFactory::destroy(m0, m0exp, m1, m1exp);                                                              \
+    }
 SUM_TEST_CASE(int64_t)
 SUM_TEST_CASE(double)
 
@@ -75,35 +91,58 @@ SUM_TEST_CASE(double)
 TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("min"), TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)) {
     using DTArg = TestType;
     using VT = typename DTArg::VT;
-    using DTRes = typename std::conditional<
-                        std::is_same<DTArg, Matrix<VT>>::value, 
-                        Matrix<VT>, 
-                        DenseMatrix<VT>
-                    >::type;
-    
+    using DTRes = typename std::conditional<std::is_same<DTArg, Matrix<VT>>::value, Matrix<VT>, DenseMatrix<VT>>::type;
+
     auto m0 = genGivenVals<DTArg>(3, {
-        0, 0, 0, 0,
-        0, 0, 0, 0,
-        0, 0, 0, 0,
-    });
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                     });
     auto m0exp = genGivenVals<DTRes>(1, {0, 0, 0, 0});
     auto m1 = genGivenVals<DTArg>(3, {
-        4, 6, 3, 9,
-        5, 2, 0, 9,
-        7, 4, 5, 4,
-    });
+                                         4,
+                                         6,
+                                         3,
+                                         9,
+                                         5,
+                                         2,
+                                         0,
+                                         9,
+                                         7,
+                                         4,
+                                         5,
+                                         4,
+                                     });
     auto m1exp = genGivenVals<DTRes>(1, {4, 2, 0, 4});
     auto m2 = genGivenVals<DTArg>(3, {
-        4, 0, 0, 9,
-        0, 2, 0, 0,
-        0, 0, 5, 0,
-    });
+                                         4,
+                                         0,
+                                         0,
+                                         9,
+                                         0,
+                                         2,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         5,
+                                         0,
+                                     });
     auto m2exp = genGivenVals<DTRes>(1, {0, 0, 0, 0});
-    
+
     checkAggCol(AggOpCode::MIN, m0, m0exp);
     checkAggCol(AggOpCode::MIN, m1, m1exp);
     checkAggCol(AggOpCode::MIN, m2, m2exp);
-    
+
     DataObjectFactory::destroy(m0, m0exp, m1, m1exp, m2, m2exp);
 }
 
@@ -111,128 +150,207 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("min"), TAG_KERNELS, (DATA_TYPES), (VALUE_T
 TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("max"), TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)) {
     using DTArg = TestType;
     using VT = typename DTArg::VT;
-    using DTRes = typename std::conditional<
-                        std::is_same<DTArg, Matrix<VT>>::value, 
-                        Matrix<VT>, 
-                        DenseMatrix<VT>
-                    >::type;
-    
+    using DTRes = typename std::conditional<std::is_same<DTArg, Matrix<VT>>::value, Matrix<VT>, DenseMatrix<VT>>::type;
+
     auto m0 = genGivenVals<DTArg>(3, {
-        0, 0, 0, 0,
-        0, 0, 0, 0,
-        0, 0, 0, 0,
-    });
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                     });
     auto m0exp = genGivenVals<DTRes>(1, {0, 0, 0, 0});
     auto m1 = genGivenVals<DTArg>(3, {
-        4, 6, 3, 9,
-        5, 2, 0, 9,
-        7, 4, 5, 4,
-    });
+                                         4,
+                                         6,
+                                         3,
+                                         9,
+                                         5,
+                                         2,
+                                         0,
+                                         9,
+                                         7,
+                                         4,
+                                         5,
+                                         4,
+                                     });
     auto m1exp = genGivenVals<DTRes>(1, {7, 6, 5, 9});
     auto m2 = genGivenVals<DTArg>(3, {
-        4, 0, 0, 9,
-        0, 2, 0, 0,
-        0, 0, 5, 0,
-    });
+                                         4,
+                                         0,
+                                         0,
+                                         9,
+                                         0,
+                                         2,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         5,
+                                         0,
+                                     });
     auto m2exp = genGivenVals<DTRes>(1, {4, 2, 5, 9});
-    
+
     checkAggCol(AggOpCode::MAX, m0, m0exp);
     checkAggCol(AggOpCode::MAX, m1, m1exp);
     checkAggCol(AggOpCode::MAX, m2, m2exp);
-    
+
     DataObjectFactory::destroy(m0, m0exp, m1, m1exp, m2, m2exp);
 }
 
 // The value types of argument and result could be different, so we need to
 // test various combinations.
-#define MEAN_TEST_CASE(VTRes) TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("mean - result value type: " #VTRes), TAG_KERNELS, (DATA_TYPES), (int64_t, double)) { \
-    using DTArg = TestType; \
-    using DTRes = typename std::conditional< \
-                        std::is_same<DTArg, Matrix<typename DTArg::VT>>::value, \
-                        Matrix<VTRes>, \
-                        DenseMatrix<VTRes> \
-                    >::type; \
-     \
-    auto m0 = genGivenVals<DTArg>(3, { \
-        0, 0, 0, 0, \
-        0, 0, 0, 0, \
-        0, 0, 0, 0, \
-    }); \
-    auto m0exp = genGivenVals<DTRes>(1, {0, 0, 0, 0}); \
-    auto m2 = genGivenVals<DTArg>(4, { \
-        1, 3, 0, -1, \
-        1, 3, 5,  3, \
-        3, 1, 0,  0, \
-        3, 1, 5, -1, \
-    }); \
-    auto m2exp = genGivenVals<DTRes>(1, {(VTRes)2.0, (VTRes)2.0, (VTRes)2.5, (VTRes)0.25}); \
-    \
-    checkAggCol(AggOpCode::MEAN, m0, m0exp); \
-    checkAggCol(AggOpCode::MEAN, m2, m2exp); \
-     \
-    DataObjectFactory::destroy(m0, m0exp, m2, m2exp); \
-}
+#define MEAN_TEST_CASE(VTRes)                                                                                          \
+    TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("mean - result value type: " #VTRes), TAG_KERNELS, (DATA_TYPES),              \
+                               (int64_t, double)) {                                                                    \
+        using DTArg = TestType;                                                                                        \
+        using DTRes = typename std::conditional<std::is_same<DTArg, Matrix<typename DTArg::VT>>::value, Matrix<VTRes>, \
+                                                DenseMatrix<VTRes>>::type;                                             \
+                                                                                                                       \
+        auto m0 = genGivenVals<DTArg>(3, {                                                                             \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                         });                                                                           \
+        auto m0exp = genGivenVals<DTRes>(1, {0, 0, 0, 0});                                                             \
+        auto m2 = genGivenVals<DTArg>(4, {                                                                             \
+                                             1,                                                                        \
+                                             3,                                                                        \
+                                             0,                                                                        \
+                                             -1,                                                                       \
+                                             1,                                                                        \
+                                             3,                                                                        \
+                                             5,                                                                        \
+                                             3,                                                                        \
+                                             3,                                                                        \
+                                             1,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             3,                                                                        \
+                                             1,                                                                        \
+                                             5,                                                                        \
+                                             -1,                                                                       \
+                                         });                                                                           \
+        auto m2exp = genGivenVals<DTRes>(1, {(VTRes)2.0, (VTRes)2.0, (VTRes)2.5, (VTRes)0.25});                        \
+                                                                                                                       \
+        checkAggCol(AggOpCode::MEAN, m0, m0exp);                                                                       \
+        checkAggCol(AggOpCode::MEAN, m2, m2exp);                                                                       \
+                                                                                                                       \
+        DataObjectFactory::destroy(m0, m0exp, m2, m2exp);                                                              \
+    }
 MEAN_TEST_CASE(int64_t);
 MEAN_TEST_CASE(double);
 
 // The value types of argument and result could be different, so we need to
 // test various combinations.
-#define STDDEV_TEST_CASE(VTRes) TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("stddev - result value type: " #VTRes), TAG_KERNELS, (DATA_TYPES), (int64_t, double)) { \
-    using DTArg = TestType; \
-    using DTRes = typename std::conditional< \
-                        std::is_same<DTArg, Matrix<typename DTArg::VT>>::value, \
-                        Matrix<VTRes>, \
-                        DenseMatrix<VTRes> \
-                    >::type; \
-     \
-    auto m0 = genGivenVals<DTArg>(3, { \
-        0, 0, 0, 0, \
-        0, 0, 0, 0, \
-        0, 0, 0, 0, \
-    }); \
-    auto m0exp = genGivenVals<DTRes>(1, {0, 0, 0, 0}); \
-    auto m2 = genGivenVals<DTArg>(4, { \
-        1, 3, 0, -1, \
-        1, 3, 5,  3, \
-        3, 1, 0,  0, \
-        3, 1, 5, -1, \
-    }); \
-    auto m2exp = genGivenVals<DTRes>(1, {1, 1, (VTRes)2.5, (VTRes)1.6393596310755}); \
-     \
-    checkAggCol(AggOpCode::STDDEV, m0, m0exp); \
-    checkAggCol(AggOpCode::STDDEV, m2, m2exp); \
-     \
-    DataObjectFactory::destroy(m0, m0exp, m2, m2exp); \
-}
+#define STDDEV_TEST_CASE(VTRes)                                                                                        \
+    TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("stddev - result value type: " #VTRes), TAG_KERNELS, (DATA_TYPES),            \
+                               (int64_t, double)) {                                                                    \
+        using DTArg = TestType;                                                                                        \
+        using DTRes = typename std::conditional<std::is_same<DTArg, Matrix<typename DTArg::VT>>::value, Matrix<VTRes>, \
+                                                DenseMatrix<VTRes>>::type;                                             \
+                                                                                                                       \
+        auto m0 = genGivenVals<DTArg>(3, {                                                                             \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                         });                                                                           \
+        auto m0exp = genGivenVals<DTRes>(1, {0, 0, 0, 0});                                                             \
+        auto m2 = genGivenVals<DTArg>(4, {                                                                             \
+                                             1,                                                                        \
+                                             3,                                                                        \
+                                             0,                                                                        \
+                                             -1,                                                                       \
+                                             1,                                                                        \
+                                             3,                                                                        \
+                                             5,                                                                        \
+                                             3,                                                                        \
+                                             3,                                                                        \
+                                             1,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             3,                                                                        \
+                                             1,                                                                        \
+                                             5,                                                                        \
+                                             -1,                                                                       \
+                                         });                                                                           \
+        auto m2exp = genGivenVals<DTRes>(1, {1, 1, (VTRes)2.5, (VTRes)1.6393596310755});                               \
+                                                                                                                       \
+        checkAggCol(AggOpCode::STDDEV, m0, m0exp);                                                                     \
+        checkAggCol(AggOpCode::STDDEV, m2, m2exp);                                                                     \
+                                                                                                                       \
+        DataObjectFactory::destroy(m0, m0exp, m2, m2exp);                                                              \
+    }
 STDDEV_TEST_CASE(int64_t);
 STDDEV_TEST_CASE(double);
 
-#define VAR_TEST_CASE(VTRes) TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("var - result value type: " #VTRes), TAG_KERNELS, (DATA_TYPES), (int64_t, double)) { \
-    using DTArg = TestType; \
-    using DTRes = typename std::conditional< \
-                        std::is_same<DTArg, Matrix<typename DTArg::VT>>::value, \
-                        Matrix<VTRes>, \
-                        DenseMatrix<VTRes> \
-                    >::type; \
-     \
-    auto m0 = genGivenVals<DTArg>(3, { \
-        0, 0, 0, 0, \
-        0, 0, 0, 0, \
-        0, 0, 0, 0, \
-    }); \
-    auto m0exp = genGivenVals<DTRes>(1, {0, 0, 0, 0}); \
-    auto m1 = genGivenVals<DTArg>(4, { \
-        1, 3, 0, \
-        1, 3, 5, \
-        3, 1, 0, \
-        3, 1, 5, \
-    }); \
-    auto m1exp = genGivenVals<DTRes>(1, {1, 1, (VTRes)6.25}); \
-     \
-    checkAggCol(AggOpCode::VAR, m0, m0exp); \
-    checkAggCol(AggOpCode::VAR, m1, m1exp); \
-     \
-    DataObjectFactory::destroy(m0, m0exp, m1, m1exp); \
-}
+#define VAR_TEST_CASE(VTRes)                                                                                           \
+    TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("var - result value type: " #VTRes), TAG_KERNELS, (DATA_TYPES),               \
+                               (int64_t, double)) {                                                                    \
+        using DTArg = TestType;                                                                                        \
+        using DTRes = typename std::conditional<std::is_same<DTArg, Matrix<typename DTArg::VT>>::value, Matrix<VTRes>, \
+                                                DenseMatrix<VTRes>>::type;                                             \
+                                                                                                                       \
+        auto m0 = genGivenVals<DTArg>(3, {                                                                             \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                         });                                                                           \
+        auto m0exp = genGivenVals<DTRes>(1, {0, 0, 0, 0});                                                             \
+        auto m1 = genGivenVals<DTArg>(4, {                                                                             \
+                                             1,                                                                        \
+                                             3,                                                                        \
+                                             0,                                                                        \
+                                             1,                                                                        \
+                                             3,                                                                        \
+                                             5,                                                                        \
+                                             3,                                                                        \
+                                             1,                                                                        \
+                                             0,                                                                        \
+                                             3,                                                                        \
+                                             1,                                                                        \
+                                             5,                                                                        \
+                                         });                                                                           \
+        auto m1exp = genGivenVals<DTRes>(1, {1, 1, (VTRes)6.25});                                                      \
+                                                                                                                       \
+        checkAggCol(AggOpCode::VAR, m0, m0exp);                                                                        \
+        checkAggCol(AggOpCode::VAR, m1, m1exp);                                                                        \
+                                                                                                                       \
+        DataObjectFactory::destroy(m0, m0exp, m1, m1exp);                                                              \
+    }
 VAR_TEST_CASE(int64_t);
 VAR_TEST_CASE(double);
\ No newline at end of file
diff --git a/test/runtime/local/kernels/AggCumTest.cpp b/test/runtime/local/kernels/AggCumTest.cpp
index efecb85c9..f0106a64b 100644
--- a/test/runtime/local/kernels/AggCumTest.cpp
+++ b/test/runtime/local/kernels/AggCumTest.cpp
@@ -16,9 +16,9 @@
 
 #include <runtime/local/datagen/GenGivenVals.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
-#include <runtime/local/kernels/CheckEq.h>
 #include <runtime/local/kernels/AggCum.h>
 #include <runtime/local/kernels/AggOpCode.h>
+#include <runtime/local/kernels/CheckEq.h>
 
 #include <tags.h>
 
@@ -30,9 +30,8 @@
 #define DATA_TYPES DenseMatrix, Matrix
 #define VALUE_TYPES double, int32_t
 
-template<class DTRes, class DTArg>
-void checkAggCum(AggOpCode opCode, const DTArg * arg, const DTRes * exp) {
-    DTRes * res = nullptr;
+template <class DTRes, class DTArg> void checkAggCum(AggOpCode opCode, const DTArg *arg, const DTRes *exp) {
+    DTRes *res = nullptr;
     aggCum<DTRes, DTArg>(opCode, res, arg, nullptr);
     CHECK(*res == *exp);
 }
@@ -41,12 +40,13 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("sum"), TAG_KERNELS, (DATA_TYPES), (VALUE_T
     using DTArg = TestType;
     using DTRes = TestType;
     using DTEmpty = DenseMatrix<typename DTArg::VT>;
-    
-    DTArg * arg = nullptr;
-    DTRes * exp = nullptr;
+
+    DTArg *arg = nullptr;
+    DTRes *exp = nullptr;
 
     SECTION("0x0 matrix") {
-        // can't create an empty generic Matrix, so DenseMatrix is casted instead
+        // can't create an empty generic Matrix, so DenseMatrix is casted
+        // instead
         arg = static_cast<DTArg *>(DataObjectFactory::create<DTEmpty>(0, 0, false));
         exp = static_cast<DTRes *>(DataObjectFactory::create<DTEmpty>(0, 0, false));
     }
@@ -60,49 +60,97 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("sum"), TAG_KERNELS, (DATA_TYPES), (VALUE_T
     }
     SECTION("mxn matrix, zero") {
         arg = genGivenVals<DTArg>(4, {
-            0, 0, 0,
-            0, 0, 0,
-            0, 0, 0,
-            0, 0, 0,
-        });
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                     });
         exp = genGivenVals<DTArg>(4, {
-            0, 0, 0,
-            0, 0, 0,
-            0, 0, 0,
-            0, 0, 0,
-        });
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                     });
     }
     SECTION("mxn matrix, sparse") {
         arg = genGivenVals<DTArg>(4, {
-            0, 0,  1,
-            0, 2,  0,
-            0, 0, -3,
-            0, 0,  0,
-        });
+                                         0,
+                                         0,
+                                         1,
+                                         0,
+                                         2,
+                                         0,
+                                         0,
+                                         0,
+                                         -3,
+                                         0,
+                                         0,
+                                         0,
+                                     });
         exp = genGivenVals<DTArg>(4, {
-            0, 0,  1,
-            0, 2,  1,
-            0, 2, -2,
-            0, 2, -2,
-        });
+                                         0,
+                                         0,
+                                         1,
+                                         0,
+                                         2,
+                                         1,
+                                         0,
+                                         2,
+                                         -2,
+                                         0,
+                                         2,
+                                         -2,
+                                     });
     }
     SECTION("mxn matrix, dense") {
         arg = genGivenVals<DTArg>(4, {
-             4, -2, -2,
-            -3,  3,  1,
-             1,  4, -5,
-             5,  2,  6,
-        });
+                                         4,
+                                         -2,
+                                         -2,
+                                         -3,
+                                         3,
+                                         1,
+                                         1,
+                                         4,
+                                         -5,
+                                         5,
+                                         2,
+                                         6,
+                                     });
         exp = genGivenVals<DTArg>(4, {
-            4, -2, -2,
-            1,  1, -1,
-            2,  5, -6,
-            7,  7,  0,
-        });
+                                         4,
+                                         -2,
+                                         -2,
+                                         1,
+                                         1,
+                                         -1,
+                                         2,
+                                         5,
+                                         -6,
+                                         7,
+                                         7,
+                                         0,
+                                     });
     }
 
     checkAggCum(AggOpCode::SUM, arg, exp);
-    
+
     DataObjectFactory::destroy(arg, exp);
 }
 
@@ -111,11 +159,12 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("prod"), TAG_KERNELS, (DATA_TYPES), (VALUE_
     using DTRes = TestType;
     using DTEmpty = DenseMatrix<typename DTArg::VT>;
 
-    DTArg * arg = nullptr;
-    DTRes * exp = nullptr;
+    DTArg *arg = nullptr;
+    DTRes *exp = nullptr;
 
     SECTION("0x0 matrix") {
-        // can't create an empty generic Matrix, so DenseMatrix is casted instead
+        // can't create an empty generic Matrix, so DenseMatrix is casted
+        // instead
         arg = static_cast<DTArg *>(DataObjectFactory::create<DTEmpty>(0, 0, false));
         exp = static_cast<DTRes *>(DataObjectFactory::create<DTEmpty>(0, 0, false));
     }
@@ -124,54 +173,102 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("prod"), TAG_KERNELS, (DATA_TYPES), (VALUE_
         exp = genGivenVals<DTArg>(1, {1, -2, 3});
     }
     SECTION("mx1 matrix") {
-        arg = genGivenVals<DTArg>(3, {1, -2,  3});
+        arg = genGivenVals<DTArg>(3, {1, -2, 3});
         exp = genGivenVals<DTArg>(3, {1, -2, -6});
     }
     SECTION("mxn matrix, zero") {
         arg = genGivenVals<DTArg>(4, {
-            0, 0, 0,
-            0, 0, 0,
-            0, 0, 0,
-            0, 0, 0,
-        });
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                     });
         exp = genGivenVals<DTArg>(4, {
-            0, 0, 0,
-            0, 0, 0,
-            0, 0, 0,
-            0, 0, 0,
-        });
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                     });
     }
     SECTION("mxn matrix, sparse") {
         arg = genGivenVals<DTArg>(4, {
-            0, 0, 1,
-            0, 2, 0,
-            0, 0, 3,
-            0, 0, 0,
-        });
+                                         0,
+                                         0,
+                                         1,
+                                         0,
+                                         2,
+                                         0,
+                                         0,
+                                         0,
+                                         3,
+                                         0,
+                                         0,
+                                         0,
+                                     });
         exp = genGivenVals<DTArg>(4, {
-            0, 0, 1,
-            0, 0, 0,
-            0, 0, 0,
-            0, 0, 0,
-        });
+                                         0,
+                                         0,
+                                         1,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                     });
     }
     SECTION("mxn matrix, dense") {
         arg = genGivenVals<DTArg>(4, {
-             4, -2, -2,
-            -3,  3,  1,
-             1,  4, -5,
-             5,  2,  6,
-        });
+                                         4,
+                                         -2,
+                                         -2,
+                                         -3,
+                                         3,
+                                         1,
+                                         1,
+                                         4,
+                                         -5,
+                                         5,
+                                         2,
+                                         6,
+                                     });
         exp = genGivenVals<DTArg>(4, {
-              4,  -2, -2,
-            -12,  -6, -2,
-            -12, -24, 10,
-            -60, -48, 60,
-        });
+                                         4,
+                                         -2,
+                                         -2,
+                                         -12,
+                                         -6,
+                                         -2,
+                                         -12,
+                                         -24,
+                                         10,
+                                         -60,
+                                         -48,
+                                         60,
+                                     });
     }
 
     checkAggCum(AggOpCode::PROD, arg, exp);
-    
+
     DataObjectFactory::destroy(arg, exp);
 }
 
@@ -179,12 +276,13 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("min"), TAG_KERNELS, (DATA_TYPES), (VALUE_T
     using DTArg = TestType;
     using DTRes = TestType;
     using DTEmpty = DenseMatrix<typename DTArg::VT>;
-    
-    DTArg * arg = nullptr;
-    DTRes * exp = nullptr;
+
+    DTArg *arg = nullptr;
+    DTRes *exp = nullptr;
 
     SECTION("0x0 matrix") {
-        // can't create an empty generic Matrix, so DenseMatrix is casted instead
+        // can't create an empty generic Matrix, so DenseMatrix is casted
+        // instead
         arg = static_cast<DTArg *>(DataObjectFactory::create<DTEmpty>(0, 0, false));
         exp = static_cast<DTRes *>(DataObjectFactory::create<DTEmpty>(0, 0, false));
     }
@@ -193,54 +291,102 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("min"), TAG_KERNELS, (DATA_TYPES), (VALUE_T
         exp = genGivenVals<DTArg>(1, {1, -2, 3});
     }
     SECTION("mx1 matrix") {
-        arg = genGivenVals<DTArg>(3, {1, -2,  3});
+        arg = genGivenVals<DTArg>(3, {1, -2, 3});
         exp = genGivenVals<DTArg>(3, {1, -2, -2});
     }
     SECTION("mxn matrix, zero") {
         arg = genGivenVals<DTArg>(4, {
-            0, 0, 0,
-            0, 0, 0,
-            0, 0, 0,
-            0, 0, 0,
-        });
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                     });
         exp = genGivenVals<DTArg>(4, {
-            0, 0, 0,
-            0, 0, 0,
-            0, 0, 0,
-            0, 0, 0,
-        });
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                     });
     }
     SECTION("mxn matrix, sparse") {
         arg = genGivenVals<DTArg>(4, {
-            0, 0,  1,
-            0, 2,  0,
-            0, 0, -3,
-            0, 0,  0,
-        });
+                                         0,
+                                         0,
+                                         1,
+                                         0,
+                                         2,
+                                         0,
+                                         0,
+                                         0,
+                                         -3,
+                                         0,
+                                         0,
+                                         0,
+                                     });
         exp = genGivenVals<DTArg>(4, {
-            0, 0,  1,
-            0, 0,  0,
-            0, 0, -3,
-            0, 0, -3,
-        });
+                                         0,
+                                         0,
+                                         1,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         -3,
+                                         0,
+                                         0,
+                                         -3,
+                                     });
     }
     SECTION("mxn matrix, dense") {
         arg = genGivenVals<DTArg>(4, {
-             4, -2, -2,
-            -3,  3,  1,
-             1,  4, -5,
-             5,  2,  6,
-        });
+                                         4,
+                                         -2,
+                                         -2,
+                                         -3,
+                                         3,
+                                         1,
+                                         1,
+                                         4,
+                                         -5,
+                                         5,
+                                         2,
+                                         6,
+                                     });
         exp = genGivenVals<DTArg>(4, {
-             4, -2, -2,
-            -3, -2, -2,
-            -3, -2, -5,
-            -3, -2, -5,
-        });
+                                         4,
+                                         -2,
+                                         -2,
+                                         -3,
+                                         -2,
+                                         -2,
+                                         -3,
+                                         -2,
+                                         -5,
+                                         -3,
+                                         -2,
+                                         -5,
+                                     });
     }
 
     checkAggCum(AggOpCode::MIN, arg, exp);
-    
+
     DataObjectFactory::destroy(arg, exp);
 }
 
@@ -248,12 +394,13 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("max"), TAG_KERNELS, (DATA_TYPES), (VALUE_T
     using DTArg = TestType;
     using DTRes = TestType;
     using DTEmpty = DenseMatrix<typename DTArg::VT>;
-    
-    DTArg * arg = nullptr;
-    DTRes * exp = nullptr;
+
+    DTArg *arg = nullptr;
+    DTRes *exp = nullptr;
 
     SECTION("0x0 matrix") {
-        // can't create an empty generic Matrix, so DenseMatrix is casted instead
+        // can't create an empty generic Matrix, so DenseMatrix is casted
+        // instead
         arg = static_cast<DTArg *>(DataObjectFactory::create<DTEmpty>(0, 0, false));
         exp = static_cast<DTRes *>(DataObjectFactory::create<DTEmpty>(0, 0, false));
     }
@@ -263,52 +410,100 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("max"), TAG_KERNELS, (DATA_TYPES), (VALUE_T
     }
     SECTION("mx1 matrix") {
         arg = genGivenVals<DTArg>(3, {1, -2, 3});
-        exp = genGivenVals<DTArg>(3, {1,  1, 3});
+        exp = genGivenVals<DTArg>(3, {1, 1, 3});
     }
     SECTION("mxn matrix, zero") {
         arg = genGivenVals<DTArg>(4, {
-            0, 0, 0,
-            0, 0, 0,
-            0, 0, 0,
-            0, 0, 0,
-        });
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                     });
         exp = genGivenVals<DTArg>(4, {
-            0, 0, 0,
-            0, 0, 0,
-            0, 0, 0,
-            0, 0, 0,
-        });
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                     });
     }
     SECTION("mxn matrix, sparse") {
         arg = genGivenVals<DTArg>(4, {
-            0, 0,  1,
-            0, 2,  0,
-            0, 0, -3,
-            0, 0,  0,
-        });
+                                         0,
+                                         0,
+                                         1,
+                                         0,
+                                         2,
+                                         0,
+                                         0,
+                                         0,
+                                         -3,
+                                         0,
+                                         0,
+                                         0,
+                                     });
         exp = genGivenVals<DTArg>(4, {
-            0, 0, 1,
-            0, 2, 1,
-            0, 2, 1,
-            0, 2, 1,
-        });
+                                         0,
+                                         0,
+                                         1,
+                                         0,
+                                         2,
+                                         1,
+                                         0,
+                                         2,
+                                         1,
+                                         0,
+                                         2,
+                                         1,
+                                     });
     }
     SECTION("mxn matrix, dense") {
         arg = genGivenVals<DTArg>(4, {
-             4, -2, -2,
-            -3,  3,  1,
-             1,  4, -5,
-             5,  2,  6,
-        });
+                                         4,
+                                         -2,
+                                         -2,
+                                         -3,
+                                         3,
+                                         1,
+                                         1,
+                                         4,
+                                         -5,
+                                         5,
+                                         2,
+                                         6,
+                                     });
         exp = genGivenVals<DTArg>(4, {
-            4, -2, -2,
-            4,  3,  1,
-            4,  4,  1,
-            5,  4,  6,
-        });
+                                         4,
+                                         -2,
+                                         -2,
+                                         4,
+                                         3,
+                                         1,
+                                         4,
+                                         4,
+                                         1,
+                                         5,
+                                         4,
+                                         6,
+                                     });
     }
 
     checkAggCum(AggOpCode::MAX, arg, exp);
-    
+
     DataObjectFactory::destroy(arg, exp);
 }
\ No newline at end of file
diff --git a/test/runtime/local/kernels/AggRowTest.cpp b/test/runtime/local/kernels/AggRowTest.cpp
index 32c5cb56b..0830cf027 100644
--- a/test/runtime/local/kernels/AggRowTest.cpp
+++ b/test/runtime/local/kernels/AggRowTest.cpp
@@ -17,9 +17,9 @@
 #include <runtime/local/datagen/GenGivenVals.h>
 #include <runtime/local/datastructures/CSRMatrix.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
-#include <runtime/local/kernels/CheckEqApprox.h>
-#include <runtime/local/kernels/AggRow.h>
 #include <runtime/local/kernels/AggOpCode.h>
+#include <runtime/local/kernels/AggRow.h>
+#include <runtime/local/kernels/CheckEqApprox.h>
 
 #include <tags.h>
 
@@ -32,44 +32,60 @@
 #define DATA_TYPES DenseMatrix, CSRMatrix, Matrix
 #define VALUE_TYPES double, uint32_t
 
-template<class DTRes, class DTArg>
-void checkAggRow(AggOpCode opCode, const DTArg * arg, const DTRes * exp) {
-    DTRes * res = nullptr;
+template <class DTRes, class DTArg> void checkAggRow(AggOpCode opCode, const DTArg *arg, const DTRes *exp) {
+    DTRes *res = nullptr;
     aggRow<DTRes, DTArg>(opCode, res, arg, nullptr);
     CHECK(checkEqApprox(res, exp, 1e-5, nullptr));
 }
 
 // The value types of argument and result could be different, so we need to
 // test various combinations.
-#define SUM_TEST_CASE(VTRes) TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("sum - result value type: " #VTRes), TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)) { \
-    using DTArg = TestType; \
-    using DTRes = typename std::conditional< \
-                        std::is_same<DTArg, Matrix<typename DTArg::VT>>::value, \
-                        Matrix<VTRes>, \
-                        DenseMatrix<VTRes> \
-                    >::type; \
-     \
-    auto m0 = genGivenVals<DTArg>(3, { \
-        0, 0, 0, 0, \
-        0, 0, 0, 0, \
-        0, 0, 0, 0, \
-    }); \
-    auto m0exp = genGivenVals<DTRes>(3, {0, 0, 0}); \
-    auto m1 = genGivenVals<DTArg>(3, { \
-        3, 0, 2, 0, \
-        0, 0, 1, 1, \
-        2, 5, 0, 0, \
-    }); \
-    auto m1exp = genGivenVals<DTRes>(3, {5, 2, 7}); \
-     \
-    checkAggRow(AggOpCode::SUM, m0, m0exp); \
-    checkAggRow(AggOpCode::SUM, m1, m1exp); \
-     \
-    DataObjectFactory::destroy(m0); \
-    DataObjectFactory::destroy(m0exp); \
-    DataObjectFactory::destroy(m1); \
-    DataObjectFactory::destroy(m1exp); \
-}
+#define SUM_TEST_CASE(VTRes)                                                                                           \
+    TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("sum - result value type: " #VTRes), TAG_KERNELS, (DATA_TYPES),               \
+                               (VALUE_TYPES)) {                                                                        \
+        using DTArg = TestType;                                                                                        \
+        using DTRes = typename std::conditional<std::is_same<DTArg, Matrix<typename DTArg::VT>>::value, Matrix<VTRes>, \
+                                                DenseMatrix<VTRes>>::type;                                             \
+                                                                                                                       \
+        auto m0 = genGivenVals<DTArg>(3, {                                                                             \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                         });                                                                           \
+        auto m0exp = genGivenVals<DTRes>(3, {0, 0, 0});                                                                \
+        auto m1 = genGivenVals<DTArg>(3, {                                                                             \
+                                             3,                                                                        \
+                                             0,                                                                        \
+                                             2,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             1,                                                                        \
+                                             1,                                                                        \
+                                             2,                                                                        \
+                                             5,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                         });                                                                           \
+        auto m1exp = genGivenVals<DTRes>(3, {5, 2, 7});                                                                \
+                                                                                                                       \
+        checkAggRow(AggOpCode::SUM, m0, m0exp);                                                                        \
+        checkAggRow(AggOpCode::SUM, m1, m1exp);                                                                        \
+                                                                                                                       \
+        DataObjectFactory::destroy(m0);                                                                                \
+        DataObjectFactory::destroy(m0exp);                                                                             \
+        DataObjectFactory::destroy(m1);                                                                                \
+        DataObjectFactory::destroy(m1exp);                                                                             \
+    }
 SUM_TEST_CASE(int64_t)
 SUM_TEST_CASE(double)
 
@@ -77,35 +93,58 @@ SUM_TEST_CASE(double)
 TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("min"), TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)) {
     using DTArg = TestType;
     using VT = typename DTArg::VT;
-    using DTRes = typename std::conditional<
-                        std::is_same<DTArg, Matrix<VT>>::value, 
-                        Matrix<VT>, 
-                        DenseMatrix<VT>
-                    >::type;
-    
+    using DTRes = typename std::conditional<std::is_same<DTArg, Matrix<VT>>::value, Matrix<VT>, DenseMatrix<VT>>::type;
+
     auto m0 = genGivenVals<DTArg>(3, {
-        0, 0, 0, 0,
-        0, 0, 0, 0,
-        0, 0, 0, 0,
-    });
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                     });
     auto m0exp = genGivenVals<DTRes>(3, {0, 0, 0});
     auto m1 = genGivenVals<DTArg>(3, {
-        4, 6, 3, 9,
-        5, 2, 0, 9,
-        7, 4, 5, 4,
-    });
+                                         4,
+                                         6,
+                                         3,
+                                         9,
+                                         5,
+                                         2,
+                                         0,
+                                         9,
+                                         7,
+                                         4,
+                                         5,
+                                         4,
+                                     });
     auto m1exp = genGivenVals<DTRes>(3, {3, 0, 4});
     auto m2 = genGivenVals<DTArg>(3, {
-        4, 0, 0, 9,
-        0, 2, 0, 0,
-        0, 0, 5, 0,
-    });
+                                         4,
+                                         0,
+                                         0,
+                                         9,
+                                         0,
+                                         2,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         5,
+                                         0,
+                                     });
     auto m2exp = genGivenVals<DTRes>(3, {0, 0, 0});
-    
+
     checkAggRow(AggOpCode::MIN, m0, m0exp);
     checkAggRow(AggOpCode::MIN, m1, m1exp);
     checkAggRow(AggOpCode::MIN, m2, m2exp);
-    
+
     DataObjectFactory::destroy(m0);
     DataObjectFactory::destroy(m0exp);
     DataObjectFactory::destroy(m1);
@@ -118,35 +157,58 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("min"), TAG_KERNELS, (DATA_TYPES), (VALUE_T
 TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("max"), TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)) {
     using DTArg = TestType;
     using VT = typename DTArg::VT;
-    using DTRes = typename std::conditional<
-                        std::is_same<DTArg, Matrix<VT>>::value, 
-                        Matrix<VT>, 
-                        DenseMatrix<VT>
-                    >::type;
-    
+    using DTRes = typename std::conditional<std::is_same<DTArg, Matrix<VT>>::value, Matrix<VT>, DenseMatrix<VT>>::type;
+
     auto m0 = genGivenVals<DTArg>(3, {
-        0, 0, 0, 0,
-        0, 0, 0, 0,
-        0, 0, 0, 0,
-    });
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                     });
     auto m0exp = genGivenVals<DTRes>(3, {0, 0, 0});
     auto m1 = genGivenVals<DTArg>(3, {
-        4, 6, 3, 9,
-        5, 2, 0, 9,
-        7, 4, 5, 4,
-    });
+                                         4,
+                                         6,
+                                         3,
+                                         9,
+                                         5,
+                                         2,
+                                         0,
+                                         9,
+                                         7,
+                                         4,
+                                         5,
+                                         4,
+                                     });
     auto m1exp = genGivenVals<DTRes>(3, {9, 9, 7});
     auto m2 = genGivenVals<DTArg>(3, {
-        4, 0, 0, 9,
-        0, 2, 0, 0,
-        0, 0, 5, 0,
-    });
+                                         4,
+                                         0,
+                                         0,
+                                         9,
+                                         0,
+                                         2,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         5,
+                                         0,
+                                     });
     auto m2exp = genGivenVals<DTRes>(3, {9, 2, 5});
-    
+
     checkAggRow(AggOpCode::MAX, m0, m0exp);
     checkAggRow(AggOpCode::MAX, m1, m1exp);
     checkAggRow(AggOpCode::MAX, m2, m2exp);
-    
+
     DataObjectFactory::destroy(m0);
     DataObjectFactory::destroy(m0exp);
     DataObjectFactory::destroy(m1);
@@ -159,35 +221,58 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("max"), TAG_KERNELS, (DATA_TYPES), (VALUE_T
 TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("idxmin"), TAG_KERNELS, (DenseMatrix), (VALUE_TYPES)) {
     using DTArg = TestType;
     using VT = typename DTArg::VT;
-    using DTRes = typename std::conditional<
-                        std::is_same<DTArg, Matrix<VT>>::value, 
-                        Matrix<VT>, 
-                        DenseMatrix<VT>
-                    >::type;
-    
+    using DTRes = typename std::conditional<std::is_same<DTArg, Matrix<VT>>::value, Matrix<VT>, DenseMatrix<VT>>::type;
+
     auto m0 = genGivenVals<DTArg>(3, {
-        0, 0, 0, 0,
-        0, 0, 0, 0,
-        0, 0, 0, 0,
-    });
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                     });
     auto m0exp = genGivenVals<DTRes>(3, {0, 0, 0});
     auto m1 = genGivenVals<DTArg>(3, {
-        4, 6, 3, 9,
-        2, 5, 0, 1,
-        7, 4, 5, 4,
-    });
+                                         4,
+                                         6,
+                                         3,
+                                         9,
+                                         2,
+                                         5,
+                                         0,
+                                         1,
+                                         7,
+                                         4,
+                                         5,
+                                         4,
+                                     });
     auto m1exp = genGivenVals<DTRes>(3, {2, 2, 1});
     auto m2 = genGivenVals<DTArg>(3, {
-        4, 0, 0, 9,
-        0, 2, 0, 0,
-        0, 0, 5, 0,
-    });
+                                         4,
+                                         0,
+                                         0,
+                                         9,
+                                         0,
+                                         2,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         5,
+                                         0,
+                                     });
     auto m2exp = genGivenVals<DTRes>(3, {1, 0, 0});
-    
+
     checkAggRow(AggOpCode::IDXMIN, m0, m0exp);
     checkAggRow(AggOpCode::IDXMIN, m1, m1exp);
     checkAggRow(AggOpCode::IDXMIN, m2, m2exp);
-    
+
     DataObjectFactory::destroy(m0, m0exp, m1, m1exp, m2, m2exp);
 }
 
@@ -195,157 +280,268 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("idxmin"), TAG_KERNELS, (DenseMatrix), (VAL
 TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("idxmax"), TAG_KERNELS, (DenseMatrix), (VALUE_TYPES)) {
     using DTArg = TestType;
     using VT = typename DTArg::VT;
-    using DTRes = typename std::conditional<
-                        std::is_same<DTArg, Matrix<VT>>::value, 
-                        Matrix<VT>, 
-                        DenseMatrix<VT>
-                    >::type;
-    
+    using DTRes = typename std::conditional<std::is_same<DTArg, Matrix<VT>>::value, Matrix<VT>, DenseMatrix<VT>>::type;
+
     auto m0 = genGivenVals<DTArg>(3, {
-        0, 0, 0, 0,
-        0, 0, 0, 0,
-        0, 0, 0, 0,
-    });
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                     });
     auto m0exp = genGivenVals<DTRes>(3, {0, 0, 0});
     auto m1 = genGivenVals<DTArg>(3, {
-        4, 6, 3, 9,
-        2, 5, 0, 1,
-        7, 4, 5, 4,
-    });
+                                         4,
+                                         6,
+                                         3,
+                                         9,
+                                         2,
+                                         5,
+                                         0,
+                                         1,
+                                         7,
+                                         4,
+                                         5,
+                                         4,
+                                     });
     auto m1exp = genGivenVals<DTRes>(3, {3, 1, 0});
     auto m2 = genGivenVals<DTArg>(3, {
-        4, 0, 0, 9,
-        0, 2, 0, 0,
-        0, 0, 5, 0,
-    });
+                                         4,
+                                         0,
+                                         0,
+                                         9,
+                                         0,
+                                         2,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         5,
+                                         0,
+                                     });
     auto m2exp = genGivenVals<DTRes>(3, {3, 1, 2});
-    
+
     checkAggRow(AggOpCode::IDXMAX, m0, m0exp);
     checkAggRow(AggOpCode::IDXMAX, m1, m1exp);
     checkAggRow(AggOpCode::IDXMAX, m2, m2exp);
-    
+
     DataObjectFactory::destroy(m0, m0exp, m1, m1exp, m2, m2exp);
 }
 
 // The value types of argument and result could be different, so we need to
 // test various combinations.
-#define MEAN_TEST_CASE(VTRes) TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("mean - result value type: " #VTRes), TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)) { \
-    using DTArg = TestType; \
-    using DTRes = typename std::conditional< \
-                        std::is_same<DTArg, Matrix<typename DTArg::VT>>::value, \
-                        Matrix<VTRes>, \
-                        DenseMatrix<VTRes> \
-                    >::type; \
-     \
-    auto m0 = genGivenVals<DTArg>(3, { \
-        0, 0, 0, 0, \
-        0, 0, 0, 0, \
-        0, 0, 0, 0, \
-    }); \
-    auto m0exp = genGivenVals<DTRes>(3, {0, 0, 0}); \
-    auto m1 = genGivenVals<DTArg>(3, { \
-        5, 7, 3, 9, \
-        2, 5, 0, 1, \
-        7, 4, 5, 4, \
-    }); \
-    auto m1exp = genGivenVals<DTRes>(3, {6, 2, 5}); \
-    auto m2 = genGivenVals<DTArg>(3, { \
-        4, 0, 0, 8, \
-        0, 4, 0, 0, \
-        0, 0, 7, 0, \
-    }); \
-    auto m2exp = genGivenVals<DTRes>(3, {3, 1, (VTRes)1.75}); \
- \
-    auto m3 = genGivenVals<DTArg>(3, { \
-        5, 7, 1, 9, \
-        2, 5, 7, 1, \
-        7, 1, 5, 4, \
-    }); \
-    auto m3exp = genGivenVals<DTRes>(3, {(VTRes)5.5, (VTRes)3.75, (VTRes)4.25}); \
-     \
-    checkAggRow(AggOpCode::MEAN, m0, m0exp); \
-    checkAggRow(AggOpCode::MEAN, m1, m1exp); \
-    checkAggRow(AggOpCode::MEAN, m2, m2exp); \
-    checkAggRow(AggOpCode::MEAN, m3, m3exp); \
-     \
-    DataObjectFactory::destroy(m0, m0exp, m1, m1exp, m2, m2exp, m3, m3exp); \
-}
+#define MEAN_TEST_CASE(VTRes)                                                                                          \
+    TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("mean - result value type: " #VTRes), TAG_KERNELS, (DATA_TYPES),              \
+                               (VALUE_TYPES)) {                                                                        \
+        using DTArg = TestType;                                                                                        \
+        using DTRes = typename std::conditional<std::is_same<DTArg, Matrix<typename DTArg::VT>>::value, Matrix<VTRes>, \
+                                                DenseMatrix<VTRes>>::type;                                             \
+                                                                                                                       \
+        auto m0 = genGivenVals<DTArg>(3, {                                                                             \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                         });                                                                           \
+        auto m0exp = genGivenVals<DTRes>(3, {0, 0, 0});                                                                \
+        auto m1 = genGivenVals<DTArg>(3, {                                                                             \
+                                             5,                                                                        \
+                                             7,                                                                        \
+                                             3,                                                                        \
+                                             9,                                                                        \
+                                             2,                                                                        \
+                                             5,                                                                        \
+                                             0,                                                                        \
+                                             1,                                                                        \
+                                             7,                                                                        \
+                                             4,                                                                        \
+                                             5,                                                                        \
+                                             4,                                                                        \
+                                         });                                                                           \
+        auto m1exp = genGivenVals<DTRes>(3, {6, 2, 5});                                                                \
+        auto m2 = genGivenVals<DTArg>(3, {                                                                             \
+                                             4,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             8,                                                                        \
+                                             0,                                                                        \
+                                             4,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             7,                                                                        \
+                                             0,                                                                        \
+                                         });                                                                           \
+        auto m2exp = genGivenVals<DTRes>(3, {3, 1, (VTRes)1.75});                                                      \
+                                                                                                                       \
+        auto m3 = genGivenVals<DTArg>(3, {                                                                             \
+                                             5,                                                                        \
+                                             7,                                                                        \
+                                             1,                                                                        \
+                                             9,                                                                        \
+                                             2,                                                                        \
+                                             5,                                                                        \
+                                             7,                                                                        \
+                                             1,                                                                        \
+                                             7,                                                                        \
+                                             1,                                                                        \
+                                             5,                                                                        \
+                                             4,                                                                        \
+                                         });                                                                           \
+        auto m3exp = genGivenVals<DTRes>(3, {(VTRes)5.5, (VTRes)3.75, (VTRes)4.25});                                   \
+                                                                                                                       \
+        checkAggRow(AggOpCode::MEAN, m0, m0exp);                                                                       \
+        checkAggRow(AggOpCode::MEAN, m1, m1exp);                                                                       \
+        checkAggRow(AggOpCode::MEAN, m2, m2exp);                                                                       \
+        checkAggRow(AggOpCode::MEAN, m3, m3exp);                                                                       \
+                                                                                                                       \
+        DataObjectFactory::destroy(m0, m0exp, m1, m1exp, m2, m2exp, m3, m3exp);                                        \
+    }
 MEAN_TEST_CASE(int64_t);
 MEAN_TEST_CASE(double);
 
-#define STDDEV_TEST_CASE(VTRes) TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("stddev - result value type: " #VTRes), TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)) { \
-    using DTArg = TestType; \
-    using DTRes = typename std::conditional< \
-                        std::is_same<DTArg, Matrix<typename DTArg::VT>>::value, \
-                        Matrix<VTRes>, \
-                        DenseMatrix<VTRes> \
-                    >::type; \
-     \
-    auto m0 = genGivenVals<DTArg>(3, { \
-        0, 0, 0, 0, \
-        0, 0, 0, 0, \
-        0, 0, 0, 0, \
-    }); \
-    auto m0exp = genGivenVals<DTRes>(3, {0, 0, 0}); \
-    auto m1 = genGivenVals<DTArg>(3, { \
-        4, 0, 0, 8, \
-        0, 4, 0, 0, \
-        0, 0, 7, 0, \
-    }); \
-    auto m1exp = genGivenVals<DTRes>(3, {(VTRes)3.3166247903553998491, (VTRes)1.7320508075688772935, (VTRes)3.0310889132455352637}); \
-    auto m2 = genGivenVals<DTArg>(3, { \
-        5, 7, 3, 9, \
-        2, 5, 0, 1, \
-        7, 4, 5, 4, \
-    }); \
-    auto m2exp = genGivenVals<DTRes>(3, {(VTRes)2.2360679774997896964, (VTRes)1.8708286933869706928, (VTRes)1.2247448713915890491}); \
-     \
-    checkAggRow(AggOpCode::STDDEV, m0, m0exp); \
-    checkAggRow(AggOpCode::STDDEV, m1, m1exp); \
-    checkAggRow(AggOpCode::STDDEV, m2, m2exp); \
-     \
-    DataObjectFactory::destroy(m0, m0exp); \
-    DataObjectFactory::destroy(m1, m1exp); \
-    DataObjectFactory::destroy(m2, m2exp); \
-}
+#define STDDEV_TEST_CASE(VTRes)                                                                                        \
+    TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("stddev - result value type: " #VTRes), TAG_KERNELS, (DATA_TYPES),            \
+                               (VALUE_TYPES)) {                                                                        \
+        using DTArg = TestType;                                                                                        \
+        using DTRes = typename std::conditional<std::is_same<DTArg, Matrix<typename DTArg::VT>>::value, Matrix<VTRes>, \
+                                                DenseMatrix<VTRes>>::type;                                             \
+                                                                                                                       \
+        auto m0 = genGivenVals<DTArg>(3, {                                                                             \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                         });                                                                           \
+        auto m0exp = genGivenVals<DTRes>(3, {0, 0, 0});                                                                \
+        auto m1 = genGivenVals<DTArg>(3, {                                                                             \
+                                             4,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             8,                                                                        \
+                                             0,                                                                        \
+                                             4,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             7,                                                                        \
+                                             0,                                                                        \
+                                         });                                                                           \
+        auto m1exp = genGivenVals<DTRes>(                                                                              \
+            3, {(VTRes)3.3166247903553998491, (VTRes)1.7320508075688772935, (VTRes)3.0310889132455352637});            \
+        auto m2 = genGivenVals<DTArg>(3, {                                                                             \
+                                             5,                                                                        \
+                                             7,                                                                        \
+                                             3,                                                                        \
+                                             9,                                                                        \
+                                             2,                                                                        \
+                                             5,                                                                        \
+                                             0,                                                                        \
+                                             1,                                                                        \
+                                             7,                                                                        \
+                                             4,                                                                        \
+                                             5,                                                                        \
+                                             4,                                                                        \
+                                         });                                                                           \
+        auto m2exp = genGivenVals<DTRes>(                                                                              \
+            3, {(VTRes)2.2360679774997896964, (VTRes)1.8708286933869706928, (VTRes)1.2247448713915890491});            \
+                                                                                                                       \
+        checkAggRow(AggOpCode::STDDEV, m0, m0exp);                                                                     \
+        checkAggRow(AggOpCode::STDDEV, m1, m1exp);                                                                     \
+        checkAggRow(AggOpCode::STDDEV, m2, m2exp);                                                                     \
+                                                                                                                       \
+        DataObjectFactory::destroy(m0, m0exp);                                                                         \
+        DataObjectFactory::destroy(m1, m1exp);                                                                         \
+        DataObjectFactory::destroy(m2, m2exp);                                                                         \
+    }
 STDDEV_TEST_CASE(int64_t);
 STDDEV_TEST_CASE(double);
 
-#define VAR_TEST_CASE(VTRes) TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("var - result value type: " #VTRes), TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)) { \
-    using DTArg = TestType; \
-    using DTRes = typename std::conditional< \
-                        std::is_same<DTArg, Matrix<typename DTArg::VT>>::value, \
-                        Matrix<VTRes>, \
-                        DenseMatrix<VTRes> \
-                    >::type; \
-     \
-    auto m0 = genGivenVals<DTArg>(3, { \
-        0, 0, 0, 0, \
-        0, 0, 0, 0, \
-        0, 0, 0, 0, \
-    }); \
-    auto m0exp = genGivenVals<DTRes>(3, {0, 0, 0}); \
-    auto m1 = genGivenVals<DTArg>(3, { \
-        1, 1, 3, 3, \
-        3, 3, 1, 1, \
-        0, 5, 0, 5, \
-    }); \
-    auto m1exp = genGivenVals<DTRes>(3, {1, 1, (VTRes)6.25}); \
-    auto m2 = genGivenVals<DTArg>(3, { \
-        5, 7, 3, 9, \
-        2, 5, 0, 1, \
-        7, 4, 5, 4, \
-    }); \
-    auto m2exp = genGivenVals<DTRes>(3, {5, (VTRes)3.5, (VTRes)1.5}); \
-     \
-     \
-    checkAggRow(AggOpCode::VAR, m0, m0exp); \
-    checkAggRow(AggOpCode::VAR, m1, m1exp); \
-    checkAggRow(AggOpCode::VAR, m2, m2exp); \
-     \
-    DataObjectFactory::destroy(m0, m0exp); \
-    DataObjectFactory::destroy(m1, m1exp); \
-    DataObjectFactory::destroy(m2, m2exp); \
-}
+#define VAR_TEST_CASE(VTRes)                                                                                           \
+    TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("var - result value type: " #VTRes), TAG_KERNELS, (DATA_TYPES),               \
+                               (VALUE_TYPES)) {                                                                        \
+        using DTArg = TestType;                                                                                        \
+        using DTRes = typename std::conditional<std::is_same<DTArg, Matrix<typename DTArg::VT>>::value, Matrix<VTRes>, \
+                                                DenseMatrix<VTRes>>::type;                                             \
+                                                                                                                       \
+        auto m0 = genGivenVals<DTArg>(3, {                                                                             \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                             0,                                                                        \
+                                         });                                                                           \
+        auto m0exp = genGivenVals<DTRes>(3, {0, 0, 0});                                                                \
+        auto m1 = genGivenVals<DTArg>(3, {                                                                             \
+                                             1,                                                                        \
+                                             1,                                                                        \
+                                             3,                                                                        \
+                                             3,                                                                        \
+                                             3,                                                                        \
+                                             3,                                                                        \
+                                             1,                                                                        \
+                                             1,                                                                        \
+                                             0,                                                                        \
+                                             5,                                                                        \
+                                             0,                                                                        \
+                                             5,                                                                        \
+                                         });                                                                           \
+        auto m1exp = genGivenVals<DTRes>(3, {1, 1, (VTRes)6.25});                                                      \
+        auto m2 = genGivenVals<DTArg>(3, {                                                                             \
+                                             5,                                                                        \
+                                             7,                                                                        \
+                                             3,                                                                        \
+                                             9,                                                                        \
+                                             2,                                                                        \
+                                             5,                                                                        \
+                                             0,                                                                        \
+                                             1,                                                                        \
+                                             7,                                                                        \
+                                             4,                                                                        \
+                                             5,                                                                        \
+                                             4,                                                                        \
+                                         });                                                                           \
+        auto m2exp = genGivenVals<DTRes>(3, {5, (VTRes)3.5, (VTRes)1.5});                                              \
+                                                                                                                       \
+        checkAggRow(AggOpCode::VAR, m0, m0exp);                                                                        \
+        checkAggRow(AggOpCode::VAR, m1, m1exp);                                                                        \
+        checkAggRow(AggOpCode::VAR, m2, m2exp);                                                                        \
+                                                                                                                       \
+        DataObjectFactory::destroy(m0, m0exp);                                                                         \
+        DataObjectFactory::destroy(m1, m1exp);                                                                         \
+        DataObjectFactory::destroy(m2, m2exp);                                                                         \
+    }
 VAR_TEST_CASE(int64_t);
 VAR_TEST_CASE(double);
\ No newline at end of file
diff --git a/test/runtime/local/kernels/BinTest.cpp b/test/runtime/local/kernels/BinTest.cpp
index 7a2533b01..37279360d 100644
--- a/test/runtime/local/kernels/BinTest.cpp
+++ b/test/runtime/local/kernels/BinTest.cpp
@@ -14,9 +14,9 @@
  * limitations under the License.
  */
 
+#include <runtime/local/datagen/GenGivenVals.h>
 #include <runtime/local/datastructures/DataObjectFactory.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
-#include <runtime/local/datagen/GenGivenVals.h>
 #include <runtime/local/kernels/Bin.h>
 #include <runtime/local/kernels/CheckEq.h>
 
@@ -32,19 +32,19 @@
 
 #define DATA_TYPES DenseMatrix, Matrix
 
-template<class DTRes, class DTArg>
-void checkBin(const DTArg * arg, size_t numBins, typename DTArg::VT min, typename DTArg::VT max, const DTRes * exp) {
-    DTRes * res = nullptr;
+template <class DTRes, class DTArg>
+void checkBin(const DTArg *arg, size_t numBins, typename DTArg::VT min, typename DTArg::VT max, const DTRes *exp) {
+    DTRes *res = nullptr;
     bin<DTRes, DTArg>(res, arg, numBins, min, max, nullptr);
     CHECK(*res == *exp);
     DataObjectFactory::destroy(res);
 }
 
-template<class DTRes, class DTArg>
-void checkBinThrows(const DTArg * arg, size_t numBins, typename DTArg::VT min, typename DTArg::VT max) {
-    DTRes * res = nullptr;
+template <class DTRes, class DTArg>
+void checkBinThrows(const DTArg *arg, size_t numBins, typename DTArg::VT min, typename DTArg::VT max) {
+    DTRes *res = nullptr;
     CHECK_THROWS(bin<DTRes, DTArg>(res, arg, numBins, min, max, nullptr));
-    if(res != nullptr)
+    if (res != nullptr)
         DataObjectFactory::destroy(res);
 }
 
@@ -52,15 +52,11 @@ TEMPLATE_PRODUCT_TEST_CASE("Bin", TAG_KERNELS, (DATA_TYPES), (double, float, int
     using DTArg = TestType;
     using VTArg = typename DTArg::VT;
     using DTRes = DTArg;
-    using DTEmpty = typename std::conditional<
-                        std::is_same<TestType, Matrix<VTArg>>::value,
-                        DenseMatrix<VTArg>,
-                        TestType
-                    >::type;
-    
-    
-    DTArg * arg = nullptr;
-    DTRes * exp = nullptr;
+    using DTEmpty =
+        typename std::conditional<std::is_same<TestType, Matrix<VTArg>>::value, DenseMatrix<VTArg>, TestType>::type;
+
+    DTArg *arg = nullptr;
+    DTRes *exp = nullptr;
 
     // fp spec: nan among normal values
 
@@ -81,26 +77,26 @@ TEMPLATE_PRODUCT_TEST_CASE("Bin", TAG_KERNELS, (DATA_TYPES), (double, float, int
     }
     SECTION("numBins > 1, min < max, wo/ out-of-bins values, 1d") {
         arg = genGivenVals<DTArg>(7, {10, 20, 30, 40, 50, 60, 70});
-        exp = genGivenVals<DTRes>(7, { 0,  0,  0,  1,  1,  2,  2});
+        exp = genGivenVals<DTRes>(7, {0, 0, 0, 1, 1, 2, 2});
         checkBin(arg, 3, 10, 70, exp);
     }
     SECTION("numBins > 1, min < max, wo/ out-of-bins values, 2d") {
         arg = genGivenVals<DTArg>(4, {10, 20, 30, 40, 50, 60, 70, 70});
-        exp = genGivenVals<DTRes>(4, { 0,  0,  0,  1,  1,  2,  2,  2});
+        exp = genGivenVals<DTRes>(4, {0, 0, 0, 1, 1, 2, 2, 2});
         checkBin(arg, 3, 10, 70, exp);
     }
     SECTION("numBins > 1, min < max, w/ out-of-bins values") {
         arg = genGivenVals<DTArg>(7, {5, 20, 30, 40, 50, 60, 100});
-        exp = genGivenVals<DTRes>(7, {0,  0,  0,  1,  1,  2,   2});
+        exp = genGivenVals<DTRes>(7, {0, 0, 0, 1, 1, 2, 2});
         checkBin(arg, 3, 10, 70, exp);
     }
-    if constexpr(std::is_floating_point<VTArg>::value) {
+    if constexpr (std::is_floating_point<VTArg>::value) {
         SECTION("numBins > 1, min < max, nan/inf/-inf values") {
             const VTArg inf = std::numeric_limits<VTArg>::infinity();
             const VTArg nan = std::numeric_limits<VTArg>::signaling_NaN();
             arg = genGivenVals<DTArg>(3, {nan, inf, -inf});
 
-            DTRes * res = nullptr;
+            DTRes *res = nullptr;
             bin<DTRes, DTArg>(res, arg, 3, 10, 70, nullptr);
             CHECK(res->getNumRows() == 3);
             CHECK(res->getNumCols() == 1);
@@ -112,17 +108,17 @@ TEMPLATE_PRODUCT_TEST_CASE("Bin", TAG_KERNELS, (DATA_TYPES), (double, float, int
     }
     SECTION("numBins == 1, min == max, wo/ out-of-bounds values") {
         arg = genGivenVals<DTArg>(3, {20, 20, 20});
-        exp = genGivenVals<DTRes>(3, { 0,  0,  0});
+        exp = genGivenVals<DTRes>(3, {0, 0, 0});
         checkBin(arg, 1, 20, 20, exp);
     }
     SECTION("numBins == 1, min == max, w/ out-of-bounds values") {
         arg = genGivenVals<DTArg>(3, {10, 20, 30});
-        exp = genGivenVals<DTRes>(3, { 0,  0,  0});
+        exp = genGivenVals<DTRes>(3, {0, 0, 0});
         checkBin(arg, 1, 20, 20, exp);
     }
     SECTION("numBins == 1, min < max") {
         arg = genGivenVals<DTArg>(3, {10, 30, 20});
-        exp = genGivenVals<DTRes>(3, { 0,  0,  0});
+        exp = genGivenVals<DTRes>(3, {0, 0, 0});
         checkBin(arg, 1, 10, 30, exp);
     }
     SECTION("numBins > 1, min == max") {
@@ -131,28 +127,28 @@ TEMPLATE_PRODUCT_TEST_CASE("Bin", TAG_KERNELS, (DATA_TYPES), (double, float, int
     }
     SECTION("numBins <= 0") {
         arg = genGivenVals<DTArg>(1, {150});
-        checkBinThrows<DTRes>(arg,  0, 100, 200);
+        checkBinThrows<DTRes>(arg, 0, 100, 200);
         checkBinThrows<DTRes>(arg, -1, 100, 200);
     }
     SECTION("min > max") {
         arg = genGivenVals<DTArg>(1, {150});
         checkBinThrows<DTRes>(arg, 2, 200, 100);
     }
-    if constexpr(std::is_floating_point<VTArg>::value) {
+    if constexpr (std::is_floating_point<VTArg>::value) {
         SECTION("min/max is nan/inf/-inf") {
             const VTArg inf = std::numeric_limits<VTArg>::infinity();
             const VTArg nan = std::numeric_limits<VTArg>::signaling_NaN();
             arg = genGivenVals<DTArg>(1, {150});
-            checkBinThrows<DTRes>(arg, 1,  nan,  200);
-            checkBinThrows<DTRes>(arg, 1,  inf,  200);
-            checkBinThrows<DTRes>(arg, 1, -inf,  200);
-            checkBinThrows<DTRes>(arg, 1,  100,  nan);
-            checkBinThrows<DTRes>(arg, 1,  100,  inf);
-            checkBinThrows<DTRes>(arg, 1,  100, -inf);
+            checkBinThrows<DTRes>(arg, 1, nan, 200);
+            checkBinThrows<DTRes>(arg, 1, inf, 200);
+            checkBinThrows<DTRes>(arg, 1, -inf, 200);
+            checkBinThrows<DTRes>(arg, 1, 100, nan);
+            checkBinThrows<DTRes>(arg, 1, 100, inf);
+            checkBinThrows<DTRes>(arg, 1, 100, -inf);
         }
     }
 
     DataObjectFactory::destroy(arg);
-    if(exp != nullptr)
+    if (exp != nullptr)
         DataObjectFactory::destroy(exp);
 }
\ No newline at end of file
diff --git a/test/runtime/local/kernels/CTableTest.cpp b/test/runtime/local/kernels/CTableTest.cpp
index c0d3d42e8..ff687fc85 100644
--- a/test/runtime/local/kernels/CTableTest.cpp
+++ b/test/runtime/local/kernels/CTableTest.cpp
@@ -15,8 +15,8 @@
  */
 
 #include <runtime/local/datagen/GenGivenVals.h>
-#include <runtime/local/datastructures/DenseMatrix.h>
 #include <runtime/local/datastructures/CSRMatrix.h>
+#include <runtime/local/datastructures/DenseMatrix.h>
 #include <runtime/local/kernels/CTable.h>
 #include <runtime/local/kernels/CheckEq.h>
 
@@ -35,19 +35,16 @@
 TEMPLATE_PRODUCT_TEST_CASE("CTable", TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)) {
     using DTRes = TestType;
     using VT = typename DTRes::VT;
-    using DTSel = typename std::conditional<
-                        std::is_same<DTRes, Matrix<VT>>::value,
-                        Matrix<int64_t>,
-                        DenseMatrix<int64_t>
-                    >::type;
-
-    DTSel * ys = nullptr;
-    DTSel * xs = nullptr;
+    using DTSel =
+        typename std::conditional<std::is_same<DTRes, Matrix<VT>>::value, Matrix<int64_t>, DenseMatrix<int64_t>>::type;
+
+    DTSel *ys = nullptr;
+    DTSel *xs = nullptr;
     VT weight;
     int64_t resNumRows;
     int64_t resNumCols;
-    DTRes * exp = nullptr;
-    
+    DTRes *exp = nullptr;
+
     SECTION("example 1") {
         ys = genGivenVals<DTSel>(1, {0});
         xs = genGivenVals<DTSel>(1, {0});
@@ -64,14 +61,7 @@ TEMPLATE_PRODUCT_TEST_CASE("CTable", TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)) {
         resNumRows = -1;
         resNumCols = -1;
 
-        exp = genGivenVals<DTRes>(6, {
-            0, 0, 0, 0,
-            0, 0, 3, 0,
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-            0, 0, 0, 6,
-            0, 3, 0, 0
-        });
+        exp = genGivenVals<DTRes>(6, {0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 3, 0, 0});
     }
     SECTION("example 2: crop #rows") {
         ys = genGivenVals<DTSel>(4, {1, 4, 5, 4});
@@ -81,11 +71,23 @@ TEMPLATE_PRODUCT_TEST_CASE("CTable", TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)) {
         resNumCols = -1;
 
         exp = genGivenVals<DTRes>(4, {
-            0, 0, 0, 0,
-            0, 0, 3, 0,
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-        });
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         3,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                     });
     }
     SECTION("example 2: crop #cols") {
         ys = genGivenVals<DTSel>(4, {1, 4, 5, 4});
@@ -95,13 +97,25 @@ TEMPLATE_PRODUCT_TEST_CASE("CTable", TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)) {
         resNumCols = 3;
 
         exp = genGivenVals<DTRes>(6, {
-            0, 0, 0,
-            0, 0, 3,
-            0, 0, 0,
-            0, 0, 0,
-            0, 0, 0,
-            0, 3, 0,
-        });
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         3,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         0,
+                                         3,
+                                         0,
+                                     });
     }
     SECTION("example 2: crop both") {
         ys = genGivenVals<DTSel>(4, {1, 4, 5, 4});
@@ -110,12 +124,7 @@ TEMPLATE_PRODUCT_TEST_CASE("CTable", TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)) {
         resNumRows = 4;
         resNumCols = 3;
 
-        exp = genGivenVals<DTRes>(4, {
-            0, 0, 0,
-            0, 0, 3,
-            0, 0, 0,
-            0, 0, 0
-        });
+        exp = genGivenVals<DTRes>(4, {0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0});
     }
     SECTION("example 3: more items than cells") {
         ys = genGivenVals<DTSel>(12, {0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2});
@@ -124,14 +133,10 @@ TEMPLATE_PRODUCT_TEST_CASE("CTable", TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)) {
         resNumRows = -1;
         resNumCols = -1;
 
-        exp = genGivenVals<DTRes>(3, {
-            6, 6,
-            6, 6,
-            6, 6
-        });
+        exp = genGivenVals<DTRes>(3, {6, 6, 6, 6, 6, 6});
     }
-    
-    DTRes * res = nullptr;
+
+    DTRes *res = nullptr;
     ctable(res, ys, xs, weight, resNumRows, resNumCols, nullptr);
     CHECK(*res == *exp);
 
diff --git a/test/runtime/local/kernels/CUDA/DNNActivationTest.cpp b/test/runtime/local/kernels/CUDA/DNNActivationTest.cpp
index 9408db411..137078739 100644
--- a/test/runtime/local/kernels/CUDA/DNNActivationTest.cpp
+++ b/test/runtime/local/kernels/CUDA/DNNActivationTest.cpp
@@ -14,28 +14,28 @@
  * limitations under the License.
  */
 
-
 #include "run_tests.h"
 
 #include "runtime/local/datagen/GenGivenVals.h"
 #include "runtime/local/kernels/CUDA/Activation.h"
 
-template<class OP, class DT>
-void check(const DT* in, const DT* exp, DaphneContext* dctx) {
-    DT* res = nullptr;
+template <class OP, class DT> void check(const DT *in, const DT *exp, DaphneContext *dctx) {
+    DT *res = nullptr;
     CUDA::NN::Activation::Forward<OP, DT, DT>::apply(res, in, dctx);
     CHECK(*res == *exp);
 }
 
-TEMPLATE_PRODUCT_TEST_CASE("CUDA::NN::Activation::ReLU::Forward", TAG_DNN, (DenseMatrix), (float, double)) { // NOLINT(cert-err58-cpp)
+TEMPLATE_PRODUCT_TEST_CASE("CUDA::NN::Activation::ReLU::Forward", TAG_DNN, (DenseMatrix),
+                           (float, double)) { // NOLINT(cert-err58-cpp)
     using DT = TestType;
 
     auto dctx = setupContextAndLogger();
 
-    auto input = genGivenVals<DT>(1, { -3, -2, -1, 0, 1, 2, 3, 4, 5});
+    auto input = genGivenVals<DT>(1, {-3, -2, -1, 0, 1, 2, 3, 4, 5});
 
-    // expected output when used with settings filter 2x2, stride 1x1, padding 0x0
-    auto result = genGivenVals<DT>(1, { 0, 0, 0, 0, 1, 2, 3, 4, 5 });
+    // expected output when used with settings filter 2x2, stride 1x1, padding
+    // 0x0
+    auto result = genGivenVals<DT>(1, {0, 0, 0, 0, 1, 2, 3, 4, 5});
 
     check<CUDA::NN::Activation::ReLU>(input, result, dctx.get());
 
diff --git a/test/runtime/local/kernels/CUDA/DNNAffineTest.cpp b/test/runtime/local/kernels/CUDA/DNNAffineTest.cpp
index 8b7c30f70..1c9017e47 100644
--- a/test/runtime/local/kernels/CUDA/DNNAffineTest.cpp
+++ b/test/runtime/local/kernels/CUDA/DNNAffineTest.cpp
@@ -16,26 +16,27 @@
 
 #include "run_tests.h"
 
-#include "runtime/local/kernels/CUDA/Affine.h"
 #include "runtime/local/datagen/GenGivenVals.h"
+#include "runtime/local/kernels/CUDA/Affine.h"
 
-template<class DT>
-        void check(const DT* in, const DT* W, const DT* b, const DT* exp, DaphneContext* dctx) {
-    DT* res = nullptr;
+template <class DT> void check(const DT *in, const DT *W, const DT *b, const DT *exp, DaphneContext *dctx) {
+    DT *res = nullptr;
     CUDA::NN::Affine::Forward<DT, DT>::apply(res, in, W, b, dctx);
     CHECK(*res == *exp);
 }
 
-TEMPLATE_PRODUCT_TEST_CASE("CUDA::NN::Affine::Forward", TAG_DNN, (DenseMatrix), (float, double)) { // NOLINT(cert-err58-cpp)
+TEMPLATE_PRODUCT_TEST_CASE("CUDA::NN::Affine::Forward", TAG_DNN, (DenseMatrix),
+                           (float, double)) { // NOLINT(cert-err58-cpp)
     auto dctx = setupContextAndLogger();
     using DT = TestType;
 
-    auto input = genGivenVals<DT>(1, { -3, -2, -1, 0, 1, 2, 3, 4, 5});
-    auto weights = genGivenVals<DT>(9, { 1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9});
-    auto bias = genGivenVals<DT>(1, { 0 });
+    auto input = genGivenVals<DT>(1, {-3, -2, -1, 0, 1, 2, 3, 4, 5});
+    auto weights = genGivenVals<DT>(9, {1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9});
+    auto bias = genGivenVals<DT>(1, {0});
 
-    // expected output when used with settings filter 2x2, stride 1x1, padding 0x0
-    auto result = genGivenVals<DT>(1, { 105, 105});
+    // expected output when used with settings filter 2x2, stride 1x1, padding
+    // 0x0
+    auto result = genGivenVals<DT>(1, {105, 105});
 
     check(input, weights, bias, result, dctx.get());
 
diff --git a/test/runtime/local/kernels/CUDA/DNNBatchNormTest.cpp b/test/runtime/local/kernels/CUDA/DNNBatchNormTest.cpp
index d99790887..516e80bc3 100644
--- a/test/runtime/local/kernels/CUDA/DNNBatchNormTest.cpp
+++ b/test/runtime/local/kernels/CUDA/DNNBatchNormTest.cpp
@@ -14,33 +14,32 @@
  * limitations under the License.
  */
 
-
 #include "run_tests.h"
 
 #include "runtime/local/datagen/GenGivenVals.h"
 #include "runtime/local/kernels/CUDA/BatchNorm.h"
 
-template<class DT>
-void check(const DT* in, const DT* gamma, const DT* beta, const DT* ema_mean, const DT* ema_var, const DT* exp,
-        DaphneContext* dctx)
-{
-    DT* res = nullptr;
+template <class DT>
+void check(const DT *in, const DT *gamma, const DT *beta, const DT *ema_mean, const DT *ema_var, const DT *exp,
+           DaphneContext *dctx) {
+    DT *res = nullptr;
     typename DT::VT epsilon = 1e-5;
     CUDA::BatchNorm::Forward<DT, DT>::apply(res, in, gamma, beta, ema_mean, ema_var, epsilon, dctx);
     CHECK(Approx(*(res->getValues())).epsilon(epsilon) == *(exp->getValues()));
 }
 
-TEMPLATE_PRODUCT_TEST_CASE("CUDA::NN::BatchNorm::Forward", TAG_DNN, (DenseMatrix), (float, double)) { // NOLINT(cert-err58-cpp)
+TEMPLATE_PRODUCT_TEST_CASE("CUDA::NN::BatchNorm::Forward", TAG_DNN, (DenseMatrix),
+                           (float, double)) { // NOLINT(cert-err58-cpp)
     auto dctx = setupContextAndLogger();
     using DT = TestType;
 
-    auto input = genGivenVals<DT>(1, { -3, -2, -1, 0, 1, 2, 3, 4, 5});
-    auto gamma = genGivenVals<DT>(1, { 1 });
-    auto beta = genGivenVals<DT>(1, { 0 });
-    auto ema_mean = genGivenVals<DT>(1, { 0 });
-    auto ema_var = genGivenVals<DT>(1, { 1 });
+    auto input = genGivenVals<DT>(1, {-3, -2, -1, 0, 1, 2, 3, 4, 5});
+    auto gamma = genGivenVals<DT>(1, {1});
+    auto beta = genGivenVals<DT>(1, {0});
+    auto ema_mean = genGivenVals<DT>(1, {0});
+    auto ema_var = genGivenVals<DT>(1, {1});
 
-    auto result = genGivenVals<DT>(1, { -3, -2, -1, 0, 1, 2, 3, 4, 5});
+    auto result = genGivenVals<DT>(1, {-3, -2, -1, 0, 1, 2, 3, 4, 5});
 
     check(input, gamma, beta, ema_mean, ema_var, result, dctx.get());
 
diff --git a/test/runtime/local/kernels/CUDA/DNNConvolutionTest.cpp b/test/runtime/local/kernels/CUDA/DNNConvolutionTest.cpp
index 0166d2583..be67a727f 100644
--- a/test/runtime/local/kernels/CUDA/DNNConvolutionTest.cpp
+++ b/test/runtime/local/kernels/CUDA/DNNConvolutionTest.cpp
@@ -14,31 +14,31 @@
  * limitations under the License.
  */
 
-
 #include "run_tests.h"
 
 #include "runtime/local/datagen/GenGivenVals.h"
 #include "runtime/local/kernels/CUDA/Convolution.h"
 
-template<class DT>
-void check(const DT* in, const DT* filter, const DT* exp, DaphneContext* dctx) {
-    DT* res = nullptr;
+template <class DT> void check(const DT *in, const DT *filter, const DT *exp, DaphneContext *dctx) {
+    DT *res = nullptr;
     size_t out_h;
     size_t out_w;
     CUDA::Convolution::Forward<DT, DT>::apply(res, out_h, out_w, in, filter, nullptr, in->getNumRows(), 1, 3, 3, 2, 2,
-            1, 1, 0, 0, dctx);
+                                              1, 1, 0, 0, dctx);
     CHECK(*res == *exp);
 }
 
-TEMPLATE_PRODUCT_TEST_CASE("CUDA::NN::Convolution::Forward", TAG_DNN, (DenseMatrix), (float, double)) { // NOLINT(cert-err58-cpp)
+TEMPLATE_PRODUCT_TEST_CASE("CUDA::NN::Convolution::Forward", TAG_DNN, (DenseMatrix),
+                           (float, double)) { // NOLINT(cert-err58-cpp)
     auto dctx = setupContextAndLogger();
     using DT = TestType;
 
-    auto input = genGivenVals<DT>(1, { 1, 2, 3, 4, 5, 6, 7, 8, 9});
-    auto filter = genGivenVals<DT>(1, { 1, 0, 0, 1});
+    auto input = genGivenVals<DT>(1, {1, 2, 3, 4, 5, 6, 7, 8, 9});
+    auto filter = genGivenVals<DT>(1, {1, 0, 0, 1});
 
-    // expected output when used with settings filter 2x2, stride 1x1, padding 0x0
-    auto result = genGivenVals<DT>(1, { 6, 8, 12, 14 });
+    // expected output when used with settings filter 2x2, stride 1x1, padding
+    // 0x0
+    auto result = genGivenVals<DT>(1, {6, 8, 12, 14});
 
     check(input, filter, result, dctx.get());
 
diff --git a/test/runtime/local/kernels/CUDA/DNNPoolingTest.cpp b/test/runtime/local/kernels/CUDA/DNNPoolingTest.cpp
index 639e9dff8..56af29f41 100644
--- a/test/runtime/local/kernels/CUDA/DNNPoolingTest.cpp
+++ b/test/runtime/local/kernels/CUDA/DNNPoolingTest.cpp
@@ -14,35 +14,35 @@
  * limitations under the License.
  */
 
-
 #include "run_tests.h"
 
 #include <runtime/local/datagen/GenGivenVals.h>
 #include <runtime/local/kernels/CUDA/Pooling.h>
 
-template<typename DT>
-DT* genInput() {
-    return genGivenVals<DT>(2, {
-            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
-            30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
-            55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75,
-            76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101,
-            102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
-            122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141,
-            142, 143, 144, 145, 146, 147, 148, 149, 150
-    });
+template <typename DT> DT *genInput() {
+    return genGivenVals<DT>(2, {1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,  15,  16,  17,
+                                18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,  32,  33,  34,
+                                35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
+                                52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,  66,  67,  68,
+                                69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,
+                                86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102,
+                                103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+                                120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136,
+                                137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150});
 }
 
-template<template<typename> class OP, class DT>
-void checkPoolingForward(const DT* in, const DT* exp, DaphneContext* dctx) {
-    DT* res = nullptr;
+template <template <typename> class OP, class DT>
+void checkPoolingForward(const DT *in, const DT *exp, DaphneContext *dctx) {
+    DT *res = nullptr;
     size_t out_h;
     size_t out_w;
-    CUDA::NN::Pooling::Forward<OP, DT, DT>::apply(res, out_h, out_w, in, in->getNumRows(), 3, 5, 5, 2, 2, 1, 1, 0, 0, dctx);
+    CUDA::NN::Pooling::Forward<OP, DT, DT>::apply(res, out_h, out_w, in, in->getNumRows(), 3, 5, 5, 2, 2, 1, 1, 0, 0,
+                                                  dctx);
     CHECK(*res == *exp);
 }
 
-TEMPLATE_PRODUCT_TEST_CASE("CUDA::NN::Pooling::AVG::Forward", TAG_DNN, (DenseMatrix), (float, double)) { // NOLINT(cert-err58-cpp)
+TEMPLATE_PRODUCT_TEST_CASE("CUDA::NN::Pooling::AVG::Forward", TAG_DNN, (DenseMatrix),
+                           (float, double)) { // NOLINT(cert-err58-cpp)
     using DT = TestType;
 
     auto dctx = setupContextAndLogger();
@@ -50,20 +50,21 @@ TEMPLATE_PRODUCT_TEST_CASE("CUDA::NN::Pooling::AVG::Forward", TAG_DNN, (DenseMat
     // two rgb "images" of 5x5 pixels
     auto inputs = genInput<DT>();
 
-    // expected output when used with settings filter 2x2, stride 1x1, padding 0x0
-    auto out_f2x2_s1x1_p0x0 = genGivenVals<DT>(2, {
-            4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 16, 17, 19, 20, 21, 22, 29, 30, 31, 32, 34, 35, 36, 37, 39, 40, 41, 42,
-                    44, 45, 46, 47, 54, 55, 56, 57, 59, 60, 61, 62, 64, 65, 66, 67, 69, 70, 71, 72,
-            79, 80, 81, 82, 84, 85, 86, 87, 89, 90, 91, 92, 94, 95, 96, 97, 104, 105, 106, 107, 109, 110, 111, 112, 114,
-                    115, 116, 117, 119, 120, 121, 122, 129, 130, 131, 132, 134, 135, 136, 137, 139, 140, 141, 142, 144,
-                    145, 146, 147
-    });
-
-    auto out_f2x2_s2x2_p1x1 = genGivenVals<DT>(2, {0.25, 1.25, 2.25, 4.25, 10.00, 12.00, 9.25, 20.00, 22.00, 6.50, 13.75, 14.75,
-          16.7500,  35.0000,  37.0000, 21.7500,  45.0000,  47.0000, 12.7500,  26.2500,  27.2500, 29.2500,  60.0000,  62.0000,
-          34.2500,  70.0000,  72.0000, 19.0000,  38.7500,  39.7500, 41.7500,  85.0000,  87.0000,
-          46.7500,  95.0000,  97.0000, 25.2500,  51.2500,  52.2500, 54.2500, 110.0000, 112.0000,
-          59.2500, 120.0000, 122.0000, 31.5000,  63.7500,  64.7500, 66.7500, 135.0000, 137.0000, 71.7500, 145.0000, 147.0000});
+    // expected output when used with settings filter 2x2, stride 1x1, padding
+    // 0x0
+    auto out_f2x2_s1x1_p0x0 = genGivenVals<DT>(
+        2, {4,   5,   6,   7,   9,   10,  11,  12,  14,  15,  16,  17,  19,  20,  21,  22,  29,  30,  31,  32,
+            34,  35,  36,  37,  39,  40,  41,  42,  44,  45,  46,  47,  54,  55,  56,  57,  59,  60,  61,  62,
+            64,  65,  66,  67,  69,  70,  71,  72,  79,  80,  81,  82,  84,  85,  86,  87,  89,  90,  91,  92,
+            94,  95,  96,  97,  104, 105, 106, 107, 109, 110, 111, 112, 114, 115, 116, 117, 119, 120, 121, 122,
+            129, 130, 131, 132, 134, 135, 136, 137, 139, 140, 141, 142, 144, 145, 146, 147});
+
+    auto out_f2x2_s2x2_p1x1 = genGivenVals<DT>(
+        2, {0.25,     1.25,    2.25,    4.25,    10.00,   12.00,    9.25,     20.00,    22.00,    6.50,    13.75,
+            14.75,    16.7500, 35.0000, 37.0000, 21.7500, 45.0000,  47.0000,  12.7500,  26.2500,  27.2500, 29.2500,
+            60.0000,  62.0000, 34.2500, 70.0000, 72.0000, 19.0000,  38.7500,  39.7500,  41.7500,  85.0000, 87.0000,
+            46.7500,  95.0000, 97.0000, 25.2500, 51.2500, 52.2500,  54.2500,  110.0000, 112.0000, 59.2500, 120.0000,
+            122.0000, 31.5000, 63.7500, 64.7500, 66.7500, 135.0000, 137.0000, 71.7500,  145.0000, 147.0000});
     checkPoolingForward<NN::Pooling::AVG>(inputs, out_f2x2_s2x2_p1x1, dctx.get());
 
     DataObjectFactory::destroy(inputs);
@@ -71,7 +72,8 @@ TEMPLATE_PRODUCT_TEST_CASE("CUDA::NN::Pooling::AVG::Forward", TAG_DNN, (DenseMat
     DataObjectFactory::destroy(out_f2x2_s2x2_p1x1);
 }
 
-TEMPLATE_PRODUCT_TEST_CASE("CUDA::NN::Pooling::MAX::Forward", TAG_DNN, (DenseMatrix), (float, double)) { // NOLINT(cert-err58-cpp)
+TEMPLATE_PRODUCT_TEST_CASE("CUDA::NN::Pooling::MAX::Forward", TAG_DNN, (DenseMatrix),
+                           (float, double)) { // NOLINT(cert-err58-cpp)
     using DT = TestType;
 
     auto dctx = setupContextAndLogger();
@@ -79,63 +81,43 @@ TEMPLATE_PRODUCT_TEST_CASE("CUDA::NN::Pooling::MAX::Forward", TAG_DNN, (DenseMat
     // two rgb "images" of 5x5 pixels
     auto inputs = genInput<DT>();
 
-    // expected output when used with settings filter 2x2, stride 1x1, padding 0x0
-    auto out_f2x2_s1x1_p0x0 = genGivenVals<DT>(2, {
-            7, 8, 9, 10, 12, 13, 14, 15, 17, 18, 19, 20, 22, 23, 24, 25, 32, 33, 34, 35, 37, 38, 39, 40, 42, 43, 44, 45,
-                    47, 48, 49, 50, 57, 58, 59, 60, 62, 63, 64, 65, 67, 68, 69, 70, 72, 73, 74, 75,
-            82, 83, 84, 85, 87, 88, 89, 90, 92, 93, 94, 95, 97, 98, 99, 100, 107, 108, 109, 110, 112, 113, 114, 115, 117,
-                    118, 119, 120, 122, 123, 124, 125, 132, 133, 134, 135, 137, 138, 139, 140, 142, 143, 144, 145, 147,
-                    148, 149, 150
-    });
-
-    auto inputs_p1x1 = genGivenVals<DT>(2, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
-            30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54
-        
-    });
-    auto out_f2x2_s1x1_p1x1 = genGivenVals<DT>(2, {
-            1, 2, 3, 3,
-            4, 5, 6, 6,
-            7, 8, 9, 9,
-            7, 8, 9, 9,
-
-            10, 11, 12, 12,
-            13, 14, 15, 15,
-            16, 17, 18, 18,
-            16, 17, 18, 18,
-
-            19, 20, 21, 21,
-            22, 23, 24, 24,
-            25, 26, 27, 27,
-            25, 26, 27, 27,
-            
-            28, 29, 30, 30,           
-            31, 32, 33, 33,
-            34, 35, 36, 36,
-            34, 35, 36, 36, 
-            
-            37, 38, 39, 39,   
-            40, 41, 42, 42,
-            43, 44, 45, 45,
-            43, 44, 45, 45,
-            
-            46, 47, 48, 48,            
-            49, 50, 51, 51,
-            52, 53, 54, 54,
-            52, 53, 54, 54
-    });
-
-    auto out_f2x2_s2x2_p1x1 = genGivenVals<DT>(2, {1., 3., 5., 11.,  13.,  15.,
-          21.,  23.,  25., 26.,  28.,  30., 36.,  38.,  40., 46.,  48.,  50.,
-            51.,  53.,  55., 61.,  63.,  65., 71.,  73.,  75.,
-         76.,  78.,  80., 86.,  88.,  90.,96.,  98., 100., 101., 103., 105., 111., 113., 115.,
-        121., 123., 125., 126., 128., 130., 136., 138., 140., 146., 148., 150.
-    });
+    // expected output when used with settings filter 2x2, stride 1x1, padding
+    // 0x0
+    auto out_f2x2_s1x1_p0x0 = genGivenVals<DT>(
+        2, {7,   8,   9,   10,  12,  13,  14,  15,  17,  18,  19,  20,  22,  23,  24,  25,  32,  33,  34,  35,
+            37,  38,  39,  40,  42,  43,  44,  45,  47,  48,  49,  50,  57,  58,  59,  60,  62,  63,  64,  65,
+            67,  68,  69,  70,  72,  73,  74,  75,  82,  83,  84,  85,  87,  88,  89,  90,  92,  93,  94,  95,
+            97,  98,  99,  100, 107, 108, 109, 110, 112, 113, 114, 115, 117, 118, 119, 120, 122, 123, 124, 125,
+            132, 133, 134, 135, 137, 138, 139, 140, 142, 143, 144, 145, 147, 148, 149, 150});
+
+    auto inputs_p1x1 = genGivenVals<DT>(2, {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+                                            20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38,
+                                            39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54
+
+                                           });
+    auto out_f2x2_s1x1_p1x1 = genGivenVals<DT>(2, {1,  2,  3,  3,  4,  5,  6,  6,  7,  8,  9,  9,  7,  8,  9,  9,
+
+                                                   10, 11, 12, 12, 13, 14, 15, 15, 16, 17, 18, 18, 16, 17, 18, 18,
+
+                                                   19, 20, 21, 21, 22, 23, 24, 24, 25, 26, 27, 27, 25, 26, 27, 27,
+
+                                                   28, 29, 30, 30, 31, 32, 33, 33, 34, 35, 36, 36, 34, 35, 36, 36,
+
+                                                   37, 38, 39, 39, 40, 41, 42, 42, 43, 44, 45, 45, 43, 44, 45, 45,
+
+                                                   46, 47, 48, 48, 49, 50, 51, 51, 52, 53, 54, 54, 52, 53, 54, 54});
+
+    auto out_f2x2_s2x2_p1x1 =
+        genGivenVals<DT>(2, {1.,   3.,   5.,   11.,  13.,  15.,  21.,  23.,  25.,  26.,  28.,  30.,  36.,  38.,
+                             40.,  46.,  48.,  50.,  51.,  53.,  55.,  61.,  63.,  65.,  71.,  73.,  75.,  76.,
+                             78.,  80.,  86.,  88.,  90.,  96.,  98.,  100., 101., 103., 105., 111., 113., 115.,
+                             121., 123., 125., 126., 128., 130., 136., 138., 140., 146., 148., 150.});
 
     checkPoolingForward<NN::Pooling::MAX>(inputs, out_f2x2_s2x2_p1x1, dctx.get());
 
     DataObjectFactory::destroy(inputs);
     DataObjectFactory::destroy(out_f2x2_s1x1_p0x0);
-    
+
     DataObjectFactory::destroy(inputs_p1x1);
     DataObjectFactory::destroy(out_f2x2_s1x1_p1x1);
     DataObjectFactory::destroy(out_f2x2_s2x2_p1x1);
diff --git a/test/runtime/local/kernels/CUDA/DNNSoftmaxTest.cpp b/test/runtime/local/kernels/CUDA/DNNSoftmaxTest.cpp
index 8293523c9..9bef413af 100644
--- a/test/runtime/local/kernels/CUDA/DNNSoftmaxTest.cpp
+++ b/test/runtime/local/kernels/CUDA/DNNSoftmaxTest.cpp
@@ -14,28 +14,28 @@
  * limitations under the License.
  */
 
-
 #include "run_tests.h"
 
 #include "runtime/local/datagen/GenGivenVals.h"
 #include "runtime/local/kernels/CUDA/Softmax.h"
 
-template<class DT>
-void check(const DT* in, const DT* exp, DaphneContext* dctx) {
-    DT* res = nullptr;
+template <class DT> void check(const DT *in, const DT *exp, DaphneContext *dctx) {
+    DT *res = nullptr;
     CUDA::Softmax::Forward<DT, DT>::apply(res, in, dctx);
     CHECK(Approx(*(res->getValues())).epsilon(1e-6) == *(exp->getValues()));
 }
 
-TEMPLATE_PRODUCT_TEST_CASE("CUDA::NN::Softmax::Forward", TAG_DNN, (DenseMatrix), (float, double)) { // NOLINT(cert-err58-cpp)
+TEMPLATE_PRODUCT_TEST_CASE("CUDA::NN::Softmax::Forward", TAG_DNN, (DenseMatrix),
+                           (float, double)) { // NOLINT(cert-err58-cpp)
     auto dctx = setupContextAndLogger();
     using DT = TestType;
 
-    auto input = genGivenVals<DT>(1, { -3, -2, -1, 0, 1, 2, 3, 4, 5});
+    auto input = genGivenVals<DT>(1, {-3, -2, -1, 0, 1, 2, 3, 4, 5});
 
-    // expected output when used with settings filter 2x2, stride 1x1, padding 0x0
-    auto result = genGivenVals<DT>(1, { 0.000212079, 0.00057649, 0.00156706, 0.00425972, 0.0115791, 0.0314753, 0.0855588,
-            0.232573, 0.632199});
+    // expected output when used with settings filter 2x2, stride 1x1, padding
+    // 0x0
+    auto result = genGivenVals<DT>(
+        1, {0.000212079, 0.00057649, 0.00156706, 0.00425972, 0.0115791, 0.0314753, 0.0855588, 0.232573, 0.632199});
 
     check(input, result, dctx.get());
 
diff --git a/test/runtime/local/kernels/CUDA/FillTest.cpp b/test/runtime/local/kernels/CUDA/FillTest.cpp
index c3fb5c9b2..1e26c3481 100644
--- a/test/runtime/local/kernels/CUDA/FillTest.cpp
+++ b/test/runtime/local/kernels/CUDA/FillTest.cpp
@@ -16,19 +16,19 @@
 
 #include "run_tests.h"
 
+#include "runtime/local/kernels/CUDA/CreateCUDAContext.h"
 #include <runtime/local/datastructures/DenseMatrix.h>
+#include <runtime/local/kernels/CUDA/Fill.h>
 #include <runtime/local/kernels/CheckEq.h>
-#include "runtime/local/kernels/CUDA/CreateCUDAContext.h"
 #include <runtime/local/kernels/Fill.h>
-#include <runtime/local/kernels/CUDA/Fill.h>
 
-#include <tags.h>
 #include <catch.hpp>
+#include <tags.h>
 
-template<class DTRes>
-void checkFill(const typename DTRes::VT val, const size_t rows, const size_t cols, DaphneContext* ctx) {
-    DTRes* res = nullptr;
-    DTRes* exp = nullptr;
+template <class DTRes>
+void checkFill(const typename DTRes::VT val, const size_t rows, const size_t cols, DaphneContext *ctx) {
+    DTRes *res = nullptr;
+    DTRes *exp = nullptr;
     CUDA::Fill<DTRes, typename DTRes::VT>::apply(res, val, rows, cols, ctx);
     fill<DTRes, typename DTRes::VT>(exp, val, rows, cols, ctx);
 
diff --git a/test/runtime/local/kernels/CUDA/MatMulTest.cpp b/test/runtime/local/kernels/CUDA/MatMulTest.cpp
index 19a18c857..5fabd2f20 100644
--- a/test/runtime/local/kernels/CUDA/MatMulTest.cpp
+++ b/test/runtime/local/kernels/CUDA/MatMulTest.cpp
@@ -18,8 +18,8 @@
 
 #include <runtime/local/datagen/GenGivenVals.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
-#include <runtime/local/kernels/CheckEq.h>
 #include <runtime/local/kernels/CUDA/MatMul.h>
+#include <runtime/local/kernels/CheckEq.h>
 
 #include <tags.h>
 
@@ -27,9 +27,9 @@
 
 #include <vector>
 
-template<class DT>
-void checkMatMulCUDA(const DT * lhs, const DT * rhs, const DT * exp, bool transa, bool transb, DaphneContext* ctx) {
-    DT* res = nullptr;
+template <class DT>
+void checkMatMulCUDA(const DT *lhs, const DT *rhs, const DT *exp, bool transa, bool transb, DaphneContext *ctx) {
+    DT *res = nullptr;
     CUDA::matMul<DT, DT, DT>(res, lhs, rhs, transa, transb, ctx);
     CHECK(*res == *exp);
 }
@@ -38,59 +38,69 @@ TEMPLATE_PRODUCT_TEST_CASE("CUDA::matMul", TAG_KERNELS, (DenseMatrix), (float, d
     auto dctx = setupContextAndLogger();
     using DT = TestType;
     auto m0 = genGivenVals<DT>(3, {
-        0, 0, 0,
-        0, 0, 0,
-        0, 0, 0,
-    });
+                                      0,
+                                      0,
+                                      0,
+                                      0,
+                                      0,
+                                      0,
+                                      0,
+                                      0,
+                                      0,
+                                  });
     auto m1 = genGivenVals<DT>(3, {
-        1, 2, 3,
-        3, 1, 2,
-        2, 3, 1,
-    });
+                                      1,
+                                      2,
+                                      3,
+                                      3,
+                                      1,
+                                      2,
+                                      2,
+                                      3,
+                                      1,
+                                  });
     auto m2 = genGivenVals<DT>(3, {
-        13, 13, 10,
-        10, 13, 13,
-        13, 10, 13,
-    });
+                                      13,
+                                      13,
+                                      10,
+                                      10,
+                                      13,
+                                      13,
+                                      13,
+                                      10,
+                                      13,
+                                  });
     auto m3 = genGivenVals<DT>(2, {
-        1, 0, 3, 0,
-        0, 0, 2, 0,
-    });
+                                      1,
+                                      0,
+                                      3,
+                                      0,
+                                      0,
+                                      0,
+                                      2,
+                                      0,
+                                  });
     auto m4 = genGivenVals<DT>(4, {
-        0, 1,
-        2, 0,
-        1, 1,
-        0, 0,
-    });
+                                      0,
+                                      1,
+                                      2,
+                                      0,
+                                      1,
+                                      1,
+                                      0,
+                                      0,
+                                  });
     auto m5 = genGivenVals<DT>(2, {
-        3, 4,
-        2, 2,
-    });
-    auto v0 = genGivenVals<DT>(3, {
-        0,
-        0,
-        0
-    });
-    auto v1 = genGivenVals<DT>(3, {
-        1,
-        1,
-        1
-    });
-    auto v2 = genGivenVals<DT>(3, {
-        1,
-        2,
-        3
-    });
-    auto v3 = genGivenVals<DT>(3, {
-        6,
-        6,
-        6
-    });
-    auto v4 = genGivenVals<DT>(3, {
-        14,
-        11,
-        11
-    });
+                                      3,
+                                      4,
+                                      2,
+                                      2,
+                                  });
+    auto v0 = genGivenVals<DT>(3, {0, 0, 0});
+    auto v1 = genGivenVals<DT>(3, {1, 1, 1});
+    auto v2 = genGivenVals<DT>(3, {1, 2, 3});
+    auto v3 = genGivenVals<DT>(3, {6, 6, 6});
+    auto v4 = genGivenVals<DT>(3, {14, 11, 11});
 
     checkMatMulCUDA(m0, m0, m0, false, false, dctx.get());
     checkMatMulCUDA(m1, m1, m2, false, false, dctx.get());
@@ -119,72 +129,93 @@ TEMPLATE_PRODUCT_TEST_CASE("CUDA::matMul Transposed", TAG_KERNELS, (DenseMatrix)
     auto dctx = setupContextAndLogger();
     using DT = TestType;
     auto m0 = genGivenVals<DT>(3, {
-            1, 2, 3,
-            3, 1, 2,
-            2, 3, 1,
-    });
+                                      1,
+                                      2,
+                                      3,
+                                      3,
+                                      1,
+                                      2,
+                                      2,
+                                      3,
+                                      1,
+                                  });
     auto m1 = genGivenVals<DT>(3, {
-            13, 10, 13,
-            13, 13, 10,
-            10, 13, 13,
-    });
+                                      13,
+                                      10,
+                                      13,
+                                      13,
+                                      13,
+                                      10,
+                                      10,
+                                      13,
+                                      13,
+                                  });
     auto m2 = genGivenVals<DT>(2, {
-            1, 0, 3, 0,
-            0, 0, 2, 0,
-    });
+                                      1,
+                                      0,
+                                      3,
+                                      0,
+                                      0,
+                                      0,
+                                      2,
+                                      0,
+                                  });
     auto m3 = genGivenVals<DT>(4, {
-            0, 1,
-            2, 0,
-            1, 1,
-            0, 0,
-    });
+                                      0,
+                                      1,
+                                      2,
+                                      0,
+                                      1,
+                                      1,
+                                      0,
+                                      0,
+                                  });
     auto m4 = genGivenVals<DT>(4, {
-            0, 2, 1, 0,
-            0, 0, 0, 0,
-            2, 6, 5, 0,
-            0, 0, 0, 0,
-    });
+                                      0,
+                                      2,
+                                      1,
+                                      0,
+                                      0,
+                                      0,
+                                      0,
+                                      0,
+                                      2,
+                                      6,
+                                      5,
+                                      0,
+                                      0,
+                                      0,
+                                      0,
+                                      0,
+                                  });
     auto m5 = genGivenVals<DT>(4, {
-            1, 0, 1, 0,
-            0, 4, 2, 0,
-            1, 2, 2, 0,
-            0, 0, 0, 0,
-    });
-    auto v0 = genGivenVals<DT>(3, {
-            1,
-            1,
-            1
-    });
-    auto v1 = genGivenVals<DT>(3, {
-            1,
-            2,
-            3
-    });
-    auto v2 = genGivenVals<DT>(3, {
-            13,
-            13,
-            10
-    });
-    auto v3 = genGivenVals<DT>(4, {
-            1,
-            1,
-            1,
-            1
-    });
-    auto v4 = genGivenVals<DT>(2, {
-            3,
-            2
-    });
+                                      1,
+                                      0,
+                                      1,
+                                      0,
+                                      0,
+                                      4,
+                                      2,
+                                      0,
+                                      1,
+                                      2,
+                                      2,
+                                      0,
+                                      0,
+                                      0,
+                                      0,
+                                      0,
+                                  });
+    auto v0 = genGivenVals<DT>(3, {1, 1, 1});
+    auto v1 = genGivenVals<DT>(3, {1, 2, 3});
+    auto v2 = genGivenVals<DT>(3, {13, 13, 10});
+    auto v3 = genGivenVals<DT>(4, {1, 1, 1, 1});
+    auto v4 = genGivenVals<DT>(2, {3, 2});
     auto v5 = genGivenVals<DT>(2, {
-            1,
-            1,
-    });
-    auto v6 = genGivenVals<DT>(4, {
-            1,
-            0,
-            5,
-            0
-    });
+                                      1,
+                                      1,
+                                  });
+    auto v6 = genGivenVals<DT>(4, {1, 0, 5, 0});
 
     checkMatMulCUDA(m0, m0, m1, true, true, dctx.get());
     checkMatMulCUDA(m2, m3, m4, true, true, dctx.get());
@@ -193,6 +224,5 @@ TEMPLATE_PRODUCT_TEST_CASE("CUDA::matMul Transposed", TAG_KERNELS, (DenseMatrix)
     checkMatMulCUDA(m2, v5, v6, true, false, dctx.get());
     checkMatMulCUDA(m3, m3, m5, false, true, dctx.get());
 
-
     DataObjectFactory::destroy(m0, m1, m2, m3, m4, m5, v0, v1, v2, v3, v4, v5, v6);
 }
diff --git a/test/runtime/local/kernels/CartesianTest.cpp b/test/runtime/local/kernels/CartesianTest.cpp
index 88a9e4dbb..47f26221f 100644
--- a/test/runtime/local/kernels/CartesianTest.cpp
+++ b/test/runtime/local/kernels/CartesianTest.cpp
@@ -32,20 +32,20 @@
 #include <cstdint>
 
 TEST_CASE("Cartesian", TAG_KERNELS) {
-    auto lhsC0 = genGivenVals<DenseMatrix<int64_t>>(3, { 1,  2,  3});
+    auto lhsC0 = genGivenVals<DenseMatrix<int64_t>>(3, {1, 2, 3});
     auto lhsC1 = genGivenVals<DenseMatrix<double>>(3, {11.0, 22.0, 33.0});
     std::vector<Structure *> lhsCols = {lhsC0, lhsC1};
     std::string lhsLabels[] = {"a", "b"};
     auto lhs = DataObjectFactory::create<Frame>(lhsCols, lhsLabels);
 
-    auto rhsC0 = genGivenVals<DenseMatrix<int64_t>>(2, { 100, 101});
-    auto rhsC1 = genGivenVals<DenseMatrix<int64_t>>(2, { -10, -15});
-    auto rhsC2 = genGivenVals<DenseMatrix<double >>(2, {0.1, 0.2});
+    auto rhsC0 = genGivenVals<DenseMatrix<int64_t>>(2, {100, 101});
+    auto rhsC1 = genGivenVals<DenseMatrix<int64_t>>(2, {-10, -15});
+    auto rhsC2 = genGivenVals<DenseMatrix<double>>(2, {0.1, 0.2});
     std::vector<Structure *> rhsCols = {rhsC0, rhsC1, rhsC2};
     std::string rhsLabels[] = {"c", "d", "e"};
     auto rhs = DataObjectFactory::create<Frame>(rhsCols, rhsLabels);
 
-    Frame * res = nullptr;
+    Frame *res = nullptr;
     cartesian(res, lhs, rhs, nullptr);
 
     // Check the meta data.
@@ -65,14 +65,14 @@ TEST_CASE("Cartesian", TAG_KERNELS) {
     CHECK(res->getLabels()[4] == "e");
 
     auto resC0Exp = genGivenVals<DenseMatrix<int64_t>>(6, {1, 1, 2, 2, 3, 3});
-    auto resC1Exp = genGivenVals<DenseMatrix<double >>(6, {11.0, 11.0, 22.0, 22.0, 33.0, 33.0});
+    auto resC1Exp = genGivenVals<DenseMatrix<double>>(6, {11.0, 11.0, 22.0, 22.0, 33.0, 33.0});
     auto resC2Exp = genGivenVals<DenseMatrix<int64_t>>(6, {100, 101, 100, 101, 100, 101});
     auto resC3Exp = genGivenVals<DenseMatrix<int64_t>>(6, {-10, -15, -10, -15, -10, -15});
-    auto resC4Exp = genGivenVals<DenseMatrix<double >>(6, {0.1, 0.2, 0.1, 0.2, 0.1, 0.2});
+    auto resC4Exp = genGivenVals<DenseMatrix<double>>(6, {0.1, 0.2, 0.1, 0.2, 0.1, 0.2});
 
     CHECK(*(res->getColumn<int64_t>(0)) == *resC0Exp);
-    CHECK(*(res->getColumn<double >(1)) == *resC1Exp);
+    CHECK(*(res->getColumn<double>(1)) == *resC1Exp);
     CHECK(*(res->getColumn<int64_t>(2)) == *resC2Exp);
     CHECK(*(res->getColumn<int64_t>(3)) == *resC3Exp);
-    CHECK(*(res->getColumn<double >(4)) == *resC4Exp);
+    CHECK(*(res->getColumn<double>(4)) == *resC4Exp);
 }
diff --git a/test/runtime/local/kernels/CastObjScaTest.cpp b/test/runtime/local/kernels/CastObjScaTest.cpp
index 198fdd666..4ef2781e8 100644
--- a/test/runtime/local/kernels/CastObjScaTest.cpp
+++ b/test/runtime/local/kernels/CastObjScaTest.cpp
@@ -20,10 +20,10 @@
 #include <runtime/local/kernels/CastObjSca.h>
 #include <runtime/local/kernels/CheckEq.h>
 
-#include <tags.h>
 #include <catch.hpp>
-#include <vector>
 #include <cstdint>
+#include <tags.h>
+#include <vector>
 
 TEMPLATE_TEST_CASE("castObjSca, matrix to scalar", TAG_KERNELS, double, float, int64_t, uint64_t, int32_t, uint32_t) {
     using VTRes = TestType;
@@ -33,7 +33,7 @@ TEMPLATE_TEST_CASE("castObjSca, matrix to scalar", TAG_KERNELS, double, float, i
         VTRes exp = 2;
         auto m0 = genGivenVals<DenseMatrix<int64_t>>(1, {static_cast<int64_t>(exp)});
         res = castObjSca<VTRes, DenseMatrix<int64_t>>(m0, nullptr);
-        CHECK(res == exp); 
+        CHECK(res == exp);
         DataObjectFactory::destroy(m0);
     }
 
@@ -41,7 +41,7 @@ TEMPLATE_TEST_CASE("castObjSca, matrix to scalar", TAG_KERNELS, double, float, i
         VTRes exp = 2.2;
         auto m0 = genGivenVals<DenseMatrix<double>>(1, {static_cast<double>(exp)});
         res = castObjSca<VTRes, DenseMatrix<double>>(m0, nullptr);
-        CHECK(res == exp); 
+        CHECK(res == exp);
         DataObjectFactory::destroy(m0);
     }
 }
@@ -49,7 +49,7 @@ TEMPLATE_TEST_CASE("castObjSca, matrix to scalar", TAG_KERNELS, double, float, i
 TEMPLATE_TEST_CASE("castObjSca, frame to scalar", TAG_KERNELS, double, float, int64_t, uint64_t, int32_t, uint32_t) {
     using VTRes = TestType;
 
-    Frame* arg = nullptr;
+    Frame *arg = nullptr;
     SECTION("Frame[double] to VTRes") {
         VTRes exp = 2.2;
 
diff --git a/test/runtime/local/kernels/CastObjTest.cpp b/test/runtime/local/kernels/CastObjTest.cpp
index cfcb86c2b..d66d748e0 100644
--- a/test/runtime/local/kernels/CastObjTest.cpp
+++ b/test/runtime/local/kernels/CastObjTest.cpp
@@ -31,33 +31,35 @@
 
 #include <cstdint>
 
-TEMPLATE_PRODUCT_TEST_CASE("castObj, frame to matrix, single-column", TAG_KERNELS, (DenseMatrix), (double, int64_t, uint32_t)) {
+TEMPLATE_PRODUCT_TEST_CASE("castObj, frame to matrix, single-column", TAG_KERNELS, (DenseMatrix),
+                           (double, int64_t, uint32_t)) {
     using DTRes = TestType;
     using VTRes = typename DTRes::VT;
-    
+
     const size_t numRows = 4;
     auto c0 = genGivenVals<DenseMatrix<double>>(numRows, {0.0, 1.1, 2.2, 3.3});
     auto c0Exp = genGivenVals<DenseMatrix<VTRes>>(numRows, {VTRes(0.0), VTRes(1.1), VTRes(2.2), VTRes(3.3)});
     std::vector<Structure *> cols = {c0};
     auto arg = DataObjectFactory::create<Frame>(cols, nullptr);
-    
-    DTRes * res = nullptr;
+
+    DTRes *res = nullptr;
     castObj<DTRes, Frame>(res, arg, nullptr);
-    
+
     REQUIRE(res->getNumRows() == numRows);
     REQUIRE(res->getNumCols() == 1);
     CHECK(*res == *c0Exp);
-    
+
     DataObjectFactory::destroy(c0);
     DataObjectFactory::destroy(c0Exp);
     DataObjectFactory::destroy(arg);
     DataObjectFactory::destroy(res);
 }
 
-TEMPLATE_PRODUCT_TEST_CASE("castObj, frame to matrix, multi-column", TAG_KERNELS, (DenseMatrix), (double, int64_t, uint32_t)) {
+TEMPLATE_PRODUCT_TEST_CASE("castObj, frame to matrix, multi-column", TAG_KERNELS, (DenseMatrix),
+                           (double, int64_t, uint32_t)) {
     using DTRes = TestType;
     using VTRes = typename DTRes::VT;
-    
+
     const size_t numRows = 4;
     const size_t numCols = 3;
     auto c0 = genGivenVals<DenseMatrix<double>>(numRows, {0.0, 1.1, 2.2, 3.3});
@@ -68,10 +70,10 @@ TEMPLATE_PRODUCT_TEST_CASE("castObj, frame to matrix, multi-column", TAG_KERNELS
     auto c2Exp = genGivenVals<DenseMatrix<VTRes>>(numRows, {VTRes(0), VTRes(11), VTRes(22), VTRes(33)});
     std::vector<Structure *> cols = {c0, c1, c2};
     auto arg = DataObjectFactory::create<Frame>(cols, nullptr);
-    
-    DTRes * res = nullptr;
+
+    DTRes *res = nullptr;
     castObj<DTRes, Frame>(res, arg, nullptr);
-    
+
     REQUIRE(res->getNumRows() == numRows);
     REQUIRE(res->getNumCols() == numCols);
     auto c0Fnd = DataObjectFactory::create<DTRes>(res, 0, numRows, 0, 1);
@@ -80,7 +82,7 @@ TEMPLATE_PRODUCT_TEST_CASE("castObj, frame to matrix, multi-column", TAG_KERNELS
     CHECK(*c0Fnd == *c0Exp);
     CHECK(*c1Fnd == *c1Exp);
     CHECK(*c2Fnd == *c2Exp);
-    
+
     DataObjectFactory::destroy(c0);
     DataObjectFactory::destroy(c1);
     DataObjectFactory::destroy(c2);
@@ -94,16 +96,22 @@ TEMPLATE_PRODUCT_TEST_CASE("castObj, frame to matrix, multi-column", TAG_KERNELS
     DataObjectFactory::destroy(res);
 }
 
-TEMPLATE_PRODUCT_TEST_CASE("castObj, matrix to frame, single-column", TAG_KERNELS, (DenseMatrix), (double, int64_t, uint32_t)) {
+TEMPLATE_PRODUCT_TEST_CASE("castObj, matrix to frame, single-column", TAG_KERNELS, (DenseMatrix),
+                           (double, int64_t, uint32_t)) {
     using DTArg = TestType;
     using VTArg = typename DTArg::VT;
 
     const size_t numRows = 4;
-    auto arg = genGivenVals<DenseMatrix<VTArg>>(numRows,{VTArg(0.0), VTArg(1.1), VTArg(2.2), VTArg(3.3),});    
+    auto arg = genGivenVals<DenseMatrix<VTArg>>(numRows, {
+                                                             VTArg(0.0),
+                                                             VTArg(1.1),
+                                                             VTArg(2.2),
+                                                             VTArg(3.3),
+                                                         });
     std::vector<Structure *> cols = {arg};
     auto exp = DataObjectFactory::create<Frame>(cols, nullptr);
 
-    Frame * res = nullptr;
+    Frame *res = nullptr;
     castObj<Frame, DTArg>(res, arg, nullptr);
     CHECK(*res == *exp);
 
@@ -112,25 +120,23 @@ TEMPLATE_PRODUCT_TEST_CASE("castObj, matrix to frame, single-column", TAG_KERNEL
     DataObjectFactory::destroy(res);
 }
 
-TEMPLATE_PRODUCT_TEST_CASE("castObj, matrix to frame, multi-column", TAG_KERNELS, (DenseMatrix), (double, int64_t, uint32_t)) {
+TEMPLATE_PRODUCT_TEST_CASE("castObj, matrix to frame, multi-column", TAG_KERNELS, (DenseMatrix),
+                           (double, int64_t, uint32_t)) {
     using DTArg = TestType;
     using VTArg = typename DTArg::VT;
 
     const size_t numRows = 4;
-    auto arg = genGivenVals<DenseMatrix<VTArg>>(numRows, {
-        VTArg(0.0), VTArg(1.1), VTArg(2.2),
-        VTArg(3.3), VTArg(4.4), VTArg(5.5),
-        VTArg(6.6), VTArg(7.7), VTArg(8.8),
-        VTArg(9.9), VTArg(1.0), VTArg(2.0)
-        });
-    
+    auto arg = genGivenVals<DenseMatrix<VTArg>>(numRows, {VTArg(0.0), VTArg(1.1), VTArg(2.2), VTArg(3.3), VTArg(4.4),
+                                                          VTArg(5.5), VTArg(6.6), VTArg(7.7), VTArg(8.8), VTArg(9.9),
+                                                          VTArg(1.0), VTArg(2.0)});
+
     auto c0 = genGivenVals<DenseMatrix<VTArg>>(numRows, {VTArg(0.0), VTArg(3.3), VTArg(6.6), VTArg(9.9)});
     auto c1 = genGivenVals<DenseMatrix<VTArg>>(numRows, {VTArg(1.1), VTArg(4.4), VTArg(7.7), VTArg(1.0)});
     auto c2 = genGivenVals<DenseMatrix<VTArg>>(numRows, {VTArg(2.2), VTArg(5.5), VTArg(8.8), VTArg(2.0)});
     std::vector<Structure *> cols = {c0, c1, c2};
     auto exp = DataObjectFactory::create<Frame>(cols, nullptr);
-    
-    Frame * res = nullptr;
+
+    Frame *res = nullptr;
     castObj<Frame, DTArg>(res, arg, nullptr);
     CHECK(*res == *exp);
 
@@ -139,39 +145,31 @@ TEMPLATE_PRODUCT_TEST_CASE("castObj, matrix to frame, multi-column", TAG_KERNELS
     DataObjectFactory::destroy(res);
 }
 
-TEMPLATE_PRODUCT_TEST_CASE("castObj, matrix to frame and back, multi-column", TAG_KERNELS, (DenseMatrix), (double, int64_t, uint32_t)) {
+TEMPLATE_PRODUCT_TEST_CASE("castObj, matrix to frame and back, multi-column", TAG_KERNELS, (DenseMatrix),
+                           (double, int64_t, uint32_t)) {
     using DT = TestType;
-    
+
     auto m0 = genGivenVals<DT>(4, {
-            0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0,
-    });
+                                      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                  });
     auto m1 = genGivenVals<DT>(4, {
-            1, 2, 0, 0, 1, 3,
-            0, 1, 0, 2, 0, 3,
-            0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0,
-    });
+                                      1, 2, 0, 0, 1, 3, 0, 1, 0, 2, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                  });
     auto m2 = genGivenVals<DT>(4, {
-            2, 3, 1, 1, 2, 4,
-            1, 2, 1, 3, 1, 4,
-            1, 1, 1, 1, 1, 1,
-            1, 1, 1, 1, 1, 1,
-    });
+                                      2, 3, 1, 1, 2, 4, 1, 2, 1, 3, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                  });
 
-    Frame * f0 = nullptr;
+    Frame *f0 = nullptr;
     castObj<Frame, DT>(f0, m0, nullptr);
-    Frame * f1 = nullptr;
+    Frame *f1 = nullptr;
     castObj<Frame, DT>(f1, m1, nullptr);
-    Frame * f2 = nullptr;
+    Frame *f2 = nullptr;
     castObj<Frame, DT>(f2, m2, nullptr);
-    DT* res0 = nullptr;
+    DT *res0 = nullptr;
     castObj<DT, Frame>(res0, f0, nullptr);
-    DT* res1 = nullptr;
+    DT *res1 = nullptr;
     castObj<DT, Frame>(res1, f1, nullptr);
-    DT* res2 = nullptr;
+    DT *res2 = nullptr;
     castObj<DT, Frame>(res2, f2, nullptr);
     CHECK(*m0 == *res0);
     CHECK(*m1 == *res1);
@@ -182,54 +180,131 @@ TEMPLATE_PRODUCT_TEST_CASE("castObj, matrix to frame and back, multi-column", TA
     DataObjectFactory::destroy(m2, f2, res2);
 }
 
-TEMPLATE_PRODUCT_TEST_CASE("castObj, matrix to matrix, multi-column", TAG_KERNELS, (DenseMatrix), (double, int64_t, uint32_t)) {
+TEMPLATE_PRODUCT_TEST_CASE("castObj, matrix to matrix, multi-column", TAG_KERNELS, (DenseMatrix),
+                           (double, int64_t, uint32_t)) {
     using DTRes = TestType;
     using VTRes = typename DTRes::VT;
 
     const size_t numRows = 2;
 
     auto arg1 = genGivenVals<DenseMatrix<double>>(numRows, {3., 1., 4., 1., 5., 9.});
-    DTRes* res1 = nullptr;
+    DTRes *res1 = nullptr;
 
     auto arg2 = genGivenVals<DenseMatrix<int64_t>>(numRows, {3, 1, 4, 1, 5, 9});
-    DTRes* res2 = nullptr;
+    DTRes *res2 = nullptr;
+
+    auto arg3 = genGivenVals<DenseMatrix<uint32_t>>(numRows, {3, 1, 4, 1, 5, 9});
+    DTRes *res3 = nullptr;
 
-    auto arg3 = genGivenVals<DenseMatrix<uint32_t>>(numRows, {3, 1, 4, 1, 5, 9}); 
-    DTRes* res3 = nullptr;
-    
-    auto check123 = genGivenVals<DenseMatrix<VTRes>>(numRows, {VTRes(3.), VTRes(1.), VTRes(4.), VTRes(1.), VTRes(5.), VTRes(9.)});
+    auto check123 =
+        genGivenVals<DenseMatrix<VTRes>>(numRows, {VTRes(3.), VTRes(1.), VTRes(4.), VTRes(1.), VTRes(5.), VTRes(9.)});
 
     castObj<DenseMatrix<VTRes>, DenseMatrix<double>>(res1, arg1, nullptr);
     castObj<DenseMatrix<VTRes>, DenseMatrix<int64_t>>(res2, arg2, nullptr);
     castObj<DenseMatrix<VTRes>, DenseMatrix<uint32_t>>(res3, arg3, nullptr);
 
-
     CHECK(*res1 == *check123);
     CHECK(*res2 == *check123);
     CHECK(*res3 == *check123);
 
-    DataObjectFactory::destroy(arg1,arg2,arg3);
-    DataObjectFactory::destroy(res1,res2,res3);
+    DataObjectFactory::destroy(arg1, arg2, arg3);
+    DataObjectFactory::destroy(res1, res2, res3);
     DataObjectFactory::destroy(check123);
 }
 
-TEMPLATE_PRODUCT_TEST_CASE("castObj, matrix to matrix, single dim", TAG_KERNELS, (DenseMatrix), (double, int64_t, uint32_t)) {
+TEMPLATE_PRODUCT_TEST_CASE("castObj, DenseMatrix<string> to DenseMatrix<number>, multi-column", TAG_KERNELS,
+                           (DenseMatrix), (double, int64_t, uint32_t)) {
     using DTRes = TestType;
     using VTRes = typename DTRes::VT;
 
+    const size_t numRows = 2;
+
+    auto arg_stdSTR =
+        genGivenVals<DenseMatrix<std::string>>(numRows, {std::string("3.1"), std::string("1.1"), std::string("4.1"),
+                                                         std::string("1.1"), std::string("5.1"), std::string("9.1")});
+    DTRes *res_stdSTR = nullptr;
+
+    auto arg_FixedStr16 =
+        genGivenVals<DenseMatrix<FixedStr16>>(numRows, {FixedStr16("3.1"), FixedStr16("1.1"), FixedStr16("4.1"),
+                                                        FixedStr16("1.1"), FixedStr16("5.1"), FixedStr16("9.1")});
+    DTRes *res_FixedStr16 = nullptr;
+
+    auto check = genGivenVals<DenseMatrix<VTRes>>(
+        numRows, {VTRes(3.1), VTRes(1.1), VTRes(4.1), VTRes(1.1), VTRes(5.1), VTRes(9.1)});
+
+    castObj<DenseMatrix<VTRes>, DenseMatrix<std::string>>(res_stdSTR, arg_stdSTR, nullptr);
+    CHECK(*res_stdSTR == *check);
+
+    castObj<DenseMatrix<VTRes>, DenseMatrix<FixedStr16>>(res_FixedStr16, arg_FixedStr16, nullptr);
+    CHECK(*res_FixedStr16 == *check);
+
+    DataObjectFactory::destroy(check);
+    DataObjectFactory::destroy(res_stdSTR, res_FixedStr16);
+    DataObjectFactory::destroy(arg_stdSTR, arg_FixedStr16);
+}
+
+TEMPLATE_PRODUCT_TEST_CASE("castObj, DenseMatrix<string> to DenseMatrix<int64_t>, int64_t", TAG_KERNELS, (DenseMatrix),
+                           (int64_t)) {
+    using DTRes = TestType;
+    using VTRes = typename DTRes::VT;
+
+    const size_t numRows = 2;
+
+    SECTION("std::string") {
+        auto arg_string = genGivenVals<DenseMatrix<std::string>>(
+            numRows, {std::string("9223372036854775807"), std::string("9223372036854775806"),
+                      std::string("9223372036854775805"), std::string("9223372036854775804"),
+                      std::string("9223372036854775803"), std::string("9223372036854775802")});
+        DTRes *res_string = nullptr;
+        auto check_string =
+            genGivenVals<DenseMatrix<VTRes>>(numRows, {9223372036854775807, 9223372036854775806, 9223372036854775805,
+                                                       9223372036854775804, 9223372036854775803, 9223372036854775802});
+
+        castObj<DenseMatrix<VTRes>, DenseMatrix<std::string>>(res_string, arg_string, nullptr);
+
+        CHECK(*res_string == *check_string);
+
+        DataObjectFactory::destroy(check_string);
+        DataObjectFactory::destroy(res_string);
+        DataObjectFactory::destroy(arg_string);
+    }
+
+    SECTION("FixedStr16") {
+        auto arg_FixedStr16 = genGivenVals<DenseMatrix<FixedStr16>>(
+            numRows, {FixedStr16("123456789012345"), FixedStr16("123456789012344"), FixedStr16("123456789012343"),
+                      FixedStr16("123456789012342"), FixedStr16("123456789012341"), FixedStr16("123456789012340")});
+        DTRes *res_FixedStr16 = nullptr;
+        auto check_FixedStr16 =
+            genGivenVals<DenseMatrix<VTRes>>(numRows, {123456789012345, 123456789012344, 123456789012343,
+                                                       123456789012342, 123456789012341, 123456789012340});
+
+        castObj<DenseMatrix<VTRes>, DenseMatrix<FixedStr16>>(res_FixedStr16, arg_FixedStr16, nullptr);
+
+        CHECK(*res_FixedStr16 == *check_FixedStr16);
+
+        DataObjectFactory::destroy(check_FixedStr16);
+        DataObjectFactory::destroy(res_FixedStr16);
+        DataObjectFactory::destroy(arg_FixedStr16);
+    }
+}
+
+TEMPLATE_PRODUCT_TEST_CASE("castObj, matrix to matrix, single dim", TAG_KERNELS, (DenseMatrix),
+                           (double, int64_t, uint32_t)) {
+    using DTRes = TestType;
+    using VTRes = typename DTRes::VT;
 
     // Single col
     size_t numRows = 3;
 
     auto arg1 = genGivenVals<DenseMatrix<double>>(numRows, {3., 1., 4.});
-    DTRes* res1 = nullptr;
+    DTRes *res1 = nullptr;
 
     auto arg2 = genGivenVals<DenseMatrix<int64_t>>(numRows, {3, 1, 4});
-    DTRes* res2 = nullptr;
+    DTRes *res2 = nullptr;
 
     auto arg3 = genGivenVals<DenseMatrix<uint32_t>>(numRows, {3, 1, 4});
-    DTRes* res3 = nullptr; 
-    
+    DTRes *res3 = nullptr;
+
     auto check123 = genGivenVals<DenseMatrix<VTRes>>(numRows, {VTRes(3.), VTRes(1.), VTRes(4.)});
 
     castObj<DenseMatrix<VTRes>, DenseMatrix<double>>(res1, arg1, nullptr);
@@ -244,14 +319,14 @@ TEMPLATE_PRODUCT_TEST_CASE("castObj, matrix to matrix, single dim", TAG_KERNELS,
     numRows = 1;
 
     auto arg4 = genGivenVals<DenseMatrix<double>>(numRows, {3., 1., 4.});
-    DTRes* res4 = nullptr;
+    DTRes *res4 = nullptr;
 
     auto arg5 = genGivenVals<DenseMatrix<int64_t>>(numRows, {3, 1, 4});
-    DTRes* res5 = nullptr;
+    DTRes *res5 = nullptr;
 
     auto arg6 = genGivenVals<DenseMatrix<uint32_t>>(numRows, {3, 1, 4});
-    DTRes* res6 = nullptr;
-    
+    DTRes *res6 = nullptr;
+
     auto check456 = genGivenVals<DenseMatrix<VTRes>>(numRows, {VTRes(3.), VTRes(1.), VTRes(4.)});
 
     castObj<DenseMatrix<VTRes>, DenseMatrix<double>>(res4, arg4, nullptr);
@@ -261,27 +336,28 @@ TEMPLATE_PRODUCT_TEST_CASE("castObj, matrix to matrix, single dim", TAG_KERNELS,
     CHECK(*res4 == *check456);
     CHECK(*res5 == *check456);
     CHECK(*res6 == *check456);
-    
-    DataObjectFactory::destroy(arg1,arg2,arg3, arg4,arg5,arg6);
+
+    DataObjectFactory::destroy(arg1, arg2, arg3, arg4, arg5, arg6);
     DataObjectFactory::destroy(res1, res2, res3, res4, res5, res6);
     DataObjectFactory::destroy(check123, check456);
 }
 
-TEMPLATE_PRODUCT_TEST_CASE("castObj, matrix to matrix, zero dim & dim mismatch", TAG_KERNELS, (DenseMatrix), (double, int64_t, uint32_t)) {
+TEMPLATE_PRODUCT_TEST_CASE("castObj, matrix to matrix, zero dim & dim mismatch", TAG_KERNELS, (DenseMatrix),
+                           (double, int64_t, uint32_t)) {
     using DTRes = TestType;
     using VTRes = typename DTRes::VT;
 
     // Zero dim
     size_t numRows = 0;
-    
+
     size_t numCols = 0;
-    auto arg1 =  DataObjectFactory::create<DenseMatrix<double>>(numRows, numCols, false);
-    DTRes* res1 = nullptr;
+    auto arg1 = DataObjectFactory::create<DenseMatrix<double>>(numRows, numCols, false);
+    DTRes *res1 = nullptr;
     auto check1 = DataObjectFactory::create<DenseMatrix<VTRes>>(numRows, numCols, false);
 
     numCols = 1;
-    auto arg2 =  DataObjectFactory::create<DenseMatrix<int64_t>>(numRows, numCols, false);
-    DTRes* res2 = nullptr;
+    auto arg2 = DataObjectFactory::create<DenseMatrix<int64_t>>(numRows, numCols, false);
+    DTRes *res2 = nullptr;
     auto check2 = DataObjectFactory::create<DenseMatrix<VTRes>>(numRows, numCols, false);
 
     castObj<DenseMatrix<VTRes>, DenseMatrix<double>>(res1, arg1, nullptr);
@@ -298,53 +374,35 @@ TEMPLATE_PRODUCT_TEST_CASE("castObj, matrix to matrix, zero dim & dim mismatch",
 TEMPLATE_TEST_CASE("CastObj CSRMatrix to DenseMatrix", TAG_KERNELS, double, float, int64_t) {
     using VT = TestType;
     using DTArg = CSRMatrix<VT>;
-    using DTRes= DenseMatrix<VT>;
-    
+    using DTRes = DenseMatrix<VT>;
+
     auto m0 = genGivenVals<DTArg>(4, {
-            0, 0, 0, 0, 0, 0,
-            0, 4, 0, 0, 0, 2,
-            0, 0, 0, 3, 0, 0,
-            0, 0, 0, 0, 0, 0,
-    });
+                                         0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0,
+                                     });
     auto m1 = genGivenVals<DTArg>(4, {
-            1, 2, 0, 0, 1, 3,
-            0, 1, 0, 2, 0, 3,
-            0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0,
-    });
+                                         1, 2, 0, 0, 1, 3, 0, 1, 0, 2, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                     });
     auto m2 = genGivenVals<DTArg>(4, {
-            2, 0, 0, 0, 0, 0,
-            1, 0, 0, 0, 0, 4,
-            0, 0, 0, 0, 1, 1,
-            1, 0, 0, 0, 0, 1,
-    });
+                                         2, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 4, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1,
+                                     });
 
     auto d0 = genGivenVals<DTRes>(4, {
-            0, 0, 0, 0, 0, 0,
-            0, 4, 0, 0, 0, 2,
-            0, 0, 0, 3, 0, 0,
-            0, 0, 0, 0, 0, 0,
-    });
+                                         0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0,
+                                     });
     auto d1 = genGivenVals<DTRes>(4, {
-            1, 2, 0, 0, 1, 3,
-            0, 1, 0, 2, 0, 3,
-            0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0,
-    });
+                                         1, 2, 0, 0, 1, 3, 0, 1, 0, 2, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                     });
     auto d2 = genGivenVals<DTRes>(4, {
-            2, 0, 0, 0, 0, 0,
-            1, 0, 0, 0, 0, 4,
-            0, 0, 0, 0, 1, 1,
-            1, 0, 0, 0, 0, 1,
-    });
+                                         2, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 4, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1,
+                                     });
 
-    DTRes * res0 = nullptr;
+    DTRes *res0 = nullptr;
     castObj<DTRes, DTArg>(res0, m0, nullptr);
-    DTRes * res1 = nullptr;
+    DTRes *res1 = nullptr;
     castObj<DTRes, DTArg>(res1, m1, nullptr);
-    DTRes * res2 = nullptr;
+    DTRes *res2 = nullptr;
     castObj<DTRes, DTArg>(res2, m2, nullptr);
-    
+
     CHECK(*d0 == *res0);
     CHECK(*d1 == *res1);
     CHECK(*d2 == *res2);
@@ -357,53 +415,35 @@ TEMPLATE_TEST_CASE("CastObj CSRMatrix to DenseMatrix", TAG_KERNELS, double, floa
 TEMPLATE_TEST_CASE("CastObj DenseMatrix to CSRMatrix", TAG_KERNELS, double, float, int64_t) {
     using VT = TestType;
     using DTRes = CSRMatrix<VT>;
-    using DTArg= DenseMatrix<VT>;
-    
+    using DTArg = DenseMatrix<VT>;
+
     auto m0 = genGivenVals<DTArg>(4, {
-            0, 0, 0, 0, 0, 0,
-            0, 4, 0, 0, 0, 2,
-            0, 0, 0, 3, 0, 0,
-            0, 0, 0, 0, 0, 0,
-    });
+                                         0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0,
+                                     });
     auto m1 = genGivenVals<DTArg>(4, {
-            1, 2, 0, 0, 1, 3,
-            0, 1, 0, 2, 0, 3,
-            0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0,
-    });
+                                         1, 2, 0, 0, 1, 3, 0, 1, 0, 2, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                     });
     auto m2 = genGivenVals<DTArg>(4, {
-            2, 0, 0, 0, 0, 0,
-            1, 0, 0, 0, 0, 4,
-            0, 0, 0, 0, 1, 1,
-            1, 0, 0, 0, 0, 1,
-    });
+                                         2, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 4, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1,
+                                     });
 
     auto d0 = genGivenVals<DTRes>(4, {
-            0, 0, 0, 0, 0, 0,
-            0, 4, 0, 0, 0, 2,
-            0, 0, 0, 3, 0, 0,
-            0, 0, 0, 0, 0, 0,
-    });
+                                         0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0,
+                                     });
     auto d1 = genGivenVals<DTRes>(4, {
-            1, 2, 0, 0, 1, 3,
-            0, 1, 0, 2, 0, 3,
-            0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0,
-    });
+                                         1, 2, 0, 0, 1, 3, 0, 1, 0, 2, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                     });
     auto d2 = genGivenVals<DTRes>(4, {
-            2, 0, 0, 0, 0, 0,
-            1, 0, 0, 0, 0, 4,
-            0, 0, 0, 0, 1, 1,
-            1, 0, 0, 0, 0, 1,
-    });
+                                         2, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 4, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1,
+                                     });
 
-    DTRes * res0 = nullptr;
+    DTRes *res0 = nullptr;
     castObj<DTRes, DTArg>(res0, m0, nullptr);
-    DTRes * res1 = nullptr;
+    DTRes *res1 = nullptr;
     castObj<DTRes, DTArg>(res1, m1, nullptr);
-    DTRes * res2 = nullptr;
+    DTRes *res2 = nullptr;
     castObj<DTRes, DTArg>(res2, m2, nullptr);
-    
+
     CHECK(*d0 == *res0);
     CHECK(*d1 == *res1);
     CHECK(*d2 == *res2);
diff --git a/test/runtime/local/kernels/CastScaObjTest.cpp b/test/runtime/local/kernels/CastScaObjTest.cpp
index ca33b7f7b..5550e2547 100644
--- a/test/runtime/local/kernels/CastScaObjTest.cpp
+++ b/test/runtime/local/kernels/CastScaObjTest.cpp
@@ -20,22 +20,23 @@
 #include <runtime/local/kernels/CastScaObj.h>
 #include <runtime/local/kernels/CheckEq.h>
 
-#include <tags.h>
 #include <catch.hpp>
-#include <vector>
 #include <cstdint>
+#include <tags.h>
+#include <vector>
 
-TEMPLATE_PRODUCT_TEST_CASE("castScaObj, scalar to matrix", TAG_KERNELS, (DenseMatrix), (double, float, int64_t, uint64_t, int32_t, uint32_t)) {
+TEMPLATE_PRODUCT_TEST_CASE("castScaObj, scalar to matrix", TAG_KERNELS, (DenseMatrix),
+                           (double, float, int64_t, uint64_t, int32_t, uint32_t)) {
     using DTRes = TestType;
     using VTRes = typename DTRes::VT;
-    DTRes* res = nullptr;
-    DTRes* exp = nullptr;
+    DTRes *res = nullptr;
+    DTRes *exp = nullptr;
 
     SECTION("int64_t to DenseMatrix<VTRes>") {
         int64_t val = 2;
         exp = genGivenVals<DTRes>(1, {VTRes(val)});
         castScaObj<DTRes, int64_t>(res, val, nullptr);
-        CHECK(*res == *exp); 
+        CHECK(*res == *exp);
     }
 
     SECTION("double to DenseMatrix<VTRes>") {
@@ -44,13 +45,13 @@ TEMPLATE_PRODUCT_TEST_CASE("castScaObj, scalar to matrix", TAG_KERNELS, (DenseMa
         castScaObj<DTRes, double>(res, val, nullptr);
         CHECK(*res == *exp);
     }
-    DataObjectFactory::destroy(exp); 
+    DataObjectFactory::destroy(exp);
     DataObjectFactory::destroy(res);
 }
 
 TEMPLATE_TEST_CASE("castScaObj, scalar to frame", TAG_KERNELS, double, float, int64_t, uint64_t, int32_t, uint32_t) {
     using VTRes = TestType;
-    Frame* exp = nullptr;
+    Frame *exp = nullptr;
     std::vector<Structure *> cols;
     SECTION("double to Frame[VTRes]") {
         double val = 2.2;
@@ -59,7 +60,7 @@ TEMPLATE_TEST_CASE("castScaObj, scalar to frame", TAG_KERNELS, double, float, in
         cols = {m0};
         exp = DataObjectFactory::create<Frame>(cols, nullptr);
 
-        Frame* res = nullptr;
+        Frame *res = nullptr;
         castScaObj<Frame, VTRes>(res, val, nullptr);
         CHECK(*res == *exp);
 
@@ -73,10 +74,10 @@ TEMPLATE_TEST_CASE("castScaObj, scalar to frame", TAG_KERNELS, double, float, in
         cols = {m0};
         exp = DataObjectFactory::create<Frame>(cols, nullptr);
 
-        Frame* res = nullptr;
+        Frame *res = nullptr;
         castScaObj<Frame, VTRes>(res, val, nullptr);
         CHECK(*res == *exp);
-        
+
         DataObjectFactory::destroy(m0, exp, res);
     }
 }
diff --git a/test/runtime/local/kernels/CastScaTest.cpp b/test/runtime/local/kernels/CastScaTest.cpp
index 8e79a3093..d1143db38 100644
--- a/test/runtime/local/kernels/CastScaTest.cpp
+++ b/test/runtime/local/kernels/CastScaTest.cpp
@@ -20,6 +20,8 @@
 
 #include <catch.hpp>
 
+#include <limits>
+
 #include <cstdint>
 
 TEST_CASE("castSca, no-op casts", TAG_KERNELS) {
@@ -34,16 +36,41 @@ TEST_CASE("castSca, actual casts", TAG_KERNELS) {
     CHECK(castSca<int64_t, double>(-123.4, nullptr) == -123);
     CHECK(castSca<double, int64_t>(123, nullptr) == 123.0);
     CHECK(castSca<double, int64_t>(-123, nullptr) == -123.0);
-    
+
     CHECK(castSca<int64_t, bool>(false, nullptr) == 0);
     CHECK(castSca<int64_t, bool>(true, nullptr) == 1);
     CHECK(castSca<double, bool>(false, nullptr) == 0.0);
     CHECK(castSca<double, bool>(true, nullptr) == 1.0);
-    
+
     CHECK(castSca<bool, int64_t>(123, nullptr) == true);
     CHECK(castSca<bool, int64_t>(-123, nullptr) == true);
     CHECK(castSca<bool, int64_t>(0, nullptr) == false);
     CHECK(castSca<bool, double>(123.4, nullptr) == true);
     CHECK(castSca<bool, double>(-123.4, nullptr) == true);
     CHECK(castSca<bool, double>(0.0, nullptr) == false);
-}
\ No newline at end of file
+}
+
+TEST_CASE("castSca, actual casts strings to numbers", TAG_KERNELS) {
+
+    CHECK(castSca<int64_t, std::string>("123", nullptr) == 123);
+    CHECK(castSca<int64_t, std::string>("-123", nullptr) == -123);
+    CHECK(castSca<int64_t, std::string>("0", nullptr) == 0);
+    CHECK(castSca<double, std::string>("123.4", nullptr) == 123.4);
+    CHECK(castSca<double, std::string>("-123.4", nullptr) == -123.4);
+    CHECK(castSca<double, std::string>("0.0", nullptr) == 0.0);
+    CHECK(castSca<int64_t, std::string>("9223372036854775807", nullptr) == std::numeric_limits<int64_t>::max());
+    CHECK(castSca<int64_t, std::string>("-9223372036854775808", nullptr) == std::numeric_limits<int64_t>::min());
+    CHECK(castSca<uint64_t, std::string>("18446744073709551615", nullptr) == std::numeric_limits<uint64_t>::max());
+    CHECK(castSca<uint64_t, std::string>("0", nullptr) == std::numeric_limits<uint64_t>::min());
+
+    CHECK(castSca<int64_t, FixedStr16>("123", nullptr) == 123);
+    CHECK(castSca<int64_t, FixedStr16>("-123", nullptr) == -123);
+    CHECK(castSca<int64_t, FixedStr16>("0", nullptr) == 0);
+    CHECK(castSca<double, FixedStr16>("123.4", nullptr) == 123.4);
+    CHECK(castSca<double, FixedStr16>("-123.4", nullptr) == -123.4);
+    CHECK(castSca<double, FixedStr16>("0.0", nullptr) == 0.0);
+    CHECK(castSca<int64_t, FixedStr16>("123456789012345", nullptr) == 123456789012345ll);
+    CHECK(castSca<int64_t, FixedStr16>("-12345678901234", nullptr) == -12345678901234ll);
+    CHECK(castSca<uint64_t, FixedStr16>("123456789012345", nullptr) == 123456789012345ull);
+    CHECK(castSca<uint64_t, FixedStr16>("0", nullptr) == std::numeric_limits<uint64_t>::min());
+}
diff --git a/test/runtime/local/kernels/CheckEqApproxTest.cpp b/test/runtime/local/kernels/CheckEqApproxTest.cpp
index 00e90eebd..76b404d61 100644
--- a/test/runtime/local/kernels/CheckEqApproxTest.cpp
+++ b/test/runtime/local/kernels/CheckEqApproxTest.cpp
@@ -15,9 +15,9 @@
  */
 
 #include <runtime/local/datagen/GenGivenVals.h>
+#include <runtime/local/datastructures/CSRMatrix.h>
 #include <runtime/local/datastructures/DataObjectFactory.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
-#include <runtime/local/datastructures/CSRMatrix.h>
 #include <runtime/local/kernels/CheckEqApprox.h>
 
 #include <tags.h>
@@ -37,23 +37,15 @@
 
 TEMPLATE_PRODUCT_TEST_CASE("CheckEqApprox, original matrices", TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)) {
     using DT = TestType;
-    
+
     std::vector<typename DT::VT> vals = {
-        0, 0, 1, 0, 2, 0,
-        0, 0, 0, 0, 0, 0,
-        3, 4, 5, 0, 6, 7,
-        0, 8, 0, 0, 9, 0,
+        0, 0, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 3, 4, 5, 0, 6, 7, 0, 8, 0, 0, 9, 0,
     };
-    std::vector<typename DT::VT> vals2 = { 
-        0, 0, 1+1e-7, 0, 2, 0,
-        0, 0, 0, 0, 0, 0,
-        3, 4, 5, 0, 6+1e-7, 7,
-        0, 8, 0, 0, 9, 0,
+    std::vector<typename DT::VT> vals2 = {
+        0, 0, 1 + 1e-7, 0, 2, 0, 0, 0, 0, 0, 0, 0, 3, 4, 5, 0, 6 + 1e-7, 7, 0, 8, 0, 0, 9, 0,
     };
     auto m1 = genGivenVals<DT>(4, vals);
-    SECTION("same inst") {
-        CHECK(checkEqApprox(m1, m1, 1e-5, nullptr));
-    }
+    SECTION("same inst") { CHECK(checkEqApprox(m1, m1, 1e-5, nullptr)); }
     SECTION("diff inst, same size, same cont") {
         auto m2 = genGivenVals<DT>(4, vals);
         CHECK(checkEqApprox(m1, m2, 1e-5, nullptr));
@@ -74,44 +66,31 @@ TEMPLATE_PRODUCT_TEST_CASE("CheckEqApprox, original matrices", TAG_KERNELS, (DAT
         CHECK(checkEqApprox<DT>(m1, m2, 0.01, nullptr));
         DataObjectFactory::destroy(m2);
     }
-    SECTION("diff inst, same size, unaccepted difference defined EPS"){
+    SECTION("diff inst, same size, unaccepted difference defined EPS") {
         auto m2 = genGivenVals<DT>(4, vals2);
         CHECK_FALSE(checkEqApprox<DT>(m1, m2, 1e-13, nullptr));
         DataObjectFactory::destroy(m2);
     }
-    
+
     DataObjectFactory::destroy(m1);
 }
 
 TEMPLATE_PRODUCT_TEST_CASE("CheckEqApprox, views on matrices", TAG_KERNELS, (DenseMatrix, Matrix), (VALUE_TYPES)) {
     using DT = TestType;
     using VT = typename DT::VT;
-    using DTGen = typename std::conditional<
-                        std::is_same<DT, Matrix<VT>>::value,
-                        DenseMatrix<VT>,
-                        DT
-                    >::type;
-    
+    using DTGen = typename std::conditional<std::is_same<DT, Matrix<VT>>::value, DenseMatrix<VT>, DT>::type;
+
     std::vector<VT> vals = {
-        1, 2, 2, 2, 0, 0,
-        3, 4, 4, 4, 1, 2,
-        0, 0, 0, 0, 3, 4,
-        0, 0, 0, 0, 0, 0,
-        1, 2, 0, 0, 0, 0,
-        3, 4, 0, 0, 1, 2,
+        1, 2, 2, 2, 0, 0, 3, 4, 4, 4, 1, 2, 0, 0, 0, 0, 3, 4, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 3, 4, 0, 0, 1, 2,
+    };
+    std::vector<VT> vals2 = {
+        1 + 1e-3, 2, 2, 2, 0, 0, 3, 4, 4 + 1e-3, 4, 1, 2, 0, 0, 0, 0, 3, 4,
+        0,        0, 0, 0, 0, 0, 1, 2, 0,        0, 0, 0, 3, 4, 0, 0, 1, 2,
     };
-    std::vector<VT> vals2 = { 
-        1+1e-3, 2, 2, 2, 0, 0,
-        3, 4, 4+1e-3, 4, 1, 2,
-        0, 0, 0, 0, 3, 4,
-        0, 0, 0, 0, 0, 0,
-        1, 2, 0, 0, 0, 0,
-        3, 4, 0, 0, 1, 2,
-    };    
 
     auto orig1 = genGivenVals<DTGen>(6, vals);
-    auto orig2 = genGivenVals<DTGen>(6, vals2); 
-    
+    auto orig2 = genGivenVals<DTGen>(6, vals2);
+
     SECTION("same inst") {
         auto view1 = static_cast<DT *>(DataObjectFactory::create<DTGen>(orig1, 0, 2, 0, 2));
         CHECK(checkEqApprox(view1, view1, 1e-5, nullptr));
@@ -124,7 +103,7 @@ TEMPLATE_PRODUCT_TEST_CASE("CheckEqApprox, views on matrices", TAG_KERNELS, (Den
         CHECK_FALSE(checkEqApprox(view1, view2, 1e-9, nullptr));
         DataObjectFactory::destroy(view1, view2);
     }
-    
+
     DataObjectFactory::destroy(orig1, orig2);
 }
 
@@ -137,12 +116,12 @@ TEMPLATE_PRODUCT_TEST_CASE("CheckEqApprox, frames", TAG_KERNELS, (DenseMatrix),
     auto c1 = genGivenVals<DenseMatrix<VTArg>>(numRows, {VTArg(4.4), VTArg(5.5), VTArg(6.6), VTArg(7.7)});
     auto c2 = genGivenVals<DenseMatrix<VTArg>>(numRows, {VTArg(8.8), VTArg(9.9), VTArg(1.0), VTArg(2.0)});
     auto c3 = genGivenVals<DenseMatrix<VTArg>>(numRows, {VTArg(8.801), VTArg(9.901), VTArg(1.001), VTArg(2.001)});
-   
+
     std::vector<Structure *> cols1 = {c0, c1, c2};
     std::vector<Structure *> cols2 = {c0, c1, c3};
     auto frame1 = DataObjectFactory::create<Frame>(cols1, nullptr);
     auto frame2 = DataObjectFactory::create<Frame>(cols2, nullptr);
-    
+
     CHECK(checkEqApprox(frame1, frame1, 0.00001, nullptr));
     CHECK(checkEqApprox(frame1, frame2, 0.01, nullptr));
     CHECK_FALSE(checkEqApprox(frame1, frame2, 0.000000001, nullptr));
diff --git a/test/runtime/local/kernels/CheckEqTest.cpp b/test/runtime/local/kernels/CheckEqTest.cpp
index 709407aa8..ed550a53f 100644
--- a/test/runtime/local/kernels/CheckEqTest.cpp
+++ b/test/runtime/local/kernels/CheckEqTest.cpp
@@ -15,9 +15,9 @@
  */
 
 #include <runtime/local/datagen/GenGivenVals.h>
+#include <runtime/local/datastructures/CSRMatrix.h>
 #include <runtime/local/datastructures/DataObjectFactory.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
-#include <runtime/local/datastructures/CSRMatrix.h>
 #include <runtime/local/kernels/CheckEq.h>
 
 #include <tags.h>
@@ -34,18 +34,13 @@
 
 TEMPLATE_PRODUCT_TEST_CASE("CheckEq, original matrices", TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)) {
     using DT = TestType;
-    
+
     std::vector<typename DT::VT> vals = {
-        0, 0, 1, 0, 2, 0,
-        0, 0, 0, 0, 0, 0,
-        3, 4, 5, 0, 6, 7,
-        0, 8, 0, 0, 9, 0,
+        0, 0, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 3, 4, 5, 0, 6, 7, 0, 8, 0, 0, 9, 0,
     };
     auto m1 = genGivenVals<DT>(4, vals);
-    
-    SECTION("same inst") {
-        CHECK(*m1 == *m1);
-    }
+
+    SECTION("same inst") { CHECK(*m1 == *m1); }
     SECTION("diff inst, same size, same cont") {
         auto m2 = genGivenVals<DT>(4, vals);
         CHECK(*m1 == *m2);
@@ -58,46 +53,43 @@ TEMPLATE_PRODUCT_TEST_CASE("CheckEq, original matrices", TAG_KERNELS, (DATA_TYPE
     }
     SECTION("diff inst, same size, diff cont") {
         auto m2 = genGivenVals<DT>(4, {
-            0, 0, 1, 0, 2, 0,
-            0, 0, 1, 0, 0, 0,
-            3, 4, 5, 0, 6, 7,
-            0, 8, 0, 0, 9, 0,
-        });
+                                          0, 0, 1, 0, 2, 0, 0, 0, 1, 0, 0, 0, 3, 4, 5, 0, 6, 7, 0, 8, 0, 0, 9, 0,
+                                      });
         CHECK_FALSE(*m1 == *m2);
         DataObjectFactory::destroy(m2);
     }
     SECTION("diff inst, diff size, diff cont") {
         auto m2 = genGivenVals<DT>(3, {
-            1, 0, 0, 0,
-            0, 2, 0, 4,
-            0, 0, 3, 0,
-        });
+                                          1,
+                                          0,
+                                          0,
+                                          0,
+                                          0,
+                                          2,
+                                          0,
+                                          4,
+                                          0,
+                                          0,
+                                          3,
+                                          0,
+                                      });
         CHECK_FALSE(*m1 == *m2);
         DataObjectFactory::destroy(m2);
     }
 
     DataObjectFactory::destroy(m1);
 }
-    
+
 TEMPLATE_PRODUCT_TEST_CASE("CheckEq, views on matrices", TAG_KERNELS, (DenseMatrix, Matrix), (VALUE_TYPES)) {
     using DT = TestType;
     using VT = typename DT::VT;
-    using DTGenView = typename std::conditional<
-                        std::is_same<DT, Matrix<VT>>::value,
-                        DenseMatrix<VT>,
-                        DT
-                    >::type;
-    
+    using DTGenView = typename std::conditional<std::is_same<DT, Matrix<VT>>::value, DenseMatrix<VT>, DT>::type;
+
     std::vector<VT> vals = {
-        1, 2, 2, 2, 0, 0,
-        3, 4, 4, 4, 1, 2,
-        0, 0, 0, 0, 3, 4,
-        0, 0, 0, 0, 0, 0,
-        1, 2, 0, 0, 0, 0,
-        3, 4, 0, 0, 1, 2,
+        1, 2, 2, 2, 0, 0, 3, 4, 4, 4, 1, 2, 0, 0, 0, 0, 3, 4, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 3, 4, 0, 0, 1, 2,
     };
     auto orig1 = genGivenVals<DTGenView>(6, vals);
-    
+
     SECTION("same inst") {
         auto view1 = static_cast<DT *>(DataObjectFactory::create<DTGenView>(orig1, 0, 2, 0, 2));
         CHECK(*view1 == *view1);
@@ -144,20 +136,12 @@ TEMPLATE_PRODUCT_TEST_CASE("CheckEq, views on matrices", TAG_KERNELS, (DenseMatr
 
 TEMPLATE_PRODUCT_TEST_CASE("CheckEq, views on matrices", TAG_KERNELS, (CSRMatrix), (VALUE_TYPES)) {
     using DT = TestType;
-    
+
     std::vector<typename DT::VT> vals = {
-        0, 0, 0, 0,
-        0, 1, 0, 2,
-        3, 0, 0, 0,
-        0, 0, 4, 5,
-        0, 0, 0, 0,
-        3, 0, 0, 0,
-        0, 0, 4, 5,
-        0, 0, 4, 5,
-        0, 0, 4, 5,
+        0, 0, 0, 0, 0, 1, 0, 2, 3, 0, 0, 0, 0, 0, 4, 5, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 4, 5, 0, 0, 4, 5, 0, 0, 4, 5,
     };
     auto orig1 = genGivenVals<DT>(9, vals);
-    
+
     SECTION("same inst") {
         auto view1 = DataObjectFactory::create<DT>(orig1, 1, 4);
         CHECK(*view1 == *view1);
@@ -205,19 +189,13 @@ TEMPLATE_PRODUCT_TEST_CASE("CheckEq, views on matrices", TAG_KERNELS, (CSRMatrix
 TEMPLATE_PRODUCT_TEST_CASE("CheckEq, empty matrices", TAG_KERNELS, (DenseMatrix, Matrix), (VALUE_TYPES)) {
     using DT = TestType;
     using VT = typename DT::VT;
-    using DTGen = typename std::conditional<
-                        std::is_same<DT, Matrix<VT>>::value,
-                        DenseMatrix<VT>,
-                        DT
-                    >::type;
-    
+    using DTGen = typename std::conditional<std::is_same<DT, Matrix<VT>>::value, DenseMatrix<VT>, DT>::type;
+
     std::vector<VT> vals = {
-        0, 0, 0, 0,
-        0, 0, 0, 0,
-        0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     };
     auto orig1 = genGivenVals<DTGen>(3, vals);
-    
+
     SECTION("orig, diff inst, same size") {
         auto orig2 = genGivenVals<DTGen>(3, vals);
         CHECK(*orig1 == *orig2);
@@ -241,14 +219,12 @@ TEMPLATE_PRODUCT_TEST_CASE("CheckEq, empty matrices", TAG_KERNELS, (DenseMatrix,
 
 TEMPLATE_PRODUCT_TEST_CASE("CheckEq, empty matrices", TAG_KERNELS, (CSRMatrix), (VALUE_TYPES)) {
     using DT = TestType;
-    
+
     std::vector<typename DT::VT> vals = {
-        0, 0, 0, 0,
-        0, 0, 0, 0,
-        0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     };
     auto orig1 = genGivenVals<DT>(3, vals);
-    
+
     SECTION("orig, diff inst, same size") {
         auto orig2 = genGivenVals<DT>(3, vals);
         CHECK(*orig1 == *orig2);
@@ -281,13 +257,11 @@ TEST_CASE("CheckEq, frames", TAG_KERNELS) {
     auto c0 = genGivenVals<DenseMatrix<VT0>>(numRows, {VT0(0.0), VT0(1.1), VT0(2.2), VT0(3.3)});
     auto c1 = genGivenVals<DenseMatrix<VT1>>(numRows, {VT1(4.4), VT1(5.5), VT1(6.6), VT1(7.7)});
     auto c2 = genGivenVals<DenseMatrix<VT2>>(numRows, {VT2(8.8), VT2(9.9), VT2(1.0), VT2(2.0)});
-   
+
     std::vector<Structure *> cols = {c0, c1, c2};
     auto frame1 = DataObjectFactory::create<Frame>(cols, nullptr);
-    
-    SECTION("same inst") {
-        CHECK(*frame1 == *frame1);
-    }
+
+    SECTION("same inst") { CHECK(*frame1 == *frame1); }
     SECTION("diff inst, same schema, same cont, no labels") {
         auto c3 = genGivenVals<DenseMatrix<VT2>>(numRows, {VT2(8.8), VT2(9.9), VT2(1.0), VT2(2.0)});
         std::vector<Structure *> cols2 = {c0, c1, c3};
@@ -322,8 +296,8 @@ TEST_CASE("CheckEq, frames", TAG_KERNELS) {
     }
     SECTION("diff inst, same schema, same cont, same labels") {
         auto c3 = genGivenVals<DenseMatrix<VT2>>(numRows, {VT2(8.8), VT2(9.9), VT2(1.0), VT2(2.0)});
-        std::string * labels1 =  new std::string[3] {"ab", "cde", "fghi"};
-        std::string * labels2 =  new std::string[3] {"ab", "cde", "fghi"};
+        std::string *labels1 = new std::string[3]{"ab", "cde", "fghi"};
+        std::string *labels2 = new std::string[3]{"ab", "cde", "fghi"};
         frame1 = DataObjectFactory::create<Frame>(cols, labels1);
         std::vector<Structure *> cols2 = {c0, c1, c3};
         auto frame2 = DataObjectFactory::create<Frame>(cols2, labels2);
@@ -333,8 +307,8 @@ TEST_CASE("CheckEq, frames", TAG_KERNELS) {
     }
     SECTION("diff inst, same schema, same cont, diff labels") {
         auto c3 = genGivenVals<DenseMatrix<VT2>>(numRows, {VT2(8.8), VT2(9.9), VT2(1.0), VT2(2.0)});
-        std::string * labels1 =  new std::string[3] {"ab", "cde", "fghi"};
-        std::string * labels2 =  new std::string[3] {"ab", "cde", "fxyz"};
+        std::string *labels1 = new std::string[3]{"ab", "cde", "fghi"};
+        std::string *labels2 = new std::string[3]{"ab", "cde", "fxyz"};
         frame1 = DataObjectFactory::create<Frame>(cols, labels1);
         std::vector<Structure *> cols2 = {c0, c1, c3};
         auto frame2 = DataObjectFactory::create<Frame>(cols2, labels2);
diff --git a/test/runtime/local/kernels/ColBindTest.cpp b/test/runtime/local/kernels/ColBindTest.cpp
index 3d513aa07..c5747e3c5 100644
--- a/test/runtime/local/kernels/ColBindTest.cpp
+++ b/test/runtime/local/kernels/ColBindTest.cpp
@@ -15,8 +15,8 @@
  */
 
 #include <runtime/local/datagen/GenGivenVals.h>
-#include <runtime/local/datastructures/DenseMatrix.h>
 #include <runtime/local/datastructures/CSRMatrix.h>
+#include <runtime/local/datastructures/DenseMatrix.h>
 #include <runtime/local/datastructures/Frame.h>
 #include <runtime/local/datastructures/Structure.h>
 #include <runtime/local/kernels/CheckEq.h>
@@ -35,28 +35,49 @@
 
 TEMPLATE_PRODUCT_TEST_CASE("ColBind", TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)) {
     using DT = TestType;
-    
+
     auto m0 = genGivenVals<DT>(3, {
-        1, 2,
-        3, 4,
-        5, 6,
-    });
+                                      1,
+                                      2,
+                                      3,
+                                      4,
+                                      5,
+                                      6,
+                                  });
     auto m1 = genGivenVals<DT>(3, {
-       10, 20, 30,
-       40, 50, 60,
-       70, 80, 90,
-    });
-    
+                                      10,
+                                      20,
+                                      30,
+                                      40,
+                                      50,
+                                      60,
+                                      70,
+                                      80,
+                                      90,
+                                  });
+
     auto exp = genGivenVals<DT>(3, {
-       1, 2, 10, 20, 30,
-       3, 4, 40, 50, 60,
-       5, 6, 70, 80, 90,
-    });
-    
-    DT * res = nullptr;
+                                       1,
+                                       2,
+                                       10,
+                                       20,
+                                       30,
+                                       3,
+                                       4,
+                                       40,
+                                       50,
+                                       60,
+                                       5,
+                                       6,
+                                       70,
+                                       80,
+                                       90,
+                                   });
+
+    DT *res = nullptr;
     colBind<DT, DT, DT>(res, m0, m1, nullptr);
     CHECK(*res == *exp);
-    
+
     DataObjectFactory::destroy(m0, m1, exp, res);
 }
 
@@ -66,19 +87,19 @@ TEST_CASE("ColBind - Frame", TAG_KERNELS) {
     auto c2 = genGivenVals<DenseMatrix<int64_t>>(3, {10, 20, 30});
     auto c3 = genGivenVals<DenseMatrix<int64_t>>(3, {40, 50, 60});
     auto c4 = genGivenVals<DenseMatrix<int64_t>>(3, {70, 80, 90});
-    
+
     std::string l0 = "a";
     std::string l1 = "b";
     std::string l2 = "x";
     std::string l3 = "y";
     std::string l4 = "z";
-    
+
     std::vector<Structure *> cols01 = {c0, c1};
     std::string labels01[] = {l0, l1};
     auto f01 = DataObjectFactory::create<Frame>(cols01, labels01);
-    
-    Frame* f234{};
-    Frame* res{};
+
+    Frame *f234{};
+    Frame *res{};
     SECTION("unique labels") {
         std::vector<Structure *> cols234 = {c2, c3, c4};
         std::string labels234[] = {l2, l3, l4};
@@ -88,14 +109,14 @@ TEST_CASE("ColBind - Frame", TAG_KERNELS) {
         REQUIRE(res->getNumRows() == 3);
         REQUIRE(res->getNumCols() == 5);
         // Check column types.
-        const std::string * labelsRes = res->getLabels();
+        const std::string *labelsRes = res->getLabels();
         CHECK(labelsRes[0] == l0);
         CHECK(labelsRes[1] == l1);
         CHECK(labelsRes[2] == l2);
         CHECK(labelsRes[3] == l3);
         CHECK(labelsRes[4] == l4);
         // Check column labels.
-        const ValueTypeCode * schemaRes = res->getSchema();
+        const ValueTypeCode *schemaRes = res->getSchema();
         CHECK(schemaRes[0] == ValueTypeCode::F64);
         CHECK(schemaRes[1] == ValueTypeCode::F64);
         CHECK(schemaRes[2] == ValueTypeCode::SI64);
@@ -107,7 +128,7 @@ TEST_CASE("ColBind - Frame", TAG_KERNELS) {
         CHECK(*(res->getColumn<int64_t>(2)) == *c2);
         CHECK(*(res->getColumn<int64_t>(3)) == *c3);
         CHECK(*(res->getColumn<int64_t>(4)) == *c4);
-        
+
         DataObjectFactory::destroy(res);
     }
     SECTION("non-unique labels") {
@@ -116,7 +137,7 @@ TEST_CASE("ColBind - Frame", TAG_KERNELS) {
         f234 = DataObjectFactory::create<Frame>(cols1, labels1);
         CHECK_THROWS(colBind<Frame, Frame, Frame>(res, f01, f234, nullptr));
     }
-    
+
     DataObjectFactory::destroy(c0, c1, c2, c3, c4, f01, f234);
 }
 
@@ -129,30 +150,30 @@ TEMPLATE_PRODUCT_TEST_CASE("ColBind", TAG_KERNELS, (CSRMatrix), (VALUE_TYPES)) {
     size_t numNonZeros = 5;
 
     auto m0 = DataObjectFactory::create<CSRMatrix<VT>>(numRows, numCols, numNonZeros, true);
-    m0->set(1,1, VT(11.));
-    m0->set(2,1, VT(21.));
-    m0->set(2,3, VT(23.));
-    m0->set(3,3, VT(33.));
-    m0->set(3,4, VT(34.));
-    
-    DT * res = nullptr;
-    
+    m0->set(1, 1, VT(11.));
+    m0->set(2, 1, VT(21.));
+    m0->set(2, 3, VT(23.));
+    m0->set(3, 3, VT(33.));
+    m0->set(3, 4, VT(34.));
+
+    DT *res = nullptr;
+
     SECTION("Normal on normal") {
         auto m1 = DataObjectFactory::create<CSRMatrix<VT>>(numRows, numCols, 3, true);
-        m1->set(1,0, VT(10.));
-        m1->set(1,2, VT(12.));
-        m1->set(3,0, VT(30.));
+        m1->set(1, 0, VT(10.));
+        m1->set(1, 2, VT(12.));
+        m1->set(3, 0, VT(30.));
 
-        auto exp = DataObjectFactory::create<CSRMatrix<VT>>(numRows, numCols*2, 8, true);
-        exp->set(1,1, VT(11.));
-        exp->set(2,1, VT(21.));
-        exp->set(2,3, VT(23.));
-        exp->set(3,3, VT(33.));
-        exp->set(3,4, VT(34.));
+        auto exp = DataObjectFactory::create<CSRMatrix<VT>>(numRows, numCols * 2, 8, true);
+        exp->set(1, 1, VT(11.));
+        exp->set(2, 1, VT(21.));
+        exp->set(2, 3, VT(23.));
+        exp->set(3, 3, VT(33.));
+        exp->set(3, 4, VT(34.));
 
-        exp->set(1,0+numCols, VT(10.));
-        exp->set(1,2+numCols, VT(12.));
-        exp->set(3,0+numCols, VT(30.));
+        exp->set(1, 0 + numCols, VT(10.));
+        exp->set(1, 2 + numCols, VT(12.));
+        exp->set(3, 0 + numCols, VT(30.));
 
         colBind<DT, DT, DT>(res, m0, m1, nullptr);
         CHECK(*res == *exp);
@@ -166,22 +187,22 @@ TEMPLATE_PRODUCT_TEST_CASE("ColBind", TAG_KERNELS, (CSRMatrix), (VALUE_TYPES)) {
         size_t lowerBound = 1;
         size_t upperBound = 3;
         size_t rowsTake = upperBound - lowerBound;
-        
+
         auto m1 = DataObjectFactory::create<CSRMatrix<VT>>(m0, lowerBound, upperBound);
 
         auto m2 = DataObjectFactory::create<CSRMatrix<VT>>(rowsTake, numCols, 3, true);
-        m2->set(0,0, VT(10.));
-        m2->set(0,2, VT(12.));
-        m2->set(1,1, VT(21.));
+        m2->set(0, 0, VT(10.));
+        m2->set(0, 2, VT(12.));
+        m2->set(1, 1, VT(21.));
 
-        auto exp = DataObjectFactory::create<CSRMatrix<VT>>(rowsTake, numCols*2, 8, true);
-        exp->set(1-lowerBound,1, VT(11.));
-        exp->set(2-lowerBound,1, VT(21.));
-        exp->set(2-lowerBound,3, VT(23.));
+        auto exp = DataObjectFactory::create<CSRMatrix<VT>>(rowsTake, numCols * 2, 8, true);
+        exp->set(1 - lowerBound, 1, VT(11.));
+        exp->set(2 - lowerBound, 1, VT(21.));
+        exp->set(2 - lowerBound, 3, VT(23.));
 
-        exp->set(0,numCols, VT(10.));
-        exp->set(0,2+numCols, VT(12.));
-        exp->set(1,1+numCols, VT(21.));
+        exp->set(0, numCols, VT(10.));
+        exp->set(0, 2 + numCols, VT(12.));
+        exp->set(1, 1 + numCols, VT(21.));
 
         colBind<DT, DT, DT>(res, m1, m2, nullptr);
         CHECK(*res == *exp);
@@ -197,15 +218,15 @@ TEMPLATE_PRODUCT_TEST_CASE("ColBind", TAG_KERNELS, (CSRMatrix), (VALUE_TYPES)) {
         size_t upperBound = 2;
         size_t rowsTake = upperBound - lowerBound;
         auto m1 = DataObjectFactory::create<CSRMatrix<VT>>(m0, lowerBound, upperBound);
-        auto m2 = DataObjectFactory::create<CSRMatrix<VT>>(m0, upperBound, rowsTake*2);
+        auto m2 = DataObjectFactory::create<CSRMatrix<VT>>(m0, upperBound, rowsTake * 2);
 
-        auto exp  = DataObjectFactory::create<CSRMatrix<VT>>(rowsTake, numCols*2, numNonZeros, true);
-        exp->set(1,1, VT(11.));
+        auto exp = DataObjectFactory::create<CSRMatrix<VT>>(rowsTake, numCols * 2, numNonZeros, true);
+        exp->set(1, 1, VT(11.));
 
-        exp->set(2-rowsTake,1+numCols, VT(21.));
-        exp->set(2-rowsTake,3+numCols, VT(23.));
-        exp->set(3-rowsTake,3+numCols, VT(33.));
-        exp->set(3-rowsTake,4+numCols, VT(34.));
+        exp->set(2 - rowsTake, 1 + numCols, VT(21.));
+        exp->set(2 - rowsTake, 3 + numCols, VT(23.));
+        exp->set(3 - rowsTake, 3 + numCols, VT(33.));
+        exp->set(3 - rowsTake, 4 + numCols, VT(34.));
 
         colBind<DT, DT, DT>(res, m1, m2, nullptr);
         CHECK(*res == *exp);
diff --git a/test/runtime/local/kernels/CondMatMatMatTest.cpp b/test/runtime/local/kernels/CondMatMatMatTest.cpp
index 59f092f0b..4b2846a38 100644
--- a/test/runtime/local/kernels/CondMatMatMatTest.cpp
+++ b/test/runtime/local/kernels/CondMatMatMatTest.cpp
@@ -33,35 +33,19 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("Matrix"), TAG_KERNELS, (DATA_TYPES), (VALU
     using DT = TestType;
     using VT = typename DT::VT;
 
-    DT * argCond = nullptr;
-    DT * argThen = nullptr;
-    DT * argElse = nullptr;
-    DT * exp = nullptr;
+    DT *argCond = nullptr;
+    DT *argThen = nullptr;
+    DT *argElse = nullptr;
+    DT *exp = nullptr;
 
     SECTION("example 1") {
-        argCond = genGivenVals<DT>(3, {
-            true, false, false,
-            false, true, false,
-            false, false, true
-        });
-        argThen = genGivenVals<DT>(3, {
-            VT(1.5), 2, 3,
-            4,       5, 6,
-            7,       8, 9
-        });
-        argElse = genGivenVals<DT>(3, {
-            -1,       -2, -3,
-            VT(-4.5), -5, -6,
-            -7,       -8, -9
-        });
-        exp = genGivenVals<DT>(3, {
-            VT(1.5),  -2, -3,
-            VT(-4.5),  5, -6,
-            -7,       -8,  9
-        });
+        argCond = genGivenVals<DT>(3, {true, false, false, false, true, false, false, false, true});
+        argThen = genGivenVals<DT>(3, {VT(1.5), 2, 3, 4, 5, 6, 7, 8, 9});
+        argElse = genGivenVals<DT>(3, {-1, -2, -3, VT(-4.5), -5, -6, -7, -8, -9});
+        exp = genGivenVals<DT>(3, {VT(1.5), -2, -3, VT(-4.5), 5, -6, -7, -8, 9});
     }
 
-    DT * res = nullptr;
+    DT *res = nullptr;
     condMatMatMat(res, argCond, argThen, argElse, nullptr);
 
     CHECK(*res == *exp);
@@ -73,49 +57,25 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("invalid shape"), TAG_KERNELS, (DATA_TYPES)
     using DT = TestType;
     using VT = typename DT::VT;
 
-    DT * argCond = genGivenVals<DT>(3, {
-        true, false, false,
-        false, true, false,
-        false, false, true
-    });
-    
-    DT * argThen = nullptr;
-    DT * argElse = nullptr;
+    DT *argCond = genGivenVals<DT>(3, {true, false, false, false, true, false, false, false, true});
+
+    DT *argThen = nullptr;
+    DT *argElse = nullptr;
 
     SECTION("then matrix too small") {
-        argThen = genGivenVals<DT>(2, {
-            VT(1.5), 2, 3,
-            4,       5, 6
-        });
-        argElse = genGivenVals<DT>(3, {
-            -1,       -2, -3,
-            VT(-4.5), -5, -6,
-            -7,       -8, -9
-        });
+        argThen = genGivenVals<DT>(2, {VT(1.5), 2, 3, 4, 5, 6});
+        argElse = genGivenVals<DT>(3, {-1, -2, -3, VT(-4.5), -5, -6, -7, -8, -9});
     }
     SECTION("else matrix too small") {
-        argThen = genGivenVals<DT>(3, {
-            VT(1.5), 2, 3,
-            4,       5, 6,
-            7,       8, 9
-        });
-        argElse = genGivenVals<DT>(2, {
-            -1,       -2, -3,
-            VT(-4.5), -5, -6
-        });
+        argThen = genGivenVals<DT>(3, {VT(1.5), 2, 3, 4, 5, 6, 7, 8, 9});
+        argElse = genGivenVals<DT>(2, {-1, -2, -3, VT(-4.5), -5, -6});
     }
     SECTION("then/else matrices too small") {
-        argThen = genGivenVals<DT>(2, {
-            VT(1.5), 2, 3,
-            4,       5, 6
-        });
-        argElse = genGivenVals<DT>(2, {
-            -1,       -2, -3,
-            VT(-4.5), -5, -6
-        });
+        argThen = genGivenVals<DT>(2, {VT(1.5), 2, 3, 4, 5, 6});
+        argElse = genGivenVals<DT>(2, {-1, -2, -3, VT(-4.5), -5, -6});
     }
 
-    DT * res = nullptr;
+    DT *res = nullptr;
 
     REQUIRE_THROWS_AS(condMatMatMat(res, argCond, argThen, argElse, nullptr), std::runtime_error);
 
diff --git a/test/runtime/local/kernels/CondMatMatScaTest.cpp b/test/runtime/local/kernels/CondMatMatScaTest.cpp
index 2c0f67ea3..06e93a07b 100644
--- a/test/runtime/local/kernels/CondMatMatScaTest.cpp
+++ b/test/runtime/local/kernels/CondMatMatScaTest.cpp
@@ -33,33 +33,21 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("Matrix"), TAG_KERNELS, (DATA_TYPES), (VALU
     using DT = TestType;
     using VT = typename DT::VT;
 
-    DT * argCond = nullptr;
-    DT * argThen = nullptr;
+    DT *argCond = nullptr;
+    DT *argThen = nullptr;
     VT argElse;
-    DT * exp = nullptr;
+    DT *exp = nullptr;
 
     SECTION("example 1") {
-        argCond = genGivenVals<DT>(3, {
-            true, false, false,
-            false, true, false,
-            false, false, true
-        });
-        argThen = genGivenVals<DT>(3, {
-            VT(1.5), 2, 3,
-            4,       5, 6,
-            7,       8, 9
-        });
+        argCond = genGivenVals<DT>(3, {true, false, false, false, true, false, false, false, true});
+        argThen = genGivenVals<DT>(3, {VT(1.5), 2, 3, 4, 5, 6, 7, 8, 9});
 
         argElse = VT(-1.5);
 
-        exp = genGivenVals<DT>(3, {
-            VT(1.5), argElse, argElse,
-            argElse, 5,       argElse,
-            argElse, argElse, 9
-        });
+        exp = genGivenVals<DT>(3, {VT(1.5), argElse, argElse, argElse, 5, argElse, argElse, argElse, 9});
     }
 
-    DT * res = nullptr;
+    DT *res = nullptr;
     condMatMatSca(res, argCond, argThen, argElse, nullptr);
 
     CHECK(*res == *exp);
@@ -71,25 +59,18 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("invalid shape"), TAG_KERNELS, (DATA_TYPES)
     using DT = TestType;
     using VT = typename DT::VT;
 
-    DT * argCond = genGivenVals<DT>(3, {
-        true, false, false,
-        false, true, false,
-        false, false, true
-    });
-    
-    DT * argThen = nullptr;
+    DT *argCond = genGivenVals<DT>(3, {true, false, false, false, true, false, false, false, true});
+
+    DT *argThen = nullptr;
     VT argElse;
 
     SECTION("then matrix too small") {
-        argThen = genGivenVals<DT>(2, {
-            VT(1.5), 2, 3,
-            4,       5, 6
-        });
+        argThen = genGivenVals<DT>(2, {VT(1.5), 2, 3, 4, 5, 6});
 
         argElse = VT(-1.5);
     }
 
-    DT * res = nullptr;
+    DT *res = nullptr;
 
     REQUIRE_THROWS_AS(condMatMatSca(res, argCond, argThen, argElse, nullptr), std::runtime_error);
 
diff --git a/test/runtime/local/kernels/CondMatScaMatTest.cpp b/test/runtime/local/kernels/CondMatScaMatTest.cpp
index a9f98d74d..13dba612c 100644
--- a/test/runtime/local/kernels/CondMatScaMatTest.cpp
+++ b/test/runtime/local/kernels/CondMatScaMatTest.cpp
@@ -33,33 +33,21 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("Matrix"), TAG_KERNELS, (DATA_TYPES), (VALU
     using DT = TestType;
     using VT = typename DT::VT;
 
-    DT * argCond = nullptr;
+    DT *argCond = nullptr;
     VT argThen;
-    DT * argElse = nullptr;
-    DT * exp = nullptr;
+    DT *argElse = nullptr;
+    DT *exp = nullptr;
 
     SECTION("example 1") {
-        argCond = genGivenVals<DT>(3, {
-            true, false, false,
-            false, true, false,
-            false, false, true
-        });
-        
+        argCond = genGivenVals<DT>(3, {true, false, false, false, true, false, false, false, true});
+
         argThen = VT(-1.5);
 
-        argElse = genGivenVals<DT>(3, {
-            1,       2, 3,
-            VT(4.5), 5, 6,
-            7,       8, 9
-        });
-        exp = genGivenVals<DT>(3, {
-            argThen, 2,       3,
-            VT(4.5), argThen, 6,
-            7,       8,       argThen
-        });
+        argElse = genGivenVals<DT>(3, {1, 2, 3, VT(4.5), 5, 6, 7, 8, 9});
+        exp = genGivenVals<DT>(3, {argThen, 2, 3, VT(4.5), argThen, 6, 7, 8, argThen});
     }
 
-    DT * res = nullptr;
+    DT *res = nullptr;
     condMatScaMat(res, argCond, argThen, argElse, nullptr);
 
     CHECK(*res == *exp);
@@ -71,25 +59,18 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("invalid shape"), TAG_KERNELS, (DATA_TYPES)
     using DT = TestType;
     using VT = typename DT::VT;
 
-    DT * argCond = genGivenVals<DT>(3, {
-        true, false, false,
-        false, true, false,
-        false, false, true
-    });
-    
+    DT *argCond = genGivenVals<DT>(3, {true, false, false, false, true, false, false, false, true});
+
     VT argThen;
-    DT * argElse = nullptr;
+    DT *argElse = nullptr;
 
     SECTION("else matrix too small") {
         argThen = VT(-1.5);
 
-        argElse = genGivenVals<DT>(2, {
-            1,       2, 3,
-            VT(4.5), 5, 6
-        });
+        argElse = genGivenVals<DT>(2, {1, 2, 3, VT(4.5), 5, 6});
     }
 
-    DT * res = nullptr;
+    DT *res = nullptr;
 
     REQUIRE_THROWS_AS(condMatScaMat(res, argCond, argThen, argElse, nullptr), std::runtime_error);
 
diff --git a/test/runtime/local/kernels/CondMatScaScaTest.cpp b/test/runtime/local/kernels/CondMatScaScaTest.cpp
index 958013ada..8dafe2218 100644
--- a/test/runtime/local/kernels/CondMatScaScaTest.cpp
+++ b/test/runtime/local/kernels/CondMatScaScaTest.cpp
@@ -33,30 +33,22 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("Matrix"), TAG_KERNELS, (DATA_TYPES), (VALU
     using DT = TestType;
     using VT = typename DT::VT;
 
-    DT * argCond = nullptr;
+    DT *argCond = nullptr;
     VT argThen;
     VT argElse;
-    DT * exp = nullptr;
+    DT *exp = nullptr;
 
     SECTION("example 1") {
-        argCond = genGivenVals<DT>(3, {
-            true, false, false,
-            false, true, false,
-            false, false, true
-        });
+        argCond = genGivenVals<DT>(3, {true, false, false, false, true, false, false, false, true});
 
         argThen = VT(1.5);
 
         argElse = VT(-1.5);
 
-        exp = genGivenVals<DT>(3, {
-            VT(1.5), argElse, argElse,
-            argElse, VT(1.5), argElse,
-            argElse, argElse, VT(1.5)
-        });
+        exp = genGivenVals<DT>(3, {VT(1.5), argElse, argElse, argElse, VT(1.5), argElse, argElse, argElse, VT(1.5)});
     }
 
-    DT * res = nullptr;
+    DT *res = nullptr;
     condMatScaSca(res, argCond, argThen, argElse, nullptr);
 
     CHECK(*res == *exp);
diff --git a/test/runtime/local/kernels/CreateFrameTest.cpp b/test/runtime/local/kernels/CreateFrameTest.cpp
index 6aaa8574d..049b12e6e 100644
--- a/test/runtime/local/kernels/CreateFrameTest.cpp
+++ b/test/runtime/local/kernels/CreateFrameTest.cpp
@@ -14,10 +14,10 @@
  * limitations under the License.
  */
 
-#include <runtime/local/datastructures/DenseMatrix.h>
-#include <runtime/local/datastructures/Frame.h>
 #include <runtime/local/datagen/GenGivenVals.h>
 #include <runtime/local/datastructures/DataObjectFactory.h>
+#include <runtime/local/datastructures/DenseMatrix.h>
+#include <runtime/local/datastructures/Frame.h>
 #include <runtime/local/kernels/CheckEq.h>
 #include <runtime/local/kernels/CreateFrame.h>
 
@@ -30,30 +30,28 @@
 TEST_CASE("CreateFrame", TAG_KERNELS) {
     const size_t numRows = 4;
     const size_t numCols = 3;
-    
+
     using VT0 = int64_t;
     using VT1 = double;
     using VT2 = float;
-    
+
     auto c0 = genGivenVals<DenseMatrix<VT0>>(numRows, {1, 2, 3, 4});
     auto c1 = genGivenVals<DenseMatrix<VT1>>(numRows, {-1.2, 3.4, -5.6, 7.8});
     auto c2 = genGivenVals<DenseMatrix<VT2>>(numRows, {11.1, 22.2, 33.3, 44.4});
-    
-    Frame * f = nullptr;
-    Structure * colMats[] = {c0, c1, c2};
-    SECTION("without column labels") {
-        createFrame(f, colMats, numCols, nullptr, 0, nullptr);
-    }
+
+    Frame *f = nullptr;
+    Structure *colMats[] = {c0, c1, c2};
+    SECTION("without column labels") { createFrame(f, colMats, numCols, nullptr, 0, nullptr); }
     SECTION("with column labels") {
-        const char * labels[] = {"ab", "cde", "fghi"};
+        const char *labels[] = {"ab", "cde", "fghi"};
         createFrame(f, colMats, numCols, labels, numCols, nullptr);
-        
+
         // Check column data, access by label.
         CHECK(*(f->template getColumn<VT0>("ab")) == *c0);
         CHECK(*(f->template getColumn<VT1>("cde")) == *c1);
         CHECK(*(f->template getColumn<VT2>("fghi")) == *c2);
     }
-    
+
     // Check #rows and #cols.
     REQUIRE(f->getNumRows() == numRows);
     REQUIRE(f->getNumCols() == numCols);
@@ -66,7 +64,7 @@ TEST_CASE("CreateFrame", TAG_KERNELS) {
     CHECK(*(f->template getColumn<VT0>(0)) == *c0);
     CHECK(*(f->template getColumn<VT1>(1)) == *c1);
     CHECK(*(f->template getColumn<VT2>(2)) == *c2);
-    
+
     DataObjectFactory::destroy(c0);
     DataObjectFactory::destroy(c1);
     DataObjectFactory::destroy(c2);
diff --git a/test/runtime/local/kernels/DNNBatchNorm2DBackwardTest.cpp b/test/runtime/local/kernels/DNNBatchNorm2DBackwardTest.cpp
index 8975c92cb..1123ab327 100644
--- a/test/runtime/local/kernels/DNNBatchNorm2DBackwardTest.cpp
+++ b/test/runtime/local/kernels/DNNBatchNorm2DBackwardTest.cpp
@@ -14,19 +14,18 @@
  * limitations under the License.
  */
 
-
 #include "run_tests.h"
 
+#include "runtime/local/kernels/BatchNorm2DBackward.h"
 #include <runtime/local/datagen/GenGivenVals.h>
 #include <runtime/local/kernels/CheckEqApprox.h>
-#include "runtime/local/kernels/BatchNorm2DBackward.h"
 
-template<class DT>
-void checkBatchNorm2DBackward(const DT* in, const DT* dOut, const DT* gamma, const DT* mean, const DT* invVar,
-        const DT* exp1, const DT* exp2, const DT* exp3, DaphneContext* dctx) {
-    DT* dX = nullptr;
-    DT* dGamma = nullptr;
-    DT* dBeta = nullptr;
+template <class DT>
+void checkBatchNorm2DBackward(const DT *in, const DT *dOut, const DT *gamma, const DT *mean, const DT *invVar,
+                              const DT *exp1, const DT *exp2, const DT *exp3, DaphneContext *dctx) {
+    DT *dX = nullptr;
+    DT *dGamma = nullptr;
+    DT *dBeta = nullptr;
 
     typename DT::VT epsilon = 1e-5;
     BatchNorm2DBackward<DT, DT>::apply(dX, dGamma, dBeta, mean, invVar, in, dOut, gamma, epsilon, dctx);
@@ -40,33 +39,24 @@ TEMPLATE_PRODUCT_TEST_CASE("batch_norm_bwd", TAG_DNN, (DenseMatrix), (float, dou
     auto dctx = setupContextAndLogger();
     using DT = TestType;
 
-    auto in = genGivenVals<DT>(2, { 1, 2, 3, 4, 
-                                       5, 6, 7, 8, 
-                                       9, 10, 11, 12,
+    auto in = genGivenVals<DT>(2, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
+
+                                   1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
 
-                                       1, 2, 3, 4, 
-                                       5, 6, 7, 8, 
-                                       9, 10, 11, 12});
+    auto dOut = genGivenVals<DT>(2, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
 
-    auto dOut = genGivenVals<DT>(2, { 1, 2, 3, 4, 
-                                       5, 6, 7, 8, 
-                                       9, 10, 11, 12,
+                                     1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
 
-                                       1, 2, 3, 4, 
-                                       5, 6, 7, 8, 
-                                       9, 10, 11, 12});
-                                      
-    auto gamma = genGivenVals<DT>(3, { 1, 1, 1 });
-    auto mean = genGivenVals<DT>(3, { 2.5, 6.5, 10.5 });
-    auto invVar = genGivenVals<DT>(3, { 1 / std::sqrt(1.25f + 1e-5f), 1 / std::sqrt(1.25f + 1e-5f), 1 / std::sqrt(1.25f + 1e-5f) });
-    auto res1 = genGivenVals<DT>(2, {-1.0733e-05, -3.5777e-06, 3.5777e-06,  1.0733e-05,
-                                     -1.0733e-05, -3.5777e-06, 3.5777e-06,  1.0733e-05, 
-                                     -1.0733e-05, -3.5777e-06, 3.5777e-06,  1.0733e-05,
-                                     -1.0733e-05, -3.5777e-06, 3.5777e-06,  1.0733e-05,
-                                     -1.0733e-05, -3.5777e-06, 3.5777e-06,  1.0733e-05, 
-                                     -1.0733e-05, -3.5777e-06, 3.5777e-06,  1.0733e-05});
-    auto res2 = genGivenVals<DT>(3, {8.9442, 8.9442, 8.9442 });
-    auto res3 = genGivenVals<DT>(3, {20, 52, 84 });
+    auto gamma = genGivenVals<DT>(3, {1, 1, 1});
+    auto mean = genGivenVals<DT>(3, {2.5, 6.5, 10.5});
+    auto invVar =
+        genGivenVals<DT>(3, {1 / std::sqrt(1.25f + 1e-5f), 1 / std::sqrt(1.25f + 1e-5f), 1 / std::sqrt(1.25f + 1e-5f)});
+    auto res1 = genGivenVals<DT>(2, {-1.0733e-05, -3.5777e-06, 3.5777e-06,  1.0733e-05,  -1.0733e-05, -3.5777e-06,
+                                     3.5777e-06,  1.0733e-05,  -1.0733e-05, -3.5777e-06, 3.5777e-06,  1.0733e-05,
+                                     -1.0733e-05, -3.5777e-06, 3.5777e-06,  1.0733e-05,  -1.0733e-05, -3.5777e-06,
+                                     3.5777e-06,  1.0733e-05,  -1.0733e-05, -3.5777e-06, 3.5777e-06,  1.0733e-05});
+    auto res2 = genGivenVals<DT>(3, {8.9442, 8.9442, 8.9442});
+    auto res3 = genGivenVals<DT>(3, {20, 52, 84});
 
     checkBatchNorm2DBackward(in, dOut, gamma, mean, invVar, res1, res2, res3, dctx.get());
 
diff --git a/test/runtime/local/kernels/DNNBatchNorm2DForwardTest.cpp b/test/runtime/local/kernels/DNNBatchNorm2DForwardTest.cpp
index 92a4f2b85..c1ed0c258 100644
--- a/test/runtime/local/kernels/DNNBatchNorm2DForwardTest.cpp
+++ b/test/runtime/local/kernels/DNNBatchNorm2DForwardTest.cpp
@@ -14,17 +14,16 @@
  * limitations under the License.
  */
 
-
 #include "run_tests.h"
 
-#include <runtime/local/datagen/GenGivenVals.h>
 #include "runtime/local/kernels/BatchNorm2DTestForward.h"
 #include "runtime/local/kernels/BatchNorm2DTrainForward.h"
+#include <runtime/local/datagen/GenGivenVals.h>
 
-template<class DT>
-void checkBatchNorm2DTestForward(const DT* in, const DT* gamma, const DT* beta, const DT* ema_mean, const DT* ema_var, const DT* exp,
-        DaphneContext* dctx) {
-    DT* res = nullptr;
+template <class DT>
+void checkBatchNorm2DTestForward(const DT *in, const DT *gamma, const DT *beta, const DT *ema_mean, const DT *ema_var,
+                                 const DT *exp, DaphneContext *dctx) {
+    DT *res = nullptr;
     typename DT::VT epsilon = 1e-5;
     BatchNorm2DTestForward<DT, DT>::apply(res, in, gamma, beta, ema_mean, ema_var, epsilon, dctx);
     CHECK(Approx(*(res->getValues())).epsilon(epsilon) == *(exp->getValues()));
@@ -34,13 +33,13 @@ TEMPLATE_PRODUCT_TEST_CASE("batch_norm_test_fwd", TAG_DNN, (DenseMatrix), (float
     auto dctx = setupContextAndLogger();
     using DT = TestType;
 
-    auto input = genGivenVals<DT>(1, { -3, -2, -1, 0, 1, 2, 3, 4, 5});
-    auto gamma = genGivenVals<DT>(1, { 1 });
-    auto beta = genGivenVals<DT>(1, { 0 });
-    auto ema_mean = genGivenVals<DT>(1, { 0 });
-    auto ema_var = genGivenVals<DT>(1, { 1 });
+    auto input = genGivenVals<DT>(1, {-3, -2, -1, 0, 1, 2, 3, 4, 5});
+    auto gamma = genGivenVals<DT>(1, {1});
+    auto beta = genGivenVals<DT>(1, {0});
+    auto ema_mean = genGivenVals<DT>(1, {0});
+    auto ema_var = genGivenVals<DT>(1, {1});
 
-    auto result = genGivenVals<DT>(1, { -3, -2, -1, 0, 1, 2, 3, 4, 5});
+    auto result = genGivenVals<DT>(1, {-3, -2, -1, 0, 1, 2, 3, 4, 5});
 
     checkBatchNorm2DTestForward(input, gamma, beta, ema_mean, ema_var, result, dctx.get());
 
@@ -48,17 +47,18 @@ TEMPLATE_PRODUCT_TEST_CASE("batch_norm_test_fwd", TAG_DNN, (DenseMatrix), (float
     DataObjectFactory::destroy(result);
 }
 
-template<class DT>
-void checkBatchNorm2DTrainForward(const DT* in, const DT* gamma, const DT* beta, const DT* ema_mean, const DT* ema_var,
-        const DT* exp, DaphneContext* dctx) {
-    DT* res = nullptr;
-    DT* new_emaMean = nullptr;
-    DT* new_emaVar = nullptr;
-    DT* Mean = nullptr;
-    DT* invVar = nullptr;
+template <class DT>
+void checkBatchNorm2DTrainForward(const DT *in, const DT *gamma, const DT *beta, const DT *ema_mean, const DT *ema_var,
+                                  const DT *exp, DaphneContext *dctx) {
+    DT *res = nullptr;
+    DT *new_emaMean = nullptr;
+    DT *new_emaVar = nullptr;
+    DT *Mean = nullptr;
+    DT *invVar = nullptr;
     typename DT::VT epsilon = 1e-5;
     typename DT::VT mu = 1;
-    BatchNorm2DTrainForward<DT, DT>::apply(res, new_emaMean, new_emaVar, Mean, invVar, in, gamma, beta, ema_mean, ema_var, epsilon, mu, dctx);
+    BatchNorm2DTrainForward<DT, DT>::apply(res, new_emaMean, new_emaVar, Mean, invVar, in, gamma, beta, ema_mean,
+                                           ema_var, epsilon, mu, dctx);
     CHECK(Approx(*(res->getValues())).epsilon(epsilon) == *(exp->getValues()));
 }
 
@@ -66,35 +66,20 @@ TEMPLATE_PRODUCT_TEST_CASE("batch_norm_train_fwd", TAG_DNN, (DenseMatrix), (floa
     auto dctx = setupContextAndLogger();
     using DT = TestType;
 
-    auto input = genGivenVals<DT>(2, { -3, -2, -1, 0, 1, 2, 3, 4, 5,
-                                    -3, -2, -1, 0, 1, 2, 3, 4, 5,
-                                    -3, -2, -1, 0, 1, 2, 3, 4, 5,
-                                    -3, -2, -1, 0, 1, 2, 3, 4, 5,
-                                    -3, -2, -1, 0, 1, 2, 3, 4, 5,
-                                    -3, -2, -1, 0, 1, 2, 3, 4, 5});
-    auto gamma = genGivenVals<DT>(3, { 1, 1, 1 });
-    auto beta = genGivenVals<DT>(3, { 0, 0, 0 });
-    auto ema_mean = genGivenVals<DT>(3, { 0, 0, 0 });
-    auto ema_var = genGivenVals<DT>(1, { 1, 1, 1 });
+    auto input =
+        genGivenVals<DT>(2, {-3, -2, -1, 0, 1, 2, 3, 4, 5, -3, -2, -1, 0, 1, 2, 3, 4, 5, -3, -2, -1, 0, 1, 2, 3, 4, 5,
+                             -3, -2, -1, 0, 1, 2, 3, 4, 5, -3, -2, -1, 0, 1, 2, 3, 4, 5, -3, -2, -1, 0, 1, 2, 3, 4, 5});
+    auto gamma = genGivenVals<DT>(3, {1, 1, 1});
+    auto beta = genGivenVals<DT>(3, {0, 0, 0});
+    auto ema_mean = genGivenVals<DT>(3, {0, 0, 0});
+    auto ema_var = genGivenVals<DT>(1, {1, 1, 1});
 
-    auto result = genGivenVals<DT>(2, { -1.5492, -1.1619, -0.7746,
-                                    -0.3873,  0.0000,  0.3873,
-                                    0.7746,  1.1619,  1.5492,
-                                    -1.5492, -1.1619, -0.7746,
-                                    -0.3873,  0.0000,  0.3873,
-                                    0.7746,  1.1619,  1.5492,
-                                    -1.5492, -1.1619, -0.7746,
-                                    -0.3873,  0.0000,  0.3873,
-                                    0.7746,  1.1619,  1.5492,
-                                    -1.5492, -1.1619, -0.7746,
-                                    -0.3873,  0.0000,  0.3873,
-                                    0.7746,  1.1619,  1.5492,
-                                    -1.5492, -1.1619, -0.7746,
-                                    -0.3873,  0.0000,  0.3873,
-                                    0.7746,  1.1619,  1.5492,
-                                    -1.5492, -1.1619, -0.7746,
-                                    -0.3873,  0.0000,  0.3873,
-                                    0.7746,  1.1619,  1.5492});
+    auto result = genGivenVals<DT>(2, {-1.5492, -1.1619, -0.7746, -0.3873, 0.0000, 0.3873, 0.7746, 1.1619, 1.5492,
+                                       -1.5492, -1.1619, -0.7746, -0.3873, 0.0000, 0.3873, 0.7746, 1.1619, 1.5492,
+                                       -1.5492, -1.1619, -0.7746, -0.3873, 0.0000, 0.3873, 0.7746, 1.1619, 1.5492,
+                                       -1.5492, -1.1619, -0.7746, -0.3873, 0.0000, 0.3873, 0.7746, 1.1619, 1.5492,
+                                       -1.5492, -1.1619, -0.7746, -0.3873, 0.0000, 0.3873, 0.7746, 1.1619, 1.5492,
+                                       -1.5492, -1.1619, -0.7746, -0.3873, 0.0000, 0.3873, 0.7746, 1.1619, 1.5492});
 
     checkBatchNorm2DTrainForward(input, gamma, beta, ema_mean, ema_var, result, dctx.get());
 
diff --git a/test/runtime/local/kernels/DNNBiasAddForwardTest.cpp b/test/runtime/local/kernels/DNNBiasAddForwardTest.cpp
index 27be5f4cc..e447d75e6 100644
--- a/test/runtime/local/kernels/DNNBiasAddForwardTest.cpp
+++ b/test/runtime/local/kernels/DNNBiasAddForwardTest.cpp
@@ -16,17 +16,15 @@
 
 #include "run_tests.h"
 
+#include "runtime/local/kernels/BiasAddForward.h"
 #include <runtime/local/datagen/GenGivenVals.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
-#include "runtime/local/kernels/BiasAddForward.h"
 
 #include <catch.hpp>
 #include <tags.h>
 
-template<class DT>
-void checkBiasAddForward(const DT* input, const DT* bias, const DT* exp, DaphneContext* dctx)
-{
-    DT* res = nullptr;
+template <class DT> void checkBiasAddForward(const DT *input, const DT *bias, const DT *exp, DaphneContext *dctx) {
+    DT *res = nullptr;
     BiasAddForward<DT, DT>::apply(res, input, bias, dctx);
     CHECK(*res == *exp);
 }
@@ -35,23 +33,14 @@ TEMPLATE_PRODUCT_TEST_CASE("bias_add_fwd", TAG_DNN, (DenseMatrix), (float, doubl
     auto dctx = setupContextAndLogger();
     using DT = TestType;
 
-    auto input = genGivenVals<DT>(2, { 1, 2, 3, 4, 
-                                       5, 6, 7, 8, 
-                                       9, 10, 11, 12,
-                                       
-                                       1, 2, 3, 4, 
-                                       5, 6, 7, 8, 
-                                       9, 10, 11, 12});
-    auto bias = genGivenVals<DT>(3, { 1, 2, 3 });
-
-
-    auto result = genGivenVals<DT>(2, { 2, 3, 4, 5,
-                                        7, 8, 9, 10,
-                                        12, 13, 14, 15,
-                                        
-                                        2, 3, 4, 5,
-                                        7, 8, 9, 10,
-                                        12, 13, 14, 15});
+    auto input = genGivenVals<DT>(2, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
+
+                                      1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+    auto bias = genGivenVals<DT>(3, {1, 2, 3});
+
+    auto result = genGivenVals<DT>(2, {2, 3, 4, 5, 7, 8, 9, 10, 12, 13, 14, 15,
+
+                                       2, 3, 4, 5, 7, 8, 9, 10, 12, 13, 14, 15});
 
     checkBiasAddForward(input, bias, result, dctx.get());
 
diff --git a/test/runtime/local/kernels/DNNConv2DBackwardTest.cpp b/test/runtime/local/kernels/DNNConv2DBackwardTest.cpp
index 30a1b1c37..62bac8123 100644
--- a/test/runtime/local/kernels/DNNConv2DBackwardTest.cpp
+++ b/test/runtime/local/kernels/DNNConv2DBackwardTest.cpp
@@ -20,9 +20,8 @@
 #include <runtime/local/kernels/Conv2DBackwardData.h>
 #include <runtime/local/kernels/Conv2DBackwardFilter.h>
 
-template<class DT>
-void checkConv2DBackwardData(const DT* in, const DT* filter, const DT* exp, DaphneContext* dctx) {
-    DT* res = nullptr;
+template <class DT> void checkConv2DBackwardData(const DT *in, const DT *filter, const DT *exp, DaphneContext *dctx) {
+    DT *res = nullptr;
     Conv2DBackwardData<DT, DT>::apply(filter, in, 2, 2, 1, 1, 1, 3, 3, 3, 2, 3, 2, 2, res, dctx);
     CHECK(*res == *exp);
 }
@@ -34,22 +33,16 @@ TEMPLATE_PRODUCT_TEST_CASE("conv_bwd_data", TAG_DNN, (DenseMatrix), (float, doub
     // auto input = genGivenVals<DT>(1, { 1, 2, 3, 4, 5, 6, 7, 8, 9});
     // auto filter = genGivenVals<DT>(1, { 1, 0, 0, 1});
 
-    auto input = genGivenVals<DT>(1, { 1, 2, 3, 4, 
-                                       5, 6, 7, 8});
+    auto input = genGivenVals<DT>(1, {1, 2, 3, 4, 5, 6, 7, 8});
 
-    auto filter = genGivenVals<DT>(2, { 1, 0, 0, 2, 
-                                        2, 0, 0, 3, 
-                                        3, 0, 0, 4, 
-                                        
-                                        5, 0, 0, 6, 
-                                        6, 0, 0, 7,
-                                        7, 0, 0, 8});
+    auto filter = genGivenVals<DT>(2, {1, 0, 0, 2, 2, 0, 0, 3, 3, 0, 0, 4,
 
-    // expected output when used with settings filter 2x2, stride 1x1, padding 0x0
-    // auto result = genGivenVals<DT>(1, { 6, 8, 12, 14 });
-    auto result = genGivenVals<DT>(1, { 32, 0, 40, 0, 44, 0, 48, 0, 56,
-                                        38, 0, 48, 0, 56, 0, 58, 0, 68,
-                                        44, 0, 56, 0, 68, 0, 68, 0, 80 });
+                                       5, 0, 0, 6, 6, 0, 0, 7, 7, 0, 0, 8});
+
+    // expected output when used with settings filter 2x2, stride 1x1, padding
+    // 0x0 auto result = genGivenVals<DT>(1, { 6, 8, 12, 14 });
+    auto result = genGivenVals<DT>(
+        1, {32, 0, 40, 0, 44, 0, 48, 0, 56, 38, 0, 48, 0, 56, 0, 58, 0, 68, 44, 0, 56, 0, 68, 0, 68, 0, 80});
 
     checkConv2DBackwardData(input, filter, result, dctx.get());
 
@@ -57,9 +50,9 @@ TEMPLATE_PRODUCT_TEST_CASE("conv_bwd_data", TAG_DNN, (DenseMatrix), (float, doub
     DataObjectFactory::destroy(result);
 }
 
-template<class DT>
-void checkConv2DBackwardFilter(const DT* input, const DT* dOutput, const DT* exp, DaphneContext* dctx) {
-    DT* res = nullptr;
+template <class DT>
+void checkConv2DBackwardFilter(const DT *input, const DT *dOutput, const DT *exp, DaphneContext *dctx) {
+    DT *res = nullptr;
     Conv2DBackwardFilter<DT, DT>::apply(res, input, dOutput, 2, 2, 1, 1, 1, 3, 3, 3, 2, 3, 2, 2, dctx);
     CHECK(*res == *exp);
 }
@@ -71,30 +64,19 @@ TEMPLATE_PRODUCT_TEST_CASE("conv_bwd_filter", TAG_DNN, (DenseMatrix), (float, do
     // auto input = genGivenVals<DT>(1, { 1, 2, 3, 4, 5, 6, 7, 8, 9});
     // auto filter = genGivenVals<DT>(1, { 1, 0, 0, 1});
 
-    auto input = genGivenVals<DT>(1, { 1, 2, 3, 4, 5, 6, 7, 8, 9,
-                                       1, 2, 3, 4, 5, 6, 7, 8, 9,
-                                       1, 2, 3, 4, 5, 6, 7, 8, 9 });
+    auto input = genGivenVals<DT>(1, {1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, 9});
 
-    auto dOutput = genGivenVals<DT>(1, { 1, 2, 3, 4,
-                                         5, 6, 7, 8});
+    auto dOutput = genGivenVals<DT>(1, {1, 2, 3, 4, 5, 6, 7, 8});
 
-    auto filter = genGivenVals<DT>(2, { 1, 0, 0, 2,
-                                        2, 0, 0, 3,
-                                        3, 0, 0, 4,
+    auto filter = genGivenVals<DT>(2, {1, 0, 0, 2, 2, 0, 0, 3, 3, 0, 0, 4,
 
-                                        5, 0, 0, 6,
-                                        6, 0, 0, 7,
-                                        7, 0, 0, 8});
+                                       5, 0, 0, 6, 6, 0, 0, 7, 7, 0, 0, 8});
 
-    // expected output when used with settings filter 2x2, stride 1x1, padding 0x0
-    // auto result = genGivenVals<DT>(1, { 6, 8, 12, 14 });
-    auto result = genGivenVals<DT>(2, { 20, 36, 36, 64,
-                                        20, 36, 36, 64,
-                                        20, 36, 36, 64,
+    // expected output when used with settings filter 2x2, stride 1x1, padding
+    // 0x0 auto result = genGivenVals<DT>(1, { 6, 8, 12, 14 });
+    auto result = genGivenVals<DT>(2, {20, 36, 36, 64,  20, 36, 36, 64,  20, 36, 36, 64,
 
-                                        40, 76, 76, 144,
-                                        40, 76, 76, 144,
-                                        40, 76, 76, 144 });
+                                       40, 76, 76, 144, 40, 76, 76, 144, 40, 76, 76, 144});
 
     checkConv2DBackwardFilter(input, dOutput, result, dctx.get());
 
diff --git a/test/runtime/local/kernels/DNNConv2DForwardTest.cpp b/test/runtime/local/kernels/DNNConv2DForwardTest.cpp
index dc194a278..7332013da 100644
--- a/test/runtime/local/kernels/DNNConv2DForwardTest.cpp
+++ b/test/runtime/local/kernels/DNNConv2DForwardTest.cpp
@@ -19,13 +19,13 @@
 #include <runtime/local/datagen/GenGivenVals.h>
 #include <runtime/local/kernels/Conv2DForward.h>
 
-template<class DT>
-void checkConv2DForward(const DT* in, const DT* filter, const DT* exp, DaphneContext* dctx) {
-    DT* res = nullptr;
+template <class DT> void checkConv2DForward(const DT *in, const DT *filter, const DT *exp, DaphneContext *dctx) {
+    DT *res = nullptr;
     size_t out_h;
     size_t out_w;
     auto bias = genGivenVals<DT>(1, {0});
-    Conv2DForward<DT, DT>::apply(res, out_h, out_w, in, filter, bias, in->getNumRows(), 3, 3, 3, 2, 2, 2, 2, 1, 1, dctx);
+    Conv2DForward<DT, DT>::apply(res, out_h, out_w, in, filter, bias, in->getNumRows(), 3, 3, 3, 2, 2, 2, 2, 1, 1,
+                                 dctx);
     CHECK(*res == *exp);
 }
 
@@ -33,42 +33,29 @@ TEMPLATE_PRODUCT_TEST_CASE("conv_fwd_cpu", TAG_DNN, (DenseMatrix), (float, doubl
     auto dctx = setupContextAndLogger();
     using DT = TestType;
 
-    auto input = genGivenVals<DT>(2, { 1, 2, 3,
-                                       4, 5, 6,
-                                       7, 8, 9,
+    auto input = genGivenVals<DT>(2, {
+                                         1,  2,  3,  4,  5,  6,  7,  8,  9,
 
-                                       10, 11, 12,
-                                       13, 14, 15,
-                                       16, 17, 18,
+                                         10, 11, 12, 13, 14, 15, 16, 17, 18,
 
-                                       19, 20, 21,
-                                       22, 23, 24,
-                                       25, 26, 27,
+                                         19, 20, 21, 22, 23, 24, 25, 26, 27,
 
-                                       1, 2, 3,
-                                       4, 5, 6,
-                                       7, 8, 9,
+                                         1,  2,  3,  4,  5,  6,  7,  8,  9,
 
-                                       10, 11, 12,
-                                       13, 14, 15,
-                                       16, 17, 18,
+                                         10, 11, 12, 13, 14, 15, 16, 17, 18,
 
-                                       19, 20, 21,
-                                       22, 23, 24,
-                                       25, 26, 27,});
+                                         19, 20, 21, 22, 23, 24, 25, 26, 27,
+                                     });
 
-    auto filter = genGivenVals<DT>(1, { 1, 0,
-                                        0, 1,
+    auto filter = genGivenVals<DT>(1, {1, 0, 0, 1,
 
-                                        1, 0,
-                                        0, 1,
+                                       1, 0, 0, 1,
 
-                                        1, 0,
-                                        0, 1});
+                                       1, 0, 0, 1});
 
-    // expected output when used with settings filter 2x2, stride 1x1, padding 0x0
-    // auto result = genGivenVals<DT>(1, { 6, 8, 12, 14 });
-    auto result = genGivenVals<DT>(2, { 30, 36, 48, 96, 30, 36, 48, 96 });
+    // expected output when used with settings filter 2x2, stride 1x1, padding
+    // 0x0 auto result = genGivenVals<DT>(1, { 6, 8, 12, 14 });
+    auto result = genGivenVals<DT>(2, {30, 36, 48, 96, 30, 36, 48, 96});
 
     checkConv2DForward(input, filter, result, dctx.get());
 
diff --git a/test/runtime/local/kernels/DNNPoolingBackwardTest.cpp b/test/runtime/local/kernels/DNNPoolingBackwardTest.cpp
index 9cc8c1e8c..f99ac193f 100644
--- a/test/runtime/local/kernels/DNNPoolingBackwardTest.cpp
+++ b/test/runtime/local/kernels/DNNPoolingBackwardTest.cpp
@@ -16,45 +16,34 @@
 
 #include "run_tests.h"
 
-#include <tags.h>
 #include <catch.hpp>
+#include <tags.h>
 
 #include <runtime/local/datagen/GenGivenVals.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
 #include <runtime/local/kernels/AvgPoolBackward.h>
 #include <runtime/local/kernels/MaxPoolBackward.h>
 
-template<typename DT>
-DT* genInput() {
-    return genGivenVals<DT>(2, {
-            1, 2, 3, 4, 5, 6, 7, 8, 9, 
-            10, 11, 12, 13, 14, 15, 16, 17, 18, 
-            19, 20, 21, 22, 23, 24, 25, 26, 27,
-
-            28, 29, 30, 31, 32, 33, 34, 35, 36, 
-            37, 38, 39, 40, 41, 42, 43, 44, 45, 
-            46, 47, 48, 49, 50, 51, 52, 53, 54            
-    });
+template <typename DT> DT *genInput() {
+    return genGivenVals<DT>(2, {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14,
+                                15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27,
+
+                                28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
+                                42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54});
 }
 
-template<typename DT>
-DT* genDOut() {
-    return genGivenVals<DT>(2, {
-            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
-            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12             
-    });
+template <typename DT> DT *genDOut() {
+    return genGivenVals<DT>(2, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
 }
 
-template<class DT>
-void check_max(const DT* in, const DT* dOut, const DT* exp, DaphneContext* dctx) {
-    DT* res = nullptr;
+template <class DT> void check_max(const DT *in, const DT *dOut, const DT *exp, DaphneContext *dctx) {
+    DT *res = nullptr;
     MaxPoolBackward<DT, DT>::apply(res, in, dOut, 2, 3, 3, 3, 2, 2, 2, 2, 1, 1, dctx);
     CHECK(*res == *exp);
 }
 
-template<class DT>
-void check_avg(const DT* in, const DT* dOut, const DT* exp, DaphneContext* dctx) {
-    DT* res = nullptr;
+template <class DT> void check_avg(const DT *in, const DT *dOut, const DT *exp, DaphneContext *dctx) {
+    DT *res = nullptr;
     AvgPoolBackward<DT, DT>::apply(res, in, dOut, 2, 3, 3, 3, 2, 2, 2, 2, 1, 1, dctx);
     CHECK(*res == *exp);
 }
@@ -67,14 +56,11 @@ TEMPLATE_PRODUCT_TEST_CASE("pool_bwd_avg", TAG_DNN, (DenseMatrix), (float, doubl
     auto inputs = genInput<DT>();
     auto dOut = genDOut<DT>();
 
-    auto out = genGivenVals<DT>(2, {0.25, 0.50, 0.50, 0.75, 1.00, 1.00, 0.75, 1.00, 1.00, 
-                                    1.25, 1.50, 1.50, 1.75, 2.00, 2.00, 1.75, 2.00, 2.00,
-                                    2.25, 2.50, 2.50, 2.75, 3.00, 3.00, 2.75, 3.00, 3.00,
+    auto out = genGivenVals<DT>(2, {0.25, 0.50, 0.50, 0.75, 1.00, 1.00, 0.75, 1.00, 1.00, 1.25, 1.50, 1.50, 1.75, 2.00,
+                                    2.00, 1.75, 2.00, 2.00, 2.25, 2.50, 2.50, 2.75, 3.00, 3.00, 2.75, 3.00, 3.00,
 
-                                    0.25, 0.50, 0.50, 0.75, 1.00, 1.00, 0.75, 1.00, 1.00,
-                                    1.25, 1.50, 1.50, 1.75, 2.00, 2.00, 1.75, 2.00, 2.00,
-                                    2.25, 2.50, 2.50, 2.75, 3.00, 3.00, 2.75, 3.00, 3.00
-    });
+                                    0.25, 0.50, 0.50, 0.75, 1.00, 1.00, 0.75, 1.00, 1.00, 1.25, 1.50, 1.50, 1.75, 2.00,
+                                    2.00, 1.75, 2.00, 2.00, 2.25, 2.50, 2.50, 2.75, 3.00, 3.00, 2.75, 3.00, 3.00});
 
     check_avg(inputs, dOut, out, dctx.get());
 
@@ -91,14 +77,10 @@ TEMPLATE_PRODUCT_TEST_CASE("pool_bwd_max", TAG_DNN, (DenseMatrix), (float, doubl
     auto inputs = genInput<DT>();
     auto dOut = genDOut<DT>();
 
-    auto out = genGivenVals<DT>(2, {1, 0, 2, 0, 0, 0, 3, 0, 4,
-                                    5, 0, 6, 0, 0, 0, 7, 0, 8,
-                                    9, 0, 10, 0, 0, 0, 11, 0, 12,
-                                    
-                                    1, 0, 2, 0, 0, 0, 3, 0, 4,
-                                    5, 0, 6, 0, 0, 0, 7, 0, 8,
-                                    9, 0, 10, 0, 0, 0, 11, 0, 12
-    });
+    auto out =
+        genGivenVals<DT>(2, {1, 0, 2, 0, 0, 0, 3, 0, 4, 5, 0, 6, 0, 0, 0, 7, 0, 8, 9, 0, 10, 0, 0, 0, 11, 0, 12,
+
+                             1, 0, 2, 0, 0, 0, 3, 0, 4, 5, 0, 6, 0, 0, 0, 7, 0, 8, 9, 0, 10, 0, 0, 0, 11, 0, 12});
 
     check_max(inputs, dOut, out, dctx.get());
 
diff --git a/test/runtime/local/kernels/DNNPoolingTestForward.cpp b/test/runtime/local/kernels/DNNPoolingTestForward.cpp
index 0bf97100d..e23a92c2e 100644
--- a/test/runtime/local/kernels/DNNPoolingTestForward.cpp
+++ b/test/runtime/local/kernels/DNNPoolingTestForward.cpp
@@ -19,29 +19,29 @@
 #include <runtime/local/datagen/GenGivenVals.h>
 #include <runtime/local/kernels/Pooling.h>
 
-template<typename DT>
-DT* genInput() {
-    return genGivenVals<DT>(2, {
-            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
-            30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
-            55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75,
-            76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101,
-            102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
-            122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141,
-            142, 143, 144, 145, 146, 147, 148, 149, 150
-    });
+template <typename DT> DT *genInput() {
+    return genGivenVals<DT>(2, {1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,  15,  16,  17,
+                                18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,  32,  33,  34,
+                                35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
+                                52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,  66,  67,  68,
+                                69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,
+                                86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102,
+                                103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+                                120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136,
+                                137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150});
 }
 
-template<template<typename> class OP, class DT>
-void checkPoolingForward(const DT* in, const DT* exp, DaphneContext* dctx) {
-    DT* res = nullptr;
+template <template <typename> class OP, class DT>
+void checkPoolingForward(const DT *in, const DT *exp, DaphneContext *dctx) {
+    DT *res = nullptr;
     size_t out_h;
     size_t out_w;
     NN::Pooling::Forward<OP, DT, DT>::apply(res, out_h, out_w, in, in->getNumRows(), 3, 5, 5, 2, 2, 2, 2, 1, 1, dctx);
     CHECK(*res == *exp);
 }
 
-TEMPLATE_PRODUCT_TEST_CASE("NN::Pooling::AVG::Forward", TAG_DNN, (DenseMatrix), (float, double)) { // NOLINT(cert-err58-cpp)
+TEMPLATE_PRODUCT_TEST_CASE("NN::Pooling::AVG::Forward", TAG_DNN, (DenseMatrix),
+                           (float, double)) { // NOLINT(cert-err58-cpp)
     using DT = TestType;
 
     auto dctx = setupContextAndLogger();
@@ -49,20 +49,21 @@ TEMPLATE_PRODUCT_TEST_CASE("NN::Pooling::AVG::Forward", TAG_DNN, (DenseMatrix),
     // two rgb "images" of 5x5 pixels
     auto inputs = genInput<DT>();
 
-    // expected output when used with settings filter 2x2, stride 1x1, padding 0x0
-    auto out_f2x2_s1x1_p0x0 = genGivenVals<DT>(2, {
-            4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 16, 17, 19, 20, 21, 22, 29, 30, 31, 32, 34, 35, 36, 37, 39, 40, 41, 42,
-                    44, 45, 46, 47, 54, 55, 56, 57, 59, 60, 61, 62, 64, 65, 66, 67, 69, 70, 71, 72,
-            79, 80, 81, 82, 84, 85, 86, 87, 89, 90, 91, 92, 94, 95, 96, 97, 104, 105, 106, 107, 109, 110, 111, 112, 114,
-                    115, 116, 117, 119, 120, 121, 122, 129, 130, 131, 132, 134, 135, 136, 137, 139, 140, 141, 142, 144,
-                    145, 146, 147
-    });
-
-    auto out_f2x2_s2x2_p1x1 = genGivenVals<DT>(2, {0.25, 1.25, 2.25, 4.25, 10.00, 12.00, 9.25, 20.00, 22.00, 6.50, 13.75, 14.75,
-          16.7500,  35.0000,  37.0000, 21.7500,  45.0000,  47.0000, 12.7500,  26.2500,  27.2500, 29.2500,  60.0000,  62.0000,
-          34.2500,  70.0000,  72.0000, 19.0000,  38.7500,  39.7500, 41.7500,  85.0000,  87.0000,
-          46.7500,  95.0000,  97.0000, 25.2500,  51.2500,  52.2500, 54.2500, 110.0000, 112.0000,
-          59.2500, 120.0000, 122.0000, 31.5000,  63.7500,  64.7500, 66.7500, 135.0000, 137.0000, 71.7500, 145.0000, 147.0000});
+    // expected output when used with settings filter 2x2, stride 1x1, padding
+    // 0x0
+    auto out_f2x2_s1x1_p0x0 = genGivenVals<DT>(
+        2, {4,   5,   6,   7,   9,   10,  11,  12,  14,  15,  16,  17,  19,  20,  21,  22,  29,  30,  31,  32,
+            34,  35,  36,  37,  39,  40,  41,  42,  44,  45,  46,  47,  54,  55,  56,  57,  59,  60,  61,  62,
+            64,  65,  66,  67,  69,  70,  71,  72,  79,  80,  81,  82,  84,  85,  86,  87,  89,  90,  91,  92,
+            94,  95,  96,  97,  104, 105, 106, 107, 109, 110, 111, 112, 114, 115, 116, 117, 119, 120, 121, 122,
+            129, 130, 131, 132, 134, 135, 136, 137, 139, 140, 141, 142, 144, 145, 146, 147});
+
+    auto out_f2x2_s2x2_p1x1 = genGivenVals<DT>(
+        2, {0.25,     1.25,    2.25,    4.25,    10.00,   12.00,    9.25,     20.00,    22.00,    6.50,    13.75,
+            14.75,    16.7500, 35.0000, 37.0000, 21.7500, 45.0000,  47.0000,  12.7500,  26.2500,  27.2500, 29.2500,
+            60.0000,  62.0000, 34.2500, 70.0000, 72.0000, 19.0000,  38.7500,  39.7500,  41.7500,  85.0000, 87.0000,
+            46.7500,  95.0000, 97.0000, 25.2500, 51.2500, 52.2500,  54.2500,  110.0000, 112.0000, 59.2500, 120.0000,
+            122.0000, 31.5000, 63.7500, 64.7500, 66.7500, 135.0000, 137.0000, 71.7500,  145.0000, 147.0000});
     checkPoolingForward<NN::Pooling::AVG>(inputs, out_f2x2_s2x2_p1x1, dctx.get());
 
     DataObjectFactory::destroy(inputs);
@@ -70,7 +71,8 @@ TEMPLATE_PRODUCT_TEST_CASE("NN::Pooling::AVG::Forward", TAG_DNN, (DenseMatrix),
     DataObjectFactory::destroy(out_f2x2_s2x2_p1x1);
 }
 
-TEMPLATE_PRODUCT_TEST_CASE("NN::Pooling::MAX::Forward", TAG_DNN, (DenseMatrix), (float, double)) { // NOLINT(cert-err58-cpp)
+TEMPLATE_PRODUCT_TEST_CASE("NN::Pooling::MAX::Forward", TAG_DNN, (DenseMatrix),
+                           (float, double)) { // NOLINT(cert-err58-cpp)
     using DT = TestType;
 
     auto dctx = setupContextAndLogger();
@@ -78,63 +80,43 @@ TEMPLATE_PRODUCT_TEST_CASE("NN::Pooling::MAX::Forward", TAG_DNN, (DenseMatrix),
     // two rgb "images" of 5x5 pixels
     auto inputs = genInput<DT>();
 
-    // expected output when used with settings filter 2x2, stride 1x1, padding 0x0
-    auto out_f2x2_s1x1_p0x0 = genGivenVals<DT>(2, {
-            7, 8, 9, 10, 12, 13, 14, 15, 17, 18, 19, 20, 22, 23, 24, 25, 32, 33, 34, 35, 37, 38, 39, 40, 42, 43, 44, 45,
-                    47, 48, 49, 50, 57, 58, 59, 60, 62, 63, 64, 65, 67, 68, 69, 70, 72, 73, 74, 75,
-            82, 83, 84, 85, 87, 88, 89, 90, 92, 93, 94, 95, 97, 98, 99, 100, 107, 108, 109, 110, 112, 113, 114, 115, 117,
-                    118, 119, 120, 122, 123, 124, 125, 132, 133, 134, 135, 137, 138, 139, 140, 142, 143, 144, 145, 147,
-                    148, 149, 150
-    });
-
-    auto inputs_p1x1 = genGivenVals<DT>(2, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
-            30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54
-        
-    });
-    auto out_f2x2_s1x1_p1x1 = genGivenVals<DT>(2, {
-            1, 2, 3, 3,
-            4, 5, 6, 6,
-            7, 8, 9, 9,
-            7, 8, 9, 9,
-
-            10, 11, 12, 12,
-            13, 14, 15, 15,
-            16, 17, 18, 18,
-            16, 17, 18, 18,
-
-            19, 20, 21, 21,
-            22, 23, 24, 24,
-            25, 26, 27, 27,
-            25, 26, 27, 27,
-            
-            28, 29, 30, 30,           
-            31, 32, 33, 33,
-            34, 35, 36, 36,
-            34, 35, 36, 36, 
-            
-            37, 38, 39, 39,   
-            40, 41, 42, 42,
-            43, 44, 45, 45,
-            43, 44, 45, 45,
-            
-            46, 47, 48, 48,            
-            49, 50, 51, 51,
-            52, 53, 54, 54,
-            52, 53, 54, 54
-    });
-
-    auto out_f2x2_s2x2_p1x1 = genGivenVals<DT>(2, {1., 3., 5., 11.,  13.,  15.,
-          21.,  23.,  25., 26.,  28.,  30., 36.,  38.,  40., 46.,  48.,  50.,
-            51.,  53.,  55., 61.,  63.,  65., 71.,  73.,  75.,
-         76.,  78.,  80., 86.,  88.,  90.,96.,  98., 100., 101., 103., 105., 111., 113., 115.,
-        121., 123., 125., 126., 128., 130., 136., 138., 140., 146., 148., 150.
-    });
+    // expected output when used with settings filter 2x2, stride 1x1, padding
+    // 0x0
+    auto out_f2x2_s1x1_p0x0 = genGivenVals<DT>(
+        2, {7,   8,   9,   10,  12,  13,  14,  15,  17,  18,  19,  20,  22,  23,  24,  25,  32,  33,  34,  35,
+            37,  38,  39,  40,  42,  43,  44,  45,  47,  48,  49,  50,  57,  58,  59,  60,  62,  63,  64,  65,
+            67,  68,  69,  70,  72,  73,  74,  75,  82,  83,  84,  85,  87,  88,  89,  90,  92,  93,  94,  95,
+            97,  98,  99,  100, 107, 108, 109, 110, 112, 113, 114, 115, 117, 118, 119, 120, 122, 123, 124, 125,
+            132, 133, 134, 135, 137, 138, 139, 140, 142, 143, 144, 145, 147, 148, 149, 150});
+
+    auto inputs_p1x1 = genGivenVals<DT>(2, {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+                                            20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38,
+                                            39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54
+
+                                           });
+    auto out_f2x2_s1x1_p1x1 = genGivenVals<DT>(2, {1,  2,  3,  3,  4,  5,  6,  6,  7,  8,  9,  9,  7,  8,  9,  9,
+
+                                                   10, 11, 12, 12, 13, 14, 15, 15, 16, 17, 18, 18, 16, 17, 18, 18,
+
+                                                   19, 20, 21, 21, 22, 23, 24, 24, 25, 26, 27, 27, 25, 26, 27, 27,
+
+                                                   28, 29, 30, 30, 31, 32, 33, 33, 34, 35, 36, 36, 34, 35, 36, 36,
+
+                                                   37, 38, 39, 39, 40, 41, 42, 42, 43, 44, 45, 45, 43, 44, 45, 45,
+
+                                                   46, 47, 48, 48, 49, 50, 51, 51, 52, 53, 54, 54, 52, 53, 54, 54});
+
+    auto out_f2x2_s2x2_p1x1 =
+        genGivenVals<DT>(2, {1.,   3.,   5.,   11.,  13.,  15.,  21.,  23.,  25.,  26.,  28.,  30.,  36.,  38.,
+                             40.,  46.,  48.,  50.,  51.,  53.,  55.,  61.,  63.,  65.,  71.,  73.,  75.,  76.,
+                             78.,  80.,  86.,  88.,  90.,  96.,  98.,  100., 101., 103., 105., 111., 113., 115.,
+                             121., 123., 125., 126., 128., 130., 136., 138., 140., 146., 148., 150.});
 
     checkPoolingForward<NN::Pooling::MAX>(inputs, out_f2x2_s2x2_p1x1, dctx.get());
 
     DataObjectFactory::destroy(inputs);
     DataObjectFactory::destroy(out_f2x2_s1x1_p0x0);
-    
+
     DataObjectFactory::destroy(inputs_p1x1);
     DataObjectFactory::destroy(out_f2x2_s1x1_p1x1);
     DataObjectFactory::destroy(out_f2x2_s2x2_p1x1);
diff --git a/test/runtime/local/kernels/DiagMatrixTest.cpp b/test/runtime/local/kernels/DiagMatrixTest.cpp
index 7198f08b1..ebe26c395 100644
--- a/test/runtime/local/kernels/DiagMatrixTest.cpp
+++ b/test/runtime/local/kernels/DiagMatrixTest.cpp
@@ -14,46 +14,58 @@
  * limitations under the License.
  */
 
-#include <runtime/local/datastructures/DenseMatrix.h>
-#include <runtime/local/datastructures/CSRMatrix.h>
 #include <runtime/local/datagen/GenGivenVals.h>
+#include <runtime/local/datastructures/CSRMatrix.h>
+#include <runtime/local/datastructures/DenseMatrix.h>
 #include <runtime/local/kernels/CheckEq.h>
 #include <runtime/local/kernels/DiagMatrix.h>
 
-#include <tags.h>
-#include <cstdint>
 #include <catch.hpp>
+#include <cstdint>
+#include <tags.h>
 
 #define TEST_NAME(opName) "DiagMatrix (" opName ")"
 #define VALUE_TYPES int32_t, float
 
-template<class DTRes, class DTArg>
-void checkDiagMatrix(const DTArg * arg, const DTRes * exp) {
-    DTRes * res = nullptr;
+template <class DTRes, class DTArg> void checkDiagMatrix(const DTArg *arg, const DTRes *exp) {
+    DTRes *res = nullptr;
     diagMatrix<DTRes, DTArg>(res, arg, nullptr);
     CHECK(*res == *exp);
     DataObjectFactory::destroy(res);
 }
 
-TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("diag-dense"), TAG_KERNELS, (DenseMatrix), (VALUE_TYPES)) { // NOLINT(cert-err58-cpp)
+TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("diag-dense"), TAG_KERNELS, (DenseMatrix),
+                           (VALUE_TYPES)) { // NOLINT(cert-err58-cpp)
     using DT = TestType;
     using VT = typename DT::VT;
 
     auto arg = genGivenVals<DT>(3, {
-        3,
-        1,
-        2,
-    });
+                                       3,
+                                       1,
+                                       2,
+                                   });
     auto dense_exp = genGivenVals<DenseMatrix<VT>>(3, {
-        3,0,0,
-        0,1,0,
-        0,0,2,
-    });
+                                                          3,
+                                                          0,
+                                                          0,
+                                                          0,
+                                                          1,
+                                                          0,
+                                                          0,
+                                                          0,
+                                                          2,
+                                                      });
     auto csr_exp = genGivenVals<CSRMatrix<VT>>(3, {
-        3,0,0,
-        0,1,0,
-        0,0,2,
-    });
+                                                      3,
+                                                      0,
+                                                      0,
+                                                      0,
+                                                      1,
+                                                      0,
+                                                      0,
+                                                      0,
+                                                      2,
+                                                  });
 
     checkDiagMatrix(arg, dense_exp);
     checkDiagMatrix(arg, csr_exp);
@@ -61,38 +73,41 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("diag-dense"), TAG_KERNELS, (DenseMatrix),
     DataObjectFactory::destroy(csr_exp, dense_exp, arg);
 }
 
-TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("diag-csr/generic"), TAG_KERNELS, (CSRMatrix, Matrix), (VALUE_TYPES)) { // NOLINT(cert-err58-cpp)
+TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("diag-csr/generic"), TAG_KERNELS, (CSRMatrix, Matrix),
+                           (VALUE_TYPES)) { // NOLINT(cert-err58-cpp)
     using DT = TestType;
 
     auto arg = genGivenVals<DT>(5, {
-        3,
-        0,
-        0,
-        1,
-        0,
-    });
+                                       3,
+                                       0,
+                                       0,
+                                       1,
+                                       0,
+                                   });
     auto exp = genGivenVals<DT>(5, {
-        3,0,0,0,0,
-        0,0,0,0,0,
-        0,0,0,0,0,
-        0,0,0,1,0,
-        0,0,0,0,0,
-    });
+                                       3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
+                                   });
 
     checkDiagMatrix(arg, exp);
 
     DataObjectFactory::destroy(exp, arg);
 
     arg = genGivenVals<DT>(3, {
-        3,
-        1,
-        2,
-    });
+                                  3,
+                                  1,
+                                  2,
+                              });
     exp = genGivenVals<DT>(3, {
-        3,0,0,
-        0,1,0,
-        0,0,2,
-    });
+                                  3,
+                                  0,
+                                  0,
+                                  0,
+                                  1,
+                                  0,
+                                  0,
+                                  0,
+                                  2,
+                              });
 
     checkDiagMatrix(arg, exp);
 
diff --git a/test/runtime/local/kernels/DiagVectorTest.cpp b/test/runtime/local/kernels/DiagVectorTest.cpp
index 06607a49a..8f4a02827 100644
--- a/test/runtime/local/kernels/DiagVectorTest.cpp
+++ b/test/runtime/local/kernels/DiagVectorTest.cpp
@@ -14,13 +14,12 @@
  * limitations under the License.
  */
 
-#include <runtime/local/datastructures/DenseMatrix.h>
-#include <runtime/local/datastructures/CSRMatrix.h>
 #include <runtime/local/datagen/GenGivenVals.h>
+#include <runtime/local/datastructures/CSRMatrix.h>
+#include <runtime/local/datastructures/DenseMatrix.h>
 #include <runtime/local/kernels/CheckEq.h>
 #include <runtime/local/kernels/DiagVector.h>
 
-
 #include <tags.h>
 
 #include <catch.hpp>
@@ -30,77 +29,83 @@
 #include <cstdint>
 
 #define DATA_TYPES CSRMatrix, DenseMatrix, Matrix
-#define VALUE_TYPES  int32_t, double  
+#define VALUE_TYPES int32_t, double
 
-template<class DTRes, class DTArg>
-void checkDiagVector(const DTArg * arg, DTRes *& res, DTRes * exp) {
+template <class DTRes, class DTArg> void checkDiagVector(const DTArg *arg, DTRes *&res, DTRes *exp) {
     diagVector<DTRes, DTArg>(res, arg, nullptr);
     CHECK(*res == *exp);
 }
 
-TEMPLATE_PRODUCT_TEST_CASE("DiagVector-normal", TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)){ // NOLINT(cert-err58-cpp)
+TEMPLATE_PRODUCT_TEST_CASE("DiagVector-normal", TAG_KERNELS, (DATA_TYPES),
+                           (VALUE_TYPES)) { // NOLINT(cert-err58-cpp)
     using DT = TestType;
     using VT = typename DT::VT;
-    using DTRes = typename std::conditional<
-                        std::is_same<DT, Matrix<VT>>::value,
-                        Matrix<VT>,
-                        DenseMatrix<VT>
-                    >::type;
-
-    DT * arg = genGivenVals<DT>(3, {
-        3,0,0,
-        0,2,0,
-        0,0,1,
-    });
-    DTRes * exp = genGivenVals<DTRes>(3, {3, 2, 1});
-    DTRes * res = nullptr;
+    using DTRes = typename std::conditional<std::is_same<DT, Matrix<VT>>::value, Matrix<VT>, DenseMatrix<VT>>::type;
+
+    DT *arg = genGivenVals<DT>(3, {
+                                      3,
+                                      0,
+                                      0,
+                                      0,
+                                      2,
+                                      0,
+                                      0,
+                                      0,
+                                      1,
+                                  });
+    DTRes *exp = genGivenVals<DTRes>(3, {3, 2, 1});
+    DTRes *res = nullptr;
 
     checkDiagVector(arg, res, exp);
 
     DataObjectFactory::destroy(arg, exp, res);
 }
 
-
-TEMPLATE_PRODUCT_TEST_CASE("DiagVector-mixed-diagonal", TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)){ // NOLINT(cert-err58-cpp)
+TEMPLATE_PRODUCT_TEST_CASE("DiagVector-mixed-diagonal", TAG_KERNELS, (DATA_TYPES),
+                           (VALUE_TYPES)) { // NOLINT(cert-err58-cpp)
     using DT = TestType;
     using VT = typename DT::VT;
-    using DTRes = typename std::conditional<
-                        std::is_same<DT, Matrix<VT>>::value,
-                        Matrix<VT>,
-                        DenseMatrix<VT>
-                    >::type;
-
-    DT * arg = genGivenVals<DT>(3, {
-        1,0,0,
-        0,0,0,
-        0,0,1,
-    });
-    DTRes * exp = genGivenVals<DTRes>(3, {1, 0, 1});
-    DTRes * res = nullptr;
+    using DTRes = typename std::conditional<std::is_same<DT, Matrix<VT>>::value, Matrix<VT>, DenseMatrix<VT>>::type;
+
+    DT *arg = genGivenVals<DT>(3, {
+                                      1,
+                                      0,
+                                      0,
+                                      0,
+                                      0,
+                                      0,
+                                      0,
+                                      0,
+                                      1,
+                                  });
+    DTRes *exp = genGivenVals<DTRes>(3, {1, 0, 1});
+    DTRes *res = nullptr;
 
     checkDiagVector(arg, res, exp);
 
     DataObjectFactory::destroy(arg, exp, res);
 }
 
-
-TEMPLATE_PRODUCT_TEST_CASE("DiagVector-null", TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)){ // NOLINT(cert-err58-cpp)
+TEMPLATE_PRODUCT_TEST_CASE("DiagVector-null", TAG_KERNELS, (DATA_TYPES),
+                           (VALUE_TYPES)) { // NOLINT(cert-err58-cpp)
     using DT = TestType;
     using VT = typename DT::VT;
-    using DTRes = typename std::conditional<
-                        std::is_same<DT, Matrix<VT>>::value,
-                        Matrix<VT>,
-                        DenseMatrix<VT>
-                    >::type;
-
-    DT * arg = genGivenVals<DT>(3, {
-        3,0,0,
-        0,2,0,
-        0,0,1,
-    });
-    DTRes * exp = genGivenVals<DTRes>(3, {3, 2, 1});
-    DTRes * res = nullptr;
-    
+    using DTRes = typename std::conditional<std::is_same<DT, Matrix<VT>>::value, Matrix<VT>, DenseMatrix<VT>>::type;
+
+    DT *arg = genGivenVals<DT>(3, {
+                                      3,
+                                      0,
+                                      0,
+                                      0,
+                                      2,
+                                      0,
+                                      0,
+                                      0,
+                                      1,
+                                  });
+    DTRes *exp = genGivenVals<DTRes>(3, {3, 2, 1});
+    DTRes *res = nullptr;
+
     checkDiagVector(arg, res, exp);
 
     DataObjectFactory::destroy(arg, exp, res);
diff --git a/test/runtime/local/kernels/EigenCalTest.cpp b/test/runtime/local/kernels/EigenCalTest.cpp
index 8f35771f5..943222bbd 100644
--- a/test/runtime/local/kernels/EigenCalTest.cpp
+++ b/test/runtime/local/kernels/EigenCalTest.cpp
@@ -17,9 +17,9 @@
 #include <runtime/local/datagen/GenGivenVals.h>
 #include <runtime/local/datastructures/DataObjectFactory.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
+#include <runtime/local/kernels/CheckEq.h>
 #include <runtime/local/kernels/CheckEqApprox.h>
 #include <runtime/local/kernels/EigenCal.h>
-#include <runtime/local/kernels/CheckEq.h>
 
 #include <tags.h>
 
@@ -29,40 +29,25 @@
 
 #include <cstdint>
 
-
-template<class DT>
-void checkEigenCal(const DT * inMat, const DT * exp1, const DT * exp2) {
-    DT * res1 = nullptr;
-    DT * res2 = nullptr;
-    eigenCal<DT,DT, DT>(res1, res2, inMat, nullptr);  
+template <class DT> void checkEigenCal(const DT *inMat, const DT *exp1, const DT *exp2) {
+    DT *res1 = nullptr;
+    DT *res2 = nullptr;
+    eigenCal<DT, DT, DT>(res1, res2, inMat, nullptr);
     CHECK(checkEqApprox(res1, exp1, 1e-2, nullptr));
     CHECK(checkEqApprox(res2, exp2, 1e-2, nullptr));
     DataObjectFactory::destroy(res1);
     DataObjectFactory::destroy(res2);
 }
 
-
 TEMPLATE_PRODUCT_TEST_CASE("EigenCal", TAG_KERNELS, (DenseMatrix), (double, float)) {
-    using DT=TestType;
-      auto m0 = genGivenVals<DT>(3, {
-        504, 360, 180,	
-        360, 360, 0,
-        180, 0,	720       
-    });
+    using DT = TestType;
+    auto m0 = genGivenVals<DT>(3, {504, 360, 180, 360, 360, 0, 180, 0, 720});
+
+    auto m1 = genGivenVals<DT>(3, {-0.648, 0.655, -0.385,
 
-        auto m1 = genGivenVals<DT>(3, {
-       -0.648, 0.655, -0.385,
+                                   0.741, 0.429, -0.516, 0.172, 0.621, 0.764});
+    auto v0 = genGivenVals<DT>(3, {44.819, 910.07, 629.11});
 
-        0.741, 0.429, -0.516,
-        0.172, 0.621, 0.764
-    });
-    auto v0 = genGivenVals<DT>(3, {
-        44.819,
-        910.07,
-        629.11       
-    });
-    
     checkEigenCal(m0, v0, m1);
-    DataObjectFactory::destroy(m0, m1,  v0);
+    DataObjectFactory::destroy(m0, m1, v0);
 }
-
diff --git a/test/runtime/local/kernels/EwBinaryMatTest.cpp b/test/runtime/local/kernels/EwBinaryMatTest.cpp
index c2f16225b..648a1b232 100644
--- a/test/runtime/local/kernels/EwBinaryMatTest.cpp
+++ b/test/runtime/local/kernels/EwBinaryMatTest.cpp
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include <runtime/local/datastructures/CSRMatrix.h>
 #include <runtime/local/datagen/GenGivenVals.h>
+#include <runtime/local/datastructures/CSRMatrix.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
 #include <runtime/local/kernels/CheckEq.h>
 #include <runtime/local/kernels/EwBinaryMat.h>
@@ -24,6 +24,7 @@
 
 #include <catch.hpp>
 
+#include <type_traits>
 #include <vector>
 
 #include <cstdint>
@@ -34,16 +35,22 @@
 // CSRMatrix currently only supports ADD and MUL opCodes
 #define DATA_TYPES_NO_CSR DenseMatrix, Matrix
 
-template<class DT>
-void checkEwBinaryMat(BinaryOpCode opCode, const DT * lhs, const DT * rhs, const DT * exp) {
-    DT * res = nullptr;
+template <class DTArg, class DTRes>
+void checkEwBinaryMat(BinaryOpCode opCode, const DTArg *lhs, const DTArg *rhs, const DTRes *exp) {
+    DTRes *res = nullptr;
+    ewBinaryMat<DTRes, DTArg, DTArg>(opCode, res, lhs, rhs, nullptr);
+    CHECK(*res == *exp);
+}
+
+template <class DT> void checkEwBinaryMat(BinaryOpCode opCode, const DT *lhs, const DT *rhs, const DT *exp) {
+    DT *res = nullptr;
     ewBinaryMat<DT, DT, DT>(opCode, res, lhs, rhs, nullptr);
     CHECK(*res == *exp);
 }
 
-template<class SparseDT, class DT>
-void checkSparseDenseEwBinaryMat(BinaryOpCode opCode, const SparseDT * lhs, const DT * rhs, const SparseDT * exp) {
-    SparseDT * res = nullptr;
+template <class SparseDT, class DT>
+void checkSparseDenseEwBinaryMat(BinaryOpCode opCode, const SparseDT *lhs, const DT *rhs, const SparseDT *exp) {
+    SparseDT *res = nullptr;
     ewBinaryMat<SparseDT, SparseDT, DT>(opCode, res, lhs, rhs, nullptr);
     CHECK(*res == *exp);
 }
@@ -54,71 +61,59 @@ void checkSparseDenseEwBinaryMat(BinaryOpCode opCode, const SparseDT * lhs, cons
 
 TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("add"), TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)) {
     using DT = TestType;
-    
+    using VT = typename DT::VT;
+
     auto m0 = genGivenVals<DT>(4, {
-            0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0,
-    });
+                                      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                  });
     auto m1 = genGivenVals<DT>(4, {
-            1, 2, 0, 0, 1, 3,
-            0, 1, 0, 2, 0, 3,
-            0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0,
-    });
-    auto m2 = genGivenVals<DT>(4, {
-            0, 0, 0, 0, 0, 0,
-            1, 2, 3, 1, 0, 0,
-            0, 0, 0, 0, 0, 0,
-            0, 0, 3, 1, 0, 2,
-    });
-    auto m3 = genGivenVals<DT>(4, {
-            1, 2, 0, 0, 1, 3,
-            1, 3, 3, 3, 0, 3,
-            0, 0, 0, 0, 0, 0,
-            0, 0, 3, 1, 0, 2,
-    });
-    
+                                      1, 2, 0, 0, 1, 3, 0, 1, 0, 2, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                  });
+    DT *m2 = nullptr;
+    DT *m3 = nullptr;
+    if (std::is_unsigned_v<VT>) {
+        m2 = genGivenVals<DT>(4, {
+                                     0, 0, 0, 0, 0, 0, 1, 2, 3, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 0, 2,
+                                 });
+        m3 = genGivenVals<DT>(4, {
+                                     1, 2, 0, 0, 1, 3, 1, 3, 3, 3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 0, 2,
+                                 });
+    } else {
+        m2 = genGivenVals<DT>(4, {
+                                     VT(-1), 0, 0, 0, 0, 0, 1, 2, 3, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 0, 2,
+                                 });
+        m3 = genGivenVals<DT>(4, {
+                                     0, 2, 0, 0, 1, 3, 1, 3, 3, 3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 0, 2,
+                                 });
+    }
+
     checkEwBinaryMat(BinaryOpCode::ADD, m0, m0, m0);
     checkEwBinaryMat(BinaryOpCode::ADD, m1, m0, m1);
     checkEwBinaryMat(BinaryOpCode::ADD, m1, m2, m3);
-    
+
     DataObjectFactory::destroy(m0, m1, m2, m3);
 }
 
 TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("mul"), TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)) {
     using DT = TestType;
-    
+
     auto m0 = genGivenVals<DT>(4, {
-            0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0,
-    });
+                                      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                  });
     auto m1 = genGivenVals<DT>(4, {
-            1, 2, 0, 0, 1, 3,
-            0, 1, 0, 2, 0, 3,
-            0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0,
-    });
+                                      1, 2, 0, 0, 1, 3, 0, 1, 0, 2, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                  });
     auto m2 = genGivenVals<DT>(4, {
-            0, 0, 0, 0, 0, 0,
-            1, 2, 3, 1, 0, 0,
-            0, 0, 0, 0, 0, 0,
-            0, 0, 3, 1, 0, 2,
-    });
+                                      0, 0, 0, 0, 0, 0, 1, 2, 3, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 0, 2,
+                                  });
     auto m3 = genGivenVals<DT>(4, {
-            0, 0, 0, 0, 0, 0,
-            0, 2, 0, 2, 0, 0,
-            0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0,
-    });
-    
+                                      0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                  });
+
     checkEwBinaryMat(BinaryOpCode::MUL, m0, m0, m0);
     checkEwBinaryMat(BinaryOpCode::MUL, m1, m0, m0);
     checkEwBinaryMat(BinaryOpCode::MUL, m1, m2, m3);
-        
+
     DataObjectFactory::destroy(m0, m1, m2, m3);
 }
 
@@ -129,42 +124,24 @@ TEMPLATE_TEST_CASE(TEST_NAME("mul_sparse_dense"), TAG_KERNELS, VALUE_TYPES) {
     using DT = DenseMatrix<VT>;
 
     auto m0 = genGivenVals<SparseDT>(4, {
-        0, 1, 1, 0, 0, 0,
-        0, 0, 0, 0, 0, 0,
-        0, 0, 0, 0, 0, 0,
-        0, 0, 0, 0, 0, 0,
-    });
+                                            0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                        });
 
     auto m1 = genGivenVals<DT>(4, {
-        1, 2, 0, 0, 1, 3,
-        0, 1, 0, 2, 0, 3,
-        0, 0, 0, 0, 0, 0,
-        0, 0, 0, 0, 0, 0,
-    });
+                                      1, 2, 0, 0, 1, 3, 0, 1, 0, 2, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                  });
     auto m2 = genGivenVals<DT>(4, {
-        3, 0, 3, 3, 3, 3,
-        1, 2, 3, 1, 1, 1,
-        1, 1, 1, 1, 1, 1,
-        1, 2, 3, 1, 3, 2,
-    });
+                                      3, 0, 3, 3, 3, 3, 1, 2, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 1, 3, 2,
+                                  });
     auto m3 = genGivenVals<DT>(4, {
-        0, 1, 1, 0, 0, 0,
-        0, 2, 0, 2, 0, 0,
-        0, 0, 0, 0, 0, 0,
-        0, 0, 0, 0, 0, 0,
-    });
+                                      0, 1, 1, 0, 0, 0, 0, 2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                  });
     auto exp0 = genGivenVals<SparseDT>(4, {
-        0, 2, 0, 0, 0, 0,
-        0, 0, 0, 0, 0, 0,
-        0, 0, 0, 0, 0, 0,
-        0, 0, 0, 0, 0, 0,
-    });
+                                              0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                          });
     auto exp1 = genGivenVals<SparseDT>(4, {
-        0, 0, 3, 0, 0, 0,
-        0, 0, 0, 0, 0, 0,
-        0, 0, 0, 0, 0, 0,
-        0, 0, 0, 0, 0, 0,
-    });
+                                              0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                          });
 
     checkSparseDenseEwBinaryMat(BinaryOpCode::MUL, m0, m1, exp0);
     checkSparseDenseEwBinaryMat(BinaryOpCode::MUL, m0, m2, exp1);
@@ -175,27 +152,43 @@ TEMPLATE_TEST_CASE(TEST_NAME("mul_sparse_dense"), TAG_KERNELS, VALUE_TYPES) {
 
 TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("div"), TAG_KERNELS, (DATA_TYPES_NO_CSR), (VALUE_TYPES)) {
     using DT = TestType;
-    
+
     auto m0 = genGivenVals<DT>(2, {
-            0, 0, 0,
-            0, 0, 0,
-    });
+                                      0,
+                                      0,
+                                      0,
+                                      0,
+                                      0,
+                                      0,
+                                  });
     auto m1 = genGivenVals<DT>(2, {
-            1, 2, 4,
-            6, 8, 9,
-    });
+                                      1,
+                                      2,
+                                      4,
+                                      6,
+                                      8,
+                                      9,
+                                  });
     auto m2 = genGivenVals<DT>(2, {
-            1, 2, 2,
-            2, 4, 3,
-    });
+                                      1,
+                                      2,
+                                      2,
+                                      2,
+                                      4,
+                                      3,
+                                  });
     auto m3 = genGivenVals<DT>(2, {
-            1, 1, 2,
-            3, 2, 3,
-    });
-    
+                                      1,
+                                      1,
+                                      2,
+                                      3,
+                                      2,
+                                      3,
+                                  });
+
     checkEwBinaryMat(BinaryOpCode::DIV, m0, m1, m0);
     checkEwBinaryMat(BinaryOpCode::DIV, m1, m2, m3);
-    
+
     DataObjectFactory::destroy(m0, m1, m2, m3);
 }
 
@@ -203,75 +196,265 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("div"), TAG_KERNELS, (DATA_TYPES_NO_CSR), (
 // Comparisons
 // ****************************************************************************
 
+TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("eq"), TAG_KERNELS, (DenseMatrix), (ALL_STRING_VALUE_TYPES)) {
+    using DT = TestType;
+    using VT = typename DT::VT;
+
+    auto m1 = genGivenVals<DT>(2, {VT("1"), VT("2"), VT("abc"), VT("abcd"), VT("ABCD"), VT("34ab")});
+    auto m2 = genGivenVals<DT>(2, {VT("1"), VT("0"), VT("3"), VT("abcd"), VT("abcd"), VT("34ab")});
+    auto m3 = genGivenVals<DenseMatrix<int64_t>>(2, {1, 0, 0, 1, 0, 1});
+
+    SECTION("matrix") { checkEwBinaryMat(BinaryOpCode::EQ, m1, m2, m3); }
+
+    DataObjectFactory::destroy(m1);
+    DataObjectFactory::destroy(m2);
+    DataObjectFactory::destroy(m3);
+}
+
+TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("neq"), TAG_KERNELS, (DenseMatrix), (ALL_STRING_VALUE_TYPES)) {
+    using DT = TestType;
+    using VT = typename DT::VT;
+
+    auto m1 = genGivenVals<DT>(2, {VT("1"), VT("2"), VT("abc"), VT("abcd"), VT("ABCD"), VT("34ab")});
+    auto m2 = genGivenVals<DT>(2, {VT("1"), VT("0"), VT("3"), VT("abcd"), VT("abcd"), VT("34ab")});
+    auto m3 = genGivenVals<DenseMatrix<int64_t>>(2, {0, 1, 1, 0, 1, 0});
+
+    SECTION("matrix") { checkEwBinaryMat(BinaryOpCode::NEQ, m1, m2, m3); }
+
+    DataObjectFactory::destroy(m1);
+    DataObjectFactory::destroy(m2);
+    DataObjectFactory::destroy(m3);
+}
+
 TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("eq"), TAG_KERNELS, (DATA_TYPES_NO_CSR), (VALUE_TYPES)) {
     using DT = TestType;
-    
-    auto m1 = genGivenVals<DT>(2, {1, 2, 3,  4, 5, 6,});
-    auto m2 = genGivenVals<DT>(2, {1, 0, 3,  4, 4, 9,});
-    auto m3 = genGivenVals<DT>(2, {1, 0, 1,  1, 0, 0,});
-    
+
+    auto m1 = genGivenVals<DT>(2, {
+                                      1,
+                                      2,
+                                      3,
+                                      4,
+                                      5,
+                                      6,
+                                  });
+    auto m2 = genGivenVals<DT>(2, {
+                                      1,
+                                      0,
+                                      3,
+                                      4,
+                                      4,
+                                      9,
+                                  });
+    auto m3 = genGivenVals<DT>(2, {
+                                      1,
+                                      0,
+                                      1,
+                                      1,
+                                      0,
+                                      0,
+                                  });
+
     checkEwBinaryMat(BinaryOpCode::EQ, m1, m2, m3);
-    
+
     DataObjectFactory::destroy(m1, m2, m3);
 }
 
 TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("neq"), TAG_KERNELS, (DATA_TYPES_NO_CSR), (VALUE_TYPES)) {
     using DT = TestType;
-    
-    auto m1 = genGivenVals<DT>(2, {1, 2, 3,  4, 5, 6,});
-    auto m2 = genGivenVals<DT>(2, {1, 0, 3,  4, 4, 9,});
-    auto m3 = genGivenVals<DT>(2, {0, 1, 0,  0, 1, 1,});
-    
+
+    auto m1 = genGivenVals<DT>(2, {
+                                      1,
+                                      2,
+                                      3,
+                                      4,
+                                      5,
+                                      6,
+                                  });
+    auto m2 = genGivenVals<DT>(2, {
+                                      1,
+                                      0,
+                                      3,
+                                      4,
+                                      4,
+                                      9,
+                                  });
+    auto m3 = genGivenVals<DT>(2, {
+                                      0,
+                                      1,
+                                      0,
+                                      0,
+                                      1,
+                                      1,
+                                  });
+
     checkEwBinaryMat(BinaryOpCode::NEQ, m1, m2, m3);
-    
+
     DataObjectFactory::destroy(m1, m2, m3);
 }
 
 TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("lt"), TAG_KERNELS, (DATA_TYPES_NO_CSR), (VALUE_TYPES)) {
     using DT = TestType;
-    
-    auto m1 = genGivenVals<DT>(2, {1, 2, 3,  4, 5, 6,});
-    auto m2 = genGivenVals<DT>(2, {1, 0, 4,  4, 4, 9,});
-    auto m3 = genGivenVals<DT>(2, {0, 0, 1,  0, 0, 1,});
-    
+
+    auto m1 = genGivenVals<DT>(2, {
+                                      1,
+                                      2,
+                                      3,
+                                      4,
+                                      5,
+                                      6,
+                                  });
+    auto m2 = genGivenVals<DT>(2, {
+                                      1,
+                                      0,
+                                      4,
+                                      4,
+                                      4,
+                                      9,
+                                  });
+    auto m3 = genGivenVals<DT>(2, {
+                                      0,
+                                      0,
+                                      1,
+                                      0,
+                                      0,
+                                      1,
+                                  });
+
     checkEwBinaryMat(BinaryOpCode::LT, m1, m2, m3);
-    
+
     DataObjectFactory::destroy(m1, m2, m3);
 }
 
+TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("lt"), TAG_KERNELS, (DenseMatrix), (ALL_STRING_VALUE_TYPES)) {
+    using DT = TestType;
+    using VT = typename DT::VT;
+
+    auto m1 = genGivenVals<DT>(
+        3, {VT("1"), VT("2"), VT("1"), VT("abc"), VT("abcd"), VT("abcd"), VT("abcd"), VT("ABC"), VT("35abcd")});
+    auto m2 = genGivenVals<DT>(
+        3, {VT("1"), VT("0"), VT("3"), VT("abcd"), VT("abce"), VT("abcd"), VT("abc"), VT("abc"), VT("30abcd")});
+    auto m3 = genGivenVals<DenseMatrix<int64_t>>(3, {0, 0, 1, 1, 1, 0, 0, 1, 0});
+
+    SECTION("matrix") { checkEwBinaryMat(BinaryOpCode::LT, m1, m2, m3); }
+
+    DataObjectFactory::destroy(m1);
+    DataObjectFactory::destroy(m2);
+    DataObjectFactory::destroy(m3);
+}
+
 TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("le"), TAG_KERNELS, (DATA_TYPES_NO_CSR), (VALUE_TYPES)) {
     using DT = TestType;
-    
-    auto m1 = genGivenVals<DT>(2, {1, 2, 3,  4, 5, 6,});
-    auto m2 = genGivenVals<DT>(2, {1, 0, 4,  4, 4, 9,});
-    auto m3 = genGivenVals<DT>(2, {1, 0, 1,  1, 0, 1,});
-    
+
+    auto m1 = genGivenVals<DT>(2, {
+                                      1,
+                                      2,
+                                      3,
+                                      4,
+                                      5,
+                                      6,
+                                  });
+    auto m2 = genGivenVals<DT>(2, {
+                                      1,
+                                      0,
+                                      4,
+                                      4,
+                                      4,
+                                      9,
+                                  });
+    auto m3 = genGivenVals<DT>(2, {
+                                      1,
+                                      0,
+                                      1,
+                                      1,
+                                      0,
+                                      1,
+                                  });
+
     checkEwBinaryMat(BinaryOpCode::LE, m1, m2, m3);
-    
+
     DataObjectFactory::destroy(m1, m2, m3);
 }
 
 TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("gt"), TAG_KERNELS, (DATA_TYPES_NO_CSR), (VALUE_TYPES)) {
     using DT = TestType;
-    
-    auto m1 = genGivenVals<DT>(2, {1, 2, 3,  4, 5, 6,});
-    auto m2 = genGivenVals<DT>(2, {1, 0, 4,  4, 4, 9,});
-    auto m3 = genGivenVals<DT>(2, {0, 1, 0,  0, 1, 0,});
-    
+
+    auto m1 = genGivenVals<DT>(2, {
+                                      1,
+                                      2,
+                                      3,
+                                      4,
+                                      5,
+                                      6,
+                                  });
+    auto m2 = genGivenVals<DT>(2, {
+                                      1,
+                                      0,
+                                      4,
+                                      4,
+                                      4,
+                                      9,
+                                  });
+    auto m3 = genGivenVals<DT>(2, {
+                                      0,
+                                      1,
+                                      0,
+                                      0,
+                                      1,
+                                      0,
+                                  });
+
     checkEwBinaryMat(BinaryOpCode::GT, m1, m2, m3);
-    
+
     DataObjectFactory::destroy(m1, m2, m3);
 }
 
+TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("gt"), TAG_KERNELS, (DenseMatrix), (ALL_STRING_VALUE_TYPES)) {
+    using DT = TestType;
+    using VT = typename DT::VT;
+
+    auto m1 = genGivenVals<DT>(
+        3, {VT("1"), VT("2"), VT("1"), VT("abc"), VT("abcd"), VT("abcd"), VT("abcd"), VT("ABC"), VT("35abcd")});
+    auto m2 = genGivenVals<DT>(
+        3, {VT("1"), VT("0"), VT("3"), VT("abcd"), VT("abce"), VT("abcd"), VT("abc"), VT("abc"), VT("30abcd")});
+    auto m3 = genGivenVals<DenseMatrix<int64_t>>(3, {0, 1, 0, 0, 0, 0, 1, 0, 1});
+
+    SECTION("matrix") { checkEwBinaryMat(BinaryOpCode::GT, m1, m2, m3); }
+
+    DataObjectFactory::destroy(m1);
+    DataObjectFactory::destroy(m2);
+    DataObjectFactory::destroy(m3);
+}
+
 TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("ge"), TAG_KERNELS, (DATA_TYPES_NO_CSR), (VALUE_TYPES)) {
     using DT = TestType;
-    
-    auto m1 = genGivenVals<DT>(2, {1, 2, 3,  4, 5, 6,});
-    auto m2 = genGivenVals<DT>(2, {1, 0, 4,  4, 4, 9,});
-    auto m3 = genGivenVals<DT>(2, {1, 1, 0,  1, 1, 0,});
-    
+
+    auto m1 = genGivenVals<DT>(2, {
+                                      1,
+                                      2,
+                                      3,
+                                      4,
+                                      5,
+                                      6,
+                                  });
+    auto m2 = genGivenVals<DT>(2, {
+                                      1,
+                                      0,
+                                      4,
+                                      4,
+                                      4,
+                                      9,
+                                  });
+    auto m3 = genGivenVals<DT>(2, {
+                                      1,
+                                      1,
+                                      0,
+                                      1,
+                                      1,
+                                      0,
+                                  });
+
     checkEwBinaryMat(BinaryOpCode::GE, m1, m2, m3);
-    
+
     DataObjectFactory::destroy(m1, m2, m3);
 }
 
@@ -281,25 +464,67 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("ge"), TAG_KERNELS, (DATA_TYPES_NO_CSR), (V
 
 TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("min"), TAG_KERNELS, (DATA_TYPES_NO_CSR), (VALUE_TYPES)) {
     using DT = TestType;
-    
-    auto m1 = genGivenVals<DT>(2, {1, 2, 3,  4, 5, 6,});
-    auto m2 = genGivenVals<DT>(2, {1, 0, 4,  4, 4, 9,});
-    auto m3 = genGivenVals<DT>(2, {1, 0, 3,  4, 4, 6,});
-    
+
+    auto m1 = genGivenVals<DT>(2, {
+                                      1,
+                                      2,
+                                      3,
+                                      4,
+                                      5,
+                                      6,
+                                  });
+    auto m2 = genGivenVals<DT>(2, {
+                                      1,
+                                      0,
+                                      4,
+                                      4,
+                                      4,
+                                      9,
+                                  });
+    auto m3 = genGivenVals<DT>(2, {
+                                      1,
+                                      0,
+                                      3,
+                                      4,
+                                      4,
+                                      6,
+                                  });
+
     checkEwBinaryMat(BinaryOpCode::MIN, m1, m2, m3);
-    
+
     DataObjectFactory::destroy(m1, m2, m3);
 }
 
 TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("max"), TAG_KERNELS, (DATA_TYPES_NO_CSR), (VALUE_TYPES)) {
     using DT = TestType;
-    
-    auto m1 = genGivenVals<DT>(2, {1, 2, 3,  4, 5, 6,});
-    auto m2 = genGivenVals<DT>(2, {1, 0, 4,  4, 4, 9,});
-    auto m3 = genGivenVals<DT>(2, {1, 2, 4,  4, 5, 9,});
-    
+
+    auto m1 = genGivenVals<DT>(2, {
+                                      1,
+                                      2,
+                                      3,
+                                      4,
+                                      5,
+                                      6,
+                                  });
+    auto m2 = genGivenVals<DT>(2, {
+                                      1,
+                                      0,
+                                      4,
+                                      4,
+                                      4,
+                                      9,
+                                  });
+    auto m3 = genGivenVals<DT>(2, {
+                                      1,
+                                      2,
+                                      4,
+                                      4,
+                                      5,
+                                      9,
+                                  });
+
     checkEwBinaryMat(BinaryOpCode::MAX, m1, m2, m3);
-    
+
     DataObjectFactory::destroy(m1, m2, m3);
 }
 
@@ -310,36 +535,57 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("max"), TAG_KERNELS, (DATA_TYPES_NO_CSR), (
 TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("and"), TAG_KERNELS, (DATA_TYPES_NO_CSR), (VALUE_TYPES)) {
     using DT = TestType;
     using VT = typename DT::VT;
-    
-    auto m1 = genGivenVals<DT>(1, {0, 0, 1, 1, 0, 2, 2,     0, VT(-2), VT(-2)});
-    auto m2 = genGivenVals<DT>(1, {0, 1, 0, 1, 2, 0, 2, VT(-2),    0 , VT(-2)});
-    auto m3 = genGivenVals<DT>(1, {0, 0, 0, 1, 0, 0, 1,     0 ,    0 ,     1 });
-    
+
+    auto m1 = genGivenVals<DT>(1, {0, 0, 1, 1, 0, 2, 2, 0, VT(-2), VT(-2)});
+    auto m2 = genGivenVals<DT>(1, {0, 1, 0, 1, 2, 0, 2, VT(-2), 0, VT(-2)});
+    auto m3 = genGivenVals<DT>(1, {0, 0, 0, 1, 0, 0, 1, 0, 0, 1});
+
     checkEwBinaryMat(BinaryOpCode::AND, m1, m2, m3);
-    
+
     DataObjectFactory::destroy(m1, m2, m3);
 }
 
 TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("or"), TAG_KERNELS, (DATA_TYPES_NO_CSR), (VALUE_TYPES)) {
     using DT = TestType;
     using VT = typename DT::VT;
-    
-    auto m1 = genGivenVals<DT>(1, {0, 0, 1, 1, 0, 2, 2,     0 , VT(-2), VT(-2)});
-    auto m2 = genGivenVals<DT>(1, {0, 1, 0, 1, 2, 0, 2, VT(-2),     0 , VT(-2)});
-    auto m3 = genGivenVals<DT>(1, {0, 1, 1, 1, 1, 1, 1,     1,      1 ,     1 });
-    
+
+    auto m1 = genGivenVals<DT>(1, {0, 0, 1, 1, 0, 2, 2, 0, VT(-2), VT(-2)});
+    auto m2 = genGivenVals<DT>(1, {0, 1, 0, 1, 2, 0, 2, VT(-2), 0, VT(-2)});
+    auto m3 = genGivenVals<DT>(1, {0, 1, 1, 1, 1, 1, 1, 1, 1, 1});
+
     checkEwBinaryMat(BinaryOpCode::OR, m1, m2, m3);
-    
+
     DataObjectFactory::destroy(m1, m2, m3);
 }
 
+// ****************************************************************************
+// string.
+// ****************************************************************************
+
+TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("concat"), TAG_KERNELS, (DenseMatrix), (ALL_STRING_VALUE_TYPES)) {
+    using DT = TestType;
+    using VT = typename DT::VT;
+    using VTr = std::string;
+
+    auto m1 = genGivenVals<DT>(2, {VT("1"), VT("2"), VT(""), VT(""), VT("ab"), VT("abcd")});
+    auto m2 = genGivenVals<DT>(2, {VT(""), VT("0"), VT(""), VT("abc"), VT("ce"), VT("abcd")});
+    auto m3 =
+        genGivenVals<DenseMatrix<VTr>>(2, {VTr("1"), VTr("20"), VTr(""), VTr("abc"), VTr("abce"), VTr("abcdabcd")});
+
+    SECTION("matrix") { checkEwBinaryMat(BinaryOpCode::CONCAT, m1, m2, m3); }
+
+    DataObjectFactory::destroy(m1);
+    DataObjectFactory::destroy(m2);
+    DataObjectFactory::destroy(m3);
+}
+
 // ****************************************************************************
 // Invalid op-code
 // ****************************************************************************
 
 TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("some invalid op-code"), TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)) {
     using DT = TestType;
-    DT * res = nullptr;
+    DT *res = nullptr;
     auto m = genGivenVals<DT>(1, {1});
     CHECK_THROWS(ewBinaryMat<DT, DT, DT>(static_cast<BinaryOpCode>(999), res, m, m, nullptr));
 }
\ No newline at end of file
diff --git a/test/runtime/local/kernels/EwBinaryObjScaTest.cpp b/test/runtime/local/kernels/EwBinaryObjScaTest.cpp
index f9b62df67..7b4d99153 100644
--- a/test/runtime/local/kernels/EwBinaryObjScaTest.cpp
+++ b/test/runtime/local/kernels/EwBinaryObjScaTest.cpp
@@ -36,9 +36,9 @@
 #define DATA_TYPES DenseMatrix, Matrix
 #define VALUE_TYPES double, uint32_t
 
-template<class DT, typename VT>
-void checkEwBinaryObjSca(BinaryOpCode opCode, const DT * lhs, const VT rhs, const DT * exp) {
-    DT * res = nullptr;
+template <class DT, typename VT>
+void checkEwBinaryObjSca(BinaryOpCode opCode, const DT *lhs, const VT rhs, const DT *exp) {
+    DT *res = nullptr;
     ewBinaryObjSca<DT, DT, VT>(opCode, res, lhs, rhs, nullptr);
     CHECK(*res == *exp);
     DataObjectFactory::destroy(res);
@@ -53,23 +53,14 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("add - Matrix"), TAG_KERNELS, (DATA_TYPES),
     using VT = typename DT::VT;
 
     auto m0 = genGivenVals<DT>(4, {
-            0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0,
-    });
+                                      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                  });
     auto m1 = genGivenVals<DT>(4, {
-            1, 2, 0, 0, 1, 3,
-            0, 1, 0, 2, 0, 3,
-            0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0,
-    });
+                                      1, 2, 0, 0, 1, 3, 0, 1, 0, 2, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                  });
     auto m2 = genGivenVals<DT>(4, {
-            2, 3, 1, 1, 2, 4,
-            1, 2, 1, 3, 1, 4,
-            1, 1, 1, 1, 1, 1,
-            1, 1, 1, 1, 1, 1,
-    });
+                                      2, 3, 1, 1, 2, 4, 1, 2, 1, 3, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                  });
 
     checkEwBinaryObjSca<DT, VT>(BinaryOpCode::ADD, m0, 0, m0);
     checkEwBinaryObjSca<DT, VT>(BinaryOpCode::ADD, m1, 0, m1);
@@ -83,29 +74,20 @@ TEMPLATE_TEST_CASE(TEST_NAME("add - Frame"), TAG_KERNELS, VALUE_TYPES) {
     using DTCol = DenseMatrix<VT>;
 
     auto m0 = genGivenVals<DTCol>(4, {
-            0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0,
-    });
+                                         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                     });
     auto m1 = genGivenVals<DTCol>(4, {
-            1, 2, 0, 0, 1, 3,
-            0, 1, 0, 2, 0, 3,
-            0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0,
-    });
+                                         1, 2, 0, 0, 1, 3, 0, 1, 0, 2, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                     });
     auto m2 = genGivenVals<DTCol>(4, {
-            2, 3, 1, 1, 2, 4,
-            1, 2, 1, 3, 1, 4,
-            1, 1, 1, 1, 1, 1,
-            1, 1, 1, 1, 1, 1,
-    });
+                                         2, 3, 1, 1, 2, 4, 1, 2, 1, 3, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                     });
 
-    Frame * f0 = nullptr;
+    Frame *f0 = nullptr;
     castObj<Frame, DTCol>(f0, m0, nullptr);
-    Frame * f1 = nullptr;
+    Frame *f1 = nullptr;
     castObj<Frame, DTCol>(f1, m1, nullptr);
-    Frame * f2 = nullptr;
+    Frame *f2 = nullptr;
     castObj<Frame, DTCol>(f2, m2, nullptr);
 
     checkEwBinaryObjSca<Frame, VT>(BinaryOpCode::ADD, f0, 0, f0);
@@ -120,23 +102,14 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("mul - Matrix"), TAG_KERNELS, (DATA_TYPES),
     using VT = typename DT::VT;
 
     auto m0 = genGivenVals<DT>(4, {
-            0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0,
-    });
+                                      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                  });
     auto m1 = genGivenVals<DT>(4, {
-            1, 2, 0, 0, 1, 3,
-            0, 1, 0, 2, 0, 3,
-            0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0,
-    });
+                                      1, 2, 0, 0, 1, 3, 0, 1, 0, 2, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                  });
     auto m2 = genGivenVals<DT>(4, {
-            2, 4, 0, 0, 2, 6,
-            0, 2, 0, 4, 0, 6,
-            0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0,
-    });
+                                      2, 4, 0, 0, 2, 6, 0, 2, 0, 4, 0, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                  });
 
     checkEwBinaryObjSca<DT, VT>(BinaryOpCode::MUL, m0, 0, m0);
     checkEwBinaryObjSca<DT, VT>(BinaryOpCode::MUL, m1, 0, m0);
@@ -150,29 +123,20 @@ TEMPLATE_TEST_CASE(TEST_NAME("mul - Frame"), TAG_KERNELS, VALUE_TYPES) {
     using DTCol = DenseMatrix<VT>;
 
     auto m0 = genGivenVals<DTCol>(4, {
-            0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0,
-    });
+                                         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                     });
     auto m1 = genGivenVals<DTCol>(4, {
-            1, 2, 0, 0, 1, 3,
-            0, 1, 0, 2, 0, 3,
-            0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0,
-    });
+                                         1, 2, 0, 0, 1, 3, 0, 1, 0, 2, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                     });
     auto m2 = genGivenVals<DTCol>(4, {
-            2, 4, 0, 0, 2, 6,
-            0, 2, 0, 4, 0, 6,
-            0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0,
-    });
+                                         2, 4, 0, 0, 2, 6, 0, 2, 0, 4, 0, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                     });
 
-    Frame * f0 = nullptr;
+    Frame *f0 = nullptr;
     castObj<Frame, DTCol>(f0, m0, nullptr);
-    Frame * f1 = nullptr;
+    Frame *f1 = nullptr;
     castObj<Frame, DTCol>(f1, m1, nullptr);
-    Frame * f2 = nullptr;
+    Frame *f2 = nullptr;
     castObj<Frame, DTCol>(f2, m2, nullptr);
 
     checkEwBinaryObjSca<Frame, VT>(BinaryOpCode::MUL, f0, 0, f0);
@@ -186,18 +150,9 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("div - Matrix"), TAG_KERNELS, (DATA_TYPES),
     using DT = TestType;
     using VT = typename DT::VT;
 
-    auto m0 = genGivenVals<DT>(2, {
-            0, 0, 0,
-            0, 0, 0
-    });
-    auto m1 = genGivenVals<DT>(2, {
-            1, 2, 4,
-            6, 8, 9
-    });
-    auto m2 = genGivenVals<DT>(2, {
-            2,  4,  8,
-            12, 16, 18
-    });
+    auto m0 = genGivenVals<DT>(2, {0, 0, 0, 0, 0, 0});
+    auto m1 = genGivenVals<DT>(2, {1, 2, 4, 6, 8, 9});
+    auto m2 = genGivenVals<DT>(2, {2, 4, 8, 12, 16, 18});
 
     checkEwBinaryObjSca<DT, VT>(BinaryOpCode::DIV, m0, 1, m0);
     checkEwBinaryObjSca<DT, VT>(BinaryOpCode::DIV, m1, 1, m1);
@@ -210,24 +165,15 @@ TEMPLATE_TEST_CASE(TEST_NAME("div - Frame"), TAG_KERNELS, VALUE_TYPES) {
     using VT = TestType;
     using DTCol = DenseMatrix<VT>;
 
-    auto m0 = genGivenVals<DTCol>(2, {
-            0, 0, 0,
-            0, 0, 0
-    });
-    auto m1 = genGivenVals<DTCol>(2, {
-            1, 2, 4,
-            6, 8, 9
-    });
-    auto m2 = genGivenVals<DTCol>(2, {
-            2,  4,  8,
-            12, 16, 18
-    });
+    auto m0 = genGivenVals<DTCol>(2, {0, 0, 0, 0, 0, 0});
+    auto m1 = genGivenVals<DTCol>(2, {1, 2, 4, 6, 8, 9});
+    auto m2 = genGivenVals<DTCol>(2, {2, 4, 8, 12, 16, 18});
 
-    Frame * f0 = nullptr;
+    Frame *f0 = nullptr;
     castObj<Frame, DTCol>(f0, m0, nullptr);
-    Frame * f1 = nullptr;
+    Frame *f1 = nullptr;
     castObj<Frame, DTCol>(f1, m1, nullptr);
-    Frame * f2 = nullptr;
+    Frame *f2 = nullptr;
     castObj<Frame, DTCol>(f2, m2, nullptr);
 
     checkEwBinaryObjSca<Frame, VT>(BinaryOpCode::DIV, f0, 1, f0);
@@ -258,11 +204,18 @@ TEMPLATE_TEST_CASE(TEST_NAME("eq - Frame"), TAG_KERNELS, VALUE_TYPES) {
     using DTCol = DenseMatrix<VT>;
 
     auto m1 = genGivenVals<DTCol>(2, {1, 2, 3, 2, 3, 1});
-    auto m2 = genGivenVals<DTCol>(2, {0, 1, 0, 1, 0, 0,});
-
-    Frame * arg = nullptr;
+    auto m2 = genGivenVals<DTCol>(2, {
+                                         0,
+                                         1,
+                                         0,
+                                         1,
+                                         0,
+                                         0,
+                                     });
+
+    Frame *arg = nullptr;
     castObj<Frame, DTCol>(arg, m1, nullptr);
-    Frame * exp = nullptr;
+    Frame *exp = nullptr;
     castObj<Frame, DTCol>(exp, m2, nullptr);
 
     checkEwBinaryObjSca<Frame, VT>(BinaryOpCode::EQ, arg, 2, exp);
@@ -289,9 +242,9 @@ TEMPLATE_TEST_CASE(TEST_NAME("neq - Frame"), TAG_KERNELS, VALUE_TYPES) {
     auto m1 = genGivenVals<DTCol>(2, {1, 2, 3, 2, 3, 1});
     auto m2 = genGivenVals<DTCol>(2, {1, 0, 1, 0, 1, 1});
 
-    Frame * arg = nullptr;
+    Frame *arg = nullptr;
     castObj<Frame, DTCol>(arg, m1, nullptr);
-    Frame * exp = nullptr;
+    Frame *exp = nullptr;
     castObj<Frame, DTCol>(exp, m2, nullptr);
 
     checkEwBinaryObjSca<Frame, VT>(BinaryOpCode::NEQ, arg, 2, exp);
@@ -318,9 +271,9 @@ TEMPLATE_TEST_CASE(TEST_NAME("lt - Frame"), TAG_KERNELS, VALUE_TYPES) {
     auto m1 = genGivenVals<DTCol>(2, {1, 2, 3, 2, 3, 1});
     auto m2 = genGivenVals<DTCol>(2, {1, 0, 0, 0, 0, 1});
 
-    Frame * arg = nullptr;
+    Frame *arg = nullptr;
     castObj<Frame, DTCol>(arg, m1, nullptr);
-    Frame * exp = nullptr;
+    Frame *exp = nullptr;
     castObj<Frame, DTCol>(exp, m2, nullptr);
 
     checkEwBinaryObjSca<Frame, VT>(BinaryOpCode::LT, arg, 2, exp);
@@ -347,9 +300,9 @@ TEMPLATE_TEST_CASE(TEST_NAME("le - Frame"), TAG_KERNELS, VALUE_TYPES) {
     auto m1 = genGivenVals<DTCol>(2, {1, 2, 3, 2, 3, 1});
     auto m2 = genGivenVals<DTCol>(2, {1, 1, 0, 1, 0, 1});
 
-    Frame * arg = nullptr;
+    Frame *arg = nullptr;
     castObj<Frame, DTCol>(arg, m1, nullptr);
-    Frame * exp = nullptr;
+    Frame *exp = nullptr;
     castObj<Frame, DTCol>(exp, m2, nullptr);
 
     checkEwBinaryObjSca<Frame, VT>(BinaryOpCode::LE, arg, 2, exp);
@@ -376,9 +329,9 @@ TEMPLATE_TEST_CASE(TEST_NAME("gt - Frame"), TAG_KERNELS, VALUE_TYPES) {
     auto m1 = genGivenVals<DTCol>(2, {1, 2, 3, 2, 3, 1});
     auto m2 = genGivenVals<DTCol>(2, {0, 0, 1, 0, 1, 0});
 
-    Frame * arg = nullptr;
+    Frame *arg = nullptr;
     castObj<Frame, DTCol>(arg, m1, nullptr);
-    Frame * exp = nullptr;
+    Frame *exp = nullptr;
     castObj<Frame, DTCol>(exp, m2, nullptr);
 
     checkEwBinaryObjSca<Frame, VT>(BinaryOpCode::GT, arg, 2, exp);
@@ -405,9 +358,9 @@ TEMPLATE_TEST_CASE(TEST_NAME("ge - Frame"), TAG_KERNELS, VALUE_TYPES) {
     auto m1 = genGivenVals<DTCol>(2, {1, 2, 3, 2, 3, 1});
     auto m2 = genGivenVals<DTCol>(2, {0, 1, 1, 1, 1, 0});
 
-    Frame * arg = nullptr;
+    Frame *arg = nullptr;
     castObj<Frame, DTCol>(arg, m1, nullptr);
-    Frame * exp = nullptr;
+    Frame *exp = nullptr;
     castObj<Frame, DTCol>(exp, m2, nullptr);
 
     checkEwBinaryObjSca<Frame, VT>(BinaryOpCode::GE, arg, 2, exp);
@@ -422,7 +375,7 @@ TEMPLATE_TEST_CASE(TEST_NAME("ge - Frame"), TAG_KERNELS, VALUE_TYPES) {
 TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("min - Matrix"), TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)) {
     using DT = TestType;
     using VT = typename DT::VT;
-    
+
     auto arg = genGivenVals<DT>(2, {1, 2, 3, 2, 3, 1});
     auto exp = genGivenVals<DT>(2, {1, 2, 2, 2, 2, 1});
 
@@ -438,9 +391,9 @@ TEMPLATE_TEST_CASE(TEST_NAME("min - Frame"), TAG_KERNELS, VALUE_TYPES) {
     auto m1 = genGivenVals<DTCol>(2, {1, 2, 3, 2, 3, 1});
     auto m2 = genGivenVals<DTCol>(2, {1, 2, 2, 2, 2, 1});
 
-    Frame * arg = nullptr;
+    Frame *arg = nullptr;
     castObj<Frame, DTCol>(arg, m1, nullptr);
-    Frame * exp = nullptr;
+    Frame *exp = nullptr;
     castObj<Frame, DTCol>(exp, m2, nullptr);
 
     checkEwBinaryObjSca<Frame, VT>(BinaryOpCode::MIN, arg, 2, exp);
@@ -467,9 +420,9 @@ TEMPLATE_TEST_CASE(TEST_NAME("max - Frame"), TAG_KERNELS, VALUE_TYPES) {
     auto m1 = genGivenVals<DTCol>(2, {1, 2, 3, 2, 3, 1});
     auto m2 = genGivenVals<DTCol>(2, {2, 2, 3, 2, 3, 2});
 
-    Frame * arg = nullptr;
+    Frame *arg = nullptr;
     castObj<Frame, DTCol>(arg, m1, nullptr);
-    Frame * exp = nullptr;
+    Frame *exp = nullptr;
     castObj<Frame, DTCol>(exp, m2, nullptr);
 
     checkEwBinaryObjSca<Frame, VT>(BinaryOpCode::MAX, arg, 2, exp);
@@ -486,7 +439,7 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("and - Matrix"), TAG_KERNELS, (DATA_TYPES),
     using VT = typename DT::VT;
 
     auto arg = genGivenVals<DT>(2, {0, 1, 2, VT(-2)});
-    DT * exp = nullptr;
+    DT *exp = nullptr;
 
     SECTION("scalar=0, matrix") {
         exp = genGivenVals<DT>(2, {0, 0, 0, 0});
@@ -513,9 +466,9 @@ TEMPLATE_TEST_CASE(TEST_NAME("and - Frame"), TAG_KERNELS, VALUE_TYPES) {
     using DTCol = DenseMatrix<VT>;
 
     auto m1 = genGivenVals<DTCol>(2, {0, 1, 2, VT(-2)});
-    DTCol * m2 = nullptr;
-    Frame * arg = nullptr;
-    Frame * exp = nullptr;
+    DTCol *m2 = nullptr;
+    Frame *arg = nullptr;
+    Frame *exp = nullptr;
 
     SECTION("scalar=0, frame") {
         m2 = genGivenVals<DTCol>(2, {0, 0, 0, 0});
@@ -554,7 +507,7 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("or - Matrix"), TAG_KERNELS, (DATA_TYPES),
     using VT = typename DT::VT;
 
     auto arg = genGivenVals<DT>(2, {0, 1, 2, VT(-2)});
-    DT * exp = nullptr;
+    DT *exp = nullptr;
 
     SECTION("scalar=0, matrix") {
         exp = genGivenVals<DT>(2, {0, 1, 1, 1});
@@ -581,9 +534,9 @@ TEMPLATE_TEST_CASE(TEST_NAME("or - Frame"), TAG_KERNELS, VALUE_TYPES) {
     using DTCol = DenseMatrix<VT>;
 
     auto m1 = genGivenVals<DTCol>(2, {0, 1, 2, VT(-2)});
-    DTCol * m2 = nullptr;
-    Frame * arg = nullptr;
-    Frame * exp = nullptr;
+    DTCol *m2 = nullptr;
+    Frame *arg = nullptr;
+    Frame *exp = nullptr;
 
     SECTION("scalar=0, frame") {
         m2 = genGivenVals<DTCol>(2, {0, 1, 1, 1});
@@ -623,7 +576,7 @@ TEMPLATE_TEST_CASE(TEST_NAME("or - Frame"), TAG_KERNELS, VALUE_TYPES) {
 
 TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("some invalid op-code"), TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)) {
     using DT = TestType;
-    DT * res = nullptr;
+    DT *res = nullptr;
     auto arg = genGivenVals<DT>(1, {1});
     CHECK_THROWS(ewBinaryObjSca<DT, DT, typename DT::VT>(static_cast<BinaryOpCode>(999), res, arg, 1, nullptr));
 }
\ No newline at end of file
diff --git a/test/runtime/local/kernels/EwBinaryScaTest.cpp b/test/runtime/local/kernels/EwBinaryScaTest.cpp
index fa07e7ca0..1504071d8 100644
--- a/test/runtime/local/kernels/EwBinaryScaTest.cpp
+++ b/test/runtime/local/kernels/EwBinaryScaTest.cpp
@@ -25,12 +25,26 @@
 #define TEST_NAME(opName) "EwBinarySca (" opName ")"
 #define VALUE_TYPES double, uint32_t
 
-template<BinaryOpCode opCode, typename VT>
-void checkEwBinarySca(VT lhs, VT rhs, VT exp) {
+template <BinaryOpCode opCode, typename VT> void checkEwBinarySca(VT lhs, VT rhs, VT exp) {
     CHECK(EwBinarySca<opCode, VT, VT, VT>::apply(lhs, rhs, nullptr) == exp);
     CHECK(ewBinarySca<VT, VT, VT>(opCode, lhs, rhs, nullptr) == exp);
 }
 
+template <BinaryOpCode opCode> void checkEwBinarySca(std::string lhs, std::string rhs, int64_t exp) {
+    CHECK(EwBinarySca<opCode, int64_t, std::string, std::string>::apply(lhs, rhs, nullptr) == exp);
+    CHECK(ewBinarySca<int64_t, std::string, std::string>(opCode, lhs, rhs, nullptr) == exp);
+}
+
+template <BinaryOpCode opCode> void checkEwBinarySca(FixedStr16 lhs, FixedStr16 rhs, int64_t exp) {
+    CHECK(EwBinarySca<opCode, int64_t, FixedStr16, FixedStr16>::apply(lhs, rhs, nullptr) == exp);
+    CHECK(ewBinarySca<int64_t, FixedStr16, FixedStr16>(opCode, lhs, rhs, nullptr) == exp);
+}
+
+template <typename VT> void checkEwBinarySca(VT lhs, VT rhs, std::string exp) {
+    CHECK(EwBinarySca<BinaryOpCode::CONCAT, std::string, VT, VT>::apply(lhs, rhs, nullptr) == exp);
+    CHECK(ewBinarySca<std::string, VT, VT>(BinaryOpCode::CONCAT, lhs, rhs, nullptr) == exp);
+}
+
 // ****************************************************************************
 // Arithmetic
 // ****************************************************************************
@@ -66,6 +80,19 @@ TEMPLATE_TEST_CASE(TEST_NAME("eq"), TAG_KERNELS, VALUE_TYPES) {
     checkEwBinarySca<BinaryOpCode::EQ, VT>(3, 5, 0);
 }
 
+TEMPLATE_TEST_CASE(TEST_NAME("eq"), TAG_KERNELS, ALL_STRING_VALUE_TYPES) {
+    using VT = TestType;
+    checkEwBinarySca<BinaryOpCode::EQ>(VT("abcd"), VT("abcd"), 1);
+    checkEwBinarySca<BinaryOpCode::EQ>(VT("abce"), VT("abcd"), 0);
+    checkEwBinarySca<BinaryOpCode::EQ>(VT("abcda"), VT("abcd"), 0);
+    checkEwBinarySca<BinaryOpCode::EQ>(VT("abc"), VT("abcd"), 0);
+    checkEwBinarySca<BinaryOpCode::EQ>(VT("ABCD"), VT("abcd"), 0);
+    checkEwBinarySca<BinaryOpCode::EQ>(VT("36abcd"), VT("30abcd"), 0);
+    checkEwBinarySca<BinaryOpCode::EQ>(VT("3"), VT("4"), 0);
+    checkEwBinarySca<BinaryOpCode::EQ>(VT(""), VT("abc"), 0);
+    checkEwBinarySca<BinaryOpCode::EQ>(VT(""), VT(""), 1);
+}
+
 TEMPLATE_TEST_CASE(TEST_NAME("neq"), TAG_KERNELS, VALUE_TYPES) {
     using VT = TestType;
     checkEwBinarySca<BinaryOpCode::NEQ, VT>(0, 0, 0);
@@ -73,6 +100,19 @@ TEMPLATE_TEST_CASE(TEST_NAME("neq"), TAG_KERNELS, VALUE_TYPES) {
     checkEwBinarySca<BinaryOpCode::NEQ, VT>(3, 5, 1);
 }
 
+TEMPLATE_TEST_CASE(TEST_NAME("neq"), TAG_KERNELS, ALL_STRING_VALUE_TYPES) {
+    using VT = TestType;
+    checkEwBinarySca<BinaryOpCode::NEQ>(VT("abcd"), VT("abcd"), 0);
+    checkEwBinarySca<BinaryOpCode::NEQ>(VT("abce"), VT("abcd"), 1);
+    checkEwBinarySca<BinaryOpCode::NEQ>(VT("abcda"), VT("abcd"), 1);
+    checkEwBinarySca<BinaryOpCode::NEQ>(VT("abc"), VT("abcd"), 1);
+    checkEwBinarySca<BinaryOpCode::NEQ>(VT("ABCD"), VT("abcd"), 1);
+    checkEwBinarySca<BinaryOpCode::NEQ>(VT("36abcd"), VT("30abcd"), 1);
+    checkEwBinarySca<BinaryOpCode::NEQ>(VT("3"), VT("4"), 1);
+    checkEwBinarySca<BinaryOpCode::NEQ>(VT(""), VT("abc"), 1);
+    checkEwBinarySca<BinaryOpCode::NEQ>(VT(""), VT(""), 0);
+}
+
 TEMPLATE_TEST_CASE(TEST_NAME("lt"), TAG_KERNELS, VALUE_TYPES) {
     using VT = TestType;
     checkEwBinarySca<BinaryOpCode::LT, VT>(1, 1, 0);
@@ -80,6 +120,21 @@ TEMPLATE_TEST_CASE(TEST_NAME("lt"), TAG_KERNELS, VALUE_TYPES) {
     checkEwBinarySca<BinaryOpCode::LT, VT>(4, 2, 0);
 }
 
+TEMPLATE_TEST_CASE(TEST_NAME("lt"), TAG_KERNELS, ALL_STRING_VALUE_TYPES) {
+    using VT = TestType;
+    checkEwBinarySca<BinaryOpCode::LT>(VT("abcd"), VT("abcd"), 0);
+    checkEwBinarySca<BinaryOpCode::LT>(VT("abce"), VT("abcd"), 0);
+    checkEwBinarySca<BinaryOpCode::LT>(VT("abcb"), VT("abcd"), 1);
+    checkEwBinarySca<BinaryOpCode::LT>(VT("abcda"), VT("abcd"), 0);
+    checkEwBinarySca<BinaryOpCode::LT>(VT("abc"), VT("abcd"), 1);
+    checkEwBinarySca<BinaryOpCode::LT>(VT("ABCD"), VT("abcd"), 1);
+    checkEwBinarySca<BinaryOpCode::LT>(VT("abcD"), VT("abcd"), 1);
+    checkEwBinarySca<BinaryOpCode::LT>(VT("36abcd"), VT("30abcd"), 0);
+    checkEwBinarySca<BinaryOpCode::LT>(VT("3"), VT("4"), 1);
+    checkEwBinarySca<BinaryOpCode::LT>(VT(""), VT("abc"), 1);
+    checkEwBinarySca<BinaryOpCode::LT>(VT(""), VT(""), 0);
+}
+
 TEMPLATE_TEST_CASE(TEST_NAME("le"), TAG_KERNELS, VALUE_TYPES) {
     using VT = TestType;
     checkEwBinarySca<BinaryOpCode::LE, VT>(1, 1, 1);
@@ -94,6 +149,21 @@ TEMPLATE_TEST_CASE(TEST_NAME("gt"), TAG_KERNELS, VALUE_TYPES) {
     checkEwBinarySca<BinaryOpCode::GT, VT>(4, 2, 1);
 }
 
+TEMPLATE_TEST_CASE(TEST_NAME("gt"), TAG_KERNELS, ALL_STRING_VALUE_TYPES) {
+    using VT = TestType;
+    checkEwBinarySca<BinaryOpCode::GT>(VT("abcd"), VT("abcd"), 0);
+    checkEwBinarySca<BinaryOpCode::GT>(VT("abce"), VT("abcd"), 1);
+    checkEwBinarySca<BinaryOpCode::GT>(VT("abcb"), VT("abcd"), 0);
+    checkEwBinarySca<BinaryOpCode::GT>(VT("abcda"), VT("abcd"), 1);
+    checkEwBinarySca<BinaryOpCode::GT>(VT("abc"), VT("abcd"), 0);
+    checkEwBinarySca<BinaryOpCode::GT>(VT("ABCD"), VT("abcd"), 0);
+    checkEwBinarySca<BinaryOpCode::GT>(VT("abcD"), VT("abcd"), 0);
+    checkEwBinarySca<BinaryOpCode::GT>(VT("36abcd"), VT("30abcd"), 1);
+    checkEwBinarySca<BinaryOpCode::GT>(VT("3"), VT("4"), 0);
+    checkEwBinarySca<BinaryOpCode::GT>(VT(""), VT("abc"), 0);
+    checkEwBinarySca<BinaryOpCode::GT>(VT(""), VT(""), 0);
+}
+
 TEMPLATE_TEST_CASE(TEST_NAME("ge"), TAG_KERNELS, VALUE_TYPES) {
     using VT = TestType;
     checkEwBinarySca<BinaryOpCode::GE, VT>(1, 1, 1);
@@ -125,32 +195,45 @@ TEMPLATE_TEST_CASE(TEST_NAME("max"), TAG_KERNELS, VALUE_TYPES) {
 
 TEMPLATE_TEST_CASE(TEST_NAME("and"), TAG_KERNELS, VALUE_TYPES) {
     using VT = TestType;
-    checkEwBinarySca<BinaryOpCode::AND, VT>( 0,  0, 0);
-    checkEwBinarySca<BinaryOpCode::AND, VT>( 0,  1, 0);
-    checkEwBinarySca<BinaryOpCode::AND, VT>( 1,  0, 0);
-    checkEwBinarySca<BinaryOpCode::AND, VT>( 1,  1, 1);
-    checkEwBinarySca<BinaryOpCode::AND, VT>( 0,  2, 0);
-    checkEwBinarySca<BinaryOpCode::AND, VT>( 2,  0, 0);
-    checkEwBinarySca<BinaryOpCode::AND, VT>( 2,  2, 1);
-    checkEwBinarySca<BinaryOpCode::AND, VT>( 0, -2, 0);
-    checkEwBinarySca<BinaryOpCode::AND, VT>(-2,  0, 0);
+    checkEwBinarySca<BinaryOpCode::AND, VT>(0, 0, 0);
+    checkEwBinarySca<BinaryOpCode::AND, VT>(0, 1, 0);
+    checkEwBinarySca<BinaryOpCode::AND, VT>(1, 0, 0);
+    checkEwBinarySca<BinaryOpCode::AND, VT>(1, 1, 1);
+    checkEwBinarySca<BinaryOpCode::AND, VT>(0, 2, 0);
+    checkEwBinarySca<BinaryOpCode::AND, VT>(2, 0, 0);
+    checkEwBinarySca<BinaryOpCode::AND, VT>(2, 2, 1);
+    checkEwBinarySca<BinaryOpCode::AND, VT>(0, -2, 0);
+    checkEwBinarySca<BinaryOpCode::AND, VT>(-2, 0, 0);
     checkEwBinarySca<BinaryOpCode::AND, VT>(-2, -2, 1);
 }
 
 TEMPLATE_TEST_CASE(TEST_NAME("or"), TAG_KERNELS, VALUE_TYPES) {
     using VT = TestType;
-    checkEwBinarySca<BinaryOpCode::OR, VT>( 0,  0, 0);
-    checkEwBinarySca<BinaryOpCode::OR, VT>( 0,  1, 1);
-    checkEwBinarySca<BinaryOpCode::OR, VT>( 1,  0, 1);
-    checkEwBinarySca<BinaryOpCode::OR, VT>( 1,  1, 1);
-    checkEwBinarySca<BinaryOpCode::OR, VT>( 0,  2, 1);
-    checkEwBinarySca<BinaryOpCode::OR, VT>( 2,  0, 1);
-    checkEwBinarySca<BinaryOpCode::OR, VT>( 2,  2, 1);
-    checkEwBinarySca<BinaryOpCode::OR, VT>( 0, -2, 1);
-    checkEwBinarySca<BinaryOpCode::OR, VT>(-2,  0, 1);
+    checkEwBinarySca<BinaryOpCode::OR, VT>(0, 0, 0);
+    checkEwBinarySca<BinaryOpCode::OR, VT>(0, 1, 1);
+    checkEwBinarySca<BinaryOpCode::OR, VT>(1, 0, 1);
+    checkEwBinarySca<BinaryOpCode::OR, VT>(1, 1, 1);
+    checkEwBinarySca<BinaryOpCode::OR, VT>(0, 2, 1);
+    checkEwBinarySca<BinaryOpCode::OR, VT>(2, 0, 1);
+    checkEwBinarySca<BinaryOpCode::OR, VT>(2, 2, 1);
+    checkEwBinarySca<BinaryOpCode::OR, VT>(0, -2, 1);
+    checkEwBinarySca<BinaryOpCode::OR, VT>(-2, 0, 1);
     checkEwBinarySca<BinaryOpCode::OR, VT>(-2, -2, 1);
 }
 
+// ****************************************************************************
+// String ops
+// ****************************************************************************
+
+TEMPLATE_TEST_CASE(TEST_NAME("concat"), TAG_KERNELS, ALL_STRING_VALUE_TYPES) {
+    using VT = TestType;
+    checkEwBinarySca<VT>(VT("abcd"), VT("abcd"), std::string("abcdabcd"));
+    checkEwBinarySca<VT>(VT("ABCD"), VT("abcd"), std::string("ABCDabcd"));
+    checkEwBinarySca<VT>(VT("3"), VT("4"), std::string("34"));
+    checkEwBinarySca<VT>(VT(""), VT("abc"), std::string("abc"));
+    checkEwBinarySca<VT>(VT(""), VT(""), std::string(""));
+}
+
 // ****************************************************************************
 // Invalid op-code
 // ****************************************************************************
diff --git a/test/runtime/local/kernels/EwUnaryMatTest.cpp b/test/runtime/local/kernels/EwUnaryMatTest.cpp
index 30cd78365..9a923db98 100644
--- a/test/runtime/local/kernels/EwUnaryMatTest.cpp
+++ b/test/runtime/local/kernels/EwUnaryMatTest.cpp
@@ -14,13 +14,13 @@
  * limitations under the License.
  */
 
-#include <runtime/local/kernels/CheckEq.h>
-#include <runtime/local/kernels/CheckEqApprox.h>
+#include <runtime/local/datagen/GenGivenVals.h>
 #include <runtime/local/datastructures/CSRMatrix.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
 #include <runtime/local/datastructures/Matrix.h>
+#include <runtime/local/kernels/CheckEq.h>
+#include <runtime/local/kernels/CheckEqApprox.h>
 #include <runtime/local/kernels/EwUnaryMat.h>
-#include <runtime/local/datagen/GenGivenVals.h>
 
 #include <tags.h>
 
@@ -34,25 +34,23 @@
 #define DATA_TYPES DenseMatrix, Matrix
 #define VALUE_TYPES int32_t, double
 
-template<typename DTRes, typename DTArg>
-void checkEwUnaryMat(UnaryOpCode opCode, const DTArg * arg, const DTRes * exp) {
-    DTRes * res = nullptr;
+template <typename DTRes, typename DTArg> void checkEwUnaryMat(UnaryOpCode opCode, const DTArg *arg, const DTRes *exp) {
+    DTRes *res = nullptr;
     ewUnaryMat<DTRes, DTArg>(opCode, res, arg, nullptr);
     CHECK(*res == *exp);
     DataObjectFactory::destroy(res);
 }
 
-template<typename DTRes, typename DTArg>
-void checkEwUnaryMatApprox(UnaryOpCode opCode, const DTArg * arg, const DTRes * exp) {
-    DTRes * res = nullptr;
+template <typename DTRes, typename DTArg>
+void checkEwUnaryMatApprox(UnaryOpCode opCode, const DTArg *arg, const DTRes *exp) {
+    DTRes *res = nullptr;
     ewUnaryMat<DTRes, DTArg>(opCode, res, arg, nullptr);
     CHECK(checkEqApprox(res, exp, 1e-2, nullptr));
     DataObjectFactory::destroy(res);
 }
 
-template<typename DTArg>
-void checkEwUnaryMatThrow(UnaryOpCode opCode, const DTArg * arg) {
-    DTArg * res = nullptr;
+template <typename DTArg> void checkEwUnaryMatThrow(UnaryOpCode opCode, const DTArg *arg) {
+    DTArg *res = nullptr;
     REQUIRE_THROWS_AS((ewUnaryMat<DTArg, DTArg>(opCode, res, arg, nullptr)), std::domain_error);
     DataObjectFactory::destroy(res);
 }
@@ -65,16 +63,16 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("abs"), TAG_KERNELS, (DATA_TYPES), (VALUE_T
     using DT = TestType;
 
     auto arg = genGivenVals<DT>(3, {
-        0,
-        1,
-        -1,
-    });
+                                       0,
+                                       1,
+                                       -1,
+                                   });
 
     auto exp = genGivenVals<DT>(3, {
-        0,
-        1,
-        1,
-    });
+                                       0,
+                                       1,
+                                       1,
+                                   });
 
     checkEwUnaryMat(UnaryOpCode::ABS, arg, exp);
 
@@ -86,14 +84,22 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("sign"), TAG_KERNELS, (DATA_TYPES), (VALUE_
     using VT = typename DT::VT;
 
     auto arg = genGivenVals<DT>(2, {
-        0, 1, -1,
-        10, -10, VT(1.4),
-    });
+                                       0,
+                                       1,
+                                       -1,
+                                       10,
+                                       -10,
+                                       VT(1.4),
+                                   });
 
     auto exp = genGivenVals<DT>(2, {
-        0, 1, -1,
-        1, -1, 1,
-    });
+                                       0,
+                                       1,
+                                       -1,
+                                       1,
+                                       -1,
+                                       1,
+                                   });
 
     checkEwUnaryMat(UnaryOpCode::SIGN, arg, exp);
 
@@ -105,14 +111,14 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("sign, floating-point-specific"), TAG_KERNE
     using VT = typename DT::VT;
 
     auto arg = genGivenVals<DT>(2, {
-        std::numeric_limits<VT>::infinity(),
-        - std::numeric_limits<VT>::infinity(),
-    });
+                                       std::numeric_limits<VT>::infinity(),
+                                       -std::numeric_limits<VT>::infinity(),
+                                   });
 
     auto exp = genGivenVals<DT>(2, {
-        1,
-        -1,
-    });
+                                       1,
+                                       -1,
+                                   });
 
     checkEwUnaryMat(UnaryOpCode::SIGN, arg, exp);
 
@@ -123,16 +129,16 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("sqrt"), TAG_KERNELS, (DATA_TYPES), (VALUE_
     using DT = TestType;
 
     auto arg = genGivenVals<DT>(3, {
-        0,
-        1,
-        16,
-    });
+                                       0,
+                                       1,
+                                       16,
+                                   });
 
     auto exp = genGivenVals<DT>(3, {
-        0,
-        1,
-        4,
-    });
+                                       0,
+                                       1,
+                                       4,
+                                   });
 
     checkEwUnaryMat(UnaryOpCode::SQRT, arg, exp);
 
@@ -143,10 +149,10 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("sqrt, check domain_error"), TAG_KERNELS, (
     using DT = TestType;
 
     auto arg = genGivenVals<DT>(3, {
-        0,
-        1,
-        -1,
-    });
+                                       0,
+                                       1,
+                                       -1,
+                                   });
 
     checkEwUnaryMatThrow(UnaryOpCode::SQRT, arg);
 
@@ -158,16 +164,16 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("exp"), TAG_KERNELS, (DATA_TYPES), (VALUE_T
     using VT = typename DT::VT;
 
     auto arg = genGivenVals<DT>(3, {
-        0,
-        -1,
-        3,
-    });
+                                       0,
+                                       -1,
+                                       3,
+                                   });
 
     auto exp = genGivenVals<DT>(3, {
-        1,
-        VT(0.367),
-        VT(20.085),
-    });
+                                       1,
+                                       VT(0.367),
+                                       VT(20.085),
+                                   });
 
     checkEwUnaryMatApprox(UnaryOpCode::EXP, arg, exp);
 
@@ -179,16 +185,16 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("ln"), TAG_KERNELS, (DATA_TYPES), (VALUE_TY
     using VT = typename DT::VT;
 
     auto arg = genGivenVals<DT>(3, {
-        1,
-        3,
-        8,
-    });
+                                       1,
+                                       3,
+                                       8,
+                                   });
 
     auto exp = genGivenVals<DT>(3, {
-        0,
-        VT(1.098),
-        VT(2.079),
-    });
+                                       0,
+                                       VT(1.098),
+                                       VT(2.079),
+                                   });
 
     checkEwUnaryMatApprox(UnaryOpCode::LN, arg, exp);
 
@@ -199,10 +205,10 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("ln, check domain_error"), TAG_KERNELS, (DA
     using DT = TestType;
 
     auto arg = genGivenVals<DT>(3, {
-        0,
-        1,
-        -1,
-    });
+                                       0,
+                                       1,
+                                       -1,
+                                   });
 
     checkEwUnaryMatThrow(UnaryOpCode::LN, arg);
 
@@ -218,16 +224,16 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("sin"), TAG_KERNELS, (DATA_TYPES), (VALUE_T
     using VT = typename DT::VT;
 
     auto arg = genGivenVals<DT>(3, {
-        0,
-        1,
-        -1,
-    });
+                                       0,
+                                       1,
+                                       -1,
+                                   });
 
     auto exp = genGivenVals<DT>(3, {
-        0,
-        VT(0.841),
-        VT(-0.841),
-    });
+                                       0,
+                                       VT(0.841),
+                                       VT(-0.841),
+                                   });
 
     checkEwUnaryMatApprox(UnaryOpCode::SIN, arg, exp);
 
@@ -239,16 +245,16 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("cos"), TAG_KERNELS, (DATA_TYPES), (VALUE_T
     using VT = typename DT::VT;
 
     auto arg = genGivenVals<DT>(3, {
-        0,
-        1,
-        -1,
-    });
+                                       0,
+                                       1,
+                                       -1,
+                                   });
 
     auto exp = genGivenVals<DT>(3, {
-        1,
-        VT(0.54),
-        VT(0.54),
-    });
+                                       1,
+                                       VT(0.54),
+                                       VT(0.54),
+                                   });
 
     checkEwUnaryMatApprox(UnaryOpCode::COS, arg, exp);
 
@@ -260,16 +266,16 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("tan"), TAG_KERNELS, (DATA_TYPES), (VALUE_T
     using VT = typename DT::VT;
 
     auto arg = genGivenVals<DT>(3, {
-        0,
-        1,
-        -1,
-    });
+                                       0,
+                                       1,
+                                       -1,
+                                   });
 
     auto exp = genGivenVals<DT>(3, {
-        0,
-        VT(1.557),
-        VT(-1.557),
-    });
+                                       0,
+                                       VT(1.557),
+                                       VT(-1.557),
+                                   });
 
     checkEwUnaryMatApprox(UnaryOpCode::TAN, arg, exp);
 
@@ -281,16 +287,16 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("asin"), TAG_KERNELS, (DATA_TYPES), (VALUE_
     using VT = typename DT::VT;
 
     auto arg = genGivenVals<DT>(3, {
-        0,
-        1,
-        -1,
-    });
+                                       0,
+                                       1,
+                                       -1,
+                                   });
 
     auto exp = genGivenVals<DT>(3, {
-        0,
-        VT(1.57),
-        VT(-1.57),
-    });
+                                       0,
+                                       VT(1.57),
+                                       VT(-1.57),
+                                   });
 
     checkEwUnaryMatApprox(UnaryOpCode::ASIN, arg, exp);
 
@@ -301,10 +307,10 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("asin, check domain_error"), TAG_KERNELS, (
     using DT = TestType;
 
     auto arg = genGivenVals<DT>(3, {
-        0,
-        1,
-        -2,
-    });
+                                       0,
+                                       1,
+                                       -2,
+                                   });
 
     checkEwUnaryMatThrow(UnaryOpCode::ASIN, arg);
 
@@ -316,16 +322,16 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("acos"), TAG_KERNELS, (DATA_TYPES), (VALUE_
     using VT = typename DT::VT;
 
     auto arg = genGivenVals<DT>(3, {
-        0,
-        1,
-        -1,
-    });
+                                       0,
+                                       1,
+                                       -1,
+                                   });
 
     auto exp = genGivenVals<DT>(3, {
-        VT(1.57),
-        0,
-        VT(3.141),
-    });
+                                       VT(1.57),
+                                       0,
+                                       VT(3.141),
+                                   });
 
     checkEwUnaryMatApprox(UnaryOpCode::ACOS, arg, exp);
 
@@ -336,10 +342,10 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("acos, check domain_error"), TAG_KERNELS, (
     using DT = TestType;
 
     auto arg = genGivenVals<DT>(3, {
-        0,
-        1,
-        -2,
-    });
+                                       0,
+                                       1,
+                                       -2,
+                                   });
 
     checkEwUnaryMatThrow(UnaryOpCode::ACOS, arg);
 
@@ -351,16 +357,16 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("atan"), TAG_KERNELS, (DATA_TYPES), (VALUE_
     using VT = typename DT::VT;
 
     auto arg = genGivenVals<DT>(3, {
-        0,
-        1,
-        -1,
-    });
+                                       0,
+                                       1,
+                                       -1,
+                                   });
 
     auto exp = genGivenVals<DT>(3, {
-        0,
-        VT(0.785),
-        VT(-0.785),
-    });
+                                       0,
+                                       VT(0.785),
+                                       VT(-0.785),
+                                   });
 
     checkEwUnaryMatApprox(UnaryOpCode::ATAN, arg, exp);
 
@@ -372,16 +378,16 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("sinh"), TAG_KERNELS, (DATA_TYPES), (VALUE_
     using VT = typename DT::VT;
 
     auto arg = genGivenVals<DT>(3, {
-        0,
-        1,
-        -1,
-    });
+                                       0,
+                                       1,
+                                       -1,
+                                   });
 
     auto exp = genGivenVals<DT>(3, {
-        0,
-        VT(1.175),
-        VT(-1.175),
-    });
+                                       0,
+                                       VT(1.175),
+                                       VT(-1.175),
+                                   });
 
     checkEwUnaryMatApprox(UnaryOpCode::SINH, arg, exp);
 
@@ -393,16 +399,16 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("cosh"), TAG_KERNELS, (DATA_TYPES), (VALUE_
     using VT = typename DT::VT;
 
     auto arg = genGivenVals<DT>(3, {
-        0,
-        1,
-        -1,
-    });
+                                       0,
+                                       1,
+                                       -1,
+                                   });
 
     auto exp = genGivenVals<DT>(3, {
-        1,
-        VT(1.543),
-        VT(1.543),
-    });
+                                       1,
+                                       VT(1.543),
+                                       VT(1.543),
+                                   });
 
     checkEwUnaryMatApprox(UnaryOpCode::COSH, arg, exp);
 
@@ -414,16 +420,16 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("tanh"), TAG_KERNELS, (DATA_TYPES), (VALUE_
     using VT = typename DT::VT;
 
     auto arg = genGivenVals<DT>(3, {
-        0,
-        1,
-        -1,
-    });
+                                       0,
+                                       1,
+                                       -1,
+                                   });
 
     auto exp = genGivenVals<DT>(3, {
-        0,
-        VT(0.761),
-        VT(-0.761),
-    });
+                                       0,
+                                       VT(0.761),
+                                       VT(-0.761),
+                                   });
 
     checkEwUnaryMatApprox(UnaryOpCode::TANH, arg, exp);
 
@@ -438,16 +444,16 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("floor"), TAG_KERNELS, (DATA_TYPES), (VALUE
     using DT = TestType;
 
     auto arg = genGivenVals<DT>(3, {
-        0,
-        1,
-        -1,
-    });
+                                       0,
+                                       1,
+                                       -1,
+                                   });
 
     auto exp = genGivenVals<DT>(3, {
-        0,
-        1,
-        -1,
-    });
+                                       0,
+                                       1,
+                                       -1,
+                                   });
 
     checkEwUnaryMat(UnaryOpCode::FLOOR, arg, exp);
 
@@ -458,14 +464,18 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("floor, floating-point-specific"), TAG_KERN
     using DT = TestType;
 
     auto arg = genGivenVals<DT>(2, {
-        0.3, -0.3,
-        0.9, -0.9,
-    });
+                                       0.3,
+                                       -0.3,
+                                       0.9,
+                                       -0.9,
+                                   });
 
     auto exp = genGivenVals<DT>(2, {
-        0, -1,
-        0, -1,
-    });
+                                       0,
+                                       -1,
+                                       0,
+                                       -1,
+                                   });
 
     checkEwUnaryMat(UnaryOpCode::FLOOR, arg, exp);
 
@@ -476,16 +486,16 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("ceil"), TAG_KERNELS, (DATA_TYPES), (VALUE_
     using DT = TestType;
 
     auto arg = genGivenVals<DT>(3, {
-        0,
-        1,
-        -1,
-    });
+                                       0,
+                                       1,
+                                       -1,
+                                   });
 
     auto exp = genGivenVals<DT>(3, {
-        0,
-        1,
-        -1,
-    });
+                                       0,
+                                       1,
+                                       -1,
+                                   });
 
     checkEwUnaryMat(UnaryOpCode::CEIL, arg, exp);
 
@@ -496,14 +506,18 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("ceil, floating-point-specific"), TAG_KERNE
     using DT = TestType;
 
     auto arg = genGivenVals<DT>(2, {
-        0.3, -0.3,
-        1.1, -1.9,
-    });
+                                       0.3,
+                                       -0.3,
+                                       1.1,
+                                       -1.9,
+                                   });
 
     auto exp = genGivenVals<DT>(2, {
-        1, -0.0,
-        2, -1,
-    });
+                                       1,
+                                       -0.0,
+                                       2,
+                                       -1,
+                                   });
 
     checkEwUnaryMat(UnaryOpCode::CEIL, arg, exp);
 
@@ -514,16 +528,16 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("round"), TAG_KERNELS, (DATA_TYPES), (VALUE
     using DT = TestType;
 
     auto arg = genGivenVals<DT>(3, {
-        0,
-        1,
-        -1,
-    });
+                                       0,
+                                       1,
+                                       -1,
+                                   });
 
     auto exp = genGivenVals<DT>(3, {
-        0,
-        1,
-        -1,
-    });
+                                       0,
+                                       1,
+                                       -1,
+                                   });
 
     checkEwUnaryMat(UnaryOpCode::ROUND, arg, exp);
 
@@ -534,14 +548,18 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("round, floating-point-specific"), TAG_KERN
     using DT = TestType;
 
     auto arg = genGivenVals<DT>(2, {
-        0.3, -0.3,
-        0.5, -0.5,
-    });
+                                       0.3,
+                                       -0.3,
+                                       0.5,
+                                       -0.5,
+                                   });
 
     auto exp = genGivenVals<DT>(2, {
-        0, -0.0,
-        1, -1,
-    });
+                                       0,
+                                       -0.0,
+                                       1,
+                                       -1,
+                                   });
 
     checkEwUnaryMat(UnaryOpCode::ROUND, arg, exp);
 
@@ -556,18 +574,13 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("isNan"), TAG_KERNELS, (DATA_TYPES), (int32
     using DT = TestType;
 
     auto arg = genGivenVals<DT>(4, {
-        1,
-        0,
-        99,
-        -99, 
-    });
-
-    auto exp = genGivenVals<DT>(4, {
-        0,
-        0,
-        0,
-        0
-    });
+                                       1,
+                                       0,
+                                       99,
+                                       -99,
+                                   });
+
+    auto exp = genGivenVals<DT>(4, {0, 0, 0, 0});
 
     checkEwUnaryMat(UnaryOpCode::ISNAN, arg, exp);
 
@@ -578,44 +591,56 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("isNan, floating-point specific"), TAG_KERN
     using DT = TestType;
     using VT = typename DT::VT;
 
-    auto arg = genGivenVals<DT>(9, {
-        1,
-        std::numeric_limits<VT>::quiet_NaN(),
-        0,
-        std::numeric_limits<VT>::infinity(),
-        -std::numeric_limits<VT>::infinity(), 
-        99.9,
-        -99.9,
-        std::numeric_limits<VT>::quiet_NaN(),
-        std::numeric_limits<VT>::denorm_min()
-    });
-
-    auto exp = genGivenVals<DT>(9, {
-        0,
-        1,
-        0,
-        0,
-        0,
-        0,
-        0,
-        1,
-        0
-    });
+    auto arg = genGivenVals<DT>(9, {1, std::numeric_limits<VT>::quiet_NaN(), 0, std::numeric_limits<VT>::infinity(),
+                                    -std::numeric_limits<VT>::infinity(), 99.9, -99.9,
+                                    std::numeric_limits<VT>::quiet_NaN(), std::numeric_limits<VT>::denorm_min()});
+
+    auto exp = genGivenVals<DT>(9, {0, 1, 0, 0, 0, 0, 0, 1, 0});
 
     checkEwUnaryMat(UnaryOpCode::ISNAN, arg, exp);
 
     DataObjectFactory::destroy(arg, exp);
 }
 
+// ****************************************************************************
+// String Upper and Lower
+// ****************************************************************************
+
+TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("Upper, string data"), TAG_KERNELS, (DenseMatrix), (ALL_STRING_VALUE_TYPES)) {
+    using DT = TestType;
+    using VT = typename DT::VT;
+
+    auto arg = genGivenVals<DT>(2, {VT(""), VT("Ab"), VT("123 abc"), VT("ABc"), VT("ab"), VT("12")});
+
+    auto exp = genGivenVals<DenseMatrix<VT>>(2, {VT(""), VT("AB"), VT("123 ABC"), VT("ABC"), VT("AB"), VT("12")});
+
+    checkEwUnaryMat(UnaryOpCode::UPPER, arg, exp);
+
+    DataObjectFactory::destroy(arg, exp);
+}
+
+TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("Lower, string data"), TAG_KERNELS, (DenseMatrix), (ALL_STRING_VALUE_TYPES)) {
+    using DT = TestType;
+    using VT = typename DT::VT;
+
+    auto arg = genGivenVals<DT>(2, {VT(""), VT("Ab"), VT("123 ABC"), VT("ABc"), VT("ab"), VT("14")});
+
+    auto exp = genGivenVals<DenseMatrix<VT>>(2, {VT(""), VT("ab"), VT("123 abc"), VT("abc"), VT("ab"), VT("14")});
+
+    checkEwUnaryMat(UnaryOpCode::LOWER, arg, exp);
+
+    DataObjectFactory::destroy(arg, exp);
+}
+
 // ****************************************************************************
 // Invalid op-code
 // ****************************************************************************
 
 TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("some invalid op-code"), TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)) {
     using DT = TestType;
-    
+
     auto arg = genGivenVals<DT>(1, {1});
-    DT * exp = nullptr;
+    DT *exp = nullptr;
     CHECK_THROWS(ewUnaryMat<DT, DT>(static_cast<UnaryOpCode>(999), exp, arg, nullptr));
 
     DataObjectFactory::destroy(arg);
diff --git a/test/runtime/local/kernels/EwUnaryScaTest.cpp b/test/runtime/local/kernels/EwUnaryScaTest.cpp
index 91dd76215..500bacce8 100644
--- a/test/runtime/local/kernels/EwUnaryScaTest.cpp
+++ b/test/runtime/local/kernels/EwUnaryScaTest.cpp
@@ -29,28 +29,24 @@
 #define FP_VALUE_TYPES float, double
 #define VALUE_TYPES SI_VALUE_TYPES, FP_VALUE_TYPES
 
-template<UnaryOpCode opCode, typename VT>
-void checkEwUnarySca(const VT arg, const VT exp) {
+template <UnaryOpCode opCode, typename VT> void checkEwUnarySca(const VT arg, const VT exp) {
     CHECK(EwUnarySca<opCode, VT, VT>::apply(arg, nullptr) == exp);
     CHECK(ewUnarySca<VT, VT>(opCode, arg, nullptr) == exp);
 }
 
 // decimals below are implicitely truncated if VALUE_TYPES is integer
 // generally avoid this for input arguments
-template<UnaryOpCode opCode, typename VT>
-void checkEwUnaryScaApprox(const VT arg, const VT exp) {
+template <UnaryOpCode opCode, typename VT> void checkEwUnaryScaApprox(const VT arg, const VT exp) {
     CHECK(Approx(EwUnarySca<opCode, VT, VT>::apply(arg, nullptr)).epsilon(1e-2) == exp);
     CHECK(Approx(ewUnarySca<VT, VT>(opCode, arg, nullptr)).epsilon(1e-2) == exp);
 }
 
-template<UnaryOpCode opCode, typename VT>
-void checkEwUnaryScaThrow(const VT arg) {
+template <UnaryOpCode opCode, typename VT> void checkEwUnaryScaThrow(const VT arg) {
     REQUIRE_THROWS_AS((EwUnarySca<opCode, VT, VT>::apply(arg, nullptr)), std::domain_error);
     REQUIRE_THROWS_AS((ewUnarySca<VT, VT>(opCode, arg, nullptr)), std::domain_error);
 }
 
-template<UnaryOpCode opCode, typename VT>
-void checkEwUnaryScaNaN(const VT arg) {
+template <UnaryOpCode opCode, typename VT> void checkEwUnaryScaNaN(const VT arg) {
     VT res1 = EwUnarySca<opCode, VT, VT>::apply(arg, nullptr);
     VT res2 = ewUnarySca<VT, VT>(opCode, arg, nullptr);
     CHECK(res1 != res1);
@@ -61,13 +57,20 @@ void checkEwUnaryScaNaN(const VT arg) {
 // Arithmetic/general math
 // ****************************************************************************
 
-TEMPLATE_TEST_CASE(TEST_NAME("abs"), TAG_KERNELS, VALUE_TYPES) {
+TEMPLATE_TEST_CASE(TEST_NAME("abs int"), TAG_KERNELS, SI_VALUE_TYPES) {
     using VT = TestType;
     checkEwUnarySca<UnaryOpCode::ABS, VT>(0, 0);
     checkEwUnarySca<UnaryOpCode::ABS, VT>(2, 2);
     checkEwUnarySca<UnaryOpCode::ABS, VT>(-2, 2);
 }
 
+TEMPLATE_TEST_CASE(TEST_NAME("abs fp"), TAG_KERNELS, FP_VALUE_TYPES) {
+    using VT = TestType;
+    checkEwUnarySca<UnaryOpCode::ABS, VT>(0.5, 0.5);
+    checkEwUnarySca<UnaryOpCode::ABS, VT>(2.5, 2.5);
+    checkEwUnarySca<UnaryOpCode::ABS, VT>(-2.5, 2.5);
+}
+
 TEMPLATE_TEST_CASE(TEST_NAME("sign"), TAG_KERNELS, VALUE_TYPES) {
     using VT = TestType;
     checkEwUnarySca<UnaryOpCode::SIGN, VT>(0, 0);
@@ -267,4 +270,4 @@ TEMPLATE_TEST_CASE(TEST_NAME("isNan, floating-point-specific"), TAG_KERNELS, FP_
 TEMPLATE_TEST_CASE(TEST_NAME("some invalid op-code"), TAG_KERNELS, VALUE_TYPES) {
     using VT = TestType;
     CHECK_THROWS(ewUnarySca<VT, VT>(static_cast<UnaryOpCode>(999), 0, nullptr));
-}
\ No newline at end of file
+}
diff --git a/test/runtime/local/kernels/ExtractColTest.cpp b/test/runtime/local/kernels/ExtractColTest.cpp
index 3817a6df8..7b9e41560 100644
--- a/test/runtime/local/kernels/ExtractColTest.cpp
+++ b/test/runtime/local/kernels/ExtractColTest.cpp
@@ -34,15 +34,13 @@
 #define TEST_NAME(name) "ExtractCol - (" name ")"
 #define DATA_TYPES DenseMatrix, Matrix
 
-template<typename DT, typename DTSel>
-void checkExtractCol(DT * res, DT * arg, DTSel * sel, DT * exp) {
+template <typename DT, typename DTSel> void checkExtractCol(DT *res, DT *arg, DTSel *sel, DT *exp) {
     extractCol<DT, DT, DTSel>(res, arg, sel, nullptr);
     CHECK(*res == *exp);
     DataObjectFactory::destroy(res, exp);
 }
 
-template<typename DT, typename DTSel>
-void checkExtractColThrow(DT * res, DT * arg, DTSel * sel) {
+template <typename DT, typename DTSel> void checkExtractColThrow(DT *res, DT *arg, DTSel *sel) {
     REQUIRE_THROWS_AS((extractCol<DT, DT, DTSel>(res, arg, sel, nullptr)), std::out_of_range);
 }
 
@@ -50,16 +48,17 @@ void checkExtractColThrow(DT * res, DT * arg, DTSel * sel) {
  * @brief Runs the extractCol-kernel with small input data and performs various
  * checks.
  */
-TEMPLATE_TEST_CASE(TEST_NAME("Frame"), TAG_KERNELS, int64_t, size_t) { // NOLINT(cert-err58-cpp)
+TEMPLATE_TEST_CASE(TEST_NAME("Frame"), TAG_KERNELS, int64_t,
+                   size_t) { // NOLINT(cert-err58-cpp)
     using VTSel = TestType;
     using DTSel = DenseMatrix<VTSel>;
-    
+
     using DT0 = DenseMatrix<double>;
     using DT1 = DenseMatrix<int32_t>;
     using DT2 = DenseMatrix<uint64_t>;
-    
+
     const size_t numRows = 5;
-    
+
     auto c0 = genGivenVals<DT0>(numRows, {0.0, 1.1, 2.2, 3.3, 4.4});
     auto c1 = genGivenVals<DT1>(numRows, {0, -10, -20, -30, -40});
     auto c2 = genGivenVals<DT2>(numRows, {0, 1, 2, 3, 4});
@@ -68,10 +67,10 @@ TEMPLATE_TEST_CASE(TEST_NAME("Frame"), TAG_KERNELS, int64_t, size_t) { // NOLINT
     std::string labels[] = {"aaa", "bbb", "ccc"};
     auto arg = DataObjectFactory::create<Frame>(colMats, labels);
     size_t numColExp;
-    
-    Frame* res{};
-    Frame* exp{};
-    DTSel* sel{};
+
+    Frame *res{};
+    Frame *exp{};
+    DTSel *sel{};
 
     SECTION("selecting nothing") {
         sel = DataObjectFactory::create<DTSel>(0, 1, false);
@@ -114,16 +113,17 @@ TEMPLATE_TEST_CASE(TEST_NAME("Frame"), TAG_KERNELS, int64_t, size_t) { // NOLINT
     DataObjectFactory::destroy(c0, c1, c2, arg, sel);
 }
 
-TEMPLATE_TEST_CASE(TEST_NAME("Frame error handling"), TAG_KERNELS, int64_t) { // NOLINT(cert-err58-cpp)
+TEMPLATE_TEST_CASE(TEST_NAME("Frame error handling"), TAG_KERNELS,
+                   int64_t) { // NOLINT(cert-err58-cpp)
     using VTSel = TestType;
     using DTSel = DenseMatrix<VTSel>;
-    
+
     using DT0 = DenseMatrix<double>;
     using DT1 = DenseMatrix<int32_t>;
     using DT2 = DenseMatrix<uint64_t>;
-    
+
     const size_t numRows = 5;
-    
+
     auto c0 = genGivenVals<DT0>(numRows, {0.0, 1.1, 2.2, 3.3, 4.4});
     auto c1 = genGivenVals<DT1>(numRows, {0, -10, -20, -30, -40});
     auto c2 = genGivenVals<DT2>(numRows, {0, 1, 2, 3, 4});
@@ -132,9 +132,9 @@ TEMPLATE_TEST_CASE(TEST_NAME("Frame error handling"), TAG_KERNELS, int64_t) { //
     std::string labels[] = {"aaa", "bbb", "ccc"};
     auto arg = DataObjectFactory::create<Frame>(colMats, labels);
     size_t numColExp;
-    
-    Frame* res{};
-    DTSel* sel{};
+
+    Frame *res{};
+    DTSel *sel{};
 
     SECTION("selecting out of bounds, negative") {
         numColExp = 2;
@@ -150,24 +150,17 @@ TEMPLATE_TEST_CASE(TEST_NAME("Frame error handling"), TAG_KERNELS, int64_t) { //
     DataObjectFactory::destroy(c0, c1, c2, arg, sel);
 }
 
-TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("Dense/Generic Matrix"), TAG_KERNELS, (DATA_TYPES), (int64_t, double)) { // NOLINT(cert-err58-cpp)
+TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("Dense/Generic Matrix"), TAG_KERNELS, (DATA_TYPES),
+                           (int64_t, double)) { // NOLINT(cert-err58-cpp)
     using DT = TestType;
     using VT = typename DT::VT;
-    using DTEmpty = typename std::conditional<
-                        std::is_same<DT, Matrix<VT>>::value,
-                        DenseMatrix<VT>,
-                        DT
-                    >::type;
-    
-    DT * arg = genGivenVals<DT>(3, {
-        1, 2, 3,
-        4, 5, 6,
-        7, 8, 9
-    });
-
-    DT * res{};
-    DT * exp{};
-    DT * sel{};
+    using DTEmpty = typename std::conditional<std::is_same<DT, Matrix<VT>>::value, DenseMatrix<VT>, DT>::type;
+
+    DT *arg = genGivenVals<DT>(3, {1, 2, 3, 4, 5, 6, 7, 8, 9});
+
+    DT *res{};
+    DT *exp{};
+    DT *sel{};
 
     SECTION("selecting nothing") {
         sel = static_cast<DT *>(DataObjectFactory::create<DTEmpty>(0, 1, false));
@@ -176,38 +169,22 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("Dense/Generic Matrix"), TAG_KERNELS, (DATA
     }
     SECTION("selecting some, once, in-order") {
         sel = genGivenVals<DT>(2, {0, 2});
-        exp = genGivenVals<DT>(3, {
-            1, 3,
-            4, 6,
-            7, 9
-        });
+        exp = genGivenVals<DT>(3, {1, 3, 4, 6, 7, 9});
         checkExtractCol(res, arg, sel, exp);
     }
     SECTION("selecting everything, once, in-order") {
         sel = genGivenVals<DT>(3, {0, 1, 2});
-        exp = genGivenVals<DT>(3, {
-            1, 2, 3,
-            4, 5, 6,
-            7, 8, 9
-        });
+        exp = genGivenVals<DT>(3, {1, 2, 3, 4, 5, 6, 7, 8, 9});
         checkExtractCol(res, arg, sel, exp);
     }
     SECTION("selecting everything, once, permuted") {
         sel = genGivenVals<DT>(3, {2, 0, 1});
-        exp = genGivenVals<DT>(3, {
-            3, 1, 2,
-            6, 4, 5,
-            9, 7, 8
-        });
+        exp = genGivenVals<DT>(3, {3, 1, 2, 6, 4, 5, 9, 7, 8});
         checkExtractCol(res, arg, sel, exp);
     }
     SECTION("selecting some, repeated") {
         sel = genGivenVals<DT>(8, {1, 2, 2, 0, 1, 0, 1, 2});
-        exp = genGivenVals<DT>(3, {
-            2, 3, 3, 1, 2, 1, 2, 3,
-            5, 6, 6, 4, 5, 4, 5, 6,
-            8, 9, 9, 7, 8, 7, 8, 9
-        });
+        exp = genGivenVals<DT>(3, {2, 3, 3, 1, 2, 1, 2, 3, 5, 6, 6, 4, 5, 4, 5, 6, 8, 9, 9, 7, 8, 7, 8, 9});
         checkExtractCol(res, arg, sel, exp);
     }
     SECTION("selecting out of bounds, negative") {
diff --git a/test/runtime/local/kernels/ExtractRowTest.cpp b/test/runtime/local/kernels/ExtractRowTest.cpp
index e98f7d3fc..7329b207a 100644
--- a/test/runtime/local/kernels/ExtractRowTest.cpp
+++ b/test/runtime/local/kernels/ExtractRowTest.cpp
@@ -37,30 +37,31 @@
  * @brief Runs the extractRow-kernel with small input data and performs various
  * checks.
  */
-TEMPLATE_TEST_CASE("ExtractRow - Frame", TAG_KERNELS, double, int64_t, uint32_t) { // NOLINT(cert-err58-cpp)
+TEMPLATE_TEST_CASE("ExtractRow - Frame", TAG_KERNELS, double, int64_t,
+                   uint32_t) { // NOLINT(cert-err58-cpp)
     using VTSel = TestType;
     using DTSel = DenseMatrix<VTSel>;
-    
+
     using DT0 = DenseMatrix<double>;
     using DT1 = DenseMatrix<int32_t>;
     using DT2 = DenseMatrix<uint64_t>;
-    
+
     const size_t numCols = 3;
     const size_t numRows = 5;
-    
+
     auto c0 = genGivenVals<DT0>(numRows, {0.0, 1.1, 2.2, 3.3, 4.4});
     auto c1 = genGivenVals<DT1>(numRows, {0, -10, -20, -30, -40});
     auto c2 = genGivenVals<DT2>(numRows, {0, 1, 2, 3, 4});
     std::vector<Structure *> colMats = {c0, c1, c2};
     std::string labels[] = {"aaa", "bbb", "ccc"};
     auto arg = DataObjectFactory::create<Frame>(colMats, labels);
-    
-    Frame* res{};
-    DTSel* sel{};
+
+    Frame *res{};
+    DTSel *sel{};
     size_t numRowsExp{};
-    DenseMatrix<double>* c0Exp{};
-    DenseMatrix<int32_t>* c1Exp{};
-    DenseMatrix<uint64_t>* c2Exp{};
+    DenseMatrix<double> *c0Exp{};
+    DenseMatrix<int32_t> *c1Exp{};
+    DenseMatrix<uint64_t> *c2Exp{};
     SECTION("selecting nothing") {
         numRowsExp = 0;
         sel = DataObjectFactory::create<DTSel>(0, 1, false);
@@ -96,14 +97,14 @@ TEMPLATE_TEST_CASE("ExtractRow - Frame", TAG_KERNELS, double, int64_t, uint32_t)
         c1Exp = genGivenVals<DT1>(numRowsExp, {-30, -20, -20, 0, -10, -40, -40, -20});
         c2Exp = genGivenVals<DT2>(numRowsExp, {3, 2, 2, 0, 1, 4, 4, 2});
     }
-    
+
     extractRow<Frame, Frame, VTSel>(res, arg, sel, nullptr);
-    
+
     // Check expected #rows.
     REQUIRE(res->getNumRows() == numRowsExp);
     // Check that #columns, schema, and labels remain unchanged.
     REQUIRE(res->getNumCols() == numCols);
-    for(size_t i = 0; i < numCols; i++) {
+    for (size_t i = 0; i < numCols; i++) {
         CHECK(res->getSchema()[i] == arg->getSchema()[i]);
         CHECK(res->getLabels()[i] == labels[i]);
     }
@@ -111,11 +112,11 @@ TEMPLATE_TEST_CASE("ExtractRow - Frame", TAG_KERNELS, double, int64_t, uint32_t)
     CHECK(*(res->getColumn<double>(0)) == *c0Exp);
     CHECK(*(res->getColumn<int32_t>(1)) == *c1Exp);
     CHECK(*(res->getColumn<uint64_t>(2)) == *c2Exp);
-    
+
     DataObjectFactory::destroy(c0);
     DataObjectFactory::destroy(c1);
     DataObjectFactory::destroy(c2);
-    if(c0 != c0Exp) {
+    if (c0 != c0Exp) {
         DataObjectFactory::destroy(c0Exp);
         DataObjectFactory::destroy(c1Exp);
         DataObjectFactory::destroy(c2Exp);
@@ -123,127 +124,68 @@ TEMPLATE_TEST_CASE("ExtractRow - Frame", TAG_KERNELS, double, int64_t, uint32_t)
     DataObjectFactory::destroy(arg);
     DataObjectFactory::destroy(res);
 }
-TEMPLATE_PRODUCT_TEST_CASE("ExtractRow - Matrix double naive", TAG_KERNELS, (DATA_TYPES), (uint32_t)) { // NOLINT(cert-err58-cpp)
+TEMPLATE_PRODUCT_TEST_CASE("ExtractRow - Matrix double naive", TAG_KERNELS, (DATA_TYPES),
+                           (uint32_t)) { // NOLINT(cert-err58-cpp)
     using DT = typename TestType::template WithValueType<double>;
     using VT = typename TestType::VT;
     using DTSel = DenseMatrix<VT>;
 
-    auto argMatrix = genGivenVals<DT>(4, {
-        1.0,  10.0, 3.0, 7.0, 7.0, 7.0,
-        17.0, 1.0,  2.0, 3.0, 7.0, 7.0,
-        7.0,  7.0,  1.0, 2.0, 3.0, 7.0,
-        7.0,  7.0,  7.0, 1.0, 2.0, 3.0
-    });
-    auto selMatrix = genGivenVals<DTSel>(4, {
-        0,
-        1,
-        2,
-        3
-    });
-    DT * resMatrix = nullptr;
+    auto argMatrix = genGivenVals<DT>(4, {1.0, 10.0, 3.0, 7.0, 7.0, 7.0, 17.0, 1.0, 2.0, 3.0, 7.0, 7.0,
+                                          7.0, 7.0,  1.0, 2.0, 3.0, 7.0, 7.0,  7.0, 7.0, 1.0, 2.0, 3.0});
+    auto selMatrix = genGivenVals<DTSel>(4, {0, 1, 2, 3});
+    DT *resMatrix = nullptr;
 
     extractRow<DT, DT, VT>(resMatrix, argMatrix, selMatrix, nullptr);
     CHECK(*resMatrix == *argMatrix);
     DataObjectFactory::destroy(argMatrix, selMatrix, resMatrix);
 }
 
-TEMPLATE_PRODUCT_TEST_CASE("ExtractRow - Matrix double expanding", TAG_KERNELS, (DATA_TYPES), (uint32_t)) { // NOLINT(cert-err58-cpp)
+TEMPLATE_PRODUCT_TEST_CASE("ExtractRow - Matrix double expanding", TAG_KERNELS, (DATA_TYPES),
+                           (uint32_t)) { // NOLINT(cert-err58-cpp)
     using DT = typename TestType::template WithValueType<double>;
     using VT = typename TestType::VT;
     using DTSel = DenseMatrix<VT>;
 
-    auto argMatrix = genGivenVals<DT>(4, {
-        1.0,  10.0, 3.0, 7.0, 7.0, 7.0,
-        17.0, 1.0,  2.0, 3.0, 7.0, 7.0,
-        7.0,  7.0,  1.0, 2.0, 3.0, 7.0,
-        7.0,  7.0,  7.0, 1.0, 2.0, 3.0
-    });
-    auto selMatrix = genGivenVals<DTSel>(8, {
-        0,
-        1,
-        2,
-        3,
-        0,
-        1,
-        2,
-        3
-    });
-    auto expMatrix = genGivenVals<DT>(8, {
-        1.0,  10.0, 3.0,  7.0,  7.0,  7.0,
-        17.0, 1.0,  2.0,  3.0,  7.0,  7.0,
-        7.0,  7.0,  1.0,  2.0,  3.0,  7.0,
-        7.0,  7.0,  7.0,  1.0,  2.0,  3.0,
-        1.0,  10.0, 3.0,  7.0,  7.0,  7.0,
-        17.0, 1.0,  2.0,  3.0,  7.0,  7.0,
-        7.0,  7.0,  1.0,  2.0,  3.0,  7.0,
-        7.0,  7.0,  7.0,  1.0,  2.0,  3.0
-    });
-    DT * resMatrix = nullptr;
+    auto argMatrix = genGivenVals<DT>(4, {1.0, 10.0, 3.0, 7.0, 7.0, 7.0, 17.0, 1.0, 2.0, 3.0, 7.0, 7.0,
+                                          7.0, 7.0,  1.0, 2.0, 3.0, 7.0, 7.0,  7.0, 7.0, 1.0, 2.0, 3.0});
+    auto selMatrix = genGivenVals<DTSel>(8, {0, 1, 2, 3, 0, 1, 2, 3});
+    auto expMatrix =
+        genGivenVals<DT>(8, {1.0, 10.0, 3.0, 7.0, 7.0, 7.0, 17.0, 1.0, 2.0, 3.0,  7.0, 7.0, 7.0, 7.0, 1.0,  2.0,
+                             3.0, 7.0,  7.0, 7.0, 7.0, 1.0, 2.0,  3.0, 1.0, 10.0, 3.0, 7.0, 7.0, 7.0, 17.0, 1.0,
+                             2.0, 3.0,  7.0, 7.0, 7.0, 7.0, 1.0,  2.0, 3.0, 7.0,  7.0, 7.0, 7.0, 1.0, 2.0,  3.0});
+    DT *resMatrix = nullptr;
 
     extractRow<DT, DT, VT>(resMatrix, argMatrix, selMatrix, nullptr);
     CHECK(*resMatrix == *expMatrix);
     DataObjectFactory::destroy(argMatrix, selMatrix, expMatrix, resMatrix);
 }
 
-TEMPLATE_PRODUCT_TEST_CASE("ExtractRow - Matrix int unordered", TAG_KERNELS, (DATA_TYPES), (uint32_t)) { // NOLINT(cert-err58-cpp)
+TEMPLATE_PRODUCT_TEST_CASE("ExtractRow - Matrix int unordered", TAG_KERNELS, (DATA_TYPES),
+                           (uint32_t)) { // NOLINT(cert-err58-cpp)
     using DT = typename TestType::template WithValueType<int64_t>;
     using VT = typename TestType::VT;
     using DTSel = DenseMatrix<VT>;
 
-    auto argMatrix = genGivenVals<DT>(4, {
-        1,  10, 3, 7, 7, 7,
-        17, 1,  2, 3, 7, 7,
-        7,  7,  1, 2, 3, 7,
-        7,  7,  7, 1, 2, 3
-    }); 
-    auto selMatrix = genGivenVals<DTSel>(4, {
-        2,
-        3,
-        0,
-        1
-    });
-    auto expMatrix =  genGivenVals<DT>(4, {
-        7,  7,  1, 2, 3, 7,
-        7,  7,  7, 1, 2, 3,
-        1,  10, 3, 7, 7, 7,
-        17, 1,  2, 3, 7, 7
-    });
-    DT * resMatrix = nullptr;
+    auto argMatrix = genGivenVals<DT>(4, {1, 10, 3, 7, 7, 7, 17, 1, 2, 3, 7, 7, 7, 7, 1, 2, 3, 7, 7, 7, 7, 1, 2, 3});
+    auto selMatrix = genGivenVals<DTSel>(4, {2, 3, 0, 1});
+    auto expMatrix = genGivenVals<DT>(4, {7, 7, 1, 2, 3, 7, 7, 7, 7, 1, 2, 3, 1, 10, 3, 7, 7, 7, 17, 1, 2, 3, 7, 7});
+    DT *resMatrix = nullptr;
 
     extractRow<DT, DT, VT>(resMatrix, argMatrix, selMatrix, nullptr);
     CHECK(*resMatrix == *expMatrix);
     DataObjectFactory::destroy(argMatrix, selMatrix, expMatrix, resMatrix);
 }
 
-TEMPLATE_PRODUCT_TEST_CASE("ExtractRow - Matrix int repeated with initialized resMatrix", TAG_KERNELS, (DATA_TYPES), (uint32_t)) { // NOLINT(cert-err58-cpp)
+TEMPLATE_PRODUCT_TEST_CASE("ExtractRow - Matrix int repeated with initialized resMatrix", TAG_KERNELS, (DATA_TYPES),
+                           (uint32_t)) { // NOLINT(cert-err58-cpp)
     using DT = typename TestType::template WithValueType<int64_t>;
     using VT = typename TestType::VT;
     using DTSel = DenseMatrix<VT>;
 
-    auto argMatrix = genGivenVals<DT>(4, {
-        1,  10, 3, 7, 7, 7,
-        17, 1,  2, 3, 7, 7,
-        7,  7,  1, 2, 3, 7,
-        7,  7,  7, 1, 2, 3
-    });
-    auto selMatrix = genGivenVals<DTSel>(4, {
-        1,
-        1,
-        1, 
-        1
-    });
-    auto* expMatrix = genGivenVals<DT>(4, {
-        17, 1, 2, 3, 7, 7,
-        17, 1, 2, 3, 7, 7,
-        17, 1, 2, 3, 7, 7,
-        17, 1, 2, 3, 7, 7
-    });
-    auto* resMatrix = genGivenVals<DT>(4, {
-        7,  0,  0,  2,  3,  7,
-        7,  7,  7,  1,  2,  3,
-        1,  10, 3,  -7, -7, 7,
-        17, 1,  2,  3,  7,  7
-    });
+    auto argMatrix = genGivenVals<DT>(4, {1, 10, 3, 7, 7, 7, 17, 1, 2, 3, 7, 7, 7, 7, 1, 2, 3, 7, 7, 7, 7, 1, 2, 3});
+    auto selMatrix = genGivenVals<DTSel>(4, {1, 1, 1, 1});
+    auto *expMatrix = genGivenVals<DT>(4, {17, 1, 2, 3, 7, 7, 17, 1, 2, 3, 7, 7, 17, 1, 2, 3, 7, 7, 17, 1, 2, 3, 7, 7});
+    auto *resMatrix = genGivenVals<DT>(4, {7, 0, 0, 2, 3, 7, 7, 7, 7, 1, 2, 3, 1, 10, 3, -7, -7, 7, 17, 1, 2, 3, 7, 7});
 
     extractRow<DT, DT, VT>(resMatrix, argMatrix, selMatrix, nullptr);
     CHECK(*resMatrix == *expMatrix);
@@ -255,29 +197,13 @@ TEMPLATE_PRODUCT_TEST_CASE("ExtractRow - Matrix boundary checking", TAG_KERNELS,
     using VT = typename DT::VT;
     using DTSel = DenseMatrix<VT>;
 
-    auto argMatrix = genGivenVals<DT>(3, {
-        1, 2, 3,
-        4, 5, 6,
-        7, 8, 9
-    });
+    auto argMatrix = genGivenVals<DT>(3, {1, 2, 3, 4, 5, 6, 7, 8, 9});
 
-    DTSel * selMatrix = nullptr;
-    DT * resMatrix = nullptr;
+    DTSel *selMatrix = nullptr;
+    DT *resMatrix = nullptr;
 
-    SECTION("sel out of bounds - negative") {
-        selMatrix = genGivenVals<DTSel>(3, {
-            -1,
-            2,
-            2
-        });
-    }
-    SECTION("sel out of bounds - too high") {
-        selMatrix = genGivenVals<DTSel>(3, {
-            0,
-            2,
-            3
-        });
-    }
+    SECTION("sel out of bounds - negative") { selMatrix = genGivenVals<DTSel>(3, {-1, 2, 2}); }
+    SECTION("sel out of bounds - too high") { selMatrix = genGivenVals<DTSel>(3, {0, 2, 3}); }
 
     REQUIRE_THROWS_AS((extractRow<DT, DT, VT>(resMatrix, argMatrix, selMatrix, nullptr)), std::out_of_range);
     DataObjectFactory::destroy(argMatrix, selMatrix, resMatrix);
@@ -286,11 +212,11 @@ TEMPLATE_PRODUCT_TEST_CASE("ExtractRow - Matrix boundary checking", TAG_KERNELS,
 TEMPLATE_TEST_CASE("ExtractRow - Frame boundary checking", TAG_KERNELS, int32_t, double) {
     using VTSel = TestType;
     using DTSel = DenseMatrix<VTSel>;
-    
+
     using DT0 = DenseMatrix<double>;
     using DT1 = DenseMatrix<int32_t>;
     using DT2 = DenseMatrix<uint64_t>;
-    
+
     auto c0 = genGivenVals<DT0>(5, {0.0, 1.1, 2.2, 3.3, 4.4});
     auto c1 = genGivenVals<DT1>(5, {0, -10, -20, -30, -40});
     auto c2 = genGivenVals<DT2>(5, {0, 1, 2, 3, 4});
@@ -298,22 +224,22 @@ TEMPLATE_TEST_CASE("ExtractRow - Frame boundary checking", TAG_KERNELS, int32_t,
     std::string labels[] = {"aaa", "bbb", "ccc"};
     auto arg = DataObjectFactory::create<Frame>(colMats, labels);
 
-    DTSel * selMatrix = nullptr;
-    Frame * res = nullptr;
+    DTSel *selMatrix = nullptr;
+    Frame *res = nullptr;
 
     SECTION("sel out of bounds - negative") {
         selMatrix = genGivenVals<DTSel>(3, {
-            -1,
-            2,
-            2,
-        });
+                                               -1,
+                                               2,
+                                               2,
+                                           });
     }
     SECTION("sel out of bounds - too high") {
         selMatrix = genGivenVals<DTSel>(3, {
-            0,
-            2,
-            5,
-        });
+                                               0,
+                                               2,
+                                               5,
+                                           });
     }
 
     REQUIRE_THROWS_AS((extractRow<Frame, Frame, VTSel>(res, arg, selMatrix, nullptr)), std::out_of_range);
diff --git a/test/runtime/local/kernels/FillTest.cpp b/test/runtime/local/kernels/FillTest.cpp
index d7bc13525..20871ab14 100644
--- a/test/runtime/local/kernels/FillTest.cpp
+++ b/test/runtime/local/kernels/FillTest.cpp
@@ -35,32 +35,71 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("Matrix"), TAG_KERNELS, (DATA_TYPES), (VALU
 
     VT arg;
     size_t numRows, numCols;
-    DT * exp = nullptr;
+    DT *exp = nullptr;
 
     SECTION("2x2 matrix") {
         arg = VT(1.5);
         numRows = 2;
         numCols = 2;
 
-        exp = genGivenVals<DT>(2, {
-            VT(1.5), VT(1.5),
-            VT(1.5), VT(1.5)
-        });
+        exp = genGivenVals<DT>(2, {VT(1.5), VT(1.5), VT(1.5), VT(1.5)});
     }
     SECTION("1x5 matrix") {
         arg = VT(2.5);
         numRows = 1;
         numCols = 5;
 
-        exp = genGivenVals<DT>(1, {
-            VT(2.5), VT(2.5), VT(2.5), VT(2.5), VT(2.5)
-        });
+        exp = genGivenVals<DT>(1, {VT(2.5), VT(2.5), VT(2.5), VT(2.5), VT(2.5)});
     }
 
-    DT * res = nullptr;
+    DT *res = nullptr;
     fill(res, arg, numRows, numCols, nullptr);
 
     CHECK(*res == *exp);
 
     DataObjectFactory::destroy(exp, res);
-}
\ No newline at end of file
+}
+
+TEMPLATE_PRODUCT_TEST_CASE("Fill-existing", TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)) {
+    using DT = TestType;
+    using VT = typename DT::VT;
+    VT arg = 123;
+    size_t numRows = 4, numCols = 4;
+    DT *res = genGivenVals<DT>(2, {VT(1.5), VT(1.5), VT(1.5), VT(1.5)});
+    CHECK_THROWS(fill(res, arg, numRows, numCols, nullptr));
+}
+
+TEMPLATE_PRODUCT_TEST_CASE("FillString", TAG_KERNELS, (DenseMatrix), (ALL_STRING_VALUE_TYPES)) {
+    using DT = TestType;
+    using VT = typename DT::VT;
+
+    size_t numRows = 3;
+    size_t numCols = 4;
+
+    SECTION("empty_string") {
+        DenseMatrix<VT> *res = nullptr;
+        VT arg = VT("");
+
+        auto *exp = genGivenVals<DenseMatrix<VT>>(
+            3, {VT(""), VT(""), VT(""), VT(""), VT(""), VT(""), VT(""), VT(""), VT(""), VT(""), VT(""), VT("")});
+
+        fill(res, arg, numRows, numCols, nullptr);
+        CHECK(*exp == *res);
+
+        DataObjectFactory::destroy(res, exp);
+    }
+
+    SECTION("not_empty_string") {
+        DenseMatrix<VT> *res = nullptr;
+        VT arg = VT("abc");
+
+        auto *exp =
+            genGivenVals<DenseMatrix<VT>>(3, {VT("abc"), VT("abc"), VT("abc"), VT("abc"), VT("abc"), VT("abc"),
+                                              VT("abc"), VT("abc"), VT("abc"), VT("abc"), VT("abc"), VT("abc")});
+
+        fill(res, arg, numRows, numCols, nullptr);
+        CHECK(*exp == *res);
+
+        DataObjectFactory::destroy(res, exp);
+    }
+}
diff --git a/test/runtime/local/kernels/FilterColTest.cpp b/test/runtime/local/kernels/FilterColTest.cpp
index e37682728..4e9f89ddc 100644
--- a/test/runtime/local/kernels/FilterColTest.cpp
+++ b/test/runtime/local/kernels/FilterColTest.cpp
@@ -37,58 +37,34 @@ TEMPLATE_PRODUCT_TEST_CASE("FilterCol", TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)
     using VT = typename DT::VT;
     using VTSel = int64_t;
     using DTSel = DenseMatrix<VTSel>;
-    using DTEmpty = typename std::conditional<
-                        std::is_same<DT, Matrix<VT>>::value,
-                        DenseMatrix<VT>,
-                        DT
-                    >::type;
+    using DTEmpty = typename std::conditional<std::is_same<DT, Matrix<VT>>::value, DenseMatrix<VT>, DT>::type;
 
-    auto arg = genGivenVals<DT>(3, {
-        1, 2, 3, 4, 5,
-        6, 7, 8, 9, 10,
-        11, 12, 13, 14, 15
-    });
+    auto arg = genGivenVals<DT>(3, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
 
-    DTSel * sel = nullptr;
-    DT * exp = nullptr;
+    DTSel *sel = nullptr;
+    DT *exp = nullptr;
     SECTION("bit vector empty") {
         sel = genGivenVals<DTSel>(5, {0, 0, 0, 0, 0});
         exp = static_cast<DT *>(DataObjectFactory::create<DTEmpty>(3, 0, false));
     }
     SECTION("bit vector contiguous 0") {
         sel = genGivenVals<DTSel>(5, {0, 0, 1, 1, 1});
-        exp = genGivenVals<DT>(3, {
-            3, 4, 5,
-            8, 9, 10,
-            13, 14, 15
-        });
+        exp = genGivenVals<DT>(3, {3, 4, 5, 8, 9, 10, 13, 14, 15});
     }
     SECTION("bit vector contiguous 1") {
         sel = genGivenVals<DTSel>(5, {1, 1, 1, 0, 0});
-        exp = genGivenVals<DT>(3, {
-            1, 2, 3,
-            6, 7, 8,
-            11, 12, 13
-        });
+        exp = genGivenVals<DT>(3, {1, 2, 3, 6, 7, 8, 11, 12, 13});
     }
     SECTION("bit vector mixed") {
         sel = genGivenVals<DTSel>(5, {0, 1, 1, 0, 1});
-        exp = genGivenVals<DT>(3, {
-            2, 3, 5,
-            7, 8, 10,
-            12, 13, 15
-        });
+        exp = genGivenVals<DT>(3, {2, 3, 5, 7, 8, 10, 12, 13, 15});
     }
     SECTION("bit vector full") {
         sel = genGivenVals<DTSel>(5, {1, 1, 1, 1, 1});
-        exp = genGivenVals<DT>(3, {
-            1, 2, 3, 4, 5,
-            6, 7, 8, 9, 10,
-            11, 12, 13, 14, 15
-        });
+        exp = genGivenVals<DT>(3, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
     }
 
-    DT * res = nullptr;
+    DT *res = nullptr;
     filterCol<DT, DT, VTSel>(res, arg, sel, nullptr);
 
     CHECK(*res == *exp);
diff --git a/test/runtime/local/kernels/FilterRowTest.cpp b/test/runtime/local/kernels/FilterRowTest.cpp
index eb95002f5..3c37d86ae 100644
--- a/test/runtime/local/kernels/FilterRowTest.cpp
+++ b/test/runtime/local/kernels/FilterRowTest.cpp
@@ -36,22 +36,12 @@ TEMPLATE_PRODUCT_TEST_CASE("FilterRow", TAG_KERNELS, (DenseMatrix, Matrix), (dou
     using VTArg = typename DT::VT;
     using VTSel = int64_t;
     using DTSel = DenseMatrix<VTSel>;
-    using DTEmpty = typename std::conditional<
-                        std::is_same<DT, Matrix<VTArg>>::value,
-                        DenseMatrix<VTArg>,
-                        DT
-                    >::type;
-
-    auto arg = genGivenVals<DT>(5, {
-        1, 2, 3,
-        4, 5, 6,
-        7, 8, 9,
-        10, 11, 12,
-        13, 14, 15
-    });
-
-    DTSel * sel = nullptr;
-    DT * exp = nullptr;
+    using DTEmpty = typename std::conditional<std::is_same<DT, Matrix<VTArg>>::value, DenseMatrix<VTArg>, DT>::type;
+
+    auto arg = genGivenVals<DT>(5, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
+
+    DTSel *sel = nullptr;
+    DT *exp = nullptr;
     SECTION("bit vector empty") {
         sel = genGivenVals<DTSel>(5, {0, 0, 0, 0, 0});
         exp = static_cast<DT *>(DataObjectFactory::create<DTEmpty>(0, 3, false));
@@ -59,39 +49,67 @@ TEMPLATE_PRODUCT_TEST_CASE("FilterRow", TAG_KERNELS, (DenseMatrix, Matrix), (dou
     SECTION("bit vector contiguous 0") {
         sel = genGivenVals<DTSel>(5, {0, 0, 1, 1, 1});
         exp = genGivenVals<DT>(3, {
-            7, 8, 9,
-            10, 11, 12,
-            13, 14, 15,
-        });
+                                      7,
+                                      8,
+                                      9,
+                                      10,
+                                      11,
+                                      12,
+                                      13,
+                                      14,
+                                      15,
+                                  });
     }
     SECTION("bit vector contiguous 1") {
         sel = genGivenVals<DTSel>(5, {1, 1, 1, 0, 0});
         exp = genGivenVals<DT>(3, {
-            1, 2, 3,
-            4, 5, 6,
-            7, 8, 9,
-        });
+                                      1,
+                                      2,
+                                      3,
+                                      4,
+                                      5,
+                                      6,
+                                      7,
+                                      8,
+                                      9,
+                                  });
     }
     SECTION("bit vector mixed") {
         sel = genGivenVals<DTSel>(5, {0, 1, 1, 0, 1});
         exp = genGivenVals<DT>(3, {
-            4, 5, 6,
-            7, 8, 9,
-            13, 14, 15,
-        });
+                                      4,
+                                      5,
+                                      6,
+                                      7,
+                                      8,
+                                      9,
+                                      13,
+                                      14,
+                                      15,
+                                  });
     }
     SECTION("bit vector full") {
         sel = genGivenVals<DTSel>(5, {1, 1, 1, 1, 1});
         exp = genGivenVals<DT>(5, {
-            1, 2, 3,
-            4, 5, 6,
-            7, 8, 9,
-            10, 11, 12,
-            13, 14, 15,
-        });
+                                      1,
+                                      2,
+                                      3,
+                                      4,
+                                      5,
+                                      6,
+                                      7,
+                                      8,
+                                      9,
+                                      10,
+                                      11,
+                                      12,
+                                      13,
+                                      14,
+                                      15,
+                                  });
     }
 
-    DT * res = nullptr;
+    DT *res = nullptr;
     filterRow<DT, DT, VTSel>(res, arg, sel, nullptr);
 
     CHECK(*res == *exp);
@@ -103,30 +121,31 @@ TEMPLATE_PRODUCT_TEST_CASE("FilterRow", TAG_KERNELS, (DenseMatrix, Matrix), (dou
  * @brief Runs the filterRow-kernel with small input data and performs various
  * checks.
  */
-TEMPLATE_TEST_CASE("FilterRow - Frame", TAG_KERNELS, double, int64_t, uint32_t) { // NOLINT(cert-err58-cpp)
+TEMPLATE_TEST_CASE("FilterRow - Frame", TAG_KERNELS, double, int64_t,
+                   uint32_t) { // NOLINT(cert-err58-cpp)
     using VTSel = TestType;
     using DTSel = DenseMatrix<VTSel>;
-    
+
     using DT0 = DenseMatrix<double>;
     using DT1 = DenseMatrix<int32_t>;
     using DT2 = DenseMatrix<uint64_t>;
-    
+
     const size_t numCols = 3;
     const size_t numRows = 5;
-    
+
     auto c0 = genGivenVals<DT0>(numRows, {1.1, 2.2, 3.3, 4.4, 5.5});
     auto c1 = genGivenVals<DT1>(numRows, {-10, -20, -30, -40, -50});
     auto c2 = genGivenVals<DT2>(numRows, {1, 2, 3, 4, 5});
     std::vector<Structure *> colMats = {c0, c1, c2};
     std::string labels[] = {"aaa", "bbb", "ccc"};
     auto arg = DataObjectFactory::create<Frame>(colMats, labels);
-    
-    Frame* res{};
-    DTSel* sel{};
+
+    Frame *res{};
+    DTSel *sel{};
     size_t numRowsExp{};
-    DenseMatrix<double> * c0Exp{};
-    DenseMatrix<int32_t> * c1Exp{};
-    DenseMatrix<uint64_t> * c2Exp{};
+    DenseMatrix<double> *c0Exp{};
+    DenseMatrix<int32_t> *c1Exp{};
+    DenseMatrix<uint64_t> *c2Exp{};
     SECTION("selecting nothing") {
         sel = genGivenVals<DTSel>(numRows, {0, 0, 0, 0, 0});
         numRowsExp = 0;
@@ -148,14 +167,14 @@ TEMPLATE_TEST_CASE("FilterRow - Frame", TAG_KERNELS, double, int64_t, uint32_t)
         c1Exp = c1;
         c2Exp = c2;
     }
-    
+
     filterRow<Frame, Frame, VTSel>(res, arg, sel, nullptr);
-    
+
     // Check expected #rows.
     CHECK(res->getNumRows() == numRowsExp);
     // Check that #columns, schema, and labels remain unchanged.
     CHECK(res->getNumCols() == numCols);
-    for(size_t i = 0; i < numCols; i++) {
+    for (size_t i = 0; i < numCols; i++) {
         CHECK(res->getSchema()[i] == arg->getSchema()[i]);
         CHECK(res->getLabels()[i] == labels[i]);
     }
@@ -163,11 +182,11 @@ TEMPLATE_TEST_CASE("FilterRow - Frame", TAG_KERNELS, double, int64_t, uint32_t)
     CHECK(*(res->getColumn<double>(0)) == *c0Exp);
     CHECK(*(res->getColumn<int32_t>(1)) == *c1Exp);
     CHECK(*(res->getColumn<uint64_t>(2)) == *c2Exp);
-    
+
     DataObjectFactory::destroy(c0);
     DataObjectFactory::destroy(c1);
     DataObjectFactory::destroy(c2);
-    if(c0 != c0Exp) {
+    if (c0 != c0Exp) {
         DataObjectFactory::destroy(c0Exp);
         DataObjectFactory::destroy(c1Exp);
         DataObjectFactory::destroy(c2Exp);
@@ -180,36 +199,35 @@ TEMPLATE_TEST_CASE("FilterRow - Frame", TAG_KERNELS, double, int64_t, uint32_t)
  * @brief Runs the filterRow-kernel with large random input data only to check
  * if it returns the expected number of rows and doesn't crash.
  */
-TEMPLATE_TEST_CASE("FilterRow (large input) - Frame", TAG_KERNELS, double, int64_t, uint32_t) { // NOLINT(cert-err58-cpp)
+TEMPLATE_TEST_CASE("FilterRow (large input) - Frame", TAG_KERNELS, double, int64_t,
+                   uint32_t) { // NOLINT(cert-err58-cpp)
     using VTSel = TestType;
     using DTSel = DenseMatrix<VTSel>;
-    
+
     using DT0 = DenseMatrix<double>;
     using DT1 = DenseMatrix<int32_t>;
     using DT2 = DenseMatrix<uint64_t>;
-    
+
     const size_t numRows = 10000;
-    
+
     auto c0 = DataObjectFactory::create<DT0>(numRows, 1, false);
     auto c1 = DataObjectFactory::create<DT1>(numRows, 1, false);
     auto c2 = DataObjectFactory::create<DT2>(numRows, 1, false);
     std::vector<Structure *> colMats = {c0, c1, c2};
     auto arg = DataObjectFactory::create<Frame>(colMats, nullptr);
-    
+
     // Randomly generate the selection with a share of 1s equal to selectivity.
     const double selectivity = 0.01;
-    DTSel * sel = nullptr;
-    randMatrix<DTSel, VTSel>(
-            sel, numRows, 1, VTSel(1), VTSel(1), selectivity, -1, nullptr
-    );
-    
-    Frame * res = nullptr;
+    DTSel *sel = nullptr;
+    randMatrix<DTSel, VTSel>(sel, numRows, 1, VTSel(1), VTSel(1), selectivity, -1, nullptr);
+
+    Frame *res = nullptr;
     filterRow<Frame, Frame, VTSel>(res, arg, sel, nullptr);
-    
+
     // Check expected #rows.
     const auto numRowsExp = static_cast<size_t>(round(selectivity * numRows));
     CHECK(res->getNumRows() == numRowsExp);
-    
+
     DataObjectFactory::destroy(c0);
     DataObjectFactory::destroy(c1);
     DataObjectFactory::destroy(c2);
diff --git a/test/runtime/local/kernels/GroupJoinTest.cpp b/test/runtime/local/kernels/GroupJoinTest.cpp
index ba9e3121f..d638d7378 100644
--- a/test/runtime/local/kernels/GroupJoinTest.cpp
+++ b/test/runtime/local/kernels/GroupJoinTest.cpp
@@ -37,22 +37,22 @@
  */
 TEST_CASE("GroupJoin", TAG_KERNELS) {
     // lhs is a kind of dimension table.
-    auto lhsC0 = genGivenVals<DenseMatrix<int64_t>>(3, { 1,  2,  3});
+    auto lhsC0 = genGivenVals<DenseMatrix<int64_t>>(3, {1, 2, 3});
     auto lhsC1 = genGivenVals<DenseMatrix<int64_t>>(3, {11, 22, 33});
     std::vector<Structure *> lhsCols = {lhsC0, lhsC1};
     std::string lhsLabels[] = {"d.id", "d.foo"};
     auto lhs = DataObjectFactory::create<Frame>(lhsCols, lhsLabels);
-    
+
     // rhs is a kind of fact table.
-    auto rhsC0 = genGivenVals<DenseMatrix<int64_t>>(10, { 1,  1,  1,  3,  1,  3,  3,  1,  3,  3});
+    auto rhsC0 = genGivenVals<DenseMatrix<int64_t>>(10, {1, 1, 1, 3, 1, 3, 3, 1, 3, 3});
     auto rhsC1 = genGivenVals<DenseMatrix<int64_t>>(10, {42, 42, 42, 42, 42, 42, 42, 42, 42, 42});
-    auto rhsC2 = genGivenVals<DenseMatrix<double >>(10, {10, 20, 30, 10, 20, 30, 10, 20, 30, 10});
+    auto rhsC2 = genGivenVals<DenseMatrix<double>>(10, {10, 20, 30, 10, 20, 30, 10, 20, 30, 10});
     std::vector<Structure *> rhsCols = {rhsC0, rhsC1, rhsC2};
     std::string rhsLabels[] = {"f.id", "f.bar", "f.agg"};
     auto rhs = DataObjectFactory::create<Frame>(rhsCols, rhsLabels);
-    
-    Frame * res = nullptr;
-    DenseMatrix<size_t> * lhsTid = nullptr;
+
+    Frame *res = nullptr;
+    DenseMatrix<size_t> *lhsTid = nullptr;
     groupJoin<size_t>(res, lhsTid, lhs, rhs, "d.id", "f.id", "f.agg", nullptr);
 
     // Check the meta data.
@@ -64,7 +64,7 @@ TEST_CASE("GroupJoin", TAG_KERNELS) {
     CHECK(res->getLabels()[1] == "SUM(f.agg)");
     CHECK(lhsTid->getNumRows() == 2);
     CHECK(lhsTid->getNumCols() == 1);
-    
+
     // Check the data.
 #if 0
     // TODO Since any order of rows would be correct, we should sort before the
@@ -79,16 +79,13 @@ TEST_CASE("GroupJoin", TAG_KERNELS) {
     auto resC0Fnd = res->getColumn<int64_t>(0);
     auto resC1Fnd = res->getColumn<double>(1);
     const bool dataGood = (
-        // the one order
-        resC0Fnd->get(0, 0) ==   1 && resC0Fnd->get(1, 0) ==  3 &&
-        resC1Fnd->get(0, 0) == 100 && resC1Fnd->get(1, 0) == 90 &&
-        lhsTid  ->get(0, 0) ==   0 && lhsTid  ->get(1, 0) ==  2
-    ) || (
-        // the other order
-        resC0Fnd->get(1, 0) ==   1 && resC0Fnd->get(0, 0) ==  3 &&
-        resC1Fnd->get(1, 0) == 100 && resC1Fnd->get(0, 0) == 90 &&
-        lhsTid  ->get(1, 0) ==   0 && lhsTid  ->get(0, 0) ==  2
-    );
+                              // the one order
+                              resC0Fnd->get(0, 0) == 1 && resC0Fnd->get(1, 0) == 3 && resC1Fnd->get(0, 0) == 100 &&
+                              resC1Fnd->get(1, 0) == 90 && lhsTid->get(0, 0) == 0 && lhsTid->get(1, 0) == 2) ||
+                          (
+                              // the other order
+                              resC0Fnd->get(1, 0) == 1 && resC0Fnd->get(0, 0) == 3 && resC1Fnd->get(1, 0) == 100 &&
+                              resC1Fnd->get(0, 0) == 90 && lhsTid->get(1, 0) == 0 && lhsTid->get(0, 0) == 2);
     CHECK(dataGood);
 #endif
 }
\ No newline at end of file
diff --git a/test/runtime/local/kernels/GroupTest.cpp b/test/runtime/local/kernels/GroupTest.cpp
index 797271467..af6d181cd 100644
--- a/test/runtime/local/kernels/GroupTest.cpp
+++ b/test/runtime/local/kernels/GroupTest.cpp
@@ -14,20 +14,19 @@
  * limitations under the License.
  */
 
+#include <ir/daphneir/Daphne.h>
 #include <runtime/local/datagen/GenGivenVals.h>
 #include <runtime/local/datastructures/DataObjectFactory.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
 #include <runtime/local/datastructures/Frame.h>
-#include <runtime/local/kernels/Order.h>
-#include <runtime/local/kernels/Group.h>
 #include <runtime/local/kernels/CheckEq.h>
-#include <ir/daphneir/Daphne.h>
+#include <runtime/local/kernels/Group.h>
+#include <runtime/local/kernels/Order.h>
 
-#include <tags.h>
 #include <catch.hpp>
+#include <tags.h>
 #include <vector>
 
-
 TEMPLATE_TEST_CASE("Group", TAG_KERNELS, (Frame)) {
     using VT0 = double;
     using VT1 = float;
@@ -36,43 +35,42 @@ TEMPLATE_TEST_CASE("Group", TAG_KERNELS, (Frame)) {
 
     size_t numRows = 20;
 
-    auto c0 = genGivenVals<DenseMatrix<VT0>>(numRows, { 1.5, 2.7, 3.2, 1.5, 2.7, 1.5, 2.7, 1.5, 2.7, 1.5,
-                                                        2.7, 1.5, 2.7, 1.5, 2.7, 1.5, 2.7, 1.5, 2.7, 1.5 });
-    auto c1 = genGivenVals<DenseMatrix<VT1>>(numRows, { 1.6, 2.7, 3.2, 1.5, 2.7, 1.5, 2.7, 1.6, 2.8, 1.5,
-                                                        2.7, 1.6, 2.7, 1.5, 2.8, 1.5, 2.7, 1.6, 2.8, 1.5 });
-    auto c2 = genGivenVals<DenseMatrix<VT2>>(numRows, { -1, 0, 1, -1, 0, 1, -1, 0, 1, 3,
-                                                        2, -1, 0, 1, 1, 1, 1, -1, -1, -1});
-    auto c3 = genGivenVals<DenseMatrix<VT3>>(numRows, { 1, 0, 1, 1, 0, 1, 1, 0, 1, 3,
-                                                        2, 1, 0, 1, 1, 1, 1, 1, 1, 1});;
-    auto c4 = genGivenVals<DenseMatrix<VT1>>(numRows, { 1.5, 2.7, 3.2, 1.5, 2.7, 1.5, 2.7, 1.5, 2.7, 1.5,
-                                                        2.7, 1.5, 2.7, 1.5, 2.7, 1.5, 2.7, 1.5, 2.7, 1.5 });
-    auto c5 = genGivenVals<DenseMatrix<VT1>>(numRows, { 1.6, 2.7, 3.2, 1.5, 2.7, 1.5, 2.7, 1.6, 2.8, 1.5,
-                                                        2.7, 1.6, 2.7, 1.5, 2.8, 1.5, 2.7, 1.6, 2.8, 1.5 });
-    auto c6 = genGivenVals<DenseMatrix<VT2>>(numRows, { -1, 0, 1, -37, 17, 1, -1, 0, 1, 3,
-                                                        2, -1, 0, 1, 1, 1, 1, -1, -1, -1});
-    auto c7 = genGivenVals<DenseMatrix<VT2>>(numRows, { 1, 0, 1, 1, 0, 1, 1, 0, 1, 3,
-                                                        2, 1, 0, 1, 1, 1, 1, 1, 1, 1});
-    auto c8 = genGivenVals<DenseMatrix<VT3>>(numRows, { 1, 2, 3, 1, 2, 1, 2, 1, 2, 1,
-                                                        2, 1, 2, 1, 2, 1, 2, 1, 2, 1 });
-    auto c9 = genGivenVals<DenseMatrix<VT1>>(numRows, { 1.6, 2.7, 3.2, 1.5, 2.7, 1.5, 2.7, 1.6, 2.8, 1.5,
-                                                        2.7, 1.6, 2.7, 1.5, 2.8, 1.5, 2.7, 1.6, 2.8, 1.5 });
-    auto c10 = genGivenVals<DenseMatrix<VT0>>(numRows, { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
-    auto c11 = genGivenVals<DenseMatrix<VT0>>(numRows, { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
-            11, 12, 13, 14, 15, 16, 17, 18, 19});
-    
-    std::vector<Structure *> colsArg {c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11};
+    auto c0 = genGivenVals<DenseMatrix<VT0>>(
+        numRows, {1.5, 2.7, 3.2, 1.5, 2.7, 1.5, 2.7, 1.5, 2.7, 1.5, 2.7, 1.5, 2.7, 1.5, 2.7, 1.5, 2.7, 1.5, 2.7, 1.5});
+    auto c1 = genGivenVals<DenseMatrix<VT1>>(
+        numRows, {1.6, 2.7, 3.2, 1.5, 2.7, 1.5, 2.7, 1.6, 2.8, 1.5, 2.7, 1.6, 2.7, 1.5, 2.8, 1.5, 2.7, 1.6, 2.8, 1.5});
+    auto c2 =
+        genGivenVals<DenseMatrix<VT2>>(numRows, {-1, 0, 1, -1, 0, 1, -1, 0, 1, 3, 2, -1, 0, 1, 1, 1, 1, -1, -1, -1});
+    auto c3 = genGivenVals<DenseMatrix<VT3>>(numRows, {1, 0, 1, 1, 0, 1, 1, 0, 1, 3, 2, 1, 0, 1, 1, 1, 1, 1, 1, 1});
+    ;
+    auto c4 = genGivenVals<DenseMatrix<VT1>>(
+        numRows, {1.5, 2.7, 3.2, 1.5, 2.7, 1.5, 2.7, 1.5, 2.7, 1.5, 2.7, 1.5, 2.7, 1.5, 2.7, 1.5, 2.7, 1.5, 2.7, 1.5});
+    auto c5 = genGivenVals<DenseMatrix<VT1>>(
+        numRows, {1.6, 2.7, 3.2, 1.5, 2.7, 1.5, 2.7, 1.6, 2.8, 1.5, 2.7, 1.6, 2.7, 1.5, 2.8, 1.5, 2.7, 1.6, 2.8, 1.5});
+    auto c6 =
+        genGivenVals<DenseMatrix<VT2>>(numRows, {-1, 0, 1, -37, 17, 1, -1, 0, 1, 3, 2, -1, 0, 1, 1, 1, 1, -1, -1, -1});
+    auto c7 = genGivenVals<DenseMatrix<VT2>>(numRows, {1, 0, 1, 1, 0, 1, 1, 0, 1, 3, 2, 1, 0, 1, 1, 1, 1, 1, 1, 1});
+    auto c8 = genGivenVals<DenseMatrix<VT3>>(numRows, {1, 2, 3, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1});
+    auto c9 = genGivenVals<DenseMatrix<VT1>>(
+        numRows, {1.6, 2.7, 3.2, 1.5, 2.7, 1.5, 2.7, 1.6, 2.8, 1.5, 2.7, 1.6, 2.7, 1.5, 2.8, 1.5, 2.7, 1.6, 2.8, 1.5});
+    auto c10 = genGivenVals<DenseMatrix<VT0>>(numRows, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
+    auto c11 =
+        genGivenVals<DenseMatrix<VT0>>(numRows, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19});
+
+    std::vector<Structure *> colsArg{c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11};
     std::string labels[] = {"aaa", "bbb", "ccc", "ddd", "eee", "fff", "ggg", "hhh", "iii", "jjj", "kkk", "lll"};
     auto arg = DataObjectFactory::create<Frame>(colsArg, labels);
     DataObjectFactory::destroy(c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11);
 
-    Frame* exp{};
-    Frame* res{};
+    Frame *exp{};
+    Frame *res{};
     size_t numKeyCols;
     size_t numAggCols;
-    const char** keyCols = nullptr;
-    const char** aggCols = nullptr;;
-    mlir::daphne::GroupEnum * aggFuncs = nullptr;
-    
+    const char **keyCols = nullptr;
+    const char **aggCols = nullptr;
+    ;
+    mlir::daphne::GroupEnum *aggFuncs = nullptr;
+
     std::vector<Structure *> colsExp;
 
     auto context = new mlir::MLIRContext();
@@ -80,15 +78,15 @@ TEMPLATE_TEST_CASE("Group", TAG_KERNELS, (Frame)) {
     SECTION("1 grouping column, 1 aggregation column") {
         numKeyCols = 1;
         numAggCols = 1;
-        keyCols = new const char*[10]{labels[0].c_str()};
-        aggCols = new const char*[10]{labels[2].c_str()};
+        keyCols = new const char *[10]{labels[0].c_str()};
+        aggCols = new const char *[10]{labels[2].c_str()};
         aggFuncs = new mlir::daphne::GroupEnum[numAggCols];
         aggFuncs[0] = mlir::daphne::GroupEnum::COUNT;
 
         numRows = 3;
-        DenseMatrix<VT0> * c0Exp = genGivenVals<DenseMatrix<VT0>>(numRows, { 1.5, 2.7, 3.2 });
-        DenseMatrix<uint64_t> * c1Exp = genGivenVals<DenseMatrix<uint64_t>>(numRows, { 10, 9, 1});
-        std::vector<Structure *> colsExp {c0Exp, c1Exp};
+        DenseMatrix<VT0> *c0Exp = genGivenVals<DenseMatrix<VT0>>(numRows, {1.5, 2.7, 3.2});
+        DenseMatrix<uint64_t> *c1Exp = genGivenVals<DenseMatrix<uint64_t>>(numRows, {10, 9, 1});
+        std::vector<Structure *> colsExp{c0Exp, c1Exp};
         std::string labelsExp[] = {"aaa", "COUNT(ccc)"};
         exp = DataObjectFactory::create<Frame>(colsExp, labelsExp);
         DataObjectFactory::destroy(c0Exp, c1Exp);
@@ -96,15 +94,15 @@ TEMPLATE_TEST_CASE("Group", TAG_KERNELS, (Frame)) {
     SECTION("1 grouping column with one distinct value, 1 aggregation columns") {
         numKeyCols = 1;
         numAggCols = 1;
-        keyCols = new const char*[10]{labels[10].c_str()};
-        aggCols = new const char*[10]{labels[0].c_str()};
+        keyCols = new const char *[10]{labels[10].c_str()};
+        aggCols = new const char *[10]{labels[0].c_str()};
         aggFuncs = new mlir::daphne::GroupEnum[numAggCols];
         aggFuncs[0] = mlir::daphne::GroupEnum::COUNT;
 
         numRows = 1;
-        DenseMatrix<VT0> * c0Exp = genGivenVals<DenseMatrix<VT0>>(numRows, { 1 });
-        DenseMatrix<uint64_t> * c1Exp = genGivenVals<DenseMatrix<uint64_t>>(numRows, { 20 });
-        std::vector<Structure *> colsExp {c0Exp, c1Exp};
+        DenseMatrix<VT0> *c0Exp = genGivenVals<DenseMatrix<VT0>>(numRows, {1});
+        DenseMatrix<uint64_t> *c1Exp = genGivenVals<DenseMatrix<uint64_t>>(numRows, {20});
+        std::vector<Structure *> colsExp{c0Exp, c1Exp};
         std::string labelsExp[] = {"kkk", "COUNT(aaa)"};
         exp = DataObjectFactory::create<Frame>(colsExp, labelsExp);
         DataObjectFactory::destroy(c0Exp, c1Exp);
@@ -112,16 +110,17 @@ TEMPLATE_TEST_CASE("Group", TAG_KERNELS, (Frame)) {
     SECTION("1 grouping column with all distinct values, 1 aggregation columns") {
         numKeyCols = 1;
         numAggCols = 1;
-        keyCols = new const char*[10]{labels[11].c_str()};
-        aggCols = new const char*[10]{labels[0].c_str()};
+        keyCols = new const char *[10]{labels[11].c_str()};
+        aggCols = new const char *[10]{labels[0].c_str()};
         aggFuncs = new mlir::daphne::GroupEnum[numAggCols];
         aggFuncs[0] = mlir::daphne::GroupEnum::COUNT;
 
         numRows = 20;
-        DenseMatrix<VT0> * c0Exp = genGivenVals<DenseMatrix<VT0>>(numRows, { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
-            11, 12, 13, 14, 15, 16, 17, 18, 19});
-        DenseMatrix<uint64_t> * c1Exp = genGivenVals<DenseMatrix<uint64_t>>(numRows, { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
-        std::vector<Structure *> colsExp {c0Exp, c1Exp};
+        DenseMatrix<VT0> *c0Exp = genGivenVals<DenseMatrix<VT0>>(
+            numRows, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19});
+        DenseMatrix<uint64_t> *c1Exp =
+            genGivenVals<DenseMatrix<uint64_t>>(numRows, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
+        std::vector<Structure *> colsExp{c0Exp, c1Exp};
         std::string labelsExp[] = {"lll", "COUNT(aaa)"};
         exp = DataObjectFactory::create<Frame>(colsExp, labelsExp);
         DataObjectFactory::destroy(c0Exp, c1Exp);
@@ -129,20 +128,21 @@ TEMPLATE_TEST_CASE("Group", TAG_KERNELS, (Frame)) {
     SECTION("1 grouping column, 3 aggregation columns") {
         numKeyCols = 1;
         numAggCols = 3;
-        keyCols = new const char*[10]{labels[7].c_str()};
-        aggCols = new const char*[10]{labels[0].c_str(), labels[3].c_str(), labels[2].c_str()};
+        keyCols = new const char *[10]{labels[7].c_str()};
+        aggCols = new const char *[10]{labels[0].c_str(), labels[3].c_str(), labels[2].c_str()};
         aggFuncs = new mlir::daphne::GroupEnum[numAggCols];
         aggFuncs[0] = mlir::daphne::GroupEnum::SUM;
         aggFuncs[1] = mlir::daphne::GroupEnum::MIN;
         aggFuncs[2] = mlir::daphne::GroupEnum::MAX;
 
         numRows = 4;
-        DenseMatrix<VT2> * c0Exp = genGivenVals<DenseMatrix<VT2>>(numRows, { 0, 1, 2, 3 });
-        DenseMatrix<VT0> * c1Exp = genGivenVals<DenseMatrix<VT0>>(numRows, { 3*2.7+1.5, 8*1.5+5*2.7+3.2, 2.7, 1.5 });
-        DenseMatrix<VT3> * c2Exp = genGivenVals<DenseMatrix<VT3>>(numRows, { 0, 1, 2, 3 });
-        DenseMatrix<VT2> * c3Exp = genGivenVals<DenseMatrix<VT2>>(numRows, { 0, 1, 2, 3 });
+        DenseMatrix<VT2> *c0Exp = genGivenVals<DenseMatrix<VT2>>(numRows, {0, 1, 2, 3});
+        DenseMatrix<VT0> *c1Exp =
+            genGivenVals<DenseMatrix<VT0>>(numRows, {3 * 2.7 + 1.5, 8 * 1.5 + 5 * 2.7 + 3.2, 2.7, 1.5});
+        DenseMatrix<VT3> *c2Exp = genGivenVals<DenseMatrix<VT3>>(numRows, {0, 1, 2, 3});
+        DenseMatrix<VT2> *c3Exp = genGivenVals<DenseMatrix<VT2>>(numRows, {0, 1, 2, 3});
 
-        std::vector<Structure *> colsExp {c0Exp, c1Exp, c2Exp, c3Exp};
+        std::vector<Structure *> colsExp{c0Exp, c1Exp, c2Exp, c3Exp};
         std::string labelsExp[] = {"hhh", "SUM(aaa)", "MIN(ddd)", "MAX(ccc)"};
         exp = DataObjectFactory::create<Frame>(colsExp, labelsExp);
         DataObjectFactory::destroy(c0Exp, c1Exp, c2Exp, c3Exp);
@@ -150,17 +150,20 @@ TEMPLATE_TEST_CASE("Group", TAG_KERNELS, (Frame)) {
     SECTION("3 grouping columns, 1 aggregation column") {
         numKeyCols = 3;
         numAggCols = 1;
-        keyCols = new const char*[10]{labels[2].c_str(), labels[1].c_str(), labels[0].c_str()};
-        aggCols = new const char*[10]{labels[6].c_str()};
+        keyCols = new const char *[10]{labels[2].c_str(), labels[1].c_str(), labels[0].c_str()};
+        aggCols = new const char *[10]{labels[6].c_str()};
         aggFuncs = new mlir::daphne::GroupEnum[numAggCols];
         aggFuncs[0] = mlir::daphne::GroupEnum::AVG;
 
         numRows = 12;
-        DenseMatrix<VT2> * c0Exp = genGivenVals<DenseMatrix<VT2>>(numRows, { -1, -1, -1, -1, 0, 0, 1, 1, 1, 1, 2, 3 });
-        DenseMatrix<VT1> * c1Exp = genGivenVals<DenseMatrix<VT1>>(numRows, { 1.5, 1.6, 2.7, 2.8, 1.6, 2.7, 1.5, 2.7, 2.8, 3.2, 2.7, 1.5 });
-        DenseMatrix<VT0> * c2Exp = genGivenVals<DenseMatrix<VT0>>(numRows, { 1.5, 1.5, 2.7, 2.7, 1.5, 2.7, 1.5, 2.7, 2.7, 3.2, 2.7, 1.5 });
-        DenseMatrix<double> * c3Exp = genGivenVals<DenseMatrix<double>>(numRows, { -19, -1, -1, -1, 0, 17.0/3.0, 1, 1, 1, 1, 2, 3 });
-        std::vector<Structure *> colsExp {c0Exp, c1Exp, c2Exp, c3Exp};
+        DenseMatrix<VT2> *c0Exp = genGivenVals<DenseMatrix<VT2>>(numRows, {-1, -1, -1, -1, 0, 0, 1, 1, 1, 1, 2, 3});
+        DenseMatrix<VT1> *c1Exp =
+            genGivenVals<DenseMatrix<VT1>>(numRows, {1.5, 1.6, 2.7, 2.8, 1.6, 2.7, 1.5, 2.7, 2.8, 3.2, 2.7, 1.5});
+        DenseMatrix<VT0> *c2Exp =
+            genGivenVals<DenseMatrix<VT0>>(numRows, {1.5, 1.5, 2.7, 2.7, 1.5, 2.7, 1.5, 2.7, 2.7, 3.2, 2.7, 1.5});
+        DenseMatrix<double> *c3Exp =
+            genGivenVals<DenseMatrix<double>>(numRows, {-19, -1, -1, -1, 0, 17.0 / 3.0, 1, 1, 1, 1, 2, 3});
+        std::vector<Structure *> colsExp{c0Exp, c1Exp, c2Exp, c3Exp};
         std::string labelsExp[] = {"ccc", "bbb", "aaa", "AVG(ggg)"};
         exp = DataObjectFactory::create<Frame>(colsExp, labelsExp);
         DataObjectFactory::destroy(c0Exp, c1Exp, c2Exp, c3Exp);
@@ -168,8 +171,10 @@ TEMPLATE_TEST_CASE("Group", TAG_KERNELS, (Frame)) {
     SECTION("5 grouping columns, 5 aggregation columns") {
         numKeyCols = 5;
         numAggCols = 5;
-        keyCols = new const char*[10]{labels[0].c_str(), labels[6].c_str(), labels[2].c_str(), labels[8].c_str(), labels[4].c_str()};
-        aggCols = new const char*[10]{labels[5].c_str(), labels[1].c_str(), labels[7].c_str(), labels[3].c_str(), labels[9].c_str()};
+        keyCols = new const char *[10]{labels[0].c_str(), labels[6].c_str(), labels[2].c_str(), labels[8].c_str(),
+                                       labels[4].c_str()};
+        aggCols = new const char *[10]{labels[5].c_str(), labels[1].c_str(), labels[7].c_str(), labels[3].c_str(),
+                                       labels[9].c_str()};
         aggFuncs = new mlir::daphne::GroupEnum[numAggCols];
         aggFuncs[0] = mlir::daphne::GroupEnum::COUNT;
         aggFuncs[1] = mlir::daphne::GroupEnum::SUM;
@@ -178,36 +183,43 @@ TEMPLATE_TEST_CASE("Group", TAG_KERNELS, (Frame)) {
         aggFuncs[4] = mlir::daphne::GroupEnum::AVG;
 
         numRows = 11;
-        DenseMatrix<VT0> * c0Exp = genGivenVals<DenseMatrix<VT0>>(numRows, { 1.5, 1.5, 1.5, 1.5, 1.5, 2.7, 2.7, 2.7, 2.7, 2.7, 3.2 });
-        DenseMatrix<VT2> * c1Exp = genGivenVals<DenseMatrix<VT2>>(numRows, { -37, -1, 0, 1, 3, -1, 0, 1, 2, 17, 1 });
-        DenseMatrix<VT2> * c2Exp = genGivenVals<DenseMatrix<VT2>>(numRows, { -1, -1, 0, 1, 3, -1, 0, 1, 2, 0, 1 });
-        DenseMatrix<VT3> * c3Exp = genGivenVals<DenseMatrix<VT3>>(numRows, { 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3 });
-        DenseMatrix<VT1> * c4Exp = genGivenVals<DenseMatrix<VT1>>(numRows, { 1.5, 1.5, 1.5, 1.5, 1.5, 2.7, 2.7, 2.7, 2.7, 2.7, 3.2 });
-        DenseMatrix<uint64_t> * c5Exp = genGivenVals<DenseMatrix<uint64_t>>(numRows, { 1, 4, 1, 3, 1, 2, 2, 3, 1, 1, 1 });  
-        DenseMatrix<VT1> * c6Exp = genGivenVals<DenseMatrix<VT1>>(numRows, { 1.5f, 3*1.6f+1.5f, 1.6f, 3*1.5f, 1.5f, 2.7f+2.8f, 2*2.7f, 2*2.8f+2.7f, 2.7f, 2.7f, 3.2f });
-        DenseMatrix<VT2> * c7Exp = genGivenVals<DenseMatrix<VT2>>(numRows, { 1, 1, 0, 1, 3, 1, 0, 1, 2, 0, 1 });
-        DenseMatrix<VT3> * c8Exp = genGivenVals<DenseMatrix<VT3>>(numRows, { 1, 1, 0, 1, 3, 1, 0, 1, 2, 0, 1 });
-        DenseMatrix<VT0> * c9Exp = genGivenVals<DenseMatrix<VT0>>(numRows, { (double)1.5f, (((double)3*1.6f)+((double)1.5f))/4.0, (double)1.6f, ((double)(3*1.5f))/3.0,
-                                                                             (double)1.5f, ((double)(2.7f+2.8f))/2.0, ((double)(2*2.7f))/2.0, (((double)2*2.8f)+((double)2.7f))/3.0,
-                                                                             (double)2.7f, (double)2.7f, ((double)3.2f) });
-        std::vector<Structure *> colsExp {c0Exp, c1Exp, c2Exp, c3Exp, c4Exp, c5Exp, c6Exp, c7Exp, c8Exp, c9Exp};
-        std::string labelsExp[] = {"aaa", "ggg", "ccc", "iii", "eee", "COUNT(fff)", "SUM(bbb)", "MIN(hhh)", "MAX(ddd)", "AVG(jjj)"};
+        DenseMatrix<VT0> *c0Exp =
+            genGivenVals<DenseMatrix<VT0>>(numRows, {1.5, 1.5, 1.5, 1.5, 1.5, 2.7, 2.7, 2.7, 2.7, 2.7, 3.2});
+        DenseMatrix<VT2> *c1Exp = genGivenVals<DenseMatrix<VT2>>(numRows, {-37, -1, 0, 1, 3, -1, 0, 1, 2, 17, 1});
+        DenseMatrix<VT2> *c2Exp = genGivenVals<DenseMatrix<VT2>>(numRows, {-1, -1, 0, 1, 3, -1, 0, 1, 2, 0, 1});
+        DenseMatrix<VT3> *c3Exp = genGivenVals<DenseMatrix<VT3>>(numRows, {1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3});
+        DenseMatrix<VT1> *c4Exp =
+            genGivenVals<DenseMatrix<VT1>>(numRows, {1.5, 1.5, 1.5, 1.5, 1.5, 2.7, 2.7, 2.7, 2.7, 2.7, 3.2});
+        DenseMatrix<uint64_t> *c5Exp = genGivenVals<DenseMatrix<uint64_t>>(numRows, {1, 4, 1, 3, 1, 2, 2, 3, 1, 1, 1});
+        DenseMatrix<VT1> *c6Exp =
+            genGivenVals<DenseMatrix<VT1>>(numRows, {1.5f, 3 * 1.6f + 1.5f, 1.6f, 3 * 1.5f, 1.5f, 2.7f + 2.8f, 2 * 2.7f,
+                                                     2 * 2.8f + 2.7f, 2.7f, 2.7f, 3.2f});
+        DenseMatrix<VT2> *c7Exp = genGivenVals<DenseMatrix<VT2>>(numRows, {1, 1, 0, 1, 3, 1, 0, 1, 2, 0, 1});
+        DenseMatrix<VT3> *c8Exp = genGivenVals<DenseMatrix<VT3>>(numRows, {1, 1, 0, 1, 3, 1, 0, 1, 2, 0, 1});
+        DenseMatrix<VT0> *c9Exp = genGivenVals<DenseMatrix<VT0>>(
+            numRows,
+            {(double)1.5f, (((double)3 * 1.6f) + ((double)1.5f)) / 4.0, (double)1.6f, ((double)(3 * 1.5f)) / 3.0,
+             (double)1.5f, ((double)(2.7f + 2.8f)) / 2.0, ((double)(2 * 2.7f)) / 2.0,
+             (((double)2 * 2.8f) + ((double)2.7f)) / 3.0, (double)2.7f, (double)2.7f, ((double)3.2f)});
+        std::vector<Structure *> colsExp{c0Exp, c1Exp, c2Exp, c3Exp, c4Exp, c5Exp, c6Exp, c7Exp, c8Exp, c9Exp};
+        std::string labelsExp[] = {"aaa",        "ggg",      "ccc",      "iii",      "eee",
+                                   "COUNT(fff)", "SUM(bbb)", "MIN(hhh)", "MAX(ddd)", "AVG(jjj)"};
         exp = DataObjectFactory::create<Frame>(colsExp, labelsExp);
         DataObjectFactory::destroy(c0Exp, c1Exp, c2Exp, c3Exp, c4Exp, c5Exp, c6Exp, c7Exp, c8Exp, c9Exp);
     }
     SECTION("0 grouping columns, 2 identical aggregation columns") {
         numKeyCols = 0;
         numAggCols = 2;
-        keyCols = new const char*[10]{};
-        aggCols = new const char*[10]{labels[2].c_str(), labels[2].c_str()};
+        keyCols = new const char *[10]{};
+        aggCols = new const char *[10]{labels[2].c_str(), labels[2].c_str()};
         aggFuncs = new mlir::daphne::GroupEnum[numAggCols];
         aggFuncs[0] = mlir::daphne::GroupEnum::COUNT;
         aggFuncs[1] = mlir::daphne::GroupEnum::SUM;
 
         numRows = 1;
-        DenseMatrix<uint64_t> * c0Exp = genGivenVals<DenseMatrix<uint64_t>>(numRows, { 20 });
-        DenseMatrix<VT2> * c1Exp = genGivenVals<DenseMatrix<VT2>>(numRows, { 5 });
-        std::vector<Structure *> colsExp {c0Exp, c1Exp};
+        DenseMatrix<uint64_t> *c0Exp = genGivenVals<DenseMatrix<uint64_t>>(numRows, {20});
+        DenseMatrix<VT2> *c1Exp = genGivenVals<DenseMatrix<VT2>>(numRows, {5});
+        std::vector<Structure *> colsExp{c0Exp, c1Exp};
         std::string labelsExp[] = {"COUNT(ccc)", "SUM(ccc)"};
         exp = DataObjectFactory::create<Frame>(colsExp, labelsExp);
         DataObjectFactory::destroy(c0Exp, c1Exp);
@@ -215,16 +227,17 @@ TEMPLATE_TEST_CASE("Group", TAG_KERNELS, (Frame)) {
     SECTION("3 grouping column, 0 aggregation columns") {
         numKeyCols = 3;
         numAggCols = 0;
-        keyCols = new const char*[10]{labels[0].c_str(), labels[2].c_str(), labels[3].c_str()};
-        aggCols = new const char*[10]{};
+        keyCols = new const char *[10]{labels[0].c_str(), labels[2].c_str(), labels[3].c_str()};
+        aggCols = new const char *[10]{};
         aggFuncs = nullptr;
 
         numRows = 9;
-        DenseMatrix<VT0> * c0Exp = genGivenVals<DenseMatrix<VT0>>(numRows, { 1.5, 1.5, 1.5, 1.5, 2.7, 2.7, 2.7, 2.7, 3.2 });
-        DenseMatrix<VT2> * c1Exp = genGivenVals<DenseMatrix<VT2>>(numRows, { -1, 0, 1, 3, -1, 0, 1, 2, 1 });
-        DenseMatrix<VT3> * c2Exp = genGivenVals<DenseMatrix<VT3>>(numRows, { 1, 0, 1, 3, 1, 0, 1, 2, 1 });
+        DenseMatrix<VT0> *c0Exp =
+            genGivenVals<DenseMatrix<VT0>>(numRows, {1.5, 1.5, 1.5, 1.5, 2.7, 2.7, 2.7, 2.7, 3.2});
+        DenseMatrix<VT2> *c1Exp = genGivenVals<DenseMatrix<VT2>>(numRows, {-1, 0, 1, 3, -1, 0, 1, 2, 1});
+        DenseMatrix<VT3> *c2Exp = genGivenVals<DenseMatrix<VT3>>(numRows, {1, 0, 1, 3, 1, 0, 1, 2, 1});
 
-        std::vector<Structure *> colsExp {c0Exp, c1Exp, c2Exp};
+        std::vector<Structure *> colsExp{c0Exp, c1Exp, c2Exp};
         std::string labelsExp[] = {"aaa", "ccc", "ddd"};
         exp = DataObjectFactory::create<Frame>(colsExp, labelsExp);
         DataObjectFactory::destroy(c0Exp, c1Exp, c2Exp);
@@ -232,8 +245,8 @@ TEMPLATE_TEST_CASE("Group", TAG_KERNELS, (Frame)) {
 
     group(res, arg, keyCols, numKeyCols, aggCols, numAggCols, aggFuncs, numAggCols, nullptr);
     CHECK(*res == *exp);
-    delete [] keyCols;
-    delete [] aggCols;
+    delete[] keyCols;
+    delete[] aggCols;
     delete aggFuncs;
     delete context;
     DataObjectFactory::destroy(arg, exp, res);
diff --git a/test/runtime/local/kernels/HasSpecialValueTest.cpp b/test/runtime/local/kernels/HasSpecialValueTest.cpp
index 79c5bffb8..66b0144e1 100644
--- a/test/runtime/local/kernels/HasSpecialValueTest.cpp
+++ b/test/runtime/local/kernels/HasSpecialValueTest.cpp
@@ -18,11 +18,11 @@
 #include <bits/stdint-uintn.h>
 #include <catch.hpp>
 #include <cmath>
-#include <tags.h>
+#include <runtime/local/datagen/GenGivenVals.h>
 #include <runtime/local/datastructures/CSRMatrix.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
-#include <runtime/local/datagen/GenGivenVals.h>
 #include <runtime/local/kernels/HasSpecialValue.h>
+#include <tags.h>
 
 #define DATA_TYPES DenseMatrix, CSRMatrix, Matrix
 
@@ -30,18 +30,9 @@ TEMPLATE_PRODUCT_TEST_CASE("hasSpecialValue - integer", TAG_KERNELS, (DATA_TYPES
 
     using DT = TestType;
 
-    auto specialMat = genGivenVals<DT>(4, {
-        0, 1, 2, 3,
-        4, 5, 6, 7,
-        8, 9, 0, 2,
-        3, 4, 5, 1
-    });
+    auto specialMat = genGivenVals<DT>(4, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 2, 3, 4, 5, 1});
 
-    auto nonSpecialMat = genGivenVals<DT>(3, {
-        0, 0, 3,
-        4, 5, 6,
-        7, 8, 9
-    });
+    auto nonSpecialMat = genGivenVals<DT>(3, {0, 0, 3, 4, 5, 6, 7, 8, 9});
 
     SECTION("hasSpecialValue check if test function is applied correctly.") {
         CHECK(hasSpecialValue(specialMat, typename DT::VT(1), nullptr));
@@ -49,24 +40,14 @@ TEMPLATE_PRODUCT_TEST_CASE("hasSpecialValue - integer", TAG_KERNELS, (DATA_TYPES
     }
 }
 
-
 TEMPLATE_PRODUCT_TEST_CASE("hasSpecialValue - DenseMatrix-Submatrix.", TAG_KERNELS, DenseMatrix, (uint32_t)) {
 
     using DT = TestType;
 
-    auto specialMat = genGivenVals<DT>(4, {
-        0, 1, 2, 3,
-        4, 5, 6, 7,
-        8, 9, 0, 2,
-        3, 4, 5, 1
-    });
+    auto specialMat = genGivenVals<DT>(4, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 2, 3, 4, 5, 1});
 
-    auto subNonSpecialMat = DataObjectFactory::create<DT>(specialMat,
-        1,
-        specialMat->getNumRows() - 1,
-        1,
-        specialMat->getNumCols() - 1
-    );
+    auto subNonSpecialMat =
+        DataObjectFactory::create<DT>(specialMat, 1, specialMat->getNumRows() - 1, 1, specialMat->getNumCols() - 1);
 
     SECTION("hasSpecialValue for Sub-DenseMatrix") {
         CHECK(hasSpecialValue(specialMat, typename DT::VT(1), nullptr));
@@ -78,17 +59,9 @@ TEMPLATE_PRODUCT_TEST_CASE("hasSpecialValue - CSRMatrix-Submatrix.", TAG_KERNELS
 
     using DT = TestType;
 
-    auto specialMat = genGivenVals<DT>(4, {
-        0, 1, 2, 3,
-        4, 5, 6, 7,
-        8, 9, 0, 2,
-        3, 4, 5, 1
-    });
+    auto specialMat = genGivenVals<DT>(4, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 2, 3, 4, 5, 1});
 
-    auto subNonSpecialMat = DataObjectFactory::create<DT>(specialMat,
-        1,
-        specialMat->getNumRows() - 2
-    );
+    auto subNonSpecialMat = DataObjectFactory::create<DT>(specialMat, 1, specialMat->getNumRows() - 2);
 
     SECTION("hasSpecialValue for Sub-CSRMatrix") {
         CHECK(hasSpecialValue(specialMat, typename DT::VT(1), nullptr));
@@ -104,23 +77,11 @@ TEMPLATE_PRODUCT_TEST_CASE("hasSpecialValue - floating point", TAG_KERNELS, (DAT
     auto quietNaN = std::numeric_limits<double>::quiet_NaN();
     auto inf = std::numeric_limits<double>::infinity();
 
-    auto sigNaNMat= genGivenVals<DT>(3, {
-        0, 1, 3,
-        4, 5, 6,
-        7, 8, sigNaN
-    });
-
-    auto quietNaNMat = genGivenVals<DT>(3, {
-        0, 1, 3,
-        4, 5, 6,
-        7, 8, quietNaN
-    });
-
-    auto infinityMat = genGivenVals<DT>(3, {
-        0, 1, 3,
-        4, 5, 6,
-        7, 8, inf
-    });
+    auto sigNaNMat = genGivenVals<DT>(3, {0, 1, 3, 4, 5, 6, 7, 8, sigNaN});
+
+    auto quietNaNMat = genGivenVals<DT>(3, {0, 1, 3, 4, 5, 6, 7, 8, quietNaN});
+
+    auto infinityMat = genGivenVals<DT>(3, {0, 1, 3, 4, 5, 6, 7, 8, inf});
 
     SECTION("Check for special values std::isnan/std::isinf.") {
         CHECK(hasSpecialValue(sigNaNMat, sigNaN, nullptr));
diff --git a/test/runtime/local/kernels/InnerJoinTest.cpp b/test/runtime/local/kernels/InnerJoinTest.cpp
index f4ae3d35a..371c20cfe 100644
--- a/test/runtime/local/kernels/InnerJoinTest.cpp
+++ b/test/runtime/local/kernels/InnerJoinTest.cpp
@@ -32,20 +32,20 @@
 #include <cstdint>
 
 TEST_CASE("innerJoin", TAG_KERNELS) {
-    auto lhsC0 = genGivenVals<DenseMatrix<int64_t>>(4, { 1,  2,  3,  4});
+    auto lhsC0 = genGivenVals<DenseMatrix<int64_t>>(4, {1, 2, 3, 4});
     auto lhsC1 = genGivenVals<DenseMatrix<double>>(4, {11.0, 22.0, 33.0, 44.00});
     std::vector<Structure *> lhsCols = {lhsC0, lhsC1};
     std::string lhsLabels[] = {"a", "b"};
     auto lhs = DataObjectFactory::create<Frame>(lhsCols, lhsLabels);
 
-    auto rhsC0 = genGivenVals<DenseMatrix<int64_t>>(3, { 1, 4, 5});
-    auto rhsC1 = genGivenVals<DenseMatrix<int64_t>>(3, { -1, -4, -5});
-    auto rhsC2 = genGivenVals<DenseMatrix<double >>(3, {0.1, 0.2, 0.3});
+    auto rhsC0 = genGivenVals<DenseMatrix<int64_t>>(3, {1, 4, 5});
+    auto rhsC1 = genGivenVals<DenseMatrix<int64_t>>(3, {-1, -4, -5});
+    auto rhsC2 = genGivenVals<DenseMatrix<double>>(3, {0.1, 0.2, 0.3});
     std::vector<Structure *> rhsCols = {rhsC0, rhsC1, rhsC2};
     std::string rhsLabels[] = {"c", "d", "e"};
     auto rhs = DataObjectFactory::create<Frame>(rhsCols, rhsLabels);
 
-    Frame * res = nullptr;
+    Frame *res = nullptr;
     innerJoin(res, lhs, rhs, "a", "c", nullptr);
 
     // Check the meta data.
@@ -65,17 +65,17 @@ TEST_CASE("innerJoin", TAG_KERNELS) {
     CHECK(res->getLabels()[4] == "e");
 
     auto resC0Exp = genGivenVals<DenseMatrix<int64_t>>(2, {1, 4});
-    auto resC1Exp = genGivenVals<DenseMatrix<double >>(2, {11.0, 44.0});
+    auto resC1Exp = genGivenVals<DenseMatrix<double>>(2, {11.0, 44.0});
     auto resC2Exp = genGivenVals<DenseMatrix<int64_t>>(2, {1, 4});
     auto resC3Exp = genGivenVals<DenseMatrix<int64_t>>(2, {-1, -4});
-    auto resC4Exp = genGivenVals<DenseMatrix<double >>(2, {0.1, 0.2});
+    auto resC4Exp = genGivenVals<DenseMatrix<double>>(2, {0.1, 0.2});
 
     CHECK(*(res->getColumn<int64_t>(0)) == *resC0Exp);
-    CHECK(*(res->getColumn<double >(1)) == *resC1Exp);
+    CHECK(*(res->getColumn<double>(1)) == *resC1Exp);
     CHECK(*(res->getColumn<int64_t>(2)) == *resC2Exp);
     CHECK(*(res->getColumn<int64_t>(3)) == *resC3Exp);
-    CHECK(*(res->getColumn<double >(4)) == *resC4Exp);
-    
+    CHECK(*(res->getColumn<double>(4)) == *resC4Exp);
+
     DataObjectFactory::destroy(lhsC0, lhsC1, lhs);
     DataObjectFactory::destroy(rhsC0, rhsC1, rhsC2, rhs);
     DataObjectFactory::destroy(res);
diff --git a/test/runtime/local/kernels/InsertColTest.cpp b/test/runtime/local/kernels/InsertColTest.cpp
index 14b397305..bf63a52bc 100644
--- a/test/runtime/local/kernels/InsertColTest.cpp
+++ b/test/runtime/local/kernels/InsertColTest.cpp
@@ -14,58 +14,80 @@
  * limitations under the License.
  */
 
-#include <runtime/local/kernels/CheckEq.h>
 #include <runtime/local/datagen/GenGivenVals.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
+#include <runtime/local/kernels/CheckEq.h>
 #include <runtime/local/kernels/InsertCol.h>
 
-#include <tags.h>
 #include <catch.hpp>
+#include <tags.h>
 
 #include <cstdint>
 
 #define DATA_TYPES DenseMatrix, Matrix
 #define VALUE_TYPES int32_t, double
 
-template<typename DTArg, typename VTSel>
-void checkInsertCol(const DTArg * arg, const DTArg * ins, const VTSel lowerIncl, const VTSel upperExcl, const DTArg * exp) {
-    DTArg * res = nullptr;
+template <typename DTArg, typename VTSel>
+void checkInsertCol(const DTArg *arg, const DTArg *ins, const VTSel lowerIncl, const VTSel upperExcl,
+                    const DTArg *exp) {
+    DTArg *res = nullptr;
     insertCol<DTArg, DTArg, VTSel>(res, arg, ins, lowerIncl, upperExcl, nullptr);
     CHECK(*res == *exp);
     DataObjectFactory::destroy(res, exp);
 }
 
-template<typename DTArg, typename VTSel>
-void checkInsertColThrow(const DTArg * arg, const DTArg * ins, const VTSel lowerIncl, const VTSel upperExcl) {
-    DTArg * res = nullptr;
-    REQUIRE_THROWS_AS((insertCol<DTArg, DTArg, VTSel>(res, arg, ins, lowerIncl, upperExcl, nullptr)), std::out_of_range);
+template <typename DTArg, typename VTSel>
+void checkInsertColThrow(const DTArg *arg, const DTArg *ins, const VTSel lowerIncl, const VTSel upperExcl) {
+    DTArg *res = nullptr;
+    REQUIRE_THROWS_AS((insertCol<DTArg, DTArg, VTSel>(res, arg, ins, lowerIncl, upperExcl, nullptr)),
+                      std::out_of_range);
 }
 
 TEMPLATE_PRODUCT_TEST_CASE("InsertCol", TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)) {
     using DT = TestType;
     using VT = typename DT::VT;
 
-
     auto arg = genGivenVals<DT>(3, {
-        1, -2, 3, 4,
-        -5, 6, 7, -8,
-        9, 10, -11, 12,
-    });
+                                       1,
+                                       -2,
+                                       3,
+                                       4,
+                                       -5,
+                                       6,
+                                       7,
+                                       -8,
+                                       9,
+                                       10,
+                                       -11,
+                                       12,
+                                   });
 
     auto ins = genGivenVals<DT>(3, {
-        1, -1,
-        1, 1,
-        1, -1,
-    });
+                                       1,
+                                       -1,
+                                       1,
+                                       1,
+                                       1,
+                                       -1,
+                                   });
 
     SECTION("multiple insertions, lower bound") {
         VT lowerIncl = 0;
         VT upperExcl = 2;
-        DT * exp = genGivenVals<DT>(3, {
-            1, -1, 3, 4,
-            1, 1, 7, -8,
-            1, -1, -11, 12,
-        });
+        DT *exp = genGivenVals<DT>(3, {
+                                          1,
+                                          -1,
+                                          3,
+                                          4,
+                                          1,
+                                          1,
+                                          7,
+                                          -8,
+                                          1,
+                                          -1,
+                                          -11,
+                                          12,
+                                      });
 
         checkInsertCol(arg, ins, lowerIncl, upperExcl, exp);
     }
@@ -73,11 +95,20 @@ TEMPLATE_PRODUCT_TEST_CASE("InsertCol", TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)
     SECTION("multiple insertion, middle") {
         VT lowerIncl = 1;
         VT upperExcl = 3;
-        DT * exp = genGivenVals<DT>(3, {
-            1, 1, -1, 4,
-            -5, 1, 1, -8,
-            9, 1, -1, 12,
-        });
+        DT *exp = genGivenVals<DT>(3, {
+                                          1,
+                                          1,
+                                          -1,
+                                          4,
+                                          -5,
+                                          1,
+                                          1,
+                                          -8,
+                                          9,
+                                          1,
+                                          -1,
+                                          12,
+                                      });
 
         checkInsertCol(arg, ins, lowerIncl, upperExcl, exp);
     }
@@ -85,15 +116,24 @@ TEMPLATE_PRODUCT_TEST_CASE("InsertCol", TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)
     SECTION("multiple insertions, upper bound") {
         VT lowerIncl = 2;
         VT upperExcl = 4;
-        DT * exp = genGivenVals<DT>(3, {
-            1, -2, 1, -1,
-            -5, 6, 1, 1,
-            9, 10, 1, -1,
-        });
+        DT *exp = genGivenVals<DT>(3, {
+                                          1,
+                                          -2,
+                                          1,
+                                          -1,
+                                          -5,
+                                          6,
+                                          1,
+                                          1,
+                                          9,
+                                          10,
+                                          1,
+                                          -1,
+                                      });
 
         checkInsertCol(arg, ins, lowerIncl, upperExcl, exp);
     }
-    
+
     SECTION("out of bounds - negative") {
         VT lowerIncl = -1;
         VT upperExcl = 1;
@@ -110,31 +150,105 @@ TEMPLATE_PRODUCT_TEST_CASE("InsertCol", TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)
 
     DataObjectFactory::destroy(arg, ins);
 }
-TEMPLATE_PRODUCT_TEST_CASE("InsertCol - FP specific", TAG_KERNELS, (DATA_TYPES), (double)) {
+
+TEMPLATE_PRODUCT_TEST_CASE("InsertCol - string specific", TAG_KERNELS, (DATA_TYPES), (ALL_STRING_VALUE_TYPES)) {
     using DT = TestType;
     using VT = typename DT::VT;
 
+    auto arg = genGivenVals<DT>(3, {VT("a"), VT(""), VT("1"), VT("abc"), VT("abc"), VT("abcd"), VT(" "), VT("a"),
+                                    VT("ABC"), VT("34ab"), VT("ac"), VT("b")});
+
+    auto ins = genGivenVals<DT>(3, {VT("a"), VT("d"), VT("b"), VT("e"), VT("c"), VT("f")});
+
+    SECTION("multiple insertions, lower bound") {
+        size_t lowerIncl = 0;
+        size_t upperExcl = 2;
+        DT *exp = genGivenVals<DT>(3, {VT("a"), VT("d"), VT("1"), VT("abc"), VT("b"), VT("e"), VT(" "), VT("a"),
+                                       VT("c"), VT("f"), VT("ac"), VT("b")});
+
+        checkInsertCol(arg, ins, lowerIncl, upperExcl, exp);
+    }
+
+    SECTION("multiple insertion, middle") {
+        size_t lowerIncl = 1;
+        size_t upperExcl = 3;
+        DT *exp = genGivenVals<DT>(3, {VT("a"), VT("a"), VT("d"), VT("abc"), VT("abc"), VT("b"), VT("e"), VT("a"),
+                                       VT("ABC"), VT("c"), VT("f"), VT("b")});
+
+        checkInsertCol(arg, ins, lowerIncl, upperExcl, exp);
+    }
+
+    SECTION("multiple insertions, upper bound") {
+        size_t lowerIncl = 2;
+        size_t upperExcl = 4;
+        DT *exp = genGivenVals<DT>(3, {VT("a"), VT(""), VT("a"), VT("d"), VT("abc"), VT("abcd"), VT("b"), VT("e"),
+                                       VT("ABC"), VT("34ab"), VT("c"), VT("f")});
+
+        checkInsertCol(arg, ins, lowerIncl, upperExcl, exp);
+    }
+
+    SECTION("out of bounds - negative") {
+        size_t lowerIncl = -1;
+        size_t upperExcl = 1;
+
+        checkInsertColThrow(arg, ins, lowerIncl, upperExcl);
+    }
+
+    SECTION("out of bounds - too high") {
+        size_t lowerIncl = 3;
+        size_t upperExcl = 5;
+
+        checkInsertColThrow(arg, ins, lowerIncl, upperExcl);
+    }
+
+    DataObjectFactory::destroy(arg, ins);
+}
+
+TEMPLATE_PRODUCT_TEST_CASE("InsertCol - FP specific", TAG_KERNELS, (DATA_TYPES), (double)) {
+    using DT = TestType;
+    using VT = typename DT::VT;
 
     auto arg = genGivenVals<DT>(3, {
-        1, -2, 3, 4,
-        -5, 6, 7, -8,
-        9, 10, -11, 12.4,
-    });
+                                       1,
+                                       -2,
+                                       3,
+                                       4,
+                                       -5,
+                                       6,
+                                       7,
+                                       -8,
+                                       9,
+                                       10,
+                                       -11,
+                                       12.4,
+                                   });
 
     auto ins = genGivenVals<DT>(3, {
-        1, -1,
-        1, 1,
-        1, -1,
-    });
+                                       1,
+                                       -1,
+                                       1,
+                                       1,
+                                       1,
+                                       -1,
+                                   });
 
     SECTION("multiple insertions, FP bounds") {
         VT lowerIncl = 2.4;
         VT upperExcl = 4.9;
-        DT * exp = genGivenVals<DT>(3, {
-            1, -2, 1, -1,
-            -5, 6, 1, 1,
-            9, 10, 1, -1,
-        });
+        DT *exp = genGivenVals<DT>(3, {
+                                          1,
+                                          -2,
+                                          1,
+                                          -1,
+                                          -5,
+                                          6,
+                                          1,
+                                          1,
+                                          9,
+                                          10,
+                                          1,
+                                          -1,
+                                      });
 
         checkInsertCol(arg, ins, lowerIncl, upperExcl, exp);
     }
diff --git a/test/runtime/local/kernels/InsertRowTest.cpp b/test/runtime/local/kernels/InsertRowTest.cpp
index 80583ea6f..ea308b7dd 100644
--- a/test/runtime/local/kernels/InsertRowTest.cpp
+++ b/test/runtime/local/kernels/InsertRowTest.cpp
@@ -14,59 +14,80 @@
  * limitations under the License.
  */
 
-#include <runtime/local/kernels/CheckEq.h>
 #include <runtime/local/datagen/GenGivenVals.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
+#include <runtime/local/kernels/CheckEq.h>
 #include <runtime/local/kernels/InsertRow.h>
 
-#include <tags.h>
 #include <catch.hpp>
+#include <tags.h>
 
 #include <cstdint>
 
 #define DATA_TYPES DenseMatrix, Matrix
 #define VALUE_TYPES int32_t, double
 
-template<typename DTArg, typename VTSel>
-void checkInsertRow(const DTArg * arg, const DTArg * ins, const VTSel lowerIncl, const VTSel upperExcl, const DTArg * exp) {
-    DTArg * res = nullptr;
+template <typename DTArg, typename VTSel>
+void checkInsertRow(const DTArg *arg, const DTArg *ins, const VTSel lowerIncl, const VTSel upperExcl,
+                    const DTArg *exp) {
+    DTArg *res = nullptr;
     insertRow<DTArg, DTArg, VTSel>(res, arg, ins, lowerIncl, upperExcl, nullptr);
     CHECK(*res == *exp);
     DataObjectFactory::destroy(res, exp);
 }
 
-template<typename DTArg, typename VTSel>
-void checkInsertRowThrow(const DTArg * arg, const DTArg * ins, const VTSel lowerIncl, const VTSel upperExcl) {
-    DTArg * res = nullptr;
-    REQUIRE_THROWS_AS((insertRow<DTArg, DTArg, VTSel>(res, arg, ins, lowerIncl, upperExcl, nullptr)), std::out_of_range);
+template <typename DTArg, typename VTSel>
+void checkInsertRowThrow(const DTArg *arg, const DTArg *ins, const VTSel lowerIncl, const VTSel upperExcl) {
+    DTArg *res = nullptr;
+    REQUIRE_THROWS_AS((insertRow<DTArg, DTArg, VTSel>(res, arg, ins, lowerIncl, upperExcl, nullptr)),
+                      std::out_of_range);
 }
 
 TEMPLATE_PRODUCT_TEST_CASE("InsertRow", TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)) {
     using DT = TestType;
     using VT = typename DT::VT;
 
-
     auto arg = genGivenVals<DT>(4, {
-        1, -2, 3,
-        4, -5, 6,
-        7, -8, 9,
-        10, -11, 12,
-    });
+                                       1,
+                                       -2,
+                                       3,
+                                       4,
+                                       -5,
+                                       6,
+                                       7,
+                                       -8,
+                                       9,
+                                       10,
+                                       -11,
+                                       12,
+                                   });
 
     auto ins = genGivenVals<DT>(2, {
-        2, -2, 2,
-        7, 9, 11,
-    });
+                                       2,
+                                       -2,
+                                       2,
+                                       7,
+                                       9,
+                                       11,
+                                   });
 
     SECTION("multiple insertions, lower bound") {
         VT lowerIncl = 0;
         VT upperExcl = 2;
-        DT * exp = genGivenVals<DT>(4, {
-            2, -2, 2,
-            7, 9, 11,
-            7, -8, 9,
-            10, -11, 12,
-        });
+        DT *exp = genGivenVals<DT>(4, {
+                                          2,
+                                          -2,
+                                          2,
+                                          7,
+                                          9,
+                                          11,
+                                          7,
+                                          -8,
+                                          9,
+                                          10,
+                                          -11,
+                                          12,
+                                      });
 
         checkInsertRow(arg, ins, lowerIncl, upperExcl, exp);
     }
@@ -74,12 +95,20 @@ TEMPLATE_PRODUCT_TEST_CASE("InsertRow", TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)
     SECTION("multiple insertion, middle") {
         VT lowerIncl = 1;
         VT upperExcl = 3;
-        DT * exp = genGivenVals<DT>(4, {
-            1, -2, 3,
-            2, -2, 2,
-            7, 9, 11,
-            10, -11, 12,
-        });
+        DT *exp = genGivenVals<DT>(4, {
+                                          1,
+                                          -2,
+                                          3,
+                                          2,
+                                          -2,
+                                          2,
+                                          7,
+                                          9,
+                                          11,
+                                          10,
+                                          -11,
+                                          12,
+                                      });
 
         checkInsertRow(arg, ins, lowerIncl, upperExcl, exp);
     }
@@ -87,12 +116,20 @@ TEMPLATE_PRODUCT_TEST_CASE("InsertRow", TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)
     SECTION("multiple insertions, upper bound") {
         VT lowerIncl = 2;
         VT upperExcl = 4;
-        DT * exp = genGivenVals<DT>(4, {
-            1, -2, 3,
-            4, -5, 6,
-            2, -2, 2,
-            7, 9, 11,
-        });
+        DT *exp = genGivenVals<DT>(4, {
+                                          1,
+                                          -2,
+                                          3,
+                                          4,
+                                          -5,
+                                          6,
+                                          2,
+                                          -2,
+                                          2,
+                                          7,
+                                          9,
+                                          11,
+                                      });
 
         checkInsertRow(arg, ins, lowerIncl, upperExcl, exp);
     }
@@ -114,32 +151,104 @@ TEMPLATE_PRODUCT_TEST_CASE("InsertRow", TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)
     DataObjectFactory::destroy(arg, ins);
 }
 
-TEMPLATE_PRODUCT_TEST_CASE("InsertRow - FP specific", TAG_KERNELS, (DATA_TYPES), (double)) {
+TEMPLATE_PRODUCT_TEST_CASE("InsertRow - string specific", TAG_KERNELS, (DATA_TYPES), (ALL_STRING_VALUE_TYPES)) {
     using DT = TestType;
     using VT = typename DT::VT;
 
+    auto arg = genGivenVals<DT>(4, {VT("a"), VT(""), VT("1"), VT("abc"), VT("abc"), VT("abcd"), VT(" "), VT("a"),
+                                    VT("ABC"), VT("34ab"), VT("ac"), VT("b")});
+
+    auto ins = genGivenVals<DT>(2, {VT("a"), VT("b"), VT("c"), VT("d"), VT("e"), VT("f")});
+
+    SECTION("multiple insertions, lower bound") {
+        size_t lowerIncl = 0;
+        size_t upperExcl = 2;
+        DT *exp = genGivenVals<DT>(4, {VT("a"), VT("b"), VT("c"), VT("d"), VT("e"), VT("f"), VT(" "), VT("a"),
+                                       VT("ABC"), VT("34ab"), VT("ac"), VT("b")});
+
+        checkInsertRow(arg, ins, lowerIncl, upperExcl, exp);
+    }
+
+    SECTION("multiple insertion, middle") {
+        size_t lowerIncl = 1;
+        size_t upperExcl = 3;
+        DT *exp = genGivenVals<DT>(4, {VT("a"), VT(""), VT("1"), VT("a"), VT("b"), VT("c"), VT("d"), VT("e"), VT("f"),
+                                       VT("34ab"), VT("ac"), VT("b")});
+
+        checkInsertRow(arg, ins, lowerIncl, upperExcl, exp);
+    }
+
+    SECTION("multiple insertions, upper bound") {
+        size_t lowerIncl = 2;
+        size_t upperExcl = 4;
+        DT *exp = genGivenVals<DT>(4, {VT("a"), VT(""), VT("1"), VT("abc"), VT("abc"), VT("abcd"), VT("a"), VT("b"),
+                                       VT("c"), VT("d"), VT("e"), VT("f")});
+
+        checkInsertRow(arg, ins, lowerIncl, upperExcl, exp);
+    }
+
+    SECTION("out of bounds - negative") {
+        size_t lowerIncl = -1;
+        size_t upperExcl = 1;
+
+        checkInsertRowThrow(arg, ins, lowerIncl, upperExcl);
+    }
+
+    SECTION("out of bounds - too high") {
+        size_t lowerIncl = 3;
+        size_t upperExcl = 5;
+
+        checkInsertRowThrow(arg, ins, lowerIncl, upperExcl);
+    }
+
+    DataObjectFactory::destroy(arg, ins);
+}
+
+TEMPLATE_PRODUCT_TEST_CASE("InsertRow - FP specific", TAG_KERNELS, (DATA_TYPES), (double)) {
+    using DT = TestType;
+    using VT = typename DT::VT;
 
     auto arg = genGivenVals<DT>(4, {
-        1, -2, 3,
-        4, -5, 6,
-        7, -8, 9,
-        10, -11, 12.4,
-    });
+                                       1,
+                                       -2,
+                                       3,
+                                       4,
+                                       -5,
+                                       6,
+                                       7,
+                                       -8,
+                                       9,
+                                       10,
+                                       -11,
+                                       12.4,
+                                   });
 
     auto ins = genGivenVals<DT>(2, {
-        2, -2, 2,
-        7, 9, 11,
-    });
-    
+                                       2,
+                                       -2,
+                                       2,
+                                       7,
+                                       9,
+                                       11,
+                                   });
+
     SECTION("multiple insertions, FP bounds") {
         VT lowerIncl = 2.4;
         VT upperExcl = 4.9;
-        DT * exp = genGivenVals<DT>(4, {
-            1, -2, 3,
-            4, -5, 6,
-            2, -2, 2,
-            7, 9, 11,
-        });
+        DT *exp = genGivenVals<DT>(4, {
+                                          1,
+                                          -2,
+                                          3,
+                                          4,
+                                          -5,
+                                          6,
+                                          2,
+                                          -2,
+                                          2,
+                                          7,
+                                          9,
+                                          11,
+                                      });
 
         checkInsertRow(arg, ins, lowerIncl, upperExcl, exp);
     }
diff --git a/test/runtime/local/kernels/IsSymmetricTest.cpp b/test/runtime/local/kernels/IsSymmetricTest.cpp
index 710427080..2adbd4dec 100644
--- a/test/runtime/local/kernels/IsSymmetricTest.cpp
+++ b/test/runtime/local/kernels/IsSymmetricTest.cpp
@@ -14,68 +14,33 @@
  * limitations under the License.
  */
 
-
 #include <bits/stdint-uintn.h>
+#include <catch.hpp>
 #include <runtime/local/datagen/GenGivenVals.h>
-#include <runtime/local/datastructures/DataObjectFactory.h>
 #include <runtime/local/datastructures/CSRMatrix.h>
+#include <runtime/local/datastructures/DataObjectFactory.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
 #include <runtime/local/kernels/IsSymmetric.h>
 #include <stdexcept>
 #include <tags.h>
-#include <catch.hpp>
 
 TEMPLATE_PRODUCT_TEST_CASE("isSymmetric", TAG_KERNELS, (DenseMatrix, CSRMatrix, Matrix), (double, uint32_t)) {
 
     using DT = TestType;
 
-    auto symMat = genGivenVals<DT>(4, {
-        0, 1, 2, 3,
-        1, 1, 0, 6,
-        2, 0, 2, 7,
-        3, 6, 7, 3
-    });
-
-    auto asymMat = genGivenVals<DT>(4, {
-        0, 1, 2, 3,
-        0, 1, 4, 6,
-        2, 4, 2, 7,
-        3, 6, 7, 3
-    });
-
-    auto nonSquareMat = genGivenVals<DT>(3, {
-        0, 1, 2, 3,
-        0, 1, 4, 6,
-        2, 4, 2, 7
-    });
-
-    auto squareZeroExceptCenterMat = genGivenVals<DT>(4, {
-        0, 0, 0, 0,
-        0, 0, 1, 0,
-        0, 1, 0, 0,
-        0, 0, 0, 0
-    });
-
-    auto squareZeroMat = genGivenVals<DT>(4, {
-        0, 0, 0, 0,
-        0, 0, 0, 0,
-        0, 0, 0, 0,
-        0, 0, 0, 0
-    });
-
-    auto squareUpperTriangleMat = genGivenVals<DT>(4, {
-        0, 1, 1, 1,
-        0, 0, 1, 1,
-        0, 0, 0, 1,
-        0, 0, 0, 0
-    });
-
-    auto squareLowerTriangleMat = genGivenVals<DT>(4, {
-        0, 0, 0, 0,
-        1, 0, 0, 0,
-        1, 1, 0, 0,
-        1, 1, 1, 0
-    });
+    auto symMat = genGivenVals<DT>(4, {0, 1, 2, 3, 1, 1, 0, 6, 2, 0, 2, 7, 3, 6, 7, 3});
+
+    auto asymMat = genGivenVals<DT>(4, {0, 1, 2, 3, 0, 1, 4, 6, 2, 4, 2, 7, 3, 6, 7, 3});
+
+    auto nonSquareMat = genGivenVals<DT>(3, {0, 1, 2, 3, 0, 1, 4, 6, 2, 4, 2, 7});
+
+    auto squareZeroExceptCenterMat = genGivenVals<DT>(4, {0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0});
+
+    auto squareZeroMat = genGivenVals<DT>(4, {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0});
+
+    auto squareUpperTriangleMat = genGivenVals<DT>(4, {0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0});
+
+    auto squareLowerTriangleMat = genGivenVals<DT>(4, {0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0});
 
     auto singularMat = genGivenVals<DT>(1, {1});
 
@@ -95,20 +60,11 @@ TEMPLATE_PRODUCT_TEST_CASE("isSymmetric - DenseMatrix-Submatrix", TAG_KERNELS, D
 
     using DT = TestType;
 
-    auto centerSymMat = genGivenVals<DT>(5, {
-        1, 1, 0, 1, 8,
-        2, 0, 1, 2, 8,
-        3, 1, 0, 3, 8,
-        4, 2, 3, 0, 8,
-        5, 4, 0, 4, 8
-    });
-
-    auto symSubMat = DataObjectFactory::create<DT>(centerSymMat,
-        1,
-        centerSymMat->getNumRows() - 1,
-        1,
-        centerSymMat->getNumCols() - 1
-    );
+    auto centerSymMat =
+        genGivenVals<DT>(5, {1, 1, 0, 1, 8, 2, 0, 1, 2, 8, 3, 1, 0, 3, 8, 4, 2, 3, 0, 8, 5, 4, 0, 4, 8});
+
+    auto symSubMat = DataObjectFactory::create<DT>(centerSymMat, 1, centerSymMat->getNumRows() - 1, 1,
+                                                   centerSymMat->getNumCols() - 1);
 
     SECTION("isSymmetric with submatrix.") {
         CHECK_FALSE(isSymmetric(centerSymMat, nullptr));
@@ -120,18 +76,9 @@ TEMPLATE_PRODUCT_TEST_CASE("isSymmetric - CSRMatrix-Submatrix", TAG_KERNELS, CSR
 
     using DT = TestType;
 
-    auto centerSymMat = genGivenVals<DT>(5, {
-         1, 0, 1,
-         0, 1, 2,
-         1, 0, 3,
-         2, 3, 0,
-         4, 0, 4
-    });
-
-    auto symSubMat = DataObjectFactory::create<DT>(centerSymMat,
-        1,
-        centerSymMat->getNumRows() - 1
-    );
+    auto centerSymMat = genGivenVals<DT>(5, {1, 0, 1, 0, 1, 2, 1, 0, 3, 2, 3, 0, 4, 0, 4});
+
+    auto symSubMat = DataObjectFactory::create<DT>(centerSymMat, 1, centerSymMat->getNumRows() - 1);
 
     SECTION("isSymmetric with submatrix.") {
         CHECK_THROWS_AS(isSymmetric(centerSymMat, nullptr), std::runtime_error);
diff --git a/test/runtime/local/kernels/MapTest.cpp b/test/runtime/local/kernels/MapTest.cpp
index 0cb94425b..a59613c7d 100644
--- a/test/runtime/local/kernels/MapTest.cpp
+++ b/test/runtime/local/kernels/MapTest.cpp
@@ -25,72 +25,81 @@
 
 #include <vector>
 
-
 #define TYPES double, float, int64_t, int32_t, int8_t, uint64_t, uint8_t
 
-template<class DTRes, class DTArg>
-void checkMap(const DTArg * arg, const DTRes * exp, void* func) {
-    DTRes * res = nullptr;
+template <class DTRes, class DTArg> void checkMap(const DTArg *arg, const DTRes *exp, void *func) {
+    DTRes *res = nullptr;
     map(res, arg, func, nullptr);
     CHECK(*res == *exp);
     DataObjectFactory::destroy(res);
 }
 
-template<typename VTarg, typename VTres>
-VTres mult3func(VTarg arg) {
-    return static_cast<VTres>(arg) * 3;
-}
+template <typename VTarg, typename VTres> VTres mult3func(VTarg arg) { return static_cast<VTres>(arg) * 3; }
 
-template<template<typename VT> class DT, class VTarg, class VTres>
-void checkMult3Map() {
+template <template <typename VT> class DT, class VTarg, class VTres> void checkMult3Map() {
     using DTArg = DT<VTarg>;
     using DTRes = DT<VTres>;
 
-    void * mult3funcPtr = reinterpret_cast<void*>(&mult3func<VTarg, VTres>);
-    
+    void *mult3funcPtr = reinterpret_cast<void *>(&mult3func<VTarg, VTres>);
+
     auto m1 = genGivenVals<DTArg>(3, {
-        0, 1, 2,
-        1, 2, 3,
-        3, 4, 5,
-    });
-    
+                                         0,
+                                         1,
+                                         2,
+                                         1,
+                                         2,
+                                         3,
+                                         3,
+                                         4,
+                                         5,
+                                     });
+
     auto mult3_res1 = genGivenVals<DTRes>(3, {
-        0, 3, 6,
-        3, 6, 9,
-        9, 12, 15,
-    });
+                                                 0,
+                                                 3,
+                                                 6,
+                                                 3,
+                                                 6,
+                                                 9,
+                                                 9,
+                                                 12,
+                                                 15,
+                                             });
 
     auto m2 = genGivenVals<DTArg>(2, {
-        1, 0, 2, 0,
-        3, 1, 2, 0,
-    });
+                                         1,
+                                         0,
+                                         2,
+                                         0,
+                                         3,
+                                         1,
+                                         2,
+                                         0,
+                                     });
 
     auto mult3_res2 = genGivenVals<DTRes>(2, {
-        3, 0, 6, 0,
-        9, 3, 6, 0,
-    });
-  
-
-    auto m3 = genGivenVals<DTArg>(3, {
-        1,
-        5,
-        3
-    }); 
-     
-    auto mult3_res3 = genGivenVals<DTRes>(3, {
-        3,
-        15,
-        9
-    });
-    
+                                                 3,
+                                                 0,
+                                                 6,
+                                                 0,
+                                                 9,
+                                                 3,
+                                                 6,
+                                                 0,
+                                             });
+
+    auto m3 = genGivenVals<DTArg>(3, {1, 5, 3});
+
+    auto mult3_res3 = genGivenVals<DTRes>(3, {3, 15, 9});
+
     checkMap(m1, mult3_res1, mult3funcPtr);
     checkMap(m2, mult3_res2, mult3funcPtr);
-    checkMap(m3, mult3_res3, mult3funcPtr); 
+    checkMap(m3, mult3_res3, mult3funcPtr);
 
     DataObjectFactory::destroy(m1, m2, m3, mult3_res1, mult3_res2, mult3_res3);
 }
 
-template<template<typename VT> class DT, typename VTarg,  typename VTres1, typename ...VTresN>
+template <template <typename VT> class DT, typename VTarg, typename VTres1, typename... VTresN>
 std::enable_if_t<(sizeof...(VTresN) > 0)> checkMult3Map() {
     checkMult3Map<DT, VTarg, VTres1>();
     checkMult3Map<DT, VTarg, VTresN...>();
diff --git a/test/runtime/local/kernels/MatMulTest.cpp b/test/runtime/local/kernels/MatMulTest.cpp
index db3096b24..bfb38d402 100644
--- a/test/runtime/local/kernels/MatMulTest.cpp
+++ b/test/runtime/local/kernels/MatMulTest.cpp
@@ -21,9 +21,8 @@
 #include <runtime/local/datagen/GenGivenVals.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
 #include <runtime/local/kernels/MatMul.h>
-#include <runtime/local/kernels/SliceRow.h>
 #include <runtime/local/kernels/SliceCol.h>
-
+#include <runtime/local/kernels/SliceRow.h>
 
 #include <tags.h>
 
@@ -34,9 +33,9 @@
 #define DATA_TYPES DenseMatrix, Matrix
 #define VALUE_TYPES float, double, int32_t, int64_t
 
-template<class DT>
-void checkMatMul(const DT * lhs, const DT * rhs, const DT * exp, DCTX(dctx), bool transa = false, bool transb = false) {
-    DT * res = nullptr;
+template <class DT>
+void checkMatMul(const DT *lhs, const DT *rhs, const DT *exp, DCTX(dctx), bool transa = false, bool transb = false) {
+    DT *res = nullptr;
     matMul<DT, DT, DT>(res, lhs, rhs, transa, transb, dctx);
     CHECK(*res == *exp);
     DataObjectFactory::destroy(res);
@@ -48,82 +47,86 @@ TEMPLATE_PRODUCT_TEST_CASE("MatMul", TAG_KERNELS, (CSRMatrix, DATA_TYPES), (VALU
     using DT = TestType;
 
     auto m0 = genGivenVals<DT>(3, {
-        0, 0, 0,
-        0, 0, 0,
-        0, 0, 0,
-    });
+                                      0,
+                                      0,
+                                      0,
+                                      0,
+                                      0,
+                                      0,
+                                      0,
+                                      0,
+                                      0,
+                                  });
     auto m1 = genGivenVals<DT>(3, {
-        1, 2, 3,
-        3, 1, 2,
-        2, 3, 1,
-    });
+                                      1,
+                                      2,
+                                      3,
+                                      3,
+                                      1,
+                                      2,
+                                      2,
+                                      3,
+                                      1,
+                                  });
     auto m2 = genGivenVals<DT>(3, {
-        13, 13, 10,
-        10, 13, 13,
-        13, 10, 13,
-    });
+                                      13,
+                                      13,
+                                      10,
+                                      10,
+                                      13,
+                                      13,
+                                      13,
+                                      10,
+                                      13,
+                                  });
     auto m3 = genGivenVals<DT>(2, {
-        1, 0, 3, 0,
-        0, 0, 2, 0,
-    });
+                                      1,
+                                      0,
+                                      3,
+                                      0,
+                                      0,
+                                      0,
+                                      2,
+                                      0,
+                                  });
     auto m4 = genGivenVals<DT>(4, {
-        0, 1,
-        2, 0,
-        1, 1,
-        0, 0,
-    });
+                                      0,
+                                      1,
+                                      2,
+                                      0,
+                                      1,
+                                      1,
+                                      0,
+                                      0,
+                                  });
     auto m5 = genGivenVals<DT>(2, {
-        3, 4,
-        2, 2,
-    });
+                                      3,
+                                      4,
+                                      2,
+                                      2,
+                                  });
     auto m6 = genGivenVals<DT>(4, {
-        1, 0,
-        0, 0,
-        3, 2,
-        0, 0,
-    });
-    auto v0 = genGivenVals<DT>(3, {
-        0,
-        0,
-        0
-    });
-    auto v1 = genGivenVals<DT>(3, {
-        1,
-        1,
-        1
-    });
-    auto v2 = genGivenVals<DT>(3, {
-        1,
-        2,
-        3
-    });
-    auto v3 = genGivenVals<DT>(3, {
-        6,
-        6,
-        6
-    });
-    auto v4 = genGivenVals<DT>(3, {
-        14,
-        11,
-        11
-    });
-    auto v5 = genGivenVals<DT>(1, {
-        1,
-        2,
-        3
-    });
+                                      1,
+                                      0,
+                                      0,
+                                      0,
+                                      3,
+                                      2,
+                                      0,
+                                      0,
+                                  });
+    auto v0 = genGivenVals<DT>(3, {0, 0, 0});
+    auto v1 = genGivenVals<DT>(3, {1, 1, 1});
+    auto v2 = genGivenVals<DT>(3, {1, 2, 3});
+    auto v3 = genGivenVals<DT>(3, {6, 6, 6});
+    auto v4 = genGivenVals<DT>(3, {14, 11, 11});
+    auto v5 = genGivenVals<DT>(1, {1, 2, 3});
     auto v6 = genGivenVals<DT>(1, {14});
     auto v7 = genGivenVals<DT>(2, {
-        1,
-        1,
-    });
-    auto v8 = genGivenVals<DT>(4, {
-        1,
-        0,
-        5,
-        0
-    });
-
+                                      1,
+                                      1,
+                                  });
+    auto v8 = genGivenVals<DT>(4, {1, 0, 5, 0});
 
     checkMatMul(m0, m0, m0, dctx.get());
     checkMatMul(m1, m1, m2, dctx.get());
@@ -145,128 +148,144 @@ TEMPLATE_PRODUCT_TEST_CASE("MatMul Transposed", TAG_KERNELS, (DATA_TYPES), (VALU
     auto dctx = setupContextAndLogger();
 
     auto m0 = genGivenVals<DT>(3, {
-        1, 2, 3,
-        3, 1, 2,
-        2, 3, 1,
-    });
+                                      1,
+                                      2,
+                                      3,
+                                      3,
+                                      1,
+                                      2,
+                                      2,
+                                      3,
+                                      1,
+                                  });
     auto m1 = genGivenVals<DT>(3, {
-        13, 10, 13,
-        13, 13, 10,
-        10, 13, 13,
-    });
+                                      13,
+                                      10,
+                                      13,
+                                      13,
+                                      13,
+                                      10,
+                                      10,
+                                      13,
+                                      13,
+                                  });
     auto m2 = genGivenVals<DT>(2, {
-        1, 0, 3, 0,
-        0, 0, 2, 0,
-    });
+                                      1,
+                                      0,
+                                      3,
+                                      0,
+                                      0,
+                                      0,
+                                      2,
+                                      0,
+                                  });
     auto m3 = genGivenVals<DT>(4, {
-        0, 1,
-        2, 0,
-        1, 1,
-        0, 0,
-    });
+                                      0,
+                                      1,
+                                      2,
+                                      0,
+                                      1,
+                                      1,
+                                      0,
+                                      0,
+                                  });
     auto m4 = genGivenVals<DT>(4, {
-        0, 2, 1, 0,
-        0, 0, 0, 0,
-        2, 6, 5, 0,
-        0, 0, 0, 0,
-    });
+                                      0,
+                                      2,
+                                      1,
+                                      0,
+                                      0,
+                                      0,
+                                      0,
+                                      0,
+                                      2,
+                                      6,
+                                      5,
+                                      0,
+                                      0,
+                                      0,
+                                      0,
+                                      0,
+                                  });
     auto m5 = genGivenVals<DT>(4, {
-        1, 0, 1, 0,
-        0, 4, 2, 0,
-        1, 2, 2, 0,
-        0, 0, 0, 0,
-    });
-    auto v0 = genGivenVals<DT>(3, {
-        1,
-        1,
-        1
-    });
-    auto v1 = genGivenVals<DT>(3, {
-        1,
-        2,
-        3
-    });
-    auto v2 = genGivenVals<DT>(3, {
-        13,
-        13,
-        10
-    });
-    auto v3 = genGivenVals<DT>(4, {
-        1,
-        1,
-        1,
-        1
-    });
-    auto v4 = genGivenVals<DT>(2, {
-        3,
-        2
-    });
+                                      1,
+                                      0,
+                                      1,
+                                      0,
+                                      0,
+                                      4,
+                                      2,
+                                      0,
+                                      1,
+                                      2,
+                                      2,
+                                      0,
+                                      0,
+                                      0,
+                                      0,
+                                      0,
+                                  });
+    auto v0 = genGivenVals<DT>(3, {1, 1, 1});
+    auto v1 = genGivenVals<DT>(3, {1, 2, 3});
+    auto v2 = genGivenVals<DT>(3, {13, 13, 10});
+    auto v3 = genGivenVals<DT>(4, {1, 1, 1, 1});
+    auto v4 = genGivenVals<DT>(2, {3, 2});
     auto v5 = genGivenVals<DT>(2, {
-        1,
-        1,
-    });
-    auto v6 = genGivenVals<DT>(4, {
-        1,
-        0,
-        5,
-        0
-    });
-    auto v7 = genGivenVals<DT>(1, {
-        1, 2, 3
-    });
+                                      1,
+                                      1,
+                                  });
+    auto v6 = genGivenVals<DT>(4, {1, 0, 5, 0});
+    auto v7 = genGivenVals<DT>(1, {1, 2, 3});
     auto v8 = genGivenVals<DT>(1, {14});
 
-
-    checkMatMul(m0, m0, m1,  dctx.get(), true, true);
-    checkMatMul(m2, m3, m4,  dctx.get(), true, true);
-    checkMatMul(m0, v1, v2,  dctx.get(), true);
-    checkMatMul(m3, v3, v4,  dctx.get(), true);
-    checkMatMul(m2, v5, v6,  dctx.get(), true);
-    checkMatMul(m3, m3, m5,  dctx.get(), false, true);
-    checkMatMul(v1, v7, v8,  dctx.get(), true, true);
+    checkMatMul(m0, m0, m1, dctx.get(), true, true);
+    checkMatMul(m2, m3, m4, dctx.get(), true, true);
+    checkMatMul(m0, v1, v2, dctx.get(), true);
+    checkMatMul(m3, v3, v4, dctx.get(), true);
+    checkMatMul(m2, v5, v6, dctx.get(), true);
+    checkMatMul(m3, m3, m5, dctx.get(), false, true);
+    checkMatMul(v1, v7, v8, dctx.get(), true, true);
 
     DataObjectFactory::destroy(m0, m1, m2, m3, m4, m5, v0, v1, v2, v3, v4, v5, v6);
 }
 
-TEMPLATE_PRODUCT_TEST_CASE("MatMul after slicing", TAG_KERNELS "[new]", (DenseMatrix), (float, double, int32_t, int64_t)){
+TEMPLATE_PRODUCT_TEST_CASE("MatMul after slicing", TAG_KERNELS "[new]", (DenseMatrix),
+                           (float, double, int32_t, int64_t)) {
     using DT = TestType;
     auto dctx = setupContextAndLogger();
-    auto argMatrix = genGivenVals<DT>(4, {
-        1, 2, 3, 4,
-        5, 6, 7, 8,
-        9, 10, 11, 12,
-        13, 14, 15, 16
-        });
-    DT*  resMatrix3x4=nullptr;
-    DT*  resMatrix3x3=nullptr;
-    DT*  resMatrix3x1=nullptr;
-    DT*  resMatrix1x3=nullptr;
+    auto argMatrix = genGivenVals<DT>(4, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+    DT *resMatrix3x4 = nullptr;
+    DT *resMatrix3x3 = nullptr;
+    DT *resMatrix3x1 = nullptr;
+    DT *resMatrix1x3 = nullptr;
     sliceRow(resMatrix3x4, argMatrix, 0, 3, nullptr);
     sliceCol(resMatrix3x3, resMatrix3x4, 0, 3, nullptr);
     sliceCol(resMatrix3x1, resMatrix3x4, 0, 1, nullptr);
     sliceRow(resMatrix1x3, resMatrix3x3, 0, 1, nullptr);
 
     auto exp0 = genGivenVals<DT>(3, {
-        38,  44,  50,
-        98, 116, 134,
-        158, 188, 218,
-    });
+                                        38,
+                                        44,
+                                        50,
+                                        98,
+                                        116,
+                                        134,
+                                        158,
+                                        188,
+                                        218,
+                                    });
     auto exp1 = genGivenVals<DT>(3, {
-        38,  98, 158,
-    });
-    auto exp2 = genGivenVals<DT>(1, {
-        107
-    });
-    auto exp3 = genGivenVals<DT>(1, {
-        38
-    });
-
+                                        38,
+                                        98,
+                                        158,
+                                    });
+    auto exp2 = genGivenVals<DT>(1, {107});
+    auto exp3 = genGivenVals<DT>(1, {38});
 
-    checkMatMul(resMatrix3x3, resMatrix3x3, exp0,  dctx.get(), false, false);
-    checkMatMul(resMatrix3x3, resMatrix3x1, exp1,  dctx.get(), false, false);
-    checkMatMul(resMatrix3x1, resMatrix3x1, exp2,  dctx.get(), true, false);
-    checkMatMul(resMatrix3x1, resMatrix1x3, exp3,  dctx.get(), true, true);
+    checkMatMul(resMatrix3x3, resMatrix3x3, exp0, dctx.get(), false, false);
+    checkMatMul(resMatrix3x3, resMatrix3x1, exp1, dctx.get(), false, false);
+    checkMatMul(resMatrix3x1, resMatrix3x1, exp2, dctx.get(), true, false);
+    checkMatMul(resMatrix3x1, resMatrix1x3, exp3, dctx.get(), true, true);
     DataObjectFactory::destroy(argMatrix);
     DataObjectFactory::destroy(resMatrix3x3);
 }
-
diff --git a/test/runtime/local/kernels/NumDistinctApproxTest.cpp b/test/runtime/local/kernels/NumDistinctApproxTest.cpp
index d77ac17f6..20bcbb2ed 100644
--- a/test/runtime/local/kernels/NumDistinctApproxTest.cpp
+++ b/test/runtime/local/kernels/NumDistinctApproxTest.cpp
@@ -14,18 +14,17 @@
  * limitations under the License.
  */
 
-
+#include <catch.hpp>
 #include <cstddef>
 #include <cstdlib>
 #include <runtime/local/datagen/GenGivenVals.h>
-#include <runtime/local/datastructures/DataObjectFactory.h>
 #include <runtime/local/datastructures/CSRMatrix.h>
+#include <runtime/local/datastructures/DataObjectFactory.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
 #include <runtime/local/kernels/NumDistinctApprox.h>
 #include <runtime/local/kernels/RandMatrix.h>
 #include <stdexcept>
 #include <tags.h>
-#include <catch.hpp>
 
 #define DATA_TYPES DenseMatrix, CSRMatrix, Matrix
 #define VALUE_TYPES double, uint32_t
@@ -40,11 +39,10 @@ TEMPLATE_PRODUCT_TEST_CASE("numDistinctApprox", TAG_KERNELS, (DATA_TYPES), (VALU
     size_t expectedNumDistinct = 0;
     size_t approxResult = 0;
 
-
     SECTION("numDistinctApprox distinct") {
 
-        std::vector<VT> v(numElements,0);
-        std::generate_n(v.begin(), numElements/100, std::rand);
+        std::vector<VT> v(numElements, 0);
+        std::generate_n(v.begin(), numElements / 100, std::rand);
 
         auto mat10000 = genGivenVals<DT>(100, v);
         approxResult = numDistinctApprox(mat10000, 64, 1234567890, nullptr);
@@ -53,12 +51,12 @@ TEMPLATE_PRODUCT_TEST_CASE("numDistinctApprox", TAG_KERNELS, (DATA_TYPES), (VALU
 
     SECTION("numDistinctApprox distinct leading 100 zeros") {
 
-        std::vector<VT> v(numElements,0);
+        std::vector<VT> v(numElements, 0);
         std::srand(123456789);
 
         auto it = v.begin();
         std::advance(it, 100);
-        std::generate_n(it, numElements/100, std::rand);
+        std::generate_n(it, numElements / 100, std::rand);
 
         auto matZerosAtStart = genGivenVals<DT>(100, v);
         approxResult = numDistinctApprox(matZerosAtStart, 64, 1234567890, nullptr);
@@ -73,7 +71,6 @@ TEMPLATE_PRODUCT_TEST_CASE("numDistinctApprox", TAG_KERNELS, (DATA_TYPES), (VALU
 
         approxResult = numDistinctApprox(twoDistinctValsMat, 64, 1234567890, nullptr);
         expectedNumDistinct = 2;
-
     }
 
     // Allow +/-10% error. When error is bigger something is either
@@ -92,47 +89,31 @@ TEMPLATE_PRODUCT_TEST_CASE("numDistinctApprox - Dense-Submatrix", TAG_KERNELS, (
     size_t expectedNumDistinct = 0;
     size_t approxResult = 0;
 
-    std::vector<VT> v(numElements,0);
+    std::vector<VT> v(numElements, 0);
     std::srand(123456789);
 
     std::generate_n(v.begin(), numElements, std::rand);
     auto mat10000 = genGivenVals<DT>(numRows, v);
 
     SECTION("numDistinctApprox for Sub-DenseMatrix full matrix - sanity check") {
-        auto fullSubMat = DataObjectFactory::create<DT>(mat10000,
-            0,
-            numRows,
-            0,
-            numCols
-        );
+        auto fullSubMat = DataObjectFactory::create<DT>(mat10000, 0, numRows, 0, numCols);
 
         approxResult = numDistinctApprox(fullSubMat, 64, 1234567890, nullptr);
         expectedNumDistinct = numElements;
     }
 
     SECTION("numDistinctApprox for Sub-DenseMatrix") {
-        auto subMat = DataObjectFactory::create<DT>(mat10000,
-            0,
-            numRows/100,
-            0,
-            numCols
-        );
+        auto subMat = DataObjectFactory::create<DT>(mat10000, 0, numRows / 100, 0, numCols);
 
         approxResult = numDistinctApprox(subMat, 64, 1234567890, nullptr);
-        expectedNumDistinct = numElements/100;
-
+        expectedNumDistinct = numElements / 100;
     }
 
     SECTION("numDistinctApprox for Sub-DenseMatrix #distinct elements < K") {
-        auto smallSubMat = DataObjectFactory::create<DT>(mat10000,
-            0,
-            numRows/100,
-            0,
-            numCols/10
-        );
+        auto smallSubMat = DataObjectFactory::create<DT>(mat10000, 0, numRows / 100, 0, numCols / 10);
 
         approxResult = numDistinctApprox(smallSubMat, 64, 1234567890, nullptr);
-        expectedNumDistinct = numElements/1000;
+        expectedNumDistinct = numElements / 1000;
     }
 
     // Allow +/-10% error. When error is bigger something is either
@@ -157,33 +138,23 @@ TEMPLATE_PRODUCT_TEST_CASE("numDistinctApprox - CSR-Submatrix", TAG_KERNELS, (CS
     std::generate_n(v.begin(), numElements, std::rand);
     auto mat10000 = genGivenVals<DT>(numRows, v);
 
-
     SECTION("numDistinctApprox for Sub-CSRMatrix full matrix - sanity check") {
-        auto fullSubMat = DataObjectFactory::create<DT>(mat10000,
-            0,
-            numRows
-        );
+        auto fullSubMat = DataObjectFactory::create<DT>(mat10000, 0, numRows);
         approxResult = numDistinctApprox(fullSubMat, 64, 1234567890, nullptr);
         expectedNumDistinct = numElements;
     }
 
     SECTION("numDistinctApprox Sub-CSRMatrix") {
-        auto subMat = DataObjectFactory::create<DT>(mat10000,
-            0,
-            numRows/100
-        );
+        auto subMat = DataObjectFactory::create<DT>(mat10000, 0, numRows / 100);
         approxResult = numDistinctApprox(subMat, 64, 1234567890, nullptr);
-        expectedNumDistinct = numElements/100;
+        expectedNumDistinct = numElements / 100;
     }
 
     SECTION("numDistinctApprox for Sub-CSRMatrix #distinct elements < K") {
-        auto smallSubMat = DataObjectFactory::create<DT>(mat10000,
-            0,
-            numRows/100
-        );
+        auto smallSubMat = DataObjectFactory::create<DT>(mat10000, 0, numRows / 100);
 
         approxResult = numDistinctApprox(smallSubMat, 128, 1234567890, nullptr);
-        expectedNumDistinct = numElements/100;
+        expectedNumDistinct = numElements / 100;
     }
 
     // Allow +/-10% error. When error is bigger something is either
diff --git a/test/runtime/local/kernels/OneHotTest.cpp b/test/runtime/local/kernels/OneHotTest.cpp
index 97467e456..48f1d8d02 100644
--- a/test/runtime/local/kernels/OneHotTest.cpp
+++ b/test/runtime/local/kernels/OneHotTest.cpp
@@ -33,23 +33,25 @@ TEMPLATE_PRODUCT_TEST_CASE("OneHot", TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)) {
     using DTArg = TestType;
     using VT = typename DTArg::VT;
     using DTRes = DTArg;
-    
-    auto * arg = genGivenVals<DTArg>(3, {
-        -1,  0, 1,
-        -10, 1, VT(1.5),
-        100, 2, 1,
-    });
 
-    DenseMatrix<int64_t> * info = nullptr;
-    DTRes * res = nullptr;
+    auto *arg = genGivenVals<DTArg>(3, {
+                                           -1,
+                                           0,
+                                           1,
+                                           -10,
+                                           1,
+                                           VT(1.5),
+                                           100,
+                                           2,
+                                           1,
+                                       });
+
+    DenseMatrix<int64_t> *info = nullptr;
+    DTRes *res = nullptr;
 
     SECTION("normal encoding") {
         info = genGivenVals<DenseMatrix<int64_t>>(1, {-1, 3, 2});
-        auto * exp = genGivenVals<DTRes>(3, {
-            -1,  1, 0, 0, 0, 1,
-            -10, 0, 1, 0, 0, 1,
-            100, 0, 0, 1, 0, 1
-        });
+        auto *exp = genGivenVals<DTRes>(3, {-1, 1, 0, 0, 0, 1, -10, 0, 1, 0, 0, 1, 100, 0, 0, 1, 0, 1});
 
         oneHot(res, arg, info, nullptr);
         CHECK(*res == *exp);
@@ -58,11 +60,7 @@ TEMPLATE_PRODUCT_TEST_CASE("OneHot", TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)) {
     }
     SECTION("normal encoding - skip columns") {
         info = genGivenVals<DenseMatrix<int64_t>>(1, {0, 0, 3});
-        auto * exp = genGivenVals<DTRes>(3, {
-            0, 1, 0,
-            0, 1, 0,
-            0, 1, 0
-        });
+        auto *exp = genGivenVals<DTRes>(3, {0, 1, 0, 0, 1, 0, 0, 1, 0});
 
         oneHot(res, arg, info, nullptr);
         CHECK(*res == *exp);
@@ -85,7 +83,8 @@ TEMPLATE_PRODUCT_TEST_CASE("OneHot", TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)) {
         info = genGivenVals<DenseMatrix<int64_t>>(1, {0, 0, 0});
         REQUIRE_THROWS_AS(oneHot(res, arg, info, nullptr), std::runtime_error);
     }
-    SECTION("negative example - not enough space reserved (0 <= info value < arg value)") {
+    SECTION("negative example - not enough space reserved (0 <= info value < "
+            "arg value)") {
         info = genGivenVals<DenseMatrix<int64_t>>(1, {-1, 2, 2});
         REQUIRE_THROWS_AS(oneHot(res, arg, info, nullptr), std::out_of_range);
     }
@@ -94,5 +93,67 @@ TEMPLATE_PRODUCT_TEST_CASE("OneHot", TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)) {
         REQUIRE_THROWS_AS(oneHot(res, arg, info, nullptr), std::out_of_range);
     }
 
+    DataObjectFactory::destroy(arg, info);
+}
+
+TEMPLATE_PRODUCT_TEST_CASE("OneHot", TAG_KERNELS, (DenseMatrix), (ALL_STRING_VALUE_TYPES)) {
+    using DTArg = TestType;
+    using VT = typename DTArg::VT;
+    using DTRes = DenseMatrix<int64_t>;
+
+    auto *arg = genGivenVals<DenseMatrix<VT>>(4, {VT("a"), VT("blue"), VT("a"), VT("5"), VT("b"), VT("green"), VT("a"),
+                                                  VT("20"), VT("c"), VT("red"), VT("b"), VT("10"), VT("d"), VT("blue"),
+                                                  VT("b"), VT("20")});
+
+    DTRes *res = nullptr;
+    DenseMatrix<int64_t> *info = nullptr;
+
+    /*
+    recoded_matrix = {
+        0, 0, 0, 0,
+        1, 1, 0, 1,
+        2, 2, 1, 2,
+        3, 0, 1, 1
+    }
+    */
+    SECTION("normal encoding") {
+        info = genGivenVals<DenseMatrix<int64_t>>(1, {-1, -1, 2, 3});
+        auto *exp = genGivenVals<DTRes>(
+            4, {
+                   0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 2, 2, 0, 1, 0, 0, 1, 3, 0, 0, 1, 0, 1, 0,
+               });
+
+        oneHot(res, arg, info, nullptr);
+        CHECK(*res == *exp);
+
+        DataObjectFactory::destroy(exp, res);
+    }
+    SECTION("normal encoding - skip columns") {
+        info = genGivenVals<DenseMatrix<int64_t>>(1, {4, 0, 0, 0});
+        auto *exp = genGivenVals<DTRes>(4, {
+                                               1,
+                                               0,
+                                               0,
+                                               0,
+                                               0,
+                                               1,
+                                               0,
+                                               0,
+                                               0,
+                                               0,
+                                               1,
+                                               0,
+                                               0,
+                                               0,
+                                               0,
+                                               1,
+                                           });
+
+        oneHot(res, arg, info, nullptr);
+        CHECK(*res == *exp);
+
+        DataObjectFactory::destroy(exp, res);
+    }
+
     DataObjectFactory::destroy(arg, info);
 }
\ No newline at end of file
diff --git a/test/runtime/local/kernels/OrderTest.cpp b/test/runtime/local/kernels/OrderTest.cpp
index f09ee6bc2..ef4a54dbb 100644
--- a/test/runtime/local/kernels/OrderTest.cpp
+++ b/test/runtime/local/kernels/OrderTest.cpp
@@ -19,8 +19,8 @@
 #include <runtime/local/datastructures/DenseMatrix.h>
 #include <runtime/local/datastructures/Frame.h>
 #include <runtime/local/datastructures/Matrix.h>
-#include <runtime/local/kernels/Order.h>
 #include <runtime/local/kernels/CheckEq.h>
+#include <runtime/local/kernels/Order.h>
 
 #include <tags.h>
 
@@ -36,57 +36,60 @@ TEMPLATE_TEST_CASE("Order", TAG_KERNELS, (Frame)) {
 
     size_t numRows = 20;
 
-    auto c0 = genGivenVals<DenseMatrix<VT0>>(numRows, { 1.1, -3.1, 4.4, -8.8, 5.6, 2.3, 0.3, 4.4, 6.6, 6.6,
-                                                        -8.8, 6.6, 6.6, 4.4, -0.3, 4.4, 6.6, 2.3, 6.6, 5.6 });
-    auto c1 = genGivenVals<DenseMatrix<VT1>>(numRows, { 1.1, -2, 4.4, 2.1, 1.1, 2.3, 0.5, 4.4, -10, 0, 2.1,
-                                                        10, 10, 4.4, -0.3, -15.5, 10, -2.3, 10, -1.1 });
-    auto c2 = genGivenVals<DenseMatrix<VT2>>(numRows, { 0, 0, 1, 1, 0, 0, 0, 3, 0, 0, 2, 1, 2, 3, 0, 0, 3, 0, 3, 0 });
-    auto c3 = genGivenVals<DenseMatrix<VT3>>(numRows, { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19 });
-    
+    auto c0 = genGivenVals<DenseMatrix<VT0>>(numRows, {1.1,  -3.1, 4.4, -8.8, 5.6,  2.3, 0.3, 4.4, 6.6, 6.6,
+                                                       -8.8, 6.6,  6.6, 4.4,  -0.3, 4.4, 6.6, 2.3, 6.6, 5.6});
+    auto c1 = genGivenVals<DenseMatrix<VT1>>(
+        numRows, {1.1, -2, 4.4, 2.1, 1.1, 2.3, 0.5, 4.4, -10, 0, 2.1, 10, 10, 4.4, -0.3, -15.5, 10, -2.3, 10, -1.1});
+    auto c2 = genGivenVals<DenseMatrix<VT2>>(numRows, {0, 0, 1, 1, 0, 0, 0, 3, 0, 0, 2, 1, 2, 3, 0, 0, 3, 0, 3, 0});
+    auto c3 =
+        genGivenVals<DenseMatrix<VT3>>(numRows, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19});
+
     std::vector<Structure *> colsArg = {c0, c1, c2, c3};
     auto arg = DataObjectFactory::create<Frame>(colsArg, nullptr);
     DataObjectFactory::destroy(c0, c1, c1, c2);
-    Frame* exp{};
-    Frame* res{};
-    DenseMatrix<size_t>* resIdxs  = nullptr;
+    Frame *exp{};
+    Frame *res{};
+    DenseMatrix<size_t> *resIdxs = nullptr;
     size_t numKeyCols;
     size_t colIdxs[4];
     bool ascending[4];
 
-    DenseMatrix<VT0> * c0Exp{};
-    DenseMatrix<VT1> * c1Exp{};
-    DenseMatrix<VT2> * c2Exp{};
-    DenseMatrix<VT3> * c3Exp{};
+    DenseMatrix<VT0> *c0Exp{};
+    DenseMatrix<VT1> *c1Exp{};
+    DenseMatrix<VT2> *c2Exp{};
+    DenseMatrix<VT3> *c3Exp{};
     SECTION("single key column, ascending") {
-        c0Exp = genGivenVals<DenseMatrix<VT0>>(numRows, { -8.8, -8.8, -3.1, -0.3, 0.3, 1.1, 2.3, 2.3, 4.4, 4.4,
-                                                               4.4, 4.4, 5.6, 5.6, 6.6, 6.6, 6.6, 6.6, 6.6, 6.6 });
-        c1Exp = genGivenVals<DenseMatrix<VT1>>(numRows, { 2.1, 2.1, -2, -0.3, 0.5, 1.1, 2.3, -2.3, 4.4, 4.4,
-                                                               4.4, -15.5, 1.1, -1.1, -10, 0, 10, 10, 10, 10 });
-        c2Exp = genGivenVals<DenseMatrix<VT2>>(numRows, { 1, 2, 0, 0, 0, 0, 0, 0, 1, 3, 3, 0, 0, 0, 0, 0, 1, 2, 3, 3 });
-        c3Exp = genGivenVals<DenseMatrix<VT3>>(numRows, { 3, 10, 1, 14, 6, 0, 5, 17, 2, 7, 13, 15, 4, 19, 8, 9 , 11, 12, 16, 18 });
+        c0Exp = genGivenVals<DenseMatrix<VT0>>(numRows, {-8.8, -8.8, -3.1, -0.3, 0.3, 1.1, 2.3, 2.3, 4.4, 4.4,
+                                                         4.4,  4.4,  5.6,  5.6,  6.6, 6.6, 6.6, 6.6, 6.6, 6.6});
+        c1Exp = genGivenVals<DenseMatrix<VT1>>(numRows, {2.1, 2.1,   -2,  -0.3, 0.5, 1.1, 2.3, -2.3, 4.4, 4.4,
+                                                         4.4, -15.5, 1.1, -1.1, -10, 0,   10,  10,   10,  10});
+        c2Exp = genGivenVals<DenseMatrix<VT2>>(numRows, {1, 2, 0, 0, 0, 0, 0, 0, 1, 3, 3, 0, 0, 0, 0, 0, 1, 2, 3, 3});
+        c3Exp = genGivenVals<DenseMatrix<VT3>>(numRows,
+                                               {3, 10, 1, 14, 6, 0, 5, 17, 2, 7, 13, 15, 4, 19, 8, 9, 11, 12, 16, 18});
         numKeyCols = 1;
         colIdxs[0] = 0;
         ascending[0] = true;
     }
     SECTION("single key column, descending") {
-        c0Exp = genGivenVals<DenseMatrix<VT0>>(numRows, { 6.6, 6.6, 6.6, 6.6, 4.4, 4.4, 4.4, 2.3, -8.8, -8.8, 
-                                                               1.1, 5.6, 0.3, 6.6, -0.3, 5.6, -3.1, 2.3, 6.6, 4.4 });
-        c1Exp = genGivenVals<DenseMatrix<VT1>>(numRows, { 10, 10, 10, 10, 4.4, 4.4, 4.4, 2.3, 2.1, 2.1, 1.1,
-                                                               1.1, 0.5, 0, -0.3, -1.1, -2, -2.3, -10, -15.5 });
-        c2Exp = genGivenVals<DenseMatrix<VT2>>(numRows, { 1, 2, 3, 3, 1, 3, 3, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 });
-        c3Exp = genGivenVals<DenseMatrix<VT3>>(numRows, { 11, 12, 16, 18, 2, 7, 13, 5, 3, 10, 0, 4, 6, 9, 14, 19, 1, 17, 8, 15 });
+        c0Exp = genGivenVals<DenseMatrix<VT0>>(numRows, {6.6, 6.6, 6.6, 6.6, 4.4,  4.4, 4.4,  2.3, -8.8, -8.8,
+                                                         1.1, 5.6, 0.3, 6.6, -0.3, 5.6, -3.1, 2.3, 6.6,  4.4});
+        c1Exp = genGivenVals<DenseMatrix<VT1>>(numRows, {10,  10,  10,  10, 4.4,  4.4,  4.4, 2.3,  2.1, 2.1,
+                                                         1.1, 1.1, 0.5, 0,  -0.3, -1.1, -2,  -2.3, -10, -15.5});
+        c2Exp = genGivenVals<DenseMatrix<VT2>>(numRows, {1, 2, 3, 3, 1, 3, 3, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0});
+        c3Exp = genGivenVals<DenseMatrix<VT3>>(numRows,
+                                               {11, 12, 16, 18, 2, 7, 13, 5, 3, 10, 0, 4, 6, 9, 14, 19, 1, 17, 8, 15});
         numKeyCols = 1;
         colIdxs[0] = 1;
         ascending[0] = false;
-        
     }
     SECTION("two key columns, ascending/descending") {
-        c0Exp = genGivenVals<DenseMatrix<VT0>>(numRows, { -8.8, -8.8, -3.1, -0.3, 0.3, 1.1, 2.3, 2.3, 4.4, 4.4,
-                                                                4.4, 4.4, 5.6, 5.6, 6.6, 6.6, 6.6, 6.6, 6.6, 6.6 });
-        c1Exp = genGivenVals<DenseMatrix<VT1>>(numRows, { 2.1, 2.1, -2, -0.3, 0.5, 1.1, 2.3, -2.3, 4.4, 4.4,
-                                                               4.4, -15.5, 1.1, -1.1, 10, 10, 10, 10, -10, 0 });
-        c2Exp = genGivenVals<DenseMatrix<VT2>>(numRows, { 2, 1, 0, 0, 0, 0, 0, 0, 3, 3, 1, 0, 0, 0, 3, 3, 2, 1, 0, 0 });
-        c3Exp = genGivenVals<DenseMatrix<VT3>>(numRows, { 10, 3, 1, 14, 6, 0, 5, 17, 7, 13, 2, 15, 4, 19, 16, 18, 12, 11, 8, 9 });
+        c0Exp = genGivenVals<DenseMatrix<VT0>>(numRows, {-8.8, -8.8, -3.1, -0.3, 0.3, 1.1, 2.3, 2.3, 4.4, 4.4,
+                                                         4.4,  4.4,  5.6,  5.6,  6.6, 6.6, 6.6, 6.6, 6.6, 6.6});
+        c1Exp = genGivenVals<DenseMatrix<VT1>>(numRows, {2.1, 2.1,   -2,  -0.3, 0.5, 1.1, 2.3, -2.3, 4.4, 4.4,
+                                                         4.4, -15.5, 1.1, -1.1, 10,  10,  10,  10,   -10, 0});
+        c2Exp = genGivenVals<DenseMatrix<VT2>>(numRows, {2, 1, 0, 0, 0, 0, 0, 0, 3, 3, 1, 0, 0, 0, 3, 3, 2, 1, 0, 0});
+        c3Exp = genGivenVals<DenseMatrix<VT3>>(numRows,
+                                               {10, 3, 1, 14, 6, 0, 5, 17, 7, 13, 2, 15, 4, 19, 16, 18, 12, 11, 8, 9});
         numKeyCols = 2;
         colIdxs[0] = 0;
         ascending[0] = true;
@@ -94,12 +97,13 @@ TEMPLATE_TEST_CASE("Order", TAG_KERNELS, (Frame)) {
         ascending[1] = false;
     }
     SECTION("four key columns, ascending/descending") {
-        c0Exp = genGivenVals<DenseMatrix<VT0>>(numRows, { 4.4, 6.6, 2.3, -3.1, 5.6, -0.3, 6.6, 0.3, 5.6, 1.1,
-                                                               -8.8, -8.8, 2.3, 4.4, 4.4, 4.4, 6.6, 6.6, 6.6, 6.6 });
-        c1Exp = genGivenVals<DenseMatrix<VT1>>(numRows, { -15.5, -10, -2.3, -2, -1.1, -0.3, 0, 0.5, 1.1, 1.1, 
-                                                                2.1, 2.1, 2.3, 4.4, 4.4, 4.4, 10, 10, 10, 10 });
-        c2Exp = genGivenVals<DenseMatrix<VT2>>(numRows, {  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 0, 3, 3, 1, 3, 3, 2, 1 });
-        c3Exp = genGivenVals<DenseMatrix<VT3>>(numRows, { 15, 8, 17, 1, 19, 14, 9, 6, 4, 0, 10, 3, 5, 7, 13, 2, 16, 18, 12, 11 });
+        c0Exp = genGivenVals<DenseMatrix<VT0>>(numRows, {4.4,  6.6,  2.3, -3.1, 5.6, -0.3, 6.6, 0.3, 5.6, 1.1,
+                                                         -8.8, -8.8, 2.3, 4.4,  4.4, 4.4,  6.6, 6.6, 6.6, 6.6});
+        c1Exp = genGivenVals<DenseMatrix<VT1>>(numRows, {-15.5, -10, -2.3, -2,  -1.1, -0.3, 0,  0.5, 1.1, 1.1,
+                                                         2.1,   2.1, 2.3,  4.4, 4.4,  4.4,  10, 10,  10,  10});
+        c2Exp = genGivenVals<DenseMatrix<VT2>>(numRows, {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 0, 3, 3, 1, 3, 3, 2, 1});
+        c3Exp = genGivenVals<DenseMatrix<VT3>>(numRows,
+                                               {15, 8, 17, 1, 19, 14, 9, 6, 4, 0, 10, 3, 5, 7, 13, 2, 16, 18, 12, 11});
         numKeyCols = 4;
         colIdxs[0] = 1;
         ascending[0] = true;
@@ -114,17 +118,17 @@ TEMPLATE_TEST_CASE("Order", TAG_KERNELS, (Frame)) {
     std::vector<Structure *> colsExp = {c0Exp, c1Exp, c2Exp, c3Exp};
     exp = DataObjectFactory::create<Frame>(colsExp, nullptr);
     DataObjectFactory::destroy(c0Exp, c1Exp, c2Exp);
-    
+
     order(res, arg, colIdxs, numKeyCols, ascending, numKeyCols, false, nullptr);
     order(resIdxs, arg, colIdxs, numKeyCols, ascending, numKeyCols, true, nullptr);
-    
+
     CHECK(*res == *exp);
     CHECK(*resIdxs == *c3Exp);
 
     DataObjectFactory::destroy(c3Exp, arg, exp, res);
 }
 
-TEMPLATE_PRODUCT_TEST_CASE("Order", TAG_KERNELS, (DenseMatrix, Matrix), (double, float)){ // NOLINT(cert-err58-cpp)
+TEMPLATE_PRODUCT_TEST_CASE("Order", TAG_KERNELS, (DenseMatrix, Matrix), (double, float)) { // NOLINT(cert-err58-cpp)
     using DT = TestType;
     using DTIdx = typename DT::template WithValueType<size_t>;
 
@@ -132,77 +136,32 @@ TEMPLATE_PRODUCT_TEST_CASE("Order", TAG_KERNELS, (DenseMatrix, Matrix), (double,
     size_t colIdxs[4];
     bool ascending[4];
 
-    DT * argMatrix = nullptr;
-    DT * resMatrix = nullptr;
-    DT * expMatrix = nullptr;
-    DTIdx * resIdxs = nullptr;
-    DTIdx * expIdxs = nullptr;
+    DT *argMatrix = nullptr;
+    DT *resMatrix = nullptr;
+    DT *expMatrix = nullptr;
+    DTIdx *resIdxs = nullptr;
+    DTIdx *expIdxs = nullptr;
 
     SECTION("single key column, ascending") {
-        argMatrix = genGivenVals<DT>(4, {
-            1,  10, 3, 7, 7, 7,
-            17, 7,  2, 3, 7, 7,
-            7,  7,  1, 2, 3, 7,
-            7,  7,  1, 1, 2, 3
-        });
-        expMatrix = genGivenVals<DT>(4, {
-            7,  7,  1, 1, 2, 3,
-            7,  7,  1, 2, 3, 7,
-            17, 7,  2, 3, 7, 7,
-            1,  10, 3, 7, 7, 7
-        });
-        expIdxs = genGivenVals<DTIdx>(4, { 3, 2, 1, 0});
+        argMatrix = genGivenVals<DT>(4, {1, 10, 3, 7, 7, 7, 17, 7, 2, 3, 7, 7, 7, 7, 1, 2, 3, 7, 7, 7, 1, 1, 2, 3});
+        expMatrix = genGivenVals<DT>(4, {7, 7, 1, 1, 2, 3, 7, 7, 1, 2, 3, 7, 17, 7, 2, 3, 7, 7, 1, 10, 3, 7, 7, 7});
+        expIdxs = genGivenVals<DTIdx>(4, {3, 2, 1, 0});
         numKeyCols = 1;
         colIdxs[0] = 3;
         ascending[0] = true;
     }
     SECTION("four key columns, ascending/descending") {
-        argMatrix = genGivenVals<DT>(20, {
-            1.1,    1.1,   0,  6,
-            -3.1,   -2,    0,  3,
-            4.4,    4.4,   1,  9,
-            -8.8,   2.1,   1,  1,
-            5.6,    1.1,   0,  13,
-            2.3,    2.3,   0,  7,
-            0.3,    0.5,   0,  5,
-            4.4,    4.4,   3,  10,
-            6.6,    -10,   0,  15,
-            6.6,    0,     0,  16,
-            -8.8,   2.1,   2,  2,
-            6.6,    10,    1,  17,
-            6.6,    10,    2,  18,
-            4.4,    4.4,   3,  11,
-            -0.3,   -0.3,  0,  4,
-            4.4,    -15.5, 0,  12,
-            6.6,    10,    3,  19,
-            2.3,    -2.3,  0,  8,
-            6.6,    10,    3,  20,
-            5.6,    -1.1,  0,  14
-        });
-        expMatrix = genGivenVals<DT>(20, {
-            4.4,    -15.5, 0,  12,
-            6.6,    -10,   0,  15,
-            2.3,    -2.3,  0,  8,
-            -3.1,   -2,    0,  3,
-            5.6,    -1.1,  0,  14,
-            -0.3,   -0.3,  0,  4,
-            6.6,    0,     0,  16,
-            0.3,    0.5,   0,  5,
-            5.6,    1.1,   0,  13,
-            1.1,    1.1,   0,  6,
-            -8.8,   2.1,   2,  2,
-            -8.8,   2.1,   1,  1,
-            2.3,    2.3,   0,  7,
-            4.4,    4.4,   3,  10,
-            4.4,    4.4,   3,  11,
-            4.4,    4.4,   1,  9,
-            6.6,    10,    3,  19,
-            6.6,    10,    3,  20,
-            6.6,    10,    2,  18,
-            6.6,    10,    1,  17
-        });
-        expIdxs = genGivenVals<DTIdx>(20, { 15, 8, 17, 1, 19, 14, 9, 6, 4, 0,
-                                            10, 3, 5, 7, 13, 2, 16, 18, 12, 11 });
+        argMatrix = genGivenVals<DT>(20, {1.1, 1.1, 0, 6,  -3.1, -2,   0, 3,  4.4,  4.4,  1, 9,  -8.8, 2.1,   1, 1,
+                                          5.6, 1.1, 0, 13, 2.3,  2.3,  0, 7,  0.3,  0.5,  0, 5,  4.4,  4.4,   3, 10,
+                                          6.6, -10, 0, 15, 6.6,  0,    0, 16, -8.8, 2.1,  2, 2,  6.6,  10,    1, 17,
+                                          6.6, 10,  2, 18, 4.4,  4.4,  3, 11, -0.3, -0.3, 0, 4,  4.4,  -15.5, 0, 12,
+                                          6.6, 10,  3, 19, 2.3,  -2.3, 0, 8,  6.6,  10,   3, 20, 5.6,  -1.1,  0, 14});
+        expMatrix = genGivenVals<DT>(20, {4.4, -15.5, 0, 12, 6.6,  -10,  0, 15, 2.3,  -2.3, 0, 8,  -3.1, -2,  0, 3,
+                                          5.6, -1.1,  0, 14, -0.3, -0.3, 0, 4,  6.6,  0,    0, 16, 0.3,  0.5, 0, 5,
+                                          5.6, 1.1,   0, 13, 1.1,  1.1,  0, 6,  -8.8, 2.1,  2, 2,  -8.8, 2.1, 1, 1,
+                                          2.3, 2.3,   0, 7,  4.4,  4.4,  3, 10, 4.4,  4.4,  3, 11, 4.4,  4.4, 1, 9,
+                                          6.6, 10,    3, 19, 6.6,  10,   3, 20, 6.6,  10,   2, 18, 6.6,  10,  1, 17});
+        expIdxs = genGivenVals<DTIdx>(20, {15, 8, 17, 1, 19, 14, 9, 6, 4, 0, 10, 3, 5, 7, 13, 2, 16, 18, 12, 11});
         numKeyCols = 4;
         colIdxs[0] = 1;
         ascending[0] = true;
diff --git a/test/runtime/local/kernels/OuterBinaryTest.cpp b/test/runtime/local/kernels/OuterBinaryTest.cpp
index 535860d5f..4c144fcca 100644
--- a/test/runtime/local/kernels/OuterBinaryTest.cpp
+++ b/test/runtime/local/kernels/OuterBinaryTest.cpp
@@ -36,25 +36,23 @@
 // Helpers
 // ****************************************************************************
 
-template<class DT>
-void checkOuterBinary(BinaryOpCode opCode, const DT * lhs, const DT * rhs, const DT * exp) {
-    DT * res = nullptr;
+template <class DT> void checkOuterBinary(BinaryOpCode opCode, const DT *lhs, const DT *rhs, const DT *exp) {
+    DT *res = nullptr;
     outerBinary<DT, DT, DT>(opCode, res, lhs, rhs, nullptr);
     CHECK(*res == *exp);
 }
 
-template<class DT>
-void helper(BinaryOpCode opCode, std::vector<typename DT::VT> lhsVals, std::vector<typename DT::VT> rhsVals, std::vector<typename DT::VT> resVals) {
-    if(resVals.size() != lhsVals.size() * rhsVals.size())
-        throw std::runtime_error(
-                "the number of elements in resVals must be the product "
-                "of the numbers of elements in lhsVals and rhsVals"
-        );
+template <class DT>
+void helper(BinaryOpCode opCode, std::vector<typename DT::VT> lhsVals, std::vector<typename DT::VT> rhsVals,
+            std::vector<typename DT::VT> resVals) {
+    if (resVals.size() != lhsVals.size() * rhsVals.size())
+        throw std::runtime_error("the number of elements in resVals must be the product "
+                                 "of the numbers of elements in lhsVals and rhsVals");
 
     auto lhs = genGivenVals<DT>(lhsVals.size(), lhsVals);
     auto rhs = genGivenVals<DT>(1, rhsVals);
     auto exp = genGivenVals<DT>(resVals.size(), resVals);
-    
+
     checkOuterBinary(opCode, lhs, rhs, exp);
 
     DataObjectFactory::destroy(lhs, rhs, exp);
@@ -67,15 +65,11 @@ void helper(BinaryOpCode opCode, std::vector<typename DT::VT> lhsVals, std::vect
 TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("valid shapes"), TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)) {
     using DT = TestType;
     using VT = typename DT::VT;
-    using DTEmpty = typename std::conditional<
-                        std::is_same<DT, Matrix<VT>>::value,
-                        DenseMatrix<VT>,
-                        DT
-                    >::type;
+    using DTEmpty = typename std::conditional<std::is_same<DT, Matrix<VT>>::value, DenseMatrix<VT>, DT>::type;
 
-    DT * lhs = nullptr;
-    DT * rhs = nullptr;
-    DT * exp = nullptr;
+    DT *lhs = nullptr;
+    DT *rhs = nullptr;
+    DT *exp = nullptr;
 
     SECTION("0x1 op 1x0") {
         lhs = static_cast<DT *>(DataObjectFactory::create<DTEmpty>(0, 1, false));
@@ -96,12 +90,21 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("valid shapes"), TAG_KERNELS, (DATA_TYPES),
         lhs = genGivenVals<DT>(3, {1, 2, 3});
         rhs = genGivenVals<DT>(1, {4, 5, 6, 7});
         exp = genGivenVals<DT>(3, {
-            5, 6, 7,  8,
-            6, 7, 8,  9,
-            7, 8, 9, 10,
-        });
+                                      5,
+                                      6,
+                                      7,
+                                      8,
+                                      6,
+                                      7,
+                                      8,
+                                      9,
+                                      7,
+                                      8,
+                                      9,
+                                      10,
+                                  });
     }
-    
+
     checkOuterBinary(BinaryOpCode::ADD, lhs, rhs, exp);
 
     DataObjectFactory::destroy(lhs, rhs, exp);
@@ -110,14 +113,10 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("valid shapes"), TAG_KERNELS, (DATA_TYPES),
 TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("invalid shapes"), TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)) {
     using DT = TestType;
     using VT = typename DT::VT;
-    using DTEmpty = typename std::conditional<
-                        std::is_same<DT, Matrix<VT>>::value,
-                        DenseMatrix<VT>,
-                        DT
-                    >::type;
+    using DTEmpty = typename std::conditional<std::is_same<DT, Matrix<VT>>::value, DenseMatrix<VT>, DT>::type;
 
-    DT * lhs = nullptr;
-    DT * rhs = nullptr;
+    DT *lhs = nullptr;
+    DT *rhs = nullptr;
 
     SECTION("mx0 op 1xn") {
         lhs = static_cast<DT *>(DataObjectFactory::create<DTEmpty>(3, 0, false));
@@ -128,18 +127,18 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("invalid shapes"), TAG_KERNELS, (DATA_TYPES
         rhs = static_cast<DT *>(DataObjectFactory::create<DTEmpty>(0, 4, false));
     }
     SECTION("mx2 op 1xn") {
-        lhs = genGivenVals<DT>(3, {0, 0,  0, 0,  0, 0});
+        lhs = genGivenVals<DT>(3, {0, 0, 0, 0, 0, 0});
         rhs = genGivenVals<DT>(1, {0, 0, 0, 0});
     }
     SECTION("mx1 op 2xn") {
         lhs = genGivenVals<DT>(3, {0, 0, 0});
-        rhs = genGivenVals<DT>(2, {0, 0, 0, 0,  0, 0, 0, 0});
+        rhs = genGivenVals<DT>(2, {0, 0, 0, 0, 0, 0, 0, 0});
     }
-    
-    DT * res = nullptr;
+
+    DT *res = nullptr;
     CHECK_THROWS(outerBinary<DT, DT, DT>(BinaryOpCode::ADD, res, lhs, rhs, nullptr));
     DataObjectFactory::destroy(lhs, rhs);
-    if(res)
+    if (res)
         DataObjectFactory::destroy(res);
 }
 
@@ -237,7 +236,7 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("ge"), TAG_KERNELS, (DATA_TYPES), (VALUE_TY
 
 TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("some invalid op-code"), TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)) {
     using DT = TestType;
-    DT * res = nullptr;
+    DT *res = nullptr;
     auto m = genGivenVals<DT>(1, {1});
     CHECK_THROWS(outerBinary<DT, DT, DT>(static_cast<BinaryOpCode>(999), res, m, m, nullptr));
 }
\ No newline at end of file
diff --git a/test/runtime/local/kernels/QuantizeTest.cpp b/test/runtime/local/kernels/QuantizeTest.cpp
index de3fecc58..232e00a1c 100644
--- a/test/runtime/local/kernels/QuantizeTest.cpp
+++ b/test/runtime/local/kernels/QuantizeTest.cpp
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include <runtime/local/kernels/Quantize.h>
 #include <runtime/local/datagen/GenGivenVals.h>
+#include <runtime/local/kernels/Quantize.h>
 
 #include <tags.h>
 
@@ -30,20 +30,17 @@ TEMPLATE_PRODUCT_TEST_CASE("Quantization", TAG_KERNELS, (DenseMatrix, Matrix), (
     using DT = TestType;
     using DTRes = typename DT::template WithValueType<uint8_t>;
 
-    auto f0 = genGivenVals<DT>(2, {
-        0,   1.0,
-        0.5, 1.1
-    });
+    auto f0 = genGivenVals<DT>(2, {0, 1.0, 0.5, 1.1});
 
-    DTRes * res = nullptr;
+    DTRes *res = nullptr;
 
     quantize(res, f0, 0, 1, nullptr);
 
     CHECK(res->getNumRows() == 2);
     CHECK(res->getNumCols() == 2);
 
-    CHECK(res->get(0,0) == 0);
-    CHECK(res->get(0,1) == 255);
-    CHECK(res->get(1,0) == 128);
-    CHECK(res->get(1,1) == 255);
+    CHECK(res->get(0, 0) == 0);
+    CHECK(res->get(0, 1) == 255);
+    CHECK(res->get(1, 0) == 128);
+    CHECK(res->get(1, 1) == 255);
 }
diff --git a/test/runtime/local/kernels/RandMatrixTest.cpp b/test/runtime/local/kernels/RandMatrixTest.cpp
index ef5f0c172..3bc0c7b84 100644
--- a/test/runtime/local/kernels/RandMatrixTest.cpp
+++ b/test/runtime/local/kernels/RandMatrixTest.cpp
@@ -39,19 +39,19 @@ TEMPLATE_PRODUCT_TEST_CASE("RandMatrix", TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES
     const VT min = 100;
     const VT max = 200;
 
-    for(double sparsity : {0.0, 0.1, 0.5, 0.9, 1.0}) {
+    for (double sparsity : {0.0, 0.1, 0.5, 0.9, 1.0}) {
         DYNAMIC_SECTION("sparsity = " << sparsity) {
-            DT * m = nullptr;
+            DT *m = nullptr;
             randMatrix<DT, VT>(m, numRows, numCols, min, max, sparsity, -1, nullptr);
 
             REQUIRE(m->getNumRows() == numRows);
             REQUIRE(m->getNumCols() == numCols);
 
             size_t numNonZeros = 0;
-            for(size_t r = 0; r < numRows; r++)
-                for(size_t c = 0; c < numCols; c++) {
+            for (size_t r = 0; r < numRows; r++)
+                for (size_t c = 0; c < numCols; c++) {
                     const VT v = m->get(r, c);
-                    if(v) {
+                    if (v) {
                         CHECK(v >= min);
                         CHECK(v <= max);
                         numNonZeros++;
@@ -65,4 +65,3 @@ TEMPLATE_PRODUCT_TEST_CASE("RandMatrix", TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES
         }
     }
 }
-
diff --git a/test/runtime/local/kernels/ReadTest.cpp b/test/runtime/local/kernels/ReadTest.cpp
index ae0e8752b..e608f7061 100644
--- a/test/runtime/local/kernels/ReadTest.cpp
+++ b/test/runtime/local/kernels/ReadTest.cpp
@@ -28,7 +28,7 @@
 
 TEMPLATE_PRODUCT_TEST_CASE("Read CSV", TAG_KERNELS, (DenseMatrix), (double)) {
     using DT = TestType;
-  
+
     DT *m = nullptr;
 
     size_t numRows = 2;
@@ -36,63 +36,63 @@ TEMPLATE_PRODUCT_TEST_CASE("Read CSV", TAG_KERNELS, (DenseMatrix), (double)) {
 
     char filename[] = "./test/runtime/local/io/ReadCsv1.csv";
 
-    read(m, filename,nullptr);
+    read(m, filename, nullptr);
 
-   REQUIRE(m->getNumRows() == numRows);
-   REQUIRE(m->getNumCols() == numCols);
+    REQUIRE(m->getNumRows() == numRows);
+    REQUIRE(m->getNumCols() == numCols);
 
-   CHECK(m->get(0, 0) == -0.1);
-   CHECK(m->get(0, 1) == -0.2);
-   CHECK(m->get(0, 2) == 0.1);
-   CHECK(m->get(0, 3) == 0.2);
+    CHECK(m->get(0, 0) == -0.1);
+    CHECK(m->get(0, 1) == -0.2);
+    CHECK(m->get(0, 2) == 0.1);
+    CHECK(m->get(0, 3) == 0.2);
 
-   CHECK(m->get(1, 0) == 3.14);
-   CHECK(m->get(1, 1) == 5.41);
-   CHECK(m->get(1, 2) == 6.22216);
-   CHECK(m->get(1, 3) == 5);
+    CHECK(m->get(1, 0) == 3.14);
+    CHECK(m->get(1, 1) == 5.41);
+    CHECK(m->get(1, 2) == 6.22216);
+    CHECK(m->get(1, 3) == 5);
 
     DataObjectFactory::destroy(m);
 }
 
 TEMPLATE_PRODUCT_TEST_CASE("Read MM", TAG_KERNELS, (DenseMatrix), (uint32_t)) {
-  using DT = TestType;
+    using DT = TestType;
 
-  DT * m= nullptr;
-  size_t numRows = 9;
-  size_t numCols = 9;
+    DT *m = nullptr;
+    size_t numRows = 9;
+    size_t numCols = 9;
 
-  char filename[] = "./test/runtime/local/io/cig.mtx";
-  read(m, filename,nullptr);
+    char filename[] = "./test/runtime/local/io/cig.mtx";
+    read(m, filename, nullptr);
 
-  REQUIRE(m->getNumRows() == numRows);
-  REQUIRE(m->getNumCols() == numCols);
+    REQUIRE(m->getNumRows() == numRows);
+    REQUIRE(m->getNumCols() == numCols);
 
-  CHECK(m->get(0, 0) == 1);
-  CHECK(m->get(2, 0) == 0);
-  CHECK(m->get(3, 4) == 9);
-  CHECK(m->get(7, 4) == 4);
+    CHECK(m->get(0, 0) == 1);
+    CHECK(m->get(2, 0) == 0);
+    CHECK(m->get(3, 4) == 9);
+    CHECK(m->get(7, 4) == 4);
 
-  DataObjectFactory::destroy(m);
+    DataObjectFactory::destroy(m);
 }
 
 TEST_CASE("Read - Frame", TAG_KERNELS) {
-    Frame * f = nullptr;
+    Frame *f = nullptr;
     read(f, "./test/runtime/local/io/ReadCsv4.csv", nullptr);
-    
+
     CHECK(f->getNumRows() == 2);
     CHECK(f->getNumCols() == 2);
     CHECK(f->getColumnType(0) == ValueTypeCode::SI64);
     CHECK(f->getColumnType(1) == ValueTypeCode::F64);
     CHECK(f->getLabels()[0] == "foo");
     CHECK(f->getLabels()[1] == "bar");
-    
+
     auto c0 = f->getColumn<int64_t>(0);
     CHECK(c0->get(0, 0) == 1);
     CHECK(c0->get(1, 0) == 2);
     auto c1 = f->getColumn<double>(1);
     CHECK(c1->get(0, 0) == 0.5);
     CHECK(c1->get(1, 0) == 1.0);
-    
+
     DataObjectFactory::destroy(f);
     DataObjectFactory::destroy(c0);
     DataObjectFactory::destroy(c1);
diff --git a/test/runtime/local/kernels/RecodeTest.cpp b/test/runtime/local/kernels/RecodeTest.cpp
index e0d93e9ab..8193229ea 100644
--- a/test/runtime/local/kernels/RecodeTest.cpp
+++ b/test/runtime/local/kernels/RecodeTest.cpp
@@ -14,9 +14,9 @@
  * limitations under the License.
  */
 
+#include <runtime/local/datagen/GenGivenVals.h>
 #include <runtime/local/datastructures/DataObjectFactory.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
-#include <runtime/local/datagen/GenGivenVals.h>
 #include <runtime/local/kernels/CheckEq.h>
 #include <runtime/local/kernels/Recode.h>
 
@@ -29,10 +29,10 @@
 
 #include <cstdint>
 
-template<class DTRes, class DTDict, class DTArg>
-void checkRecode(const DTArg * arg, bool orderPreserving, const DTRes * expRes, const DTDict * expDict) {
-    DTRes * res = nullptr;
-    DTDict * dict = nullptr;
+template <class DTRes, class DTDict, class DTArg>
+void checkRecode(const DTArg *arg, bool orderPreserving, const DTRes *expRes, const DTDict *expDict) {
+    DTRes *res = nullptr;
+    DTDict *dict = nullptr;
     recode<DTRes, DTDict, DTArg>(res, dict, arg, orderPreserving, nullptr);
     CHECK(*res == *expRes);
     CHECK(*dict == *expDict);
@@ -43,20 +43,14 @@ TEMPLATE_PRODUCT_TEST_CASE("Recode", TAG_KERNELS, (DenseMatrix, Matrix), (double
     using DTArg = TestType;
     using VTArg = typename DTArg::VT;
     using DTRes = typename DTArg::template WithValueType<int64_t>;
-    using DTEmptyArg = typename std::conditional<
-                        std::is_same<DTArg, Matrix<VTArg>>::value,
-                        DenseMatrix<VTArg>,
-                        DTArg
-                    >::type;
-    using DTEmptyRes = typename std::conditional<
-                        std::is_same<DTArg, Matrix<VTArg>>::value,
-                        DenseMatrix<int64_t>,
-                        DTRes
-                    >::type;
-    
-    DTArg * arg = nullptr;
-    DTRes * expRes = nullptr;
-    DTArg * expDict = nullptr;
+    using DTEmptyArg =
+        typename std::conditional<std::is_same<DTArg, Matrix<VTArg>>::value, DenseMatrix<VTArg>, DTArg>::type;
+    using DTEmptyRes =
+        typename std::conditional<std::is_same<DTArg, Matrix<VTArg>>::value, DenseMatrix<int64_t>, DTRes>::type;
+
+    DTArg *arg = nullptr;
+    DTRes *expRes = nullptr;
+    DTArg *expDict = nullptr;
 
     SECTION("empty arg, non-order-preserving recoding") {
         arg = static_cast<DTArg *>(DataObjectFactory::create<DTEmptyArg>(0, 1, false));
@@ -84,4 +78,44 @@ TEMPLATE_PRODUCT_TEST_CASE("Recode", TAG_KERNELS, (DenseMatrix, Matrix), (double
     }
 
     DataObjectFactory::destroy(arg, expRes, expDict);
-}
\ No newline at end of file
+}
+
+TEMPLATE_PRODUCT_TEST_CASE("Recode", TAG_KERNELS, (DenseMatrix), (ALL_STRING_VALUE_TYPES)) {
+    using DTArg = TestType;
+    using VTArg = typename DTArg::VT;
+    using DTRes = DenseMatrix<int64_t>;
+    using DTDict = DenseMatrix<VTArg>;
+
+    DTArg *arg = nullptr;
+    DTRes *expRes = nullptr;
+    DTArg *expDict = nullptr;
+
+    SECTION("empty arg, non-order-preserving recoding") {
+        arg = DataObjectFactory::create<DTArg>(0, 1, false);
+        expRes = DataObjectFactory::create<DTRes>(0, 1, false);
+        expDict = DataObjectFactory::create<DTDict>(0, 1, false);
+        checkRecode(arg, false, expRes, expDict);
+    }
+    SECTION("empty arg, order-preserving recoding") {
+        arg = DataObjectFactory::create<DTArg>(0, 1, false);
+        expRes = DataObjectFactory::create<DTRes>(0, 1, false);
+        expDict = DataObjectFactory::create<DTDict>(0, 1, false);
+        checkRecode(arg, true, expRes, expDict);
+    }
+    SECTION("non-empty arg, non-order-preserving recoding") {
+        arg = genGivenVals<DTArg>(8, {VTArg("abc"), VTArg("ab"), VTArg("abcde"), VTArg("ab"), VTArg("ab"), VTArg("a"),
+                                      VTArg("abcd"), VTArg("abcde")});
+        expRes = genGivenVals<DTRes>(8, {0, 1, 2, 1, 1, 3, 4, 2});
+        expDict = genGivenVals<DTDict>(5, {VTArg("abc"), VTArg("ab"), VTArg("abcde"), VTArg("a"), VTArg("abcd")});
+        checkRecode(arg, false, expRes, expDict);
+    }
+    SECTION("non-empty arg, order-preserving recoding") {
+        arg = genGivenVals<DTArg>(8, {VTArg("abc"), VTArg("ab"), VTArg("abcde"), VTArg("ab"), VTArg("ab"), VTArg("a"),
+                                      VTArg("abcd"), VTArg("abcde")});
+        expRes = genGivenVals<DTRes>(8, {2, 1, 4, 1, 1, 0, 3, 4});
+        expDict = genGivenVals<DTDict>(5, {VTArg("a"), VTArg("ab"), VTArg("abc"), VTArg("abcd"), VTArg("abcde")});
+        checkRecode(arg, true, expRes, expDict);
+    }
+
+    DataObjectFactory::destroy(arg, expRes, expDict);
+}
diff --git a/test/runtime/local/kernels/ReplaceTest.cpp b/test/runtime/local/kernels/ReplaceTest.cpp
index 1e22f2be4..dc29f9f48 100644
--- a/test/runtime/local/kernels/ReplaceTest.cpp
+++ b/test/runtime/local/kernels/ReplaceTest.cpp
@@ -15,11 +15,10 @@
  */
 
 #include <runtime/local/datagen/GenGivenVals.h>
-#include <runtime/local/datastructures/DenseMatrix.h>
 #include <runtime/local/datastructures/CSRMatrix.h>
-#include <runtime/local/kernels/Replace.h>
+#include <runtime/local/datastructures/DenseMatrix.h>
 #include <runtime/local/kernels/CheckEq.h>
-
+#include <runtime/local/kernels/Replace.h>
 
 #include <tags.h>
 
@@ -33,66 +32,53 @@
 #define VALUE_TYPES double, float, uint32_t, uint64_t, int32_t, int64_t
 #define VALUE_TYPES_SPECIAL_CASE double
 
-template<class DT>
-void checkReplace(DT*& outputMatrix,  const DT* inputMatrix,typename DT::VT pattern, typename DT::VT replacement,  DT* expected){
+template <class DT>
+void checkReplace(DT *&outputMatrix, const DT *inputMatrix, typename DT::VT pattern, typename DT::VT replacement,
+                  DT *expected) {
     replace<DT, DT, typename DT::VT>(outputMatrix, inputMatrix, pattern, replacement, nullptr);
     CHECK(*outputMatrix == *expected);
 }
 
-TEMPLATE_PRODUCT_TEST_CASE("Replace", TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)){
+TEMPLATE_PRODUCT_TEST_CASE("Replace", TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)) {
     using DT = TestType;
-    //inplace updates
+    // inplace updates
 
     auto initMatrix = genGivenVals<DT>(4, {
-        1, 2, 3, 7, 7, 7,
-        7, 1, 2, 3, 7, 7,
-        7, 7, 1, 2, 3, 7,
-        7, 7, 7, 1, 2, 3,
-    });
+                                              1, 2, 3, 7, 7, 7, 7, 1, 2, 3, 7, 7, 7, 7, 1, 2, 3, 7, 7, 7, 7, 1, 2, 3,
+                                          });
 
     auto testMatrix1 = genGivenVals<DT>(4, {
-        7, 2, 3, 7, 7, 7,
-        7, 7, 2, 3, 7, 7,
-        7, 7, 7, 2, 3, 7,
-        7, 7, 7, 7, 2, 3,
-    });
+                                               7, 2, 3, 7, 7, 7, 7, 7, 2, 3, 7, 7, 7, 7, 7, 2, 3, 7, 7, 7, 7, 7, 2, 3,
+                                           });
 
     auto testMatrix2 = genGivenVals<DT>(4, {
-        7, 7, 3, 7, 7, 7,
-        7, 7, 7, 3, 7, 7,
-        7, 7, 7, 7, 3, 7,
-        7, 7, 7, 7, 7, 3,
-    });
+                                               7, 7, 3, 7, 7, 7, 7, 7, 7, 3, 7, 7, 7, 7, 7, 7, 3, 7, 7, 7, 7, 7, 7, 3,
+                                           });
 
     checkReplace(initMatrix, initMatrix, 1, 7, testMatrix1);
-    //should do nothing because there is no ones
+    // should do nothing because there is no ones
     checkReplace(initMatrix, initMatrix, 1, 7, testMatrix1);
-    //target=2;
+    // target=2;
     checkReplace(initMatrix, initMatrix, 2, 7, testMatrix2);
 
     // update in a new copy
     auto testMatrix3 = genGivenVals<DT>(4, {
-        7, 7, 7, 7, 7, 7,
-        7, 7, 7, 7, 7, 7,
-        7, 7, 7, 7, 7, 7,
-        7, 7, 7, 7, 7, 7,
-    });
-
-    auto testMatrix4 = genGivenVals<DT>(4, {
-        7, 7, 10, 7, 7, 7,
-        7, 7, 7, 10, 7, 7,
-        7, 7, 7, 7, 10, 7,
-        7, 7, 7, 7, 7, 10,
-    });
-
-    DT * outputMatrix=nullptr;
+                                               7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+                                           });
+
+    auto testMatrix4 =
+        genGivenVals<DT>(4, {
+                                7, 7, 10, 7, 7, 7, 7, 7, 7, 10, 7, 7, 7, 7, 7, 7, 10, 7, 7, 7, 7, 7, 7, 10,
+                            });
+
+    DT *outputMatrix = nullptr;
     checkReplace(outputMatrix, initMatrix, 3, 7, testMatrix3);
 
     checkReplace(initMatrix, initMatrix, 3, 10, testMatrix4);
-    //this test case should act as a copy
-    DT * outputMatrix2=nullptr;
+    // this test case should act as a copy
+    DT *outputMatrix2 = nullptr;
 
-    checkReplace(outputMatrix2, initMatrix,  3, 3, testMatrix4);
+    checkReplace(outputMatrix2, initMatrix, 3, 3, testMatrix4);
     DataObjectFactory::destroy(initMatrix);
     DataObjectFactory::destroy(testMatrix1);
     DataObjectFactory::destroy(testMatrix2);
@@ -102,26 +88,20 @@ TEMPLATE_PRODUCT_TEST_CASE("Replace", TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)){
     DataObjectFactory::destroy(outputMatrix2);
 }
 
-TEMPLATE_PRODUCT_TEST_CASE("Replace-nan", TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES_SPECIAL_CASE)){
+TEMPLATE_PRODUCT_TEST_CASE("Replace-nan", TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES_SPECIAL_CASE)) {
     using DT = TestType;
-    //inplace updates
+    // inplace updates
 
     auto initMatrix = genGivenVals<DT>(4, {
-        1, 2, 3, 7, 7, 7,
-        7, 1, 2, 3, 7, 7,
-        7, 7, 1, 2, 3, 7,
-        7, 7, 7, 1, 2, 3,
-    });
-    auto testMatrix1 = genGivenVals<DT>(4, {
-        1000, 2, 3, 7, 7, 7,
-        7, 1, 2, 3, 7, 7,
-        7, 7, 1, 2, 3, 7,
-        7, 7, 7, 1, 2, 3,
-    });
+                                              1, 2, 3, 7, 7, 7, 7, 1, 2, 3, 7, 7, 7, 7, 1, 2, 3, 7, 7, 7, 7, 1, 2, 3,
+                                          });
+    auto testMatrix1 =
+        genGivenVals<DT>(4, {
+                                1000, 2, 3, 7, 7, 7, 7, 1, 2, 3, 7, 7, 7, 7, 1, 2, 3, 7, 7, 7, 7, 1, 2, 3,
+                            });
 
     initMatrix->set(0, 0, nan(""));
-    checkReplace(initMatrix, initMatrix ,nan(""), 1000, testMatrix1);
+    checkReplace(initMatrix, initMatrix, nan(""), 1000, testMatrix1);
     DataObjectFactory::destroy(initMatrix);
     DataObjectFactory::destroy(testMatrix1);
-
 }
\ No newline at end of file
diff --git a/test/runtime/local/kernels/ReshapeTest.cpp b/test/runtime/local/kernels/ReshapeTest.cpp
index ad1890a95..645ffe698 100644
--- a/test/runtime/local/kernels/ReshapeTest.cpp
+++ b/test/runtime/local/kernels/ReshapeTest.cpp
@@ -14,9 +14,9 @@
  * limitations under the License.
  */
 
+#include <runtime/local/datagen/GenGivenVals.h>
 #include <runtime/local/datastructures/DataObjectFactory.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
-#include <runtime/local/datagen/GenGivenVals.h>
 #include <runtime/local/kernels/CheckEq.h>
 #include <runtime/local/kernels/Reshape.h>
 
@@ -32,9 +32,8 @@
 #define DATA_TYPES DenseMatrix, Matrix
 #define VALUE_TYPES double, uint32_t
 
-template<class DT>
-void checkReshape(const DT * arg, size_t numRows, size_t numCols, const DT * exp) {
-    DT * res = nullptr;
+template <class DT> void checkReshape(const DT *arg, size_t numRows, size_t numCols, const DT *exp) {
+    DT *res = nullptr;
     reshape<DT, DT>(res, arg, numRows, numCols, nullptr); // w/o template
     CHECK(*res == *exp);
     DataObjectFactory::destroy(res);
@@ -43,51 +42,98 @@ void checkReshape(const DT * arg, size_t numRows, size_t numCols, const DT * exp
 TEMPLATE_PRODUCT_TEST_CASE("Reshape", TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)) {
     using DT = TestType;
     using VT = typename DT::VT;
-    using DTView = typename std::conditional<
-                        std::is_same<DT, Matrix<VT>>::value,
-                        DenseMatrix<VT>,
-                        DT
-                    >::type;
-    
+    using DTView = typename std::conditional<std::is_same<DT, Matrix<VT>>::value, DenseMatrix<VT>, DT>::type;
+
     std::vector<typename DT::VT> vals = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
-    DT * arg = genGivenVals<DT>(1, vals); // 1x12
-    
+    DT *arg = genGivenVals<DT>(1, vals); // 1x12
+
+    SECTION("valid reshape 1") {
+        const DT *exp = genGivenVals<DT>(12, vals); // 12x1
+        checkReshape(arg, 12, 1, exp);
+        DataObjectFactory::destroy(exp);
+    }
+    SECTION("valid reshape 2") {
+        const DT *exp = genGivenVals<DT>(3, vals); // 3x4
+        checkReshape(arg, 3, 4, exp);
+        DataObjectFactory::destroy(exp);
+    }
+    SECTION("view 1") {
+        const DTView *initial = genGivenVals<DTView>(3, vals);                                      // 3x4
+        const DT *view = static_cast<DT *>(DataObjectFactory::create<DTView>(initial, 0, 3, 2, 4)); // 3x2
+        const DT *exp = genGivenVals<DT>(2, {2, 3, 6, 7, 10, 11});                                  // 2x3
+        checkReshape(view, 2, 3, exp);
+
+        DataObjectFactory::destroy(exp, initial, view);
+    }
+    SECTION("view 2") {
+        const DTView *initial = genGivenVals<DTView>(2, vals);                                      // 2x6
+        const DT *view = static_cast<DT *>(DataObjectFactory::create<DTView>(initial, 1, 2, 0, 6)); // 1x6
+        const DT *exp = genGivenVals<DT>(3, {6, 7, 8, 9, 10, 11});                                  // 3x2
+        checkReshape(view, 3, 2, exp);
+
+        DataObjectFactory::destroy(exp, initial, view);
+    }
+    SECTION("view 3") {
+        const DTView *initial = genGivenVals<DTView>(2, vals);                                      // 2x6
+        const DT *view = static_cast<DT *>(DataObjectFactory::create<DTView>(initial, 1, 2, 0, 4)); // 1x4
+        const DT *exp = genGivenVals<DT>(2, {6, 7, 8, 9});                                          // 2x2
+        checkReshape(view, 2, 2, exp);
+
+        DataObjectFactory::destroy(exp, initial, view);
+    }
+    SECTION("invalid reshape") {
+        DT *res = nullptr;
+        CHECK_THROWS(reshape<DT, DT>(res, arg, 5, 2, nullptr));
+    }
+
+    DataObjectFactory::destroy(arg);
+}
+
+TEMPLATE_PRODUCT_TEST_CASE("Reshape - string specific", TAG_KERNELS, (DATA_TYPES), (ALL_STRING_VALUE_TYPES)) {
+    using DT = TestType;
+    using VT = typename DT::VT;
+    using DTView = typename std::conditional<std::is_same<DT, Matrix<VT>>::value, DenseMatrix<VT>, DT>::type;
+
+    std::vector<typename DT::VT> vals = {"ab",      "abcd",     "",          "a",   "abcde", "abcdef",
+                                         "abcdefg", "abcdefgh", "abcdefghi", " ab", " ",     "123"};
+    DT *arg = genGivenVals<DT>(1, vals); // 1x12
+
     SECTION("valid reshape 1") {
-        const DT * exp = genGivenVals<DT>(12, vals); // 12x1
+        const DT *exp = genGivenVals<DT>(12, vals); // 12x1
         checkReshape(arg, 12, 1, exp);
         DataObjectFactory::destroy(exp);
     }
     SECTION("valid reshape 2") {
-        const DT * exp = genGivenVals<DT>(3, vals); // 3x4
+        const DT *exp = genGivenVals<DT>(3, vals); // 3x4
         checkReshape(arg, 3, 4, exp);
         DataObjectFactory::destroy(exp);
     }
     SECTION("view 1") {
-        const DTView * initial = genGivenVals<DTView>(3, vals); // 3x4
-        const DT * view = static_cast<DT *>(DataObjectFactory::create<DTView>(initial, 0, 3, 2, 4)); // 3x2
-        const DT * exp = genGivenVals<DT>(2, {2, 3, 6, 7, 10, 11}); // 2x3
+        const DTView *initial = genGivenVals<DTView>(3, vals);                                      // 3x4
+        const DT *view = static_cast<DT *>(DataObjectFactory::create<DTView>(initial, 0, 3, 2, 4)); // 3x2
+        const DT *exp = genGivenVals<DT>(2, {"", "a", "abcdefg", "abcdefgh", " ", "123"});          // 2x3
         checkReshape(view, 2, 3, exp);
 
         DataObjectFactory::destroy(exp, initial, view);
     }
     SECTION("view 2") {
-        const DTView * initial = genGivenVals<DTView>(2, vals); // 2x6
-        const DT * view = static_cast<DT *>(DataObjectFactory::create<DTView>(initial, 1, 2, 0, 6)); // 1x6
-        const DT * exp = genGivenVals<DT>(3, {6, 7, 8, 9, 10, 11}); // 3x2
+        const DTView *initial = genGivenVals<DTView>(2, vals);                                        // 2x6
+        const DT *view = static_cast<DT *>(DataObjectFactory::create<DTView>(initial, 1, 2, 0, 6));   // 1x6
+        const DT *exp = genGivenVals<DT>(3, {"abcdefg", "abcdefgh", "abcdefghi", " ab", " ", "123"}); // 3x2
         checkReshape(view, 3, 2, exp);
 
         DataObjectFactory::destroy(exp, initial, view);
     }
     SECTION("view 3") {
-        const DTView * initial = genGivenVals<DTView>(2, vals); // 2x6
-        const DT * view = static_cast<DT *>(DataObjectFactory::create<DTView>(initial, 1, 2, 0, 4)); // 1x4
-        const DT * exp = genGivenVals<DT>(2, {6, 7, 8, 9}); // 2x2
+        const DTView *initial = genGivenVals<DTView>(2, vals);                                      // 2x6
+        const DT *view = static_cast<DT *>(DataObjectFactory::create<DTView>(initial, 1, 2, 0, 4)); // 1x4
+        const DT *exp = genGivenVals<DT>(2, {"abcdefg", "abcdefgh", "abcdefghi", " ab"});           // 2x2
         checkReshape(view, 2, 2, exp);
 
         DataObjectFactory::destroy(exp, initial, view);
     }
     SECTION("invalid reshape") {
-        DT * res = nullptr;
+        DT *res = nullptr;
         CHECK_THROWS(reshape<DT, DT>(res, arg, 5, 2, nullptr));
     }
 
diff --git a/test/runtime/local/kernels/ReverseTest.cpp b/test/runtime/local/kernels/ReverseTest.cpp
index 0e7463f00..313c32345 100644
--- a/test/runtime/local/kernels/ReverseTest.cpp
+++ b/test/runtime/local/kernels/ReverseTest.cpp
@@ -17,8 +17,8 @@
 #include <runtime/local/datagen/GenGivenVals.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
 #include <runtime/local/datastructures/Structure.h>
-#include <runtime/local/kernels/Reverse.h>
 #include <runtime/local/kernels/CheckEq.h>
+#include <runtime/local/kernels/Reverse.h>
 
 #include <tags.h>
 
@@ -33,69 +33,148 @@
 
 TEMPLATE_PRODUCT_TEST_CASE("Reverse", TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)) {
     using DT = TestType;
-    
-    DT * arg = nullptr;
-    DT * exp = nullptr;
+
+    DT *arg = nullptr;
+    DT *exp = nullptr;
     SECTION("general matrix 1") {
         arg = genGivenVals<DT>(3, {
-            1, 2,
-            3, 4,
-            5, 6,
-        });
+                                      1,
+                                      2,
+                                      3,
+                                      4,
+                                      5,
+                                      6,
+                                  });
         exp = genGivenVals<DT>(3, {
-            5, 6,
-            3, 4,
-            1, 2,
-        });
+                                      5,
+                                      6,
+                                      3,
+                                      4,
+                                      1,
+                                      2,
+                                  });
     }
     SECTION("general matrix 2") {
         arg = genGivenVals<DT>(3, {
-           1, 2, 3,
-           4, 5, 6,
-           7, 8, 9,
-        });
+                                      1,
+                                      2,
+                                      3,
+                                      4,
+                                      5,
+                                      6,
+                                      7,
+                                      8,
+                                      9,
+                                  });
         exp = genGivenVals<DT>(3, {
-           7, 8, 9,
-           4, 5, 6,
-           1, 2, 3,
-        });
+                                      7,
+                                      8,
+                                      9,
+                                      4,
+                                      5,
+                                      6,
+                                      1,
+                                      2,
+                                      3,
+                                  });
     }
     SECTION("column matrix") {
         arg = genGivenVals<DT>(9, {
-           1,
-           2,
-           3,
-           4,
-           5,
-           6,
-           7,
-           8,
-           9,
-        });
+                                      1,
+                                      2,
+                                      3,
+                                      4,
+                                      5,
+                                      6,
+                                      7,
+                                      8,
+                                      9,
+                                  });
         exp = genGivenVals<DT>(9, {
-           9,
-           8,
-           7,
-           6,
-           5,
-           4,
-           3,
-           2,
-           1,
-        });
+                                      9,
+                                      8,
+                                      7,
+                                      6,
+                                      5,
+                                      4,
+                                      3,
+                                      2,
+                                      1,
+                                  });
     }
     SECTION("row matrix") {
         arg = genGivenVals<DT>(1, {
-           1, 2, 3, 4, 5, 6, 7, 8, 9,
-        });
+                                      1,
+                                      2,
+                                      3,
+                                      4,
+                                      5,
+                                      6,
+                                      7,
+                                      8,
+                                      9,
+                                  });
         exp = genGivenVals<DT>(1, {
-           1, 2, 3, 4, 5, 6, 7, 8, 9,
-        });
+                                      1,
+                                      2,
+                                      3,
+                                      4,
+                                      5,
+                                      6,
+                                      7,
+                                      8,
+                                      9,
+                                  });
     }
 
     DT *res = nullptr;
     reverse<DT, DT>(res, arg, nullptr);
     CHECK(*res == *exp);
-    
+
     DataObjectFactory::destroy(arg, exp, res);
+}
+
+TEMPLATE_PRODUCT_TEST_CASE("Reverse - string specific", TAG_KERNELS, (DATA_TYPES), (ALL_STRING_VALUE_TYPES)) {
+    using DT = TestType;
+
+    DT *arg = nullptr;
+    DT *exp = nullptr;
+
+    SECTION("general matrix 1") {
+        arg = genGivenVals<DT>(3, {"", "abcd", "a", "abc", "ab", "ab"});
+        exp = genGivenVals<DT>(3, {"ab", "ab", "a", "abc", "", "abcd"});
+        DT *res = nullptr;
+        reverse<DT, DT>(res, arg, nullptr);
+        CHECK(*res == *exp);
+
+        DataObjectFactory::destroy(arg, exp, res);
+    }
+    SECTION("general matrix 2") {
+        arg = genGivenVals<DT>(3, {"", "abcd", "red", "a", "abc", "blue", "ab", "ab", "green"});
+        exp = genGivenVals<DT>(3, {"ab", "ab", "green", "a", "abc", "blue", "", "abcd", "red"});
+        DT *res = nullptr;
+        reverse<DT, DT>(res, arg, nullptr);
+        CHECK(*res == *exp);
+
+        DataObjectFactory::destroy(arg, exp, res);
+    }
+    SECTION("column matrix") {
+        arg = genGivenVals<DT>(9, {"ab", "abcd", "", "a", "abcde", "abcdef", "abcdefg", "abcdefgh", "abcdefghi"});
+        exp = genGivenVals<DT>(9, {"abcdefghi", "abcdefgh", "abcdefg", "abcdef", "abcde", "a", "", "abcd", "ab"});
+        DT *res = nullptr;
+        reverse<DT, DT>(res, arg, nullptr);
+        CHECK(*res == *exp);
+
+        DataObjectFactory::destroy(arg, exp, res);
+    }
+
+    SECTION("row matrix") {
+        arg = genGivenVals<DT>(1, {"ab", "abcd", "", "a", "abcde", "abcdef", "abcdefg", "abcdefgh", "abcdefghi"});
+        exp = genGivenVals<DT>(1, {"ab", "abcd", "", "a", "abcde", "abcdef", "abcdefg", "abcdefgh", "abcdefghi"});
+        DT *res = nullptr;
+        reverse<DT, DT>(res, arg, nullptr);
+        CHECK(*res == *exp);
+
+        DataObjectFactory::destroy(arg, exp, res);
+    }
 }
\ No newline at end of file
diff --git a/test/runtime/local/kernels/RowBindTest.cpp b/test/runtime/local/kernels/RowBindTest.cpp
index 2f92feacf..8ce485fb8 100644
--- a/test/runtime/local/kernels/RowBindTest.cpp
+++ b/test/runtime/local/kernels/RowBindTest.cpp
@@ -15,8 +15,8 @@
  */
 
 #include <runtime/local/datagen/GenGivenVals.h>
-#include <runtime/local/datastructures/DenseMatrix.h>
 #include <runtime/local/datastructures/CSRMatrix.h>
+#include <runtime/local/datastructures/DenseMatrix.h>
 #include <runtime/local/datastructures/Frame.h>
 #include <runtime/local/datastructures/Structure.h>
 #include <runtime/local/kernels/CheckEq.h>
@@ -32,31 +32,45 @@
 
 TEMPLATE_PRODUCT_TEST_CASE("RowBind", TAG_KERNELS, (DenseMatrix, Matrix), (double, uint32_t)) {
     using DT = TestType;
-    
+
     auto m0 = genGivenVals<DT>(4, {
-        1, 2, 3, 4,
-        5, 6, 7, 8,
-        9, 10, 11, 12,
-        13, 14, 15, 16,
-    });
-    
-    DT * res = nullptr;
+                                      1,
+                                      2,
+                                      3,
+                                      4,
+                                      5,
+                                      6,
+                                      7,
+                                      8,
+                                      9,
+                                      10,
+                                      11,
+                                      12,
+                                      13,
+                                      14,
+                                      15,
+                                      16,
+                                  });
+
+    DT *res = nullptr;
     SECTION("matching matrices") {
         auto m1 = genGivenVals<DT>(3, {
-           17, 20, 30, 40,
-           50, 60, 70, 80,
-           90, 100, 110, 120,
-        });
-
-        auto exp = genGivenVals<DT>(7, {
-           1, 2, 3, 4,
-           5, 6, 7, 8, 
-           9, 10, 11, 12,
-           13, 14, 15, 16,
-           17, 20, 30, 40, 
-           50, 60, 70, 80,
-           90, 100, 110, 120
-        });
+                                          17,
+                                          20,
+                                          30,
+                                          40,
+                                          50,
+                                          60,
+                                          70,
+                                          80,
+                                          90,
+                                          100,
+                                          110,
+                                          120,
+                                      });
+
+        auto exp = genGivenVals<DT>(7, {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12,  13,  14,
+                                        15, 16, 17, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120});
 
         rowBind<DT, DT, DT>(res, m0, m1, nullptr);
         CHECK(*res == *exp);
@@ -65,16 +79,22 @@ TEMPLATE_PRODUCT_TEST_CASE("RowBind", TAG_KERNELS, (DenseMatrix, Matrix), (doubl
     }
     SECTION("size mismatch") {
         auto m1 = genGivenVals<DT>(3, {
-           17, 20, 30,
-           50, 60, 70,
-           90, 100, 110,
-        });
+                                          17,
+                                          20,
+                                          30,
+                                          50,
+                                          60,
+                                          70,
+                                          90,
+                                          100,
+                                          110,
+                                      });
 
         CHECK_THROWS(rowBind<DT, DT, DT>(res, m0, m1, nullptr));
-        
+
         DataObjectFactory::destroy(m1);
     }
-    
+
     DataObjectFactory::destroy(m0);
 }
 
@@ -96,17 +116,16 @@ TEMPLATE_TEST_CASE("RowBind", TAG_KERNELS, (Frame)) {
     std::string l1 = "b";
     std::string l2 = "c";
     std::string l3 = "d";
-    
-    
+
     std::vector<Structure *> cols0123 = {c0, c1, c2, c3};
     std::string labels0123[] = {l0, l1, l2, l3};
     auto f0123 = DataObjectFactory::create<Frame>(cols0123, labels0123);
-    
-    Frame * res = nullptr;
+
+    Frame *res = nullptr;
     SECTION("matching frames") {
         std::vector<Structure *> cols4567 = {c4, c5, c6, c7};
-        Frame * f4567 = DataObjectFactory::create<Frame>(cols4567, labels0123);
-        
+        Frame *f4567 = DataObjectFactory::create<Frame>(cols4567, labels0123);
+
         rowBind<Frame, Frame, Frame>(res, f0123, f4567, nullptr);
 
         // Check dimensions.
@@ -114,49 +133,49 @@ TEMPLATE_TEST_CASE("RowBind", TAG_KERNELS, (Frame)) {
         REQUIRE(res->getNumCols() == 4);
 
         // Check column types.
-        const std::string * labelsRes = res->getLabels();
+        const std::string *labelsRes = res->getLabels();
         CHECK(labelsRes[0] == l0);
         CHECK(labelsRes[1] == l1);
         CHECK(labelsRes[2] == l2);
         CHECK(labelsRes[3] == l3);
 
         // Check row labels.
-        const ValueTypeCode * schemaRes = res->getSchema();
+        const ValueTypeCode *schemaRes = res->getSchema();
         CHECK(schemaRes[0] == ValueTypeCode::F64);
         CHECK(schemaRes[1] == ValueTypeCode::F32);
         CHECK(schemaRes[2] == ValueTypeCode::SI64);
         CHECK(schemaRes[3] == ValueTypeCode::UI8);
 
-        // Check column data. 
+        // Check column data.
         CHECK(*(res->getColumn<double>(0)) == *c04);
         CHECK(*(res->getColumn<float>(1)) == *c15);
         CHECK(*(res->getColumn<int64_t>(2)) == *c26);
         CHECK(*(res->getColumn<uint8_t>(3)) == *c37);
-        
+
         DataObjectFactory::destroy(f4567);
         DataObjectFactory::destroy(res);
     }
     SECTION("size mismatch") {
         std::string labels012[] = {l0, l1, l2};
         std::vector<Structure *> cols456 = {c4, c5, c6};
-        Frame * f456 = DataObjectFactory::create<Frame>(cols456, labels012);
-        
+        Frame *f456 = DataObjectFactory::create<Frame>(cols456, labels012);
+
         CHECK_THROWS(rowBind<Frame, Frame, Frame>(res, f0123, f456, nullptr));
     }
     SECTION("schema mismatch") {
         std::vector<Structure *> cols4444 = {c4, c4, c4, c4};
-        Frame * f4444 = DataObjectFactory::create<Frame>(cols4444, labels0123);
-        
+        Frame *f4444 = DataObjectFactory::create<Frame>(cols4444, labels0123);
+
         CHECK_THROWS(rowBind<Frame, Frame, Frame>(res, f0123, f4444, nullptr));
     }
     SECTION("label mismatch") {
         std::string labels3210[] = {l3, l2, l1, l0};
         std::vector<Structure *> cols4567 = {c4, c5, c6, c7};
-        Frame * f4567 = DataObjectFactory::create<Frame>(cols4567, labels3210);
-        
+        Frame *f4567 = DataObjectFactory::create<Frame>(cols4567, labels3210);
+
         CHECK_THROWS(rowBind<Frame, Frame, Frame>(res, f0123, f4567, nullptr));
     }
-    
+
     DataObjectFactory::destroy(c0);
     DataObjectFactory::destroy(c1);
     DataObjectFactory::destroy(c2);
@@ -181,29 +200,29 @@ TEMPLATE_PRODUCT_TEST_CASE("RowBind", TAG_KERNELS, (CSRMatrix), (double, uint32_
     size_t numNonZeros = 5;
 
     auto m0 = DataObjectFactory::create<CSRMatrix<VT>>(numRows, numCols, numNonZeros, true);
-    m0->set(1,1, VT(11.));
-    m0->set(2,1, VT(21.));
-    m0->set(2,3, VT(23.));
-    m0->set(3,3, VT(33.));
-    m0->set(3,4, VT(34.));
-    
-    DT * res = nullptr;
-    
+    m0->set(1, 1, VT(11.));
+    m0->set(2, 1, VT(21.));
+    m0->set(2, 3, VT(23.));
+    m0->set(3, 3, VT(33.));
+    m0->set(3, 4, VT(34.));
+
+    DT *res = nullptr;
+
     SECTION("Normal on normal") {
         auto m1 = DataObjectFactory::create<CSRMatrix<VT>>(numRows, numCols, 3, true);
-        m1->set(1,0, VT(10.));
-        m1->set(1,2, VT(12.));
-        m1->set(3,0, VT(30.));
-
-        auto exp = DataObjectFactory::create<CSRMatrix<VT>>(numRows*2, numCols, 8, true);
-        exp->set(1,1, VT(11.));
-        exp->set(2,1, VT(21.));
-        exp->set(2,3, VT(23.));
-        exp->set(3,3, VT(33.));
-        exp->set(3,4, VT(34.));
-        exp->set(1 + numRows,0, VT(10.));
-        exp->set(1 + numRows,2, VT(12.));
-        exp->set(3 + numRows,0, VT(30.));
+        m1->set(1, 0, VT(10.));
+        m1->set(1, 2, VT(12.));
+        m1->set(3, 0, VT(30.));
+
+        auto exp = DataObjectFactory::create<CSRMatrix<VT>>(numRows * 2, numCols, 8, true);
+        exp->set(1, 1, VT(11.));
+        exp->set(2, 1, VT(21.));
+        exp->set(2, 3, VT(23.));
+        exp->set(3, 3, VT(33.));
+        exp->set(3, 4, VT(34.));
+        exp->set(1 + numRows, 0, VT(10.));
+        exp->set(1 + numRows, 2, VT(12.));
+        exp->set(3 + numRows, 0, VT(30.));
 
         rowBind<DT, DT, DT>(res, m0, m1, nullptr);
         CHECK(*res == *exp);
@@ -218,14 +237,14 @@ TEMPLATE_PRODUCT_TEST_CASE("RowBind", TAG_KERNELS, (CSRMatrix), (double, uint32_
         size_t upper_bound = 3;
         auto m1 = DataObjectFactory::create<CSRMatrix<VT>>(m0, lower_bound, upper_bound);
         auto exp = DataObjectFactory::create<CSRMatrix<VT>>(numRows + (upper_bound - lower_bound), numCols, 8, true);
-        exp->set(1,1, VT(11.));
-        exp->set(2,1, VT(21.));
-        exp->set(2,3, VT(23.));
-        exp->set(3,3, VT(33.));
-        exp->set(3,4, VT(34.));
-        exp->set(1 + (numRows-1), 1, VT(11.));
-        exp->set(2 + (numRows-1), 1, VT(21.));
-        exp->set(2 + (numRows-1), 3, VT(23.));
+        exp->set(1, 1, VT(11.));
+        exp->set(2, 1, VT(21.));
+        exp->set(2, 3, VT(23.));
+        exp->set(3, 3, VT(33.));
+        exp->set(3, 4, VT(34.));
+        exp->set(1 + (numRows - 1), 1, VT(11.));
+        exp->set(2 + (numRows - 1), 1, VT(21.));
+        exp->set(2 + (numRows - 1), 3, VT(23.));
 
         rowBind<DT, DT, DT>(res, m0, m1, nullptr);
         CHECK(*res == *exp);
@@ -238,12 +257,12 @@ TEMPLATE_PRODUCT_TEST_CASE("RowBind", TAG_KERNELS, (CSRMatrix), (double, uint32_
     SECTION("View on view") {
         auto m1 = DataObjectFactory::create<CSRMatrix<VT>>(m0, 2, 4);
         auto m2 = DataObjectFactory::create<CSRMatrix<VT>>(m0, 0, 2);
-        auto exp  = DataObjectFactory::create<CSRMatrix<VT>>(numRows, numCols, numNonZeros, true);
-        exp->set(0,1, VT(21.));
-        exp->set(0,3, VT(23.));
-        exp->set(1,3, VT(33.));
-        exp->set(1,4, VT(34.));
-        exp->set(3,1, VT(11.));
+        auto exp = DataObjectFactory::create<CSRMatrix<VT>>(numRows, numCols, numNonZeros, true);
+        exp->set(0, 1, VT(21.));
+        exp->set(0, 3, VT(23.));
+        exp->set(1, 3, VT(33.));
+        exp->set(1, 4, VT(34.));
+        exp->set(3, 1, VT(11.));
 
         rowBind<DT, DT, DT>(res, m1, m2, nullptr);
         CHECK(*res == *exp);
diff --git a/test/runtime/local/kernels/SampleTest.cpp b/test/runtime/local/kernels/SampleTest.cpp
index 8cb75f9da..8b2bb7988 100644
--- a/test/runtime/local/kernels/SampleTest.cpp
+++ b/test/runtime/local/kernels/SampleTest.cpp
@@ -26,41 +26,41 @@ TEMPLATE_PRODUCT_TEST_CASE("Sample", TAG_KERNELS, (DenseMatrix), (double, uint32
     using DT = TestType;
     using VT = typename DT::VT;
 
-    DT * m = nullptr;    
+    DT *m = nullptr;
     const size_t size = 10000;
     const VT range = 10000;
-    bool withReplacement;    
-    
-    SECTION("with replacement"){
+    bool withReplacement;
+
+    SECTION("with replacement") {
         withReplacement = true;
-    
+
         sample<DT, VT>(m, range, size, withReplacement, -1, nullptr);
 
         REQUIRE(m->getNumRows() == size);
         REQUIRE(m->getNumCols() == 1);
 
-        VT *values = m->getValues();        
-        for (size_t i=0; i<size; i++){                
-            CHECK(values[i] >= 0);            
-            CHECK(values[i] < range);            
+        VT *values = m->getValues();
+        for (size_t i = 0; i < size; i++) {
+            CHECK(values[i] >= 0);
+            CHECK(values[i] < range);
         }
     }
-        
-    SECTION("without replacement"){
+
+    SECTION("without replacement") {
         withReplacement = false;
-    
+
         sample<DT, VT>(m, range, size, withReplacement, -1, nullptr);
-        
+
         REQUIRE(m->getNumRows() == size);
         REQUIRE(m->getNumCols() == 1);
-        
-        VT *values = m->getValues();        
-        
-        std::sort(values, values + size);   
-        CHECK(values[0] >= 0);                   
-        CHECK(values[size-1] < range);                   
-        for (size_t i=1; i<size; i++){                
-            CHECK(values[i-1] != values[i]);            
+
+        VT *values = m->getValues();
+
+        std::sort(values, values + size);
+        CHECK(values[0] >= 0);
+        CHECK(values[size - 1] < range);
+        for (size_t i = 1; i < size; i++) {
+            CHECK(values[i - 1] != values[i]);
         }
     }
 
diff --git a/test/runtime/local/kernels/SemiJoinTest.cpp b/test/runtime/local/kernels/SemiJoinTest.cpp
index 2e7664050..5b0d76017 100644
--- a/test/runtime/local/kernels/SemiJoinTest.cpp
+++ b/test/runtime/local/kernels/SemiJoinTest.cpp
@@ -32,16 +32,16 @@
 
 TEST_CASE("SemiJoin", TAG_KERNELS) {
     // lhs
-    auto lhsC0 = genGivenVals<DenseMatrix<int64_t>>(4, { 1,  2,  3,  4});
+    auto lhsC0 = genGivenVals<DenseMatrix<int64_t>>(4, {1, 2, 3, 4});
     auto lhsC1 = genGivenVals<DenseMatrix<double>>(4, {11.0, 22.0, 33.0, 44.00});
     std::vector<Structure *> lhsCols = {lhsC0, lhsC1};
     std::string lhsLabels[] = {"a", "b"};
     auto lhs = DataObjectFactory::create<Frame>(lhsCols, lhsLabels);
 
     // rhs
-    auto rhsC0 = genGivenVals<DenseMatrix<int64_t>>(3, { 1, 4, 5});
-    auto rhsC1 = genGivenVals<DenseMatrix<int64_t>>(3, { -1, -4, -5});
-    auto rhsC2 = genGivenVals<DenseMatrix<double >>(3, {0.1, 0.2, 0.3});
+    auto rhsC0 = genGivenVals<DenseMatrix<int64_t>>(3, {1, 4, 5});
+    auto rhsC1 = genGivenVals<DenseMatrix<int64_t>>(3, {-1, -4, -5});
+    auto rhsC2 = genGivenVals<DenseMatrix<double>>(3, {0.1, 0.2, 0.3});
     std::vector<Structure *> rhsCols = {rhsC0, rhsC1, rhsC2};
     std::string rhsLabels[] = {"c", "d", "e"};
     auto rhs = DataObjectFactory::create<Frame>(rhsCols, rhsLabels);
@@ -56,13 +56,12 @@ TEST_CASE("SemiJoin", TAG_KERNELS) {
     auto expTid = genGivenVals<DenseMatrix<int64_t>>(2, {0, 3});
 
     // res
-    Frame * res = nullptr;
-    DenseMatrix<int64_t> * lhsTid = nullptr;
+    Frame *res = nullptr;
+    DenseMatrix<int64_t> *lhsTid = nullptr;
     semiJoin(res, lhsTid, lhs, rhs, "a", "c", nullptr);
 
     CHECK(*res == *expRes);
     CHECK(*lhsTid == *expTid);
 
-    DataObjectFactory::destroy(lhs, rhs, expRes, expTid, res, lhsTid,
-                            lhsC0, lhsC1, rhsC0, rhsC1, rhsC2, expResC0);
+    DataObjectFactory::destroy(lhs, rhs, expRes, expTid, res, lhsTid, lhsC0, lhsC1, rhsC0, rhsC1, rhsC2, expResC0);
 }
\ No newline at end of file
diff --git a/test/runtime/local/kernels/SeqTest.cpp b/test/runtime/local/kernels/SeqTest.cpp
index 7a2c5bd13..a09cd7907 100644
--- a/test/runtime/local/kernels/SeqTest.cpp
+++ b/test/runtime/local/kernels/SeqTest.cpp
@@ -14,74 +14,72 @@
  * limitations under the License.
  */
 
-#include <runtime/local/datastructures/DenseMatrix.h>
 #include <runtime/local/datagen/GenGivenVals.h>
+#include <runtime/local/datastructures/DenseMatrix.h>
 #include <runtime/local/kernels/CheckEq.h>
 #include <runtime/local/kernels/Seq.h>
 
-
-#include <tags.h>
 #include <catch.hpp>
 #include <cstdint>
+#include <tags.h>
 
 #define DATA_TYPES DenseMatrix
-#define VALUE_TYPES  int32_t, int64_t 
+#define VALUE_TYPES int32_t, int64_t
 #define VALUE_TYPES_FRAC float, double
 
-template<class DT>
-void checkSeq(DT *& res, typename DT::VT start, typename DT::VT end, typename DT::VT inc, DT * expectedMatrix) {
+template <class DT>
+void checkSeq(DT *&res, typename DT::VT start, typename DT::VT end, typename DT::VT inc, DT *expectedMatrix) {
     seq<DT>(res, start, end, inc, nullptr);
     CHECK(*res == *expectedMatrix);
 }
 
-TEMPLATE_PRODUCT_TEST_CASE("Seq-basic-positive", TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)){
+TEMPLATE_PRODUCT_TEST_CASE("Seq-basic-positive", TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)) {
 
     using DT = TestType;
-    DT * inputMatrix = nullptr;
-    DT * targetMatrix = nullptr;
+    DT *inputMatrix = nullptr;
+    DT *targetMatrix = nullptr;
 
     targetMatrix = genGivenVals<DT>(5, {
-        0,
-        1,
-        2,
-        3,
-        4,
-    });
-    checkSeq(inputMatrix,0,4, 1, targetMatrix);
+                                           0,
+                                           1,
+                                           2,
+                                           3,
+                                           4,
+                                       });
+    checkSeq(inputMatrix, 0, 4, 1, targetMatrix);
     DataObjectFactory::destroy(targetMatrix);
     DataObjectFactory::destroy(inputMatrix);
 }
 
 TEMPLATE_PRODUCT_TEST_CASE("Seq-reverse-positive", TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)) {
     using DT = TestType;
-    DT * inputMatrix = nullptr;
-    DT * targetMatrix = nullptr;
+    DT *inputMatrix = nullptr;
+    DT *targetMatrix = nullptr;
 
     targetMatrix = genGivenVals<DT>(5, {
-        4,
-        3,
-        2,
-        1,
-        0,
-    });
+                                           4,
+                                           3,
+                                           2,
+                                           1,
+                                           0,
+                                       });
     checkSeq(inputMatrix, 4, 0, -1, targetMatrix);
     DataObjectFactory::destroy(targetMatrix);
     DataObjectFactory::destroy(inputMatrix);
 }
 
- 
 TEMPLATE_PRODUCT_TEST_CASE("Seq-basic-negative", TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)) {
     using DT = TestType;
-    DT * inputMatrix = nullptr;
-    DT * targetMatrix = nullptr;
+    DT *inputMatrix = nullptr;
+    DT *targetMatrix = nullptr;
 
     targetMatrix = genGivenVals<DT>(5, {
-        0,
-        -1,
-        -2,
-        -3,
-        -4,
-    });
+                                           0,
+                                           -1,
+                                           -2,
+                                           -3,
+                                           -4,
+                                       });
     checkSeq(inputMatrix, 0, -4, -1, targetMatrix);
     DataObjectFactory::destroy(targetMatrix);
     DataObjectFactory::destroy(inputMatrix);
@@ -89,16 +87,16 @@ TEMPLATE_PRODUCT_TEST_CASE("Seq-basic-negative", TAG_KERNELS, (DATA_TYPES), (VAL
 
 TEMPLATE_PRODUCT_TEST_CASE("Seq-reverse-negative", TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)) {
     using DT = TestType;
-    DT * inputMatrix = nullptr;
-    DT * targetMatrix = nullptr;
+    DT *inputMatrix = nullptr;
+    DT *targetMatrix = nullptr;
 
     targetMatrix = genGivenVals<DT>(5, {
-        -4,
-        -3,
-        -2,
-        -1,
-        0,
-    });
+                                           -4,
+                                           -3,
+                                           -2,
+                                           -1,
+                                           0,
+                                       });
     checkSeq(inputMatrix, -4, 0, 1, targetMatrix);
     DataObjectFactory::destroy(targetMatrix);
     DataObjectFactory::destroy(inputMatrix);
@@ -106,59 +104,57 @@ TEMPLATE_PRODUCT_TEST_CASE("Seq-reverse-negative", TAG_KERNELS, (DATA_TYPES), (V
 
 TEMPLATE_PRODUCT_TEST_CASE("Seq-basic-mix", TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)) {
     using DT = TestType;
-    DT * inputMatrix = nullptr;
-    DT * targetMatrix = nullptr;
+    DT *inputMatrix = nullptr;
+    DT *targetMatrix = nullptr;
 
     targetMatrix = genGivenVals<DT>(5, {
-        -4,
-        -1,
-        2,
-        5,
-        8,
-    });
+                                           -4,
+                                           -1,
+                                           2,
+                                           5,
+                                           8,
+                                       });
     checkSeq(inputMatrix, -4, 8, 3, targetMatrix);
     DataObjectFactory::destroy(targetMatrix);
     DataObjectFactory::destroy(inputMatrix);
 }
 
-
 TEMPLATE_PRODUCT_TEST_CASE("Seq-reverse-mix", TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)) {
     using DT = TestType;
-    DT * inputMatrix = nullptr;
-    DT * targetMatrix = nullptr;
+    DT *inputMatrix = nullptr;
+    DT *targetMatrix = nullptr;
 
     targetMatrix = genGivenVals<DT>(5, {
-        8,
-        5,
-        2,
-        -1,
-        -4,
-    });
+                                           8,
+                                           5,
+                                           2,
+                                           -1,
+                                           -4,
+                                       });
     checkSeq(inputMatrix, 8, -4, -3, targetMatrix);
     DataObjectFactory::destroy(targetMatrix);
     DataObjectFactory::destroy(inputMatrix);
-
 }
 
 TEMPLATE_PRODUCT_TEST_CASE("Seq-floating-step-forward", TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES_FRAC)) {
     using DT = TestType;
-    DT * inputMatrix = nullptr;
-    DT * targetMatrix = nullptr;
+    DT *inputMatrix = nullptr;
+    DT *targetMatrix = nullptr;
     typename DT::VT start = 3.7;
     typename DT::VT end = 5.3;
-    typename DT::VT inc= 0.4;
+    typename DT::VT inc = 0.4;
     typename DT::VT v1 = start + inc;
     typename DT::VT v2 = v1 + inc;
     typename DT::VT v3 = v2 + inc;
     typename DT::VT v4 = v3 + inc;
 
     targetMatrix = genGivenVals<DT>(5, {
-        start,
-        v1,
-        v2,
-        v3,
-        v4,
-    });
+                                           start,
+                                           v1,
+                                           v2,
+                                           v3,
+                                           v4,
+                                       });
     checkSeq(inputMatrix, start, end, inc, targetMatrix);
     DataObjectFactory::destroy(targetMatrix);
     DataObjectFactory::destroy(inputMatrix);
@@ -166,40 +162,39 @@ TEMPLATE_PRODUCT_TEST_CASE("Seq-floating-step-forward", TAG_KERNELS, (DATA_TYPES
 
 TEMPLATE_PRODUCT_TEST_CASE("Seq-floating-step-backward", TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES_FRAC)) {
     using DT = TestType;
-    DT * inputMatrix = nullptr;
-    DT * targetMatrix = nullptr;
+    DT *inputMatrix = nullptr;
+    DT *targetMatrix = nullptr;
     typename DT::VT start = 5.3;
     typename DT::VT end = 3.7;
-    typename DT::VT inc= -0.4;
+    typename DT::VT inc = -0.4;
     typename DT::VT v1 = start + inc;
     typename DT::VT v2 = v1 + inc;
     typename DT::VT v3 = v2 + inc;
     typename DT::VT v4 = v3 + inc;
 
     targetMatrix = genGivenVals<DT>(5, {
-        start,
-        v1,
-        v2,
-        v3,
-        v4,
-    });
+                                           start,
+                                           v1,
+                                           v2,
+                                           v3,
+                                           v4,
+                                       });
     checkSeq(inputMatrix, start, end, inc, targetMatrix);
     DataObjectFactory::destroy(targetMatrix);
     DataObjectFactory::destroy(inputMatrix);
 }
 
-
 TEMPLATE_PRODUCT_TEST_CASE("Seq-step>end", TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES_FRAC)) {
     using DT = TestType;
-    DT * inputMatrix = nullptr;
-    DT * targetMatrix = nullptr;
+    DT *inputMatrix = nullptr;
+    DT *targetMatrix = nullptr;
     typename DT::VT start = 3.7;
     typename DT::VT end = 5.3;
-    typename DT::VT inc= 5.3;
+    typename DT::VT inc = 5.3;
 
     targetMatrix = genGivenVals<DT>(1, {
-        start,
-    });
+                                           start,
+                                       });
     checkSeq(inputMatrix, start, end, inc, targetMatrix);
     DataObjectFactory::destroy(targetMatrix);
     DataObjectFactory::destroy(inputMatrix);
@@ -207,8 +202,8 @@ TEMPLATE_PRODUCT_TEST_CASE("Seq-step>end", TAG_KERNELS, (DATA_TYPES), (VALUE_TYP
 
 TEMPLATE_PRODUCT_TEST_CASE("Seq-end is not in the sequence", TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES_FRAC)) {
     using DT = TestType;
-    DT * inputMatrix = nullptr;
-    DT * targetMatrix = nullptr;
+    DT *inputMatrix = nullptr;
+    DT *targetMatrix = nullptr;
     typename DT::VT start = 3.7;
     typename DT::VT end = 5;
     typename DT::VT inc = 0.4;
@@ -217,11 +212,11 @@ TEMPLATE_PRODUCT_TEST_CASE("Seq-end is not in the sequence", TAG_KERNELS, (DATA_
     typename DT::VT v3 = v2 + inc;
 
     targetMatrix = genGivenVals<DT>(4, {
-        start,
-        v1,
-        v2,
-        v3,
-    });
+                                           start,
+                                           v1,
+                                           v2,
+                                           v3,
+                                       });
     checkSeq(inputMatrix, start, end, inc, targetMatrix);
     DataObjectFactory::destroy(targetMatrix);
     DataObjectFactory::destroy(inputMatrix);
@@ -230,15 +225,11 @@ TEMPLATE_PRODUCT_TEST_CASE("Seq-end is not in the sequence", TAG_KERNELS, (DATA_
 TEMPLATE_PRODUCT_TEST_CASE("Seq-inc-does-not-lead-to-end", TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)) {
     using DT = TestType;
 
-    DT * res = nullptr;
-    DT * targetMatrix = DataObjectFactory::create<DT>(0, 1, false);
+    DT *res = nullptr;
+    DT *targetMatrix = DataObjectFactory::create<DT>(0, 1, false);
 
-    SECTION("positive inc") {
-        checkSeq(res, 4, 0, 1, targetMatrix);
-    }
-    SECTION("negative inc") {
-        checkSeq(res, 0, 4, -1, targetMatrix);
-    }
+    SECTION("positive inc") { checkSeq(res, 4, 0, 1, targetMatrix); }
+    SECTION("negative inc") { checkSeq(res, 0, 4, -1, targetMatrix); }
     // TODO Test that zero increment yields an exception.
 
     DataObjectFactory::destroy(targetMatrix);
diff --git a/test/runtime/local/kernels/SetColLabelsPrefixTest.cpp b/test/runtime/local/kernels/SetColLabelsPrefixTest.cpp
index fa56fbdfb..0c3e8fbec 100644
--- a/test/runtime/local/kernels/SetColLabelsPrefixTest.cpp
+++ b/test/runtime/local/kernels/SetColLabelsPrefixTest.cpp
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include <runtime/local/datastructures/Frame.h>
 #include <runtime/local/datastructures/DataObjectFactory.h>
+#include <runtime/local/datastructures/Frame.h>
 #include <runtime/local/datastructures/ValueTypeCode.h>
 #include <runtime/local/datastructures/ValueTypeUtils.h>
 #include <runtime/local/kernels/SetColLabelsPrefix.h>
@@ -30,31 +30,31 @@
 
 TEST_CASE("SetColLabelsPrefix", TAG_KERNELS) {
     const size_t numCols = 3;
-    
+
     ValueTypeCode schema[] = {ValueTypeCode::F64, ValueTypeCode::SI32, ValueTypeCode::UI8};
-    
+
     const std::string labelsArg[numCols] = {"a", "b", "c"};
     auto arg = DataObjectFactory::create<Frame>(4, numCols, schema, labelsArg, false);
-    
-    const std::string * labelsRes;
-    
+
+    const std::string *labelsRes;
+
     // Introduces a prefix.
-    Frame * res1 = nullptr;
+    Frame *res1 = nullptr;
     setColLabelsPrefix(res1, arg, "R", nullptr);
-    
+
     labelsRes = res1->getLabels();
     CHECK(labelsRes[0] == "R.a");
     CHECK(labelsRes[1] == "R.b");
     CHECK(labelsRes[2] == "R.c");
-    
+
     // Changes an existing prefix.
-    Frame * res2 = nullptr;
+    Frame *res2 = nullptr;
     setColLabelsPrefix(res2, arg, "S", nullptr);
-    
+
     labelsRes = res2->getLabels();
     CHECK(labelsRes[0] == "S.a");
     CHECK(labelsRes[1] == "S.b");
     CHECK(labelsRes[2] == "S.c");
-    
+
     DataObjectFactory::destroy(arg, res1, res2);
 }
\ No newline at end of file
diff --git a/test/runtime/local/kernels/SetColLabelsTest.cpp b/test/runtime/local/kernels/SetColLabelsTest.cpp
index a8a7efb82..98524be10 100644
--- a/test/runtime/local/kernels/SetColLabelsTest.cpp
+++ b/test/runtime/local/kernels/SetColLabelsTest.cpp
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include <runtime/local/datastructures/Frame.h>
 #include <runtime/local/datastructures/DataObjectFactory.h>
+#include <runtime/local/datastructures/Frame.h>
 #include <runtime/local/datastructures/ValueTypeCode.h>
 #include <runtime/local/datastructures/ValueTypeUtils.h>
 #include <runtime/local/kernels/SetColLabels.h>
@@ -30,18 +30,18 @@
 
 TEST_CASE("SetColLabels", TAG_KERNELS) {
     const size_t numCols = 3;
-    
+
     ValueTypeCode schema[] = {ValueTypeCode::F64, ValueTypeCode::SI32, ValueTypeCode::UI8};
-    
+
     auto arg = DataObjectFactory::create<Frame>(4, numCols, schema, nullptr, false);
-    const char * labelsArg[numCols] = {"ab", "cde", "fghi"};
-    
-    Frame * res = nullptr;
+    const char *labelsArg[numCols] = {"ab", "cde", "fghi"};
+
+    Frame *res = nullptr;
     setColLabels(res, arg, labelsArg, numCols, nullptr);
-    
-    const std::string * labelsRes = res->getLabels();
-    for(size_t i = 0; i < numCols; i++)
+
+    const std::string *labelsRes = res->getLabels();
+    for (size_t i = 0; i < numCols; i++)
         CHECK(labelsRes[i] == labelsArg[i]);
-    
+
     DataObjectFactory::destroy(arg, res);
 }
\ No newline at end of file
diff --git a/test/runtime/local/kernels/SliceColTest.cpp b/test/runtime/local/kernels/SliceColTest.cpp
index c9aae4e36..0a83b3227 100644
--- a/test/runtime/local/kernels/SliceColTest.cpp
+++ b/test/runtime/local/kernels/SliceColTest.cpp
@@ -18,8 +18,8 @@
 #include <runtime/local/datastructures/DataObjectFactory.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
 #include <runtime/local/datastructures/Frame.h>
-#include <runtime/local/kernels/SliceCol.h>
 #include <runtime/local/kernels/CheckEq.h>
+#include <runtime/local/kernels/SliceCol.h>
 
 #include <tags.h>
 
@@ -31,22 +31,33 @@
 
 TEMPLATE_PRODUCT_TEST_CASE("SliceCol", TAG_KERNELS, (DenseMatrix, Matrix), (double, int64_t, uint32_t)) {
     using DT = TestType;
-    
+
     std::vector<typename DT::VT> vals = {
-        0, 0, 1, 0, 2, 0,
-        0, 0, 0, 0, 0, 0,
-        3, 4, 5, 0, 6, 7,
-        0, 8, 0, 0, 9, 0,
+        0, 0, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 3, 4, 5, 0, 6, 7, 0, 8, 0, 0, 9, 0,
     };
     std::vector<typename DT::VT> valsExp = {
-        0, 1,
-        0, 0,
-        4, 5,
-        8, 0,
+        0, 1, 0, 0, 4, 5, 8, 0,
     };
     auto arg = genGivenVals<DT>(4, vals);
     auto exp = genGivenVals<DT>(4, valsExp);
-    DT * res = nullptr;
+    DT *res = nullptr;
+    sliceCol(res, arg, 1, 3, nullptr);
+    CHECK(*res == *exp);
+
+    DataObjectFactory::destroy(arg, exp, res);
+}
+
+TEMPLATE_PRODUCT_TEST_CASE("SliceCol - string specific", TAG_KERNELS, (DenseMatrix, Matrix), (ALL_STRING_VALUE_TYPES)) {
+    using DT = TestType;
+    using VT = typename DT::VT;
+
+    std::vector<VT> vals = {VT("a"),  VT(""),  VT("1"),  VT("abc"), VT("e"),   VT("j"),    VT("abc"), VT("abcd"),
+                            VT("ab"), VT("a"), VT("f"),  VT("k"),   VT("ABC"), VT("34ab"), VT("ac"),  VT("b"),
+                            VT("g"),  VT("l"), VT("cd"), VT(" "),   VT("ad"),  VT("c"),    VT("h"),   VT(" ")};
+    std::vector<VT> valsExp = {VT(""), VT("1"), VT("abcd"), VT("ab"), VT("34ab"), VT("ac"), VT(" "), VT("ad")};
+    auto arg = genGivenVals<DT>(4, vals);
+    auto exp = genGivenVals<DT>(4, valsExp);
+    DT *res = nullptr;
     sliceCol(res, arg, 1, 3, nullptr);
     CHECK(*res == *exp);
 
@@ -58,14 +69,11 @@ TEMPLATE_PRODUCT_TEST_CASE("SliceCol - check throws", TAG_KERNELS, (DenseMatrix,
     using VT = typename DT::VT;
 
     auto arg = genGivenVals<DT>(4, {
-        0, 0, 1, 0, 2, 0,
-        0, 0, 0, 0, 0, 0,
-        3, 4, 5, 0, 6, 7,
-        0, 8, 0, 0, 9, 0,
-    });
-
-    DT * res = nullptr;
-    
+                                       0, 0, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 3, 4, 5, 0, 6, 7, 0, 8, 0, 0, 9, 0,
+                                   });
+
+    DT *res = nullptr;
+
     SECTION("lowerIncl out of bounds - negative") {
         REQUIRE_THROWS_AS((sliceCol(res, arg, -1, 3, nullptr)), std::out_of_range);
     }
@@ -88,11 +96,11 @@ TEMPLATE_TEST_CASE("SliceCol", TAG_KERNELS, (Frame)) {
     auto c3 = genGivenVals<DenseMatrix<VT>>(4, {3.0, 4.0, 5.0, 6.0});
     std::vector<Structure *> cols1 = {c0, c1, c2, c3};
     std::vector<Structure *> cols2 = {c1, c2};
-    std::string * labels1 =  new std::string[4] {"ab", "cde", "fghi", "jkilm"};
-    std::string * labels2 =  new std::string[4] {"cde", "fghi"};
+    std::string *labels1 = new std::string[4]{"ab", "cde", "fghi", "jkilm"};
+    std::string *labels2 = new std::string[4]{"cde", "fghi"};
     auto arg = DataObjectFactory::create<Frame>(cols1, labels1);
     auto exp = DataObjectFactory::create<Frame>(cols2, labels2);
-    Frame * res = nullptr;
+    Frame *res = nullptr;
     sliceCol(res, arg, 1, 3, nullptr);
     CHECK(*res == *exp);
 
@@ -109,10 +117,10 @@ TEMPLATE_TEST_CASE("SliceCol - check throws", TAG_KERNELS, (Frame)) {
     auto c2 = genGivenVals<DenseMatrix<VT>>(4, {8.8, 9.9, 1.0, 2.0});
     auto c3 = genGivenVals<DenseMatrix<VT>>(4, {3.0, 4.0, 5.0, 6.0});
     std::vector<Structure *> cols1 = {c0, c1, c2, c3};
-    std::string * labels1 =  new std::string[4] {"ab", "cde", "fghi", "jkilm"};
+    std::string *labels1 = new std::string[4]{"ab", "cde", "fghi", "jkilm"};
     auto arg = DataObjectFactory::create<Frame>(cols1, labels1);
 
-    Frame * res = nullptr;
+    Frame *res = nullptr;
 
     SECTION("lowerIncl out of bounds - negative") {
         REQUIRE_THROWS_AS((sliceCol(res, arg, -0.1, 3.0, nullptr)), std::out_of_range);
diff --git a/test/runtime/local/kernels/SliceRowTest.cpp b/test/runtime/local/kernels/SliceRowTest.cpp
index 505f04e0c..69192bc00 100644
--- a/test/runtime/local/kernels/SliceRowTest.cpp
+++ b/test/runtime/local/kernels/SliceRowTest.cpp
@@ -18,8 +18,8 @@
 #include <runtime/local/datastructures/DataObjectFactory.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
 #include <runtime/local/datastructures/Frame.h>
-#include <runtime/local/kernels/SliceRow.h>
 #include <runtime/local/kernels/CheckEq.h>
+#include <runtime/local/kernels/SliceRow.h>
 
 #include <tags.h>
 
@@ -31,20 +31,34 @@
 
 TEMPLATE_PRODUCT_TEST_CASE("SliceRow", TAG_KERNELS, (DenseMatrix, Matrix), (double, int64_t, uint32_t)) {
     using DT = TestType;
-    
+
     std::vector<typename DT::VT> vals = {
-        0, 0, 1, 0, 2, 0,
-        0, 0, 0, 0, 0, 0,
-        3, 4, 5, 0, 6, 7,
-        0, 8, 0, 0, 9, 0,
+        0, 0, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 3, 4, 5, 0, 6, 7, 0, 8, 0, 0, 9, 0,
     };
     std::vector<typename DT::VT> valsExp = {
-        0, 0, 0, 0, 0, 0,
-        3, 4, 5, 0, 6, 7,
+        0, 0, 0, 0, 0, 0, 3, 4, 5, 0, 6, 7,
     };
     auto arg = genGivenVals<DT>(4, vals);
     auto exp = genGivenVals<DT>(2, valsExp);
-    DT * res = nullptr;
+    DT *res = nullptr;
+    sliceRow(res, arg, 1, 3, nullptr);
+    CHECK(*res == *exp);
+
+    DataObjectFactory::destroy(arg, exp, res);
+}
+
+TEMPLATE_PRODUCT_TEST_CASE("SliceRow - string specific", TAG_KERNELS, (DenseMatrix, Matrix), (ALL_STRING_VALUE_TYPES)) {
+    using DT = TestType;
+    using VT = typename DT::VT;
+
+    std::vector<VT> vals = {VT("a"), VT(""),  VT("1"),  VT("abc"), VT("e"),   VT("j"),    VT("abc"), VT("abcd"),
+                            VT(" "), VT("a"), VT("f"),  VT("k"),   VT("ABC"), VT("34ab"), VT("ac"),  VT("b"),
+                            VT("g"), VT("l"), VT("cd"), VT(" "),   VT("ad"),  VT("c"),    VT("h"),   VT(" ")};
+    std::vector<VT> valsExp = {VT("abc"), VT("abcd"), VT(" "),  VT("a"), VT("f"), VT("k"),
+                               VT("ABC"), VT("34ab"), VT("ac"), VT("b"), VT("g"), VT("l")};
+    auto arg = genGivenVals<DT>(4, vals);
+    auto exp = genGivenVals<DT>(2, valsExp);
+    DT *res = nullptr;
     sliceRow(res, arg, 1, 3, nullptr);
     CHECK(*res == *exp);
 
@@ -56,14 +70,11 @@ TEMPLATE_PRODUCT_TEST_CASE("SliceRow - check throws", TAG_KERNELS, (DenseMatrix,
     using VT = typename DT::VT;
 
     auto arg = genGivenVals<DT>(4, {
-        0, 0, 1, 0, 2, 0,
-        0, 0, 0, 0, 0, 0,
-        3, 4, 5, 0, 6, 7,
-        0, 8, 0, 0, 9, 0,
-    });
-
-    DT * res = nullptr;
-    
+                                       0, 0, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 3, 4, 5, 0, 6, 7, 0, 8, 0, 0, 9, 0,
+                                   });
+
+    DT *res = nullptr;
+
     SECTION("lowerIncl out of bounds - negative") {
         REQUIRE_THROWS_AS((sliceRow(res, arg, -0.1, 3.0, nullptr)), std::out_of_range);
     }
@@ -92,7 +103,7 @@ TEMPLATE_TEST_CASE("SliceRow", TAG_KERNELS, (Frame)) {
     std::vector<Structure *> cols2 = {c4, c5, c6, c7};
     auto arg = DataObjectFactory::create<Frame>(cols1, nullptr);
     auto exp = DataObjectFactory::create<Frame>(cols2, nullptr);
-    Frame * res = nullptr;
+    Frame *res = nullptr;
     sliceRow(res, arg, 1, 3, nullptr);
     CHECK(*res == *exp);
 
@@ -110,8 +121,8 @@ TEMPLATE_TEST_CASE("SliceRow - check throws", TAG_KERNELS, (Frame)) {
     std::vector<Structure *> cols1 = {c0, c1, c2, c3};
     auto arg = DataObjectFactory::create<Frame>(cols1, nullptr);
 
-    DT * res = nullptr;
-    
+    DT *res = nullptr;
+
     SECTION("lowerIncl out of bounds - negative") {
         REQUIRE_THROWS_AS((sliceRow(res, arg, -0.1, 3.0, nullptr)), std::out_of_range);
     }
diff --git a/test/runtime/local/kernels/SolveTest.cpp b/test/runtime/local/kernels/SolveTest.cpp
index 1ca13049e..eb798e27e 100644
--- a/test/runtime/local/kernels/SolveTest.cpp
+++ b/test/runtime/local/kernels/SolveTest.cpp
@@ -14,13 +14,13 @@
  * limitations under the License.
  */
 
+#include "run_tests.h"
 #include <runtime/local/datagen/GenGivenVals.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
 #include <runtime/local/kernels/CheckEqApprox.h>
 #include <runtime/local/kernels/MatMul.h>
-#include <runtime/local/kernels/Transpose.h>
 #include <runtime/local/kernels/Solve.h>
-#include "run_tests.h"
+#include <runtime/local/kernels/Transpose.h>
 
 #include <tags.h>
 
@@ -28,12 +28,12 @@
 
 #include <vector>
 
-template<class DT>
-void checkSolve(const DT* lhs, const DT* rhs, const DT * exp, DCTX(dctx)) {
+template <class DT> void checkSolve(const DT *lhs, const DT *rhs, const DT *exp, DCTX(dctx)) {
     DT *res = nullptr;
     solve<DT, DT, DT>(res, lhs, rhs, dctx);
     // instead of CHECK(*res == * exp), we use the below approximate comparison
-    // because otherwise the float results do not exactly match, while double does
+    // because otherwise the float results do not exactly match, while double
+    // does
     CHECK(res->getNumRows() == exp->getNumRows());
     CHECK(res->getNumCols() == exp->getNumCols());
     CHECK(checkEqApprox(res, exp, 1e-6, nullptr));
@@ -43,20 +43,12 @@ TEMPLATE_PRODUCT_TEST_CASE("Solve", TAG_KERNELS, (DenseMatrix), (float, double))
     using DT = TestType;
     auto dctx = setupContextAndLogger();
 
-    auto X = genGivenVals<DT>(7, {
-        1, 4, 5,
-        3, 7, 1,
-        2, 3, 5,
-        9, 8, 1,
-        1, 2, 3,
-        5, 1, 9,
-        2, 3, 1
-    });
+    auto X = genGivenVals<DT>(7, {1, 4, 5, 3, 7, 1, 2, 3, 5, 9, 8, 1, 1, 2, 3, 5, 1, 9, 2, 3, 1});
     auto w = genGivenVals<DT>(3, {
-        1,
-        2,
-        3,
-    });
+                                     1,
+                                     2,
+                                     3,
+                                 });
 
     DT *y = nullptr, *tX = nullptr, *A = nullptr, *b = nullptr;
     matMul(y, X, w, false, false, dctx.get());
diff --git a/test/runtime/local/kernels/StopTest.cpp b/test/runtime/local/kernels/StopTest.cpp
index c250abe98..a0dafa9fd 100644
--- a/test/runtime/local/kernels/StopTest.cpp
+++ b/test/runtime/local/kernels/StopTest.cpp
@@ -16,9 +16,7 @@
 
 #include <runtime/local/kernels/Stop.h>
 
-#include <catch.hpp>
 #include "run_tests.h"
+#include <catch.hpp>
 
-TEST_CASE ("Stop - check throws") {
-    REQUIRE_THROWS_AS((stop("Custom Message", nullptr)), std::runtime_error);
-}
\ No newline at end of file
+TEST_CASE("Stop - check throws") { REQUIRE_THROWS_AS((stop("Custom Message", nullptr)), std::runtime_error); }
\ No newline at end of file
diff --git a/test/runtime/local/kernels/SyrkTest.cpp b/test/runtime/local/kernels/SyrkTest.cpp
index d982ad146..e862054b4 100644
--- a/test/runtime/local/kernels/SyrkTest.cpp
+++ b/test/runtime/local/kernels/SyrkTest.cpp
@@ -14,13 +14,13 @@
  * limitations under the License.
  */
 
+#include "run_tests.h"
 #include <runtime/local/datagen/GenGivenVals.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
 #include <runtime/local/kernels/CheckEq.h>
 #include <runtime/local/kernels/MatMul.h>
-#include <runtime/local/kernels/Transpose.h>
 #include <runtime/local/kernels/Syrk.h>
-#include "run_tests.h"
+#include <runtime/local/kernels/Transpose.h>
 
 #include <tags.h>
 
@@ -28,14 +28,13 @@
 
 #include <vector>
 
-template<class DT>
-void checkSyrk(const DT * arg, DCTX(dctx)) {
-    DT * resExp = nullptr;
-    DT * argT = nullptr;
+template <class DT> void checkSyrk(const DT *arg, DCTX(dctx)) {
+    DT *resExp = nullptr;
+    DT *argT = nullptr;
     transpose(argT, arg, dctx);
     matMul(resExp, argT, arg, false, false, dctx);
 
-    DT * resAct = nullptr;
+    DT *resAct = nullptr;
     syrk(resAct, arg, nullptr);
     CHECK(*resAct == *resExp);
     DataObjectFactory::destroy(resAct);
@@ -47,49 +46,67 @@ TEMPLATE_PRODUCT_TEST_CASE("Syrk", TAG_KERNELS, (DenseMatrix), (float, double))
     auto dctx = setupContextAndLogger();
 
     auto m0 = genGivenVals<DT>(3, {
-        0, 0, 0,
-        0, 0, 0,
-        0, 0, 0,
-    });
+                                      0,
+                                      0,
+                                      0,
+                                      0,
+                                      0,
+                                      0,
+                                      0,
+                                      0,
+                                      0,
+                                  });
     auto m1 = genGivenVals<DT>(3, {
-        1, 2, 3,
-        3, 1, 2,
-        2, 3, 1,
-    });
+                                      1,
+                                      2,
+                                      3,
+                                      3,
+                                      1,
+                                      2,
+                                      2,
+                                      3,
+                                      1,
+                                  });
     auto m2 = genGivenVals<DT>(3, {
-        13, 13, 10,
-        10, 13, 13,
-        13, 10, 13,
-    });
+                                      13,
+                                      13,
+                                      10,
+                                      10,
+                                      13,
+                                      13,
+                                      13,
+                                      10,
+                                      13,
+                                  });
     auto m3 = genGivenVals<DT>(2, {
-        1, 0, 3, 0,
-        0, 0, 2, 0,
-    });
+                                      1,
+                                      0,
+                                      3,
+                                      0,
+                                      0,
+                                      0,
+                                      2,
+                                      0,
+                                  });
     auto m4 = genGivenVals<DT>(4, {
-        0, 1,
-        2, 0,
-        1, 1,
-        0, 0,
-    });
+                                      0,
+                                      1,
+                                      2,
+                                      0,
+                                      1,
+                                      1,
+                                      0,
+                                      0,
+                                  });
     auto m5 = genGivenVals<DT>(2, {
-        3, 4,
-        2, 2,
-    });
-    auto v0 = genGivenVals<DT>(3, {
-        0,
-        0,
-        0
-    });
-    auto v1 = genGivenVals<DT>(3, {
-        1,
-        1,
-        1
-    });
-    auto v2 = genGivenVals<DT>(3, {
-        1,
-        2,
-        3
-    });
+                                      3,
+                                      4,
+                                      2,
+                                      2,
+                                  });
+    auto v0 = genGivenVals<DT>(3, {0, 0, 0});
+    auto v1 = genGivenVals<DT>(3, {1, 1, 1});
+    auto v2 = genGivenVals<DT>(3, {1, 2, 3});
 
     checkSyrk(m0, dctx.get());
     checkSyrk(m1, dctx.get());
diff --git a/test/runtime/local/kernels/ThetaJoinTest.cpp b/test/runtime/local/kernels/ThetaJoinTest.cpp
index 5b69ed128..d4bb7faec 100644
--- a/test/runtime/local/kernels/ThetaJoinTest.cpp
+++ b/test/runtime/local/kernels/ThetaJoinTest.cpp
@@ -14,44 +14,42 @@
  * limitations under the License.
  */
 
-#include <tags.h>
-#include <vector>
 #include <cstdint>
 #include <iostream>
+#include <tags.h>
+#include <vector>
 
-#include <runtime/local/datastructures/ValueTypeUtils.h>
+#include <runtime/local/datagen/GenGivenVals.h>
 #include <runtime/local/datastructures/DataObjectFactory.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
-#include <runtime/local/datagen/GenGivenVals.h>
 #include <runtime/local/datastructures/Frame.h>
+#include <runtime/local/datastructures/ValueTypeUtils.h>
 // @todo: remove after fix
-#include <mlir/IR/Location.h>
 #include <mlir/IR/BuiltinAttributes.h>
+#include <mlir/IR/Location.h>
 #include <runtime/local/kernels/CheckEq.h>
 
 #include <catch.hpp>
 
 #include <runtime/local/kernels/ThetaJoin.h>
 
-
 /// Test query Select * From R, S Where R.a = S.a
 TEST_CASE("ThetaJoin: Test the equal (==) operation", TAG_KERNELS) {
     /// Data generation
-    auto lhs_col0 = genGivenVals<DenseMatrix<uint64_t>>(10, { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9});
-    auto lhs_col1 = genGivenVals<DenseMatrix<int64_t>>(10,  { 0, 11, 20, 33, 44, 55, 60, 77, 88, 99});
+    auto lhs_col0 = genGivenVals<DenseMatrix<uint64_t>>(10, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9});
+    auto lhs_col1 = genGivenVals<DenseMatrix<int64_t>>(10, {0, 11, 20, 33, 44, 55, 60, 77, 88, 99});
     std::vector<Structure *> lhsCols = {lhs_col0, lhs_col1};
     std::string lhsLabels[] = {"R.idx", "R.a"};
     auto lhs = DataObjectFactory::create<Frame>(lhsCols, lhsLabels);
-    
-    auto rhs_col0 = genGivenVals<DenseMatrix<uint64_t>>(10, {   0,  1,  2,  3,  4,  5,  6,  7,  8,  9});
-    auto rhs_col1 = genGivenVals<DenseMatrix<int64_t>>(10,  { 100, 90, 80, 70, 60, 50, 40, 30, 20, 10});
-    auto rhs_col2 = genGivenVals<DenseMatrix<double >>(10,  {0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0});
+
+    auto rhs_col0 = genGivenVals<DenseMatrix<uint64_t>>(10, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9});
+    auto rhs_col1 = genGivenVals<DenseMatrix<int64_t>>(10, {100, 90, 80, 70, 60, 50, 40, 30, 20, 10});
+    auto rhs_col2 = genGivenVals<DenseMatrix<double>>(10, {0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0});
     std::vector<Structure *> rhsCols = {rhs_col0, rhs_col1, rhs_col2};
     std::string rhsLabels[] = {"S.idx", "S.a", "S.b"};
     auto rhs = DataObjectFactory::create<Frame>(rhsCols, rhsLabels);
-    
-    
-    Frame * expectedResult;
+
+    Frame *expectedResult;
     /// create expected result set
     {
         std::vector<uint64_t> er_col0_val;
@@ -59,8 +57,8 @@ TEST_CASE("ThetaJoin: Test the equal (==) operation", TAG_KERNELS) {
         std::vector<uint64_t> er_col2_val;
         std::vector<int64_t> er_col3_val;
         std::vector<double> er_col4_val;
-        for (uint64_t outerLoop = 0; outerLoop < lhs_col0->getNumRows(); ++ outerLoop) {
-            for (uint64_t innerLoop = 0; innerLoop < rhs_col0->getNumRows(); ++ innerLoop) {
+        for (uint64_t outerLoop = 0; outerLoop < lhs_col0->getNumRows(); ++outerLoop) {
+            for (uint64_t innerLoop = 0; innerLoop < rhs_col0->getNumRows(); ++innerLoop) {
                 /// condition to check
                 if (lhs_col1->get(outerLoop, 0) == rhs_col1->get(innerLoop, 0)) {
                     er_col0_val.push_back(lhs_col0->get(outerLoop, 0));
@@ -80,33 +78,30 @@ TEST_CASE("ThetaJoin: Test the equal (==) operation", TAG_KERNELS) {
         std::string labels[] = {"R.idx", "R.a", "S.idx", "S.a", "S.b"};
         /// create result data
         expectedResult = DataObjectFactory::create<Frame>(
-          std::vector<Structure*>{er_col0,er_col1,er_col2,er_col3,er_col4},
-          labels);
+            std::vector<Structure *>{er_col0, er_col1, er_col2, er_col3, er_col4}, labels);
         /// cleanup
         DataObjectFactory::destroy(er_col0, er_col1, er_col2, er_col3, er_col4);
         DataObjectFactory::destroy(lhs_col0, lhs_col1, rhs_col0, rhs_col1, rhs_col2);
     }
-    
+
     /// test execution
-    Frame * resultFrame = nullptr;
+    Frame *resultFrame = nullptr;
     uint64_t equations = 1;
-    
+
     /// R.a == S.a
-    auto lhsQLabels = new const char*[10]{"R.a"};
-    auto rhsQLabels = new const char*[10]{"S.a"};
+    auto lhsQLabels = new const char *[10]{"R.a"};
+    auto rhsQLabels = new const char *[10]{"S.a"};
     auto cmps = new CompareOperation[10]{CompareOperation::Equal};
     thetaJoin(resultFrame, lhs, rhs, lhsQLabels, equations, rhsQLabels, equations, cmps, equations, nullptr);
     delete[] lhsQLabels, delete[] rhsQLabels, delete[] cmps;
-    
-    
+
     /// test if result matches expected result
     CHECK(checkEq<Frame>(resultFrame, expectedResult, nullptr));
-    
+
     /// cleanup
     DataObjectFactory::destroy(resultFrame, expectedResult, lhs, rhs);
 }
 
-
 /// Test query Select * From R, S Where R.a < S.a
 TEST_CASE("ThetaJoin: Test the LessThan (<) operation", TAG_KERNELS) {
     /// data generation
@@ -115,15 +110,15 @@ TEST_CASE("ThetaJoin: Test the LessThan (<) operation", TAG_KERNELS) {
     std::vector<Structure *> lhsCols = {lhs_col0, lhs_col1};
     std::string lhsLabels[] = {"R.idx", "R.a"};
     auto lhs = DataObjectFactory::create<Frame>(lhsCols, lhsLabels);
-    
+
     auto rhs_col0 = genGivenVals<DenseMatrix<uint64_t>>(10, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9});
     auto rhs_col1 = genGivenVals<DenseMatrix<int64_t>>(10, {100, 90, 80, 70, 60, 50, 40, 30, 20, 10});
-    auto rhs_col2 = genGivenVals<DenseMatrix<double >>(10, {0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0});
+    auto rhs_col2 = genGivenVals<DenseMatrix<double>>(10, {0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0});
     std::vector<Structure *> rhsCols = {rhs_col0, rhs_col1, rhs_col2};
     std::string rhsLabels[] = {"S.idx", "S.a", "S.b"};
     auto rhs = DataObjectFactory::create<Frame>(rhsCols, rhsLabels);
-    
-    Frame * expectedResult;
+
+    Frame *expectedResult;
     /// create expected result set
     {
         std::vector<uint64_t> er_col0_val;
@@ -131,8 +126,8 @@ TEST_CASE("ThetaJoin: Test the LessThan (<) operation", TAG_KERNELS) {
         std::vector<uint64_t> er_col2_val;
         std::vector<int64_t> er_col3_val;
         std::vector<double> er_col4_val;
-        for (uint64_t outerLoop = 0; outerLoop < lhs_col0->getNumRows(); ++ outerLoop) {
-            for (uint64_t innerLoop = 0; innerLoop < rhs_col0->getNumRows(); ++ innerLoop) {
+        for (uint64_t outerLoop = 0; outerLoop < lhs_col0->getNumRows(); ++outerLoop) {
+            for (uint64_t innerLoop = 0; innerLoop < rhs_col0->getNumRows(); ++innerLoop) {
                 /// condition to check
                 if (lhs_col1->get(outerLoop, 0) < rhs_col1->get(innerLoop, 0)) {
                     er_col0_val.push_back(lhs_col0->get(outerLoop, 0));
@@ -152,29 +147,27 @@ TEST_CASE("ThetaJoin: Test the LessThan (<) operation", TAG_KERNELS) {
         std::string labels[] = {"R.idx", "R.a", "S.idx", "S.a", "S.b"};
         /// create result data
         expectedResult = DataObjectFactory::create<Frame>(
-          std::vector<Structure*>{er_col0,er_col1,er_col2,er_col3,er_col4},
-          labels);
+            std::vector<Structure *>{er_col0, er_col1, er_col2, er_col3, er_col4}, labels);
         /// cleanup
         DataObjectFactory::destroy(er_col0, er_col1, er_col2, er_col3, er_col4);
         DataObjectFactory::destroy(lhs_col0, lhs_col1, rhs_col0, rhs_col1, rhs_col2);
     }
-    
+
     /// test execution
-    Frame * resultFrame = nullptr;
+    Frame *resultFrame = nullptr;
     uint64_t equations = 1;
-    
+
     /// R.a < S.a
-    auto lhsQLabels = new const char*[10]{"R.a"};
-    auto rhsQLabels = new const char*[10]{"S.a"};
+    auto lhsQLabels = new const char *[10]{"R.a"};
+    auto rhsQLabels = new const char *[10]{"S.a"};
     auto cmps = new CompareOperation[10]{CompareOperation::LessThan};
     thetaJoin(resultFrame, lhs, rhs, lhsQLabels, equations, rhsQLabels, equations, cmps, equations, nullptr);
     delete[] lhsQLabels, delete[] rhsQLabels, delete[] cmps;
-    
-    
+
     /// test if result matches expected result
-//    CHECK(*resultFrame == *expectedResult);
+    //    CHECK(*resultFrame == *expectedResult);
     CHECK(checkEq<Frame>(resultFrame, expectedResult, nullptr));
-    
+
     /// cleanup
     DataObjectFactory::destroy(resultFrame, expectedResult, lhs, rhs);
 }
@@ -187,15 +180,15 @@ TEST_CASE("ThetaJoin: Test the LessEqual (<=) operation", TAG_KERNELS) {
     std::vector<Structure *> lhsCols = {lhs_col0, lhs_col1};
     std::string lhsLabels[] = {"R.idx", "R.a"};
     auto lhs = DataObjectFactory::create<Frame>(lhsCols, lhsLabels);
-    
+
     auto rhs_col0 = genGivenVals<DenseMatrix<uint64_t>>(10, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9});
     auto rhs_col1 = genGivenVals<DenseMatrix<int64_t>>(10, {100, 90, 80, 70, 60, 50, 40, 30, 20, 10});
-    auto rhs_col2 = genGivenVals<DenseMatrix<double >>(10, {0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0});
+    auto rhs_col2 = genGivenVals<DenseMatrix<double>>(10, {0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0});
     std::vector<Structure *> rhsCols = {rhs_col0, rhs_col1, rhs_col2};
     std::string rhsLabels[] = {"S.idx", "S.a", "S.b"};
     auto rhs = DataObjectFactory::create<Frame>(rhsCols, rhsLabels);
-    
-    Frame * expectedResult;
+
+    Frame *expectedResult;
     /// create expected result set
     {
         std::vector<uint64_t> er_col0_val;
@@ -203,8 +196,8 @@ TEST_CASE("ThetaJoin: Test the LessEqual (<=) operation", TAG_KERNELS) {
         std::vector<uint64_t> er_col2_val;
         std::vector<int64_t> er_col3_val;
         std::vector<double> er_col4_val;
-        for (uint64_t outerLoop = 0; outerLoop < lhs_col0->getNumRows(); ++ outerLoop) {
-            for (uint64_t innerLoop = 0; innerLoop < rhs_col0->getNumRows(); ++ innerLoop) {
+        for (uint64_t outerLoop = 0; outerLoop < lhs_col0->getNumRows(); ++outerLoop) {
+            for (uint64_t innerLoop = 0; innerLoop < rhs_col0->getNumRows(); ++innerLoop) {
                 /// condition to check
                 if (lhs_col1->get(outerLoop, 0) <= rhs_col1->get(innerLoop, 0)) {
                     er_col0_val.push_back(lhs_col0->get(outerLoop, 0));
@@ -224,29 +217,27 @@ TEST_CASE("ThetaJoin: Test the LessEqual (<=) operation", TAG_KERNELS) {
         std::string labels[] = {"R.idx", "R.a", "S.idx", "S.a", "S.b"};
         /// create result data
         expectedResult = DataObjectFactory::create<Frame>(
-          std::vector<Structure*>{er_col0,er_col1,er_col2,er_col3,er_col4},
-          labels);
+            std::vector<Structure *>{er_col0, er_col1, er_col2, er_col3, er_col4}, labels);
         /// cleanup
         DataObjectFactory::destroy(er_col0, er_col1, er_col2, er_col3, er_col4);
         DataObjectFactory::destroy(lhs_col0, lhs_col1, rhs_col0, rhs_col1, rhs_col2);
     }
-    
+
     /// test execution
-    Frame * resultFrame = nullptr;
+    Frame *resultFrame = nullptr;
     uint64_t equations = 1;
-    
+
     /// R.a <= S.a
-    auto lhsQLabels = new const char*[10]{"R.a"};
-    auto rhsQLabels = new const char*[10]{"S.a"};
+    auto lhsQLabels = new const char *[10]{"R.a"};
+    auto rhsQLabels = new const char *[10]{"S.a"};
     auto cmps = new CompareOperation[10]{CompareOperation::LessEqual};
     thetaJoin(resultFrame, lhs, rhs, lhsQLabels, equations, rhsQLabels, equations, cmps, equations, nullptr);
     delete[] lhsQLabels, delete[] rhsQLabels, delete[] cmps;
-    
-    
+
     /// test if result matches expected result
-//    CHECK(*resultFrame == *expectedResult);
+    //    CHECK(*resultFrame == *expectedResult);
     CHECK(checkEq<Frame>(resultFrame, expectedResult, nullptr));
-    
+
     /// cleanup
     DataObjectFactory::destroy(resultFrame, expectedResult, lhs, rhs);
 }
@@ -259,15 +250,15 @@ TEST_CASE("ThetaJoin: Test the GreaterThan (>) operation", TAG_KERNELS) {
     std::vector<Structure *> lhsCols = {lhs_col0, lhs_col1};
     std::string lhsLabels[] = {"R.idx", "R.a"};
     auto lhs = DataObjectFactory::create<Frame>(lhsCols, lhsLabels);
-    
+
     auto rhs_col0 = genGivenVals<DenseMatrix<uint64_t>>(10, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9});
     auto rhs_col1 = genGivenVals<DenseMatrix<int64_t>>(10, {100, 90, 80, 70, 60, 50, 40, 30, 20, 10});
-    auto rhs_col2 = genGivenVals<DenseMatrix<double >>(10, {0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0});
+    auto rhs_col2 = genGivenVals<DenseMatrix<double>>(10, {0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0});
     std::vector<Structure *> rhsCols = {rhs_col0, rhs_col1, rhs_col2};
     std::string rhsLabels[] = {"S.idx", "S.a", "S.b"};
     auto rhs = DataObjectFactory::create<Frame>(rhsCols, rhsLabels);
-    
-    Frame * expectedResult;
+
+    Frame *expectedResult;
     /// create expected result set
     {
         std::vector<uint64_t> er_col0_val;
@@ -275,8 +266,8 @@ TEST_CASE("ThetaJoin: Test the GreaterThan (>) operation", TAG_KERNELS) {
         std::vector<uint64_t> er_col2_val;
         std::vector<int64_t> er_col3_val;
         std::vector<double> er_col4_val;
-        for (uint64_t outerLoop = 0; outerLoop < lhs_col0->getNumRows(); ++ outerLoop) {
-            for (uint64_t innerLoop = 0; innerLoop < rhs_col0->getNumRows(); ++ innerLoop) {
+        for (uint64_t outerLoop = 0; outerLoop < lhs_col0->getNumRows(); ++outerLoop) {
+            for (uint64_t innerLoop = 0; innerLoop < rhs_col0->getNumRows(); ++innerLoop) {
                 /// condition to check
                 if (lhs_col1->get(outerLoop, 0) > rhs_col1->get(innerLoop, 0)) {
                     er_col0_val.push_back(lhs_col0->get(outerLoop, 0));
@@ -296,29 +287,27 @@ TEST_CASE("ThetaJoin: Test the GreaterThan (>) operation", TAG_KERNELS) {
         std::string labels[] = {"R.idx", "R.a", "S.idx", "S.a", "S.b"};
         /// create result data
         expectedResult = DataObjectFactory::create<Frame>(
-          std::vector<Structure*>{er_col0,er_col1,er_col2,er_col3,er_col4},
-          labels);
+            std::vector<Structure *>{er_col0, er_col1, er_col2, er_col3, er_col4}, labels);
         /// cleanup
         DataObjectFactory::destroy(er_col0, er_col1, er_col2, er_col3, er_col4);
         DataObjectFactory::destroy(lhs_col0, lhs_col1, rhs_col0, rhs_col1, rhs_col2);
     }
-    
+
     /// test execution
-    Frame * resultFrame = nullptr;
+    Frame *resultFrame = nullptr;
     uint64_t equations = 1;
-    
+
     /// R.a > S.a
-    auto lhsQLabels = new const char*[10]{"R.a"};
-    auto rhsQLabels = new const char*[10]{"S.a"};
+    auto lhsQLabels = new const char *[10]{"R.a"};
+    auto rhsQLabels = new const char *[10]{"S.a"};
     auto cmps = new CompareOperation[10]{CompareOperation::GreaterThan};
     thetaJoin(resultFrame, lhs, rhs, lhsQLabels, equations, rhsQLabels, equations, cmps, equations, nullptr);
     delete[] lhsQLabels, delete[] rhsQLabels, delete[] cmps;
-    
-    
+
     /// test if result matches expected result
-//    CHECK(*resultFrame == *expectedResult);
+    //    CHECK(*resultFrame == *expectedResult);
     CHECK(checkEq<Frame>(resultFrame, expectedResult, nullptr));
-    
+
     /// cleanup
     DataObjectFactory::destroy(resultFrame, expectedResult, lhs, rhs);
 }
@@ -331,15 +320,15 @@ TEST_CASE("ThetaJoin: Test the GreaterEqual (>=) operation", TAG_KERNELS) {
     std::vector<Structure *> lhsCols = {lhs_col0, lhs_col1};
     std::string lhsLabels[] = {"R.idx", "R.a"};
     auto lhs = DataObjectFactory::create<Frame>(lhsCols, lhsLabels);
-    
+
     auto rhs_col0 = genGivenVals<DenseMatrix<uint64_t>>(10, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9});
     auto rhs_col1 = genGivenVals<DenseMatrix<int64_t>>(10, {100, 90, 80, 70, 60, 50, 40, 30, 20, 10});
-    auto rhs_col2 = genGivenVals<DenseMatrix<double >>(10, {0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0});
+    auto rhs_col2 = genGivenVals<DenseMatrix<double>>(10, {0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0});
     std::vector<Structure *> rhsCols = {rhs_col0, rhs_col1, rhs_col2};
     std::string rhsLabels[] = {"S.idx", "S.a", "S.b"};
     auto rhs = DataObjectFactory::create<Frame>(rhsCols, rhsLabels);
-    
-    Frame * expectedResult;
+
+    Frame *expectedResult;
     /// create expected result set
     {
         std::vector<uint64_t> er_col0_val;
@@ -347,8 +336,8 @@ TEST_CASE("ThetaJoin: Test the GreaterEqual (>=) operation", TAG_KERNELS) {
         std::vector<uint64_t> er_col2_val;
         std::vector<int64_t> er_col3_val;
         std::vector<double> er_col4_val;
-        for (uint64_t outerLoop = 0; outerLoop < lhs_col0->getNumRows(); ++ outerLoop) {
-            for (uint64_t innerLoop = 0; innerLoop < rhs_col0->getNumRows(); ++ innerLoop) {
+        for (uint64_t outerLoop = 0; outerLoop < lhs_col0->getNumRows(); ++outerLoop) {
+            for (uint64_t innerLoop = 0; innerLoop < rhs_col0->getNumRows(); ++innerLoop) {
                 /// condition to check
                 if (lhs_col1->get(outerLoop, 0) >= rhs_col1->get(innerLoop, 0)) {
                     er_col0_val.push_back(lhs_col0->get(outerLoop, 0));
@@ -368,29 +357,27 @@ TEST_CASE("ThetaJoin: Test the GreaterEqual (>=) operation", TAG_KERNELS) {
         std::string labels[] = {"R.idx", "R.a", "S.idx", "S.a", "S.b"};
         /// create result data
         expectedResult = DataObjectFactory::create<Frame>(
-          std::vector<Structure*>{er_col0,er_col1,er_col2,er_col3,er_col4},
-          labels);
+            std::vector<Structure *>{er_col0, er_col1, er_col2, er_col3, er_col4}, labels);
         /// cleanup
         DataObjectFactory::destroy(er_col0, er_col1, er_col2, er_col3, er_col4);
         DataObjectFactory::destroy(lhs_col0, lhs_col1, rhs_col0, rhs_col1, rhs_col2);
     }
-    
+
     /// test execution
-    Frame * resultFrame = nullptr;
+    Frame *resultFrame = nullptr;
     uint64_t equations = 1;
-    
+
     /// R.a >= S.a
-    auto lhsQLabels = new const char*[10]{"R.a"};
-    auto rhsQLabels = new const char*[10]{"S.a"};
+    auto lhsQLabels = new const char *[10]{"R.a"};
+    auto rhsQLabels = new const char *[10]{"S.a"};
     auto cmps = new CompareOperation[10]{CompareOperation::GreaterEqual};
     thetaJoin(resultFrame, lhs, rhs, lhsQLabels, equations, rhsQLabels, equations, cmps, equations, nullptr);
     delete[] lhsQLabels, delete[] rhsQLabels, delete[] cmps;
-    
-    
+
     /// test if result matches expected result
-//    CHECK(*resultFrame == *expectedResult);
+    //    CHECK(*resultFrame == *expectedResult);
     CHECK(checkEq<Frame>(resultFrame, expectedResult, nullptr));
-    
+
     /// cleanup
     DataObjectFactory::destroy(resultFrame, expectedResult, lhs, rhs);
 }
@@ -403,15 +390,15 @@ TEST_CASE("ThetaJoin: Test the NonEqual (!=) operation", TAG_KERNELS) {
     std::vector<Structure *> lhsCols = {lhs_col0, lhs_col1};
     std::string lhsLabels[] = {"R.idx", "R.a"};
     auto lhs = DataObjectFactory::create<Frame>(lhsCols, lhsLabels);
-    
+
     auto rhs_col0 = genGivenVals<DenseMatrix<uint64_t>>(10, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9});
     auto rhs_col1 = genGivenVals<DenseMatrix<int64_t>>(10, {100, 90, 80, 70, 60, 50, 40, 30, 20, 10});
-    auto rhs_col2 = genGivenVals<DenseMatrix<double >>(10, {0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0});
+    auto rhs_col2 = genGivenVals<DenseMatrix<double>>(10, {0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0});
     std::vector<Structure *> rhsCols = {rhs_col0, rhs_col1, rhs_col2};
     std::string rhsLabels[] = {"S.idx", "S.a", "S.b"};
     auto rhs = DataObjectFactory::create<Frame>(rhsCols, rhsLabels);
-    
-    Frame * expectedResult;
+
+    Frame *expectedResult;
     /// create expected result set
     {
         std::vector<uint64_t> er_col0_val;
@@ -419,8 +406,8 @@ TEST_CASE("ThetaJoin: Test the NonEqual (!=) operation", TAG_KERNELS) {
         std::vector<uint64_t> er_col2_val;
         std::vector<int64_t> er_col3_val;
         std::vector<double> er_col4_val;
-        for (uint64_t outerLoop = 0; outerLoop < lhs_col0->getNumRows(); ++ outerLoop) {
-            for (uint64_t innerLoop = 0; innerLoop < rhs_col0->getNumRows(); ++ innerLoop) {
+        for (uint64_t outerLoop = 0; outerLoop < lhs_col0->getNumRows(); ++outerLoop) {
+            for (uint64_t innerLoop = 0; innerLoop < rhs_col0->getNumRows(); ++innerLoop) {
                 /// condition to check
                 if (lhs_col1->get(outerLoop, 0) != rhs_col1->get(innerLoop, 0)) {
                     er_col0_val.push_back(lhs_col0->get(outerLoop, 0));
@@ -440,52 +427,51 @@ TEST_CASE("ThetaJoin: Test the NonEqual (!=) operation", TAG_KERNELS) {
         std::string labels[] = {"R.idx", "R.a", "S.idx", "S.a", "S.b"};
         /// create result data
         expectedResult = DataObjectFactory::create<Frame>(
-          std::vector<Structure*>{er_col0,er_col1,er_col2,er_col3,er_col4},
-          labels);
+            std::vector<Structure *>{er_col0, er_col1, er_col2, er_col3, er_col4}, labels);
         /// cleanup
         DataObjectFactory::destroy(er_col0, er_col1, er_col2, er_col3, er_col4);
         DataObjectFactory::destroy(lhs_col0, lhs_col1, rhs_col0, rhs_col1, rhs_col2);
     }
-    
+
     /// test execution
-    Frame * resultFrame = nullptr;
+    Frame *resultFrame = nullptr;
     uint64_t equations = 1;
-    
+
     /// R.a != S.a
-    auto lhsQLabels = new const char*[10]{"R.a"};
-    auto rhsQLabels = new const char*[10]{"S.a"};
+    auto lhsQLabels = new const char *[10]{"R.a"};
+    auto rhsQLabels = new const char *[10]{"S.a"};
     auto cmps = new CompareOperation[10]{CompareOperation::NotEqual};
     thetaJoin(resultFrame, lhs, rhs, lhsQLabels, equations, rhsQLabels, equations, cmps, equations, nullptr);
     delete[] lhsQLabels, delete[] rhsQLabels, delete[] cmps;
-    
-    
+
     /// test if result matches expected result
-//    CHECK(*resultFrame == *expectedResult);
+    //    CHECK(*resultFrame == *expectedResult);
     CHECK(checkEq<Frame>(resultFrame, expectedResult, nullptr));
-    
+
     /// cleanup
     DataObjectFactory::destroy(resultFrame, expectedResult, lhs, rhs);
 }
 
-/// Test query Select * From R, S Where R.idx = S.idx And R.a != S.a And R.b >= S.c
+/// Test query Select * From R, S Where R.idx = S.idx And R.a != S.a And R.b >=
+/// S.c
 TEST_CASE("ThetaJoin: Test multiple conditions", TAG_KERNELS) {
     /// data generation
-    auto lhs_col0 = genGivenVals<DenseMatrix<uint64_t>>(10, {   0,   1,   2,   3,   4,   5,   6,   7,   8,   9});
-    auto lhs_col1 = genGivenVals<DenseMatrix<int64_t>>(10,  {   0,   0,   0,   0,   0,   0,  40,   0,   0,  10});
-    auto lhs_col2 = genGivenVals<DenseMatrix<uint64_t>>(10, {   1,   3,   5,   3,   1,   3,   5,   3,   1,   3});
+    auto lhs_col0 = genGivenVals<DenseMatrix<uint64_t>>(10, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9});
+    auto lhs_col1 = genGivenVals<DenseMatrix<int64_t>>(10, {0, 0, 0, 0, 0, 0, 40, 0, 0, 10});
+    auto lhs_col2 = genGivenVals<DenseMatrix<uint64_t>>(10, {1, 3, 5, 3, 1, 3, 5, 3, 1, 3});
     std::vector<Structure *> lhsCols = {lhs_col0, lhs_col1, lhs_col2};
     std::string lhsLabels[] = {"R.idx", "R.a", "R.b"};
     auto lhs = DataObjectFactory::create<Frame>(lhsCols, lhsLabels);
-    
-    auto rhs_col0 = genGivenVals<DenseMatrix<uint64_t>>(10, {  0,   1,   2,   3,   4,   5,   6,   7,   8,   9});
-    auto rhs_col1 = genGivenVals<DenseMatrix<int64_t>>(10,  {100,  90,  80,  70,  60,  50,  40,  30,  20,  10});
-    auto rhs_col2 = genGivenVals<DenseMatrix<double >>(10,  {0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0});
+
+    auto rhs_col0 = genGivenVals<DenseMatrix<uint64_t>>(10, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9});
+    auto rhs_col1 = genGivenVals<DenseMatrix<int64_t>>(10, {100, 90, 80, 70, 60, 50, 40, 30, 20, 10});
+    auto rhs_col2 = genGivenVals<DenseMatrix<double>>(10, {0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0});
     auto rhs_col3 = genGivenVals<DenseMatrix<uint64_t>>(10, {2, 3, 2, 3, 2, 3, 2, 3, 2, 3});
     std::vector<Structure *> rhsCols = {rhs_col0, rhs_col1, rhs_col2, rhs_col3};
     std::string rhsLabels[] = {"S.idx", "S.a", "S.b", "S.c"};
     auto rhs = DataObjectFactory::create<Frame>(rhsCols, rhsLabels);
-    
-    Frame * expectedResult;
+
+    Frame *expectedResult;
     /// create expected result set
     {
         std::vector<uint64_t> er_col0_val;
@@ -495,17 +481,16 @@ TEST_CASE("ThetaJoin: Test multiple conditions", TAG_KERNELS) {
         std::vector<int64_t> er_col4_val;
         std::vector<double> er_col5_val;
         std::vector<uint64_t> er_col6_val;
-        for (uint64_t outerLoop = 0; outerLoop < lhs_col0->getNumRows(); ++ outerLoop) {
-            for (uint64_t innerLoop = 0; innerLoop < rhs_col0->getNumRows(); ++ innerLoop) {
+        for (uint64_t outerLoop = 0; outerLoop < lhs_col0->getNumRows(); ++outerLoop) {
+            for (uint64_t innerLoop = 0; innerLoop < rhs_col0->getNumRows(); ++innerLoop) {
                 /// condition to check
                 if (
-                  /// R.idx == S.idx
-                  lhs_col0->get(outerLoop, 0) == rhs_col0->get(innerLoop, 0) and
-                  /// R.a != S.a
-                  lhs_col1->get(outerLoop, 0) != rhs_col1->get(innerLoop, 0) and
-                  /// R.b >= S.c
-                  lhs_col2->get(outerLoop, 0) >= rhs_col3->get(innerLoop, 0)
-                  ) {
+                    /// R.idx == S.idx
+                    lhs_col0->get(outerLoop, 0) == rhs_col0->get(innerLoop, 0) and
+                    /// R.a != S.a
+                    lhs_col1->get(outerLoop, 0) != rhs_col1->get(innerLoop, 0) and
+                    /// R.b >= S.c
+                    lhs_col2->get(outerLoop, 0) >= rhs_col3->get(innerLoop, 0)) {
                     er_col0_val.push_back(lhs_col0->get(outerLoop, 0));
                     er_col1_val.push_back(lhs_col1->get(outerLoop, 0));
                     er_col2_val.push_back(lhs_col2->get(outerLoop, 0));
@@ -527,91 +512,90 @@ TEST_CASE("ThetaJoin: Test multiple conditions", TAG_KERNELS) {
         std::string labels[] = {"R.idx", "R.a", "R.b", "S.idx", "S.a", "S.b", "S.c"};
         /// create result data
         expectedResult = DataObjectFactory::create<Frame>(
-          std::vector<Structure*>{er_col0,er_col1,er_col2,er_col3,er_col4, er_col5, er_col6},
-          labels);
+            std::vector<Structure *>{er_col0, er_col1, er_col2, er_col3, er_col4, er_col5, er_col6}, labels);
         /// cleanup
         DataObjectFactory::destroy(er_col0, er_col1, er_col2, er_col3, er_col4, er_col5, er_col6);
         DataObjectFactory::destroy(lhs_col0, lhs_col1, lhs_col2, rhs_col0, rhs_col1, rhs_col2, rhs_col3);
     }
-    
+
     /// test execution
-    Frame * resultFrame = nullptr;
+    Frame *resultFrame = nullptr;
     uint64_t equations = 3;
-    
+
     /// R.idx == S.idx && R.a != S.b && R.b >= S.c
-    auto lhsQLabels = new const char*[10]{"R.idx", "R.a", "R.b"};
-    auto rhsQLabels = new const char*[10]{"S.idx", "S.a", "S.c"};
-    auto cmps = new CompareOperation[10]{CompareOperation::Equal, CompareOperation::NotEqual, CompareOperation::GreaterEqual};
+    auto lhsQLabels = new const char *[10]{"R.idx", "R.a", "R.b"};
+    auto rhsQLabels = new const char *[10]{"S.idx", "S.a", "S.c"};
+    auto cmps =
+        new CompareOperation[10]{CompareOperation::Equal, CompareOperation::NotEqual, CompareOperation::GreaterEqual};
     thetaJoin(resultFrame, lhs, rhs, lhsQLabels, equations, rhsQLabels, equations, cmps, equations, nullptr);
     delete[] lhsQLabels, delete[] rhsQLabels, delete[] cmps;
-    
-    
+
     /// test if result matches expected result
-//    CHECK(*resultFrame == *expectedResult);
+    //    CHECK(*resultFrame == *expectedResult);
     CHECK(checkEq<Frame>(resultFrame, expectedResult, nullptr));
-    
+
     /// cleanup
     DataObjectFactory::destroy(resultFrame, expectedResult, lhs, rhs);
 }
 
-
-/// Test query Select * From R, S Where R.X = S.Y with X, Y in [ui8, ui32, ui64, si8, si32, si64, f32, f64]
+/// Test query Select * From R, S Where R.X = S.Y with X, Y in [ui8, ui32, ui64,
+/// si8, si32, si64, f32, f64]
 TEST_CASE("ThetaJoin: Test unequal value types", TAG_KERNELS) {
     /// data generation
-    auto ui8  = genGivenVals<DenseMatrix<uint8_t >>(10, {   0,   1,   2,   3,   4,   5,   6,   7,   8,   9});
-    auto ui32 = genGivenVals<DenseMatrix<uint32_t>>(10, {   0,   1,   2,   3,   4,   5,   6,   7,   8,   9});
-    auto ui64 = genGivenVals<DenseMatrix<uint64_t>>(10, {   0,   1,   2,   3,   4,   5,   6,   7,   8,   9});
-    auto si8  = genGivenVals<DenseMatrix<int8_t  >>(10, {   0,   1,   2,   3,   4,   5,   6,   7,   8,   9});
-    auto si32 = genGivenVals<DenseMatrix<int32_t >>(10, {   0,   1,   2,   3,   4,   5,   6,   7,   8,   9});
-    auto si64 = genGivenVals<DenseMatrix<int64_t >>(10, {   0,   1,   2,   3,   4,   5,   6,   7,   8,   9});
-    auto f32  = genGivenVals<DenseMatrix<float   >>(10, {   0,   1,   2,   3,   4,   5,   6,   7,   8,   9});
-    auto f64  = genGivenVals<DenseMatrix<double  >>(10, {   0,   1,   2,   3,   4,   5,   6,   7,   8,   9});
+    auto ui8 = genGivenVals<DenseMatrix<uint8_t>>(10, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9});
+    auto ui32 = genGivenVals<DenseMatrix<uint32_t>>(10, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9});
+    auto ui64 = genGivenVals<DenseMatrix<uint64_t>>(10, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9});
+    auto si8 = genGivenVals<DenseMatrix<int8_t>>(10, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9});
+    auto si32 = genGivenVals<DenseMatrix<int32_t>>(10, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9});
+    auto si64 = genGivenVals<DenseMatrix<int64_t>>(10, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9});
+    auto f32 = genGivenVals<DenseMatrix<float>>(10, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9});
+    auto f64 = genGivenVals<DenseMatrix<double>>(10, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9});
     std::vector<Structure *> cols = {ui8, ui32, ui64, si8, si32, si64, f32, f64};
-    std::string lhsLabels[] = {"R.ui8","R.ui32","R.ui64","R.si8","R.si32","R.si64", "R.f32", "R.f64"};
-    std::string rhsLabels[] = {"S.ui8","S.ui32","S.ui64","S.si8","S.si32","S.si64", "S.f32", "S.f64"};
+    std::string lhsLabels[] = {"R.ui8", "R.ui32", "R.ui64", "R.si8", "R.si32", "R.si64", "R.f32", "R.f64"};
+    std::string rhsLabels[] = {"S.ui8", "S.ui32", "S.ui64", "S.si8", "S.si32", "S.si64", "S.f32", "S.f64"};
     auto lhs = DataObjectFactory::create<Frame>(cols, lhsLabels);
     auto rhs = DataObjectFactory::create<Frame>(cols, rhsLabels);
-    
-    
-    Frame * expectedResult;
+
+    Frame *expectedResult;
     /// create expected result set
     {
-        std::string labels[] = {"R.ui8","R.ui32","R.ui64","R.si8","R.si32","R.si64", "R.f32", "R.f64",
-                                "S.ui8","S.ui32","S.ui64","S.si8","S.si32","S.si64", "S.f32", "S.f64"};
+        std::string labels[] = {"R.ui8", "R.ui32", "R.ui64", "R.si8", "R.si32", "R.si64", "R.f32", "R.f64",
+                                "S.ui8", "S.ui32", "S.ui64", "S.si8", "S.si32", "S.si64", "S.f32", "S.f64"};
         /// create result data
-        expectedResult = DataObjectFactory::create<Frame>(
-          std::vector<Structure*>{ui8, ui32, ui64, si8, si32, si64, f32, f64, ui8, ui32, ui64, si8, si32, si64, f32, f64},
-          labels);
+        expectedResult =
+            DataObjectFactory::create<Frame>(std::vector<Structure *>{ui8, ui32, ui64, si8, si32, si64, f32, f64, ui8,
+                                                                      ui32, ui64, si8, si32, si64, f32, f64},
+                                             labels);
     }
-    
+
     /// test execution
-    Frame * resultFrame = nullptr;
+    Frame *resultFrame = nullptr;
     uint64_t equations = 1;
-    
-    std::function<void(const std::string&, const std::string)> test = [&] (const std::string& lhsCol, const std::string& rhsCol){
-        auto lhsQLabels = new const char*[10]{lhsCol.c_str()};
-        auto rhsQLabels = new const char*[10]{rhsCol.c_str()};
+
+    std::function<void(const std::string &, const std::string)> test = [&](const std::string &lhsCol,
+                                                                           const std::string &rhsCol) {
+        auto lhsQLabels = new const char *[10]{lhsCol.c_str()};
+        auto rhsQLabels = new const char *[10]{rhsCol.c_str()};
         auto cmps = new CompareOperation[10]{CompareOperation::Equal};
         thetaJoin(resultFrame, lhs, rhs, lhsQLabels, equations, rhsQLabels, equations, cmps, equations, nullptr);
         delete[] lhsQLabels, delete[] rhsQLabels, delete[] cmps;
-        
+
         /// test if result matches expected result
-//        CHECK(*resultFrame == *expectedResult);
+        //        CHECK(*resultFrame == *expectedResult);
         CHECK(checkEq<Frame>(resultFrame, expectedResult, nullptr));
         /// cleanup
         DataObjectFactory::destroy(resultFrame);
     };
-    
-    std::vector<std::string> types {"ui8", "ui32", "ui64", "si8", "si32", "si64", "f32", "f64"};
-    
+
+    std::vector<std::string> types{"ui8", "ui32", "ui64", "si8", "si32", "si64", "f32", "f64"};
+
     /// check each possible combination
-    for(auto& tlhs : types){
-        for(auto& trhs : types){
+    for (auto &tlhs : types) {
+        for (auto &trhs : types) {
             test("R." + tlhs, "S." + trhs);
         }
     }
-    
-    
+
     /// cleanup
     DataObjectFactory::destroy(resultFrame, expectedResult, lhs, rhs);
 }
diff --git a/test/runtime/local/kernels/TransposeTest.cpp b/test/runtime/local/kernels/TransposeTest.cpp
index 6b2a8a9b2..cd4162fed 100644
--- a/test/runtime/local/kernels/TransposeTest.cpp
+++ b/test/runtime/local/kernels/TransposeTest.cpp
@@ -14,9 +14,9 @@
  * limitations under the License.
  */
 
+#include <runtime/local/datagen/GenGivenVals.h>
 #include <runtime/local/datastructures/CSRMatrix.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
-#include <runtime/local/datagen/GenGivenVals.h>
 #include <runtime/local/kernels/CheckEq.h>
 #include <runtime/local/kernels/Transpose.h>
 
@@ -29,64 +29,133 @@
 #define DATA_TYPES DenseMatrix, CSRMatrix, Matrix
 #define VALUE_TYPES double, uint32_t
 
-template<class DT>
-void checkTranspose(const DT * arg, const DT * exp) {
-    DT * res = nullptr;
+template <class DT> void checkTranspose(const DT *arg, const DT *exp) {
+    DT *res = nullptr;
     transpose<DT, DT>(res, arg, nullptr);
     CHECK(*res == *exp);
 }
 
 TEMPLATE_PRODUCT_TEST_CASE("Transpose", TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)) {
     using DT = TestType;
-    
-    DT * m = nullptr;
-    DT * mt = nullptr;
-    
+
+    DT *m = nullptr;
+    DT *mt = nullptr;
+
     SECTION("fully populated matrix") {
         m = genGivenVals<DT>(3, {
-            1,  2,  3,  4,
-            5,  6,  7,  8,
-            9, 10, 11, 12,
-        });
+                                    1,
+                                    2,
+                                    3,
+                                    4,
+                                    5,
+                                    6,
+                                    7,
+                                    8,
+                                    9,
+                                    10,
+                                    11,
+                                    12,
+                                });
         mt = genGivenVals<DT>(4, {
-            1, 5,  9,
-            2, 6, 10,
-            3, 7, 11,
-            4, 8, 12,
-        });
+                                     1,
+                                     5,
+                                     9,
+                                     2,
+                                     6,
+                                     10,
+                                     3,
+                                     7,
+                                     11,
+                                     4,
+                                     8,
+                                     12,
+                                 });
     }
     SECTION("sparse matrix") {
-        m = genGivenVals<DT>(5, {
-            0, 0, 0, 0, 0, 0,
-            0, 0, 3, 0, 0, 0,
-            0, 0, 0, 0, 4, 0,
-            0, 0, 0, 0, 0, 0,
-            5, 0, 0, 0, 6, 0,
-        });
-        mt = genGivenVals<DT>(6, {
-            0, 0, 0, 0, 5,
-            0, 0, 0, 0, 0,
-            0, 3, 0, 0, 0,
-            0, 0, 0, 0, 0,
-            0, 0, 4, 0, 6,
-            0, 0, 0, 0, 0,
-        });
+        m = genGivenVals<DT>(
+            5, {
+                   0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 6, 0,
+               });
+        mt = genGivenVals<DT>(
+            6, {
+                   0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 6, 0, 0, 0, 0, 0,
+               });
     }
     SECTION("empty matrix") {
         m = genGivenVals<DT>(3, {
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-            0, 0, 0, 0,
-        });
+                                    0,
+                                    0,
+                                    0,
+                                    0,
+                                    0,
+                                    0,
+                                    0,
+                                    0,
+                                    0,
+                                    0,
+                                    0,
+                                    0,
+                                });
         mt = genGivenVals<DT>(4, {
-            0, 0, 0,
-            0, 0, 0,
-            0, 0, 0,
-            0, 0, 0,
-        });
+                                     0,
+                                     0,
+                                     0,
+                                     0,
+                                     0,
+                                     0,
+                                     0,
+                                     0,
+                                     0,
+                                     0,
+                                     0,
+                                     0,
+                                 });
     }
 
     checkTranspose(m, mt);
 
     DataObjectFactory::destroy(m, mt);
-}
\ No newline at end of file
+}
+
+TEMPLATE_PRODUCT_TEST_CASE("Transpose", TAG_KERNELS, (DenseMatrix), (ALL_STRING_VALUE_TYPES)) {
+    using DT = TestType;
+    using VT = typename DT::VT;
+    ;
+
+    DT *m = nullptr;
+    DT *mt = nullptr;
+
+    m = genGivenVals<DT>(3, {
+                                VT("1"),
+                                VT("a"),
+                                VT("3"),
+                                VT("4"),
+                                VT("5"),
+                                VT("ab"),
+                                VT("7"),
+                                VT("8"),
+                                VT("9"),
+                                VT("abc"),
+                                VT("11"),
+                                VT("12"),
+                            });
+    mt = genGivenVals<DT>(4, {
+                                 VT("1"),
+                                 VT("5"),
+                                 VT("9"),
+                                 VT("a"),
+                                 VT("ab"),
+                                 VT("abc"),
+                                 VT("3"),
+                                 VT("7"),
+                                 VT("11"),
+                                 VT("4"),
+                                 VT("8"),
+                                 VT("12"),
+                             });
+
+    checkTranspose(m, mt);
+
+    DataObjectFactory::destroy(m);
+    DataObjectFactory::destroy(mt);
+}
diff --git a/test/runtime/local/kernels/TriTest.cpp b/test/runtime/local/kernels/TriTest.cpp
index 6cc6b63e8..bd87d60e6 100644
--- a/test/runtime/local/kernels/TriTest.cpp
+++ b/test/runtime/local/kernels/TriTest.cpp
@@ -14,9 +14,9 @@
  * limitations under the License.
  */
 
+#include <runtime/local/datagen/GenGivenVals.h>
 #include <runtime/local/datastructures/CSRMatrix.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
-#include <runtime/local/datagen/GenGivenVals.h>
 #include <runtime/local/kernels/CheckEq.h>
 #include <runtime/local/kernels/Tri.h>
 
@@ -28,9 +28,8 @@
 #define DATA_TYPES DenseMatrix, CSRMatrix, Matrix
 #define VALUE_TYPES double, uint32_t
 
-template<class DT>
-void checkTri(const DT * arg, const DT * exp, bool upper, bool diag, bool values) {
-    DT * res = nullptr;
+template <class DT> void checkTri(const DT *arg, const DT *exp, bool upper, bool diag, bool values) {
+    DT *res = nullptr;
     tri<DT>(res, arg, upper, diag, values, nullptr);
     CHECK(*res == *exp);
 }
@@ -39,25 +38,61 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("example"), TAG_KERNELS, (DATA_TYPES), (VAL
     using DT = TestType;
 
     auto m = genGivenVals<DT>(4, {
-        1, 0, 2, 3,
-        4, 5, 0, 6,
-        0, 7, 0, 8,
-        0, 0, 9, 0,
-    });
+                                     1,
+                                     0,
+                                     2,
+                                     3,
+                                     4,
+                                     5,
+                                     0,
+                                     6,
+                                     0,
+                                     7,
+                                     0,
+                                     8,
+                                     0,
+                                     0,
+                                     9,
+                                     0,
+                                 });
 
     auto m1 = genGivenVals<DT>(4, {
-        1, 0, 0, 0,
-        4, 5, 0, 0,
-        0, 7, 0, 0,
-        0, 0, 9, 0,
-    });
+                                      1,
+                                      0,
+                                      0,
+                                      0,
+                                      4,
+                                      5,
+                                      0,
+                                      0,
+                                      0,
+                                      7,
+                                      0,
+                                      0,
+                                      0,
+                                      0,
+                                      9,
+                                      0,
+                                  });
 
     auto m2 = genGivenVals<DT>(4, {
-        0, 0, 1, 1,
-        0, 0, 0, 1,
-        0, 0, 0, 1,
-        0, 0, 0, 0,
-    });
+                                      0,
+                                      0,
+                                      1,
+                                      1,
+                                      0,
+                                      0,
+                                      0,
+                                      1,
+                                      0,
+                                      0,
+                                      0,
+                                      1,
+                                      0,
+                                      0,
+                                      0,
+                                      0,
+                                  });
 
     checkTri(m, m1, false, true, true);
     checkTri(m, m2, true, false, false);
diff --git a/test/runtime/local/vectorized/MultiThreadedKernelTest.cpp b/test/runtime/local/vectorized/MultiThreadedKernelTest.cpp
index 2dc4958a6..1c1103e2e 100644
--- a/test/runtime/local/vectorized/MultiThreadedKernelTest.cpp
+++ b/test/runtime/local/vectorized/MultiThreadedKernelTest.cpp
@@ -22,31 +22,23 @@
 #include <runtime/local/kernels/RandMatrix.h>
 #include <runtime/local/vectorized/MTWrapper.h>
 
-#include <tags.h>
 #include <catch.hpp>
+#include <tags.h>
 
 #define DATA_TYPES DenseMatrix
-#define VALUE_TYPES double, float //TODO uint32_t
-
-template<class DT>
-void funAdd(DT*** outputs, Structure** inputs, DCTX(ctx)) {
-    ewBinaryMat(BinaryOpCode::ADD,
-        *outputs[0],
-        reinterpret_cast<DT*>(inputs[0]),
-        reinterpret_cast<DT*>(inputs[1]),
-        ctx);
+#define VALUE_TYPES double, float // TODO uint32_t
+
+template <class DT> void funAdd(DT ***outputs, Structure **inputs, DCTX(ctx)) {
+    ewBinaryMat(BinaryOpCode::ADD, *outputs[0], reinterpret_cast<DT *>(inputs[0]), reinterpret_cast<DT *>(inputs[1]),
+                ctx);
 }
 
-template<class DT>
-void funMul(DT*** outputs, Structure** inputs, DCTX(ctx)) {
-    ewBinaryMat(BinaryOpCode::MUL,
-        *outputs[0],
-        reinterpret_cast<DT*>(inputs[0]),
-        reinterpret_cast<DT*>(inputs[1]),
-        ctx);
+template <class DT> void funMul(DT ***outputs, Structure **inputs, DCTX(ctx)) {
+    ewBinaryMat(BinaryOpCode::MUL, *outputs[0], reinterpret_cast<DT *>(inputs[0]), reinterpret_cast<DT *>(inputs[1]),
+                ctx);
 }
 
-TEMPLATE_PRODUCT_TEST_CASE("Multi-threaded-scheduling", TAG_VECTORIZED, (DATA_TYPES), (VALUE_TYPES)){
+TEMPLATE_PRODUCT_TEST_CASE("Multi-threaded-scheduling", TAG_VECTORIZED, (DATA_TYPES), (VALUE_TYPES)) {
     using DT = TestType;
     using VT = typename DT::VT;
     auto dctx = setupContextAndLogger();
@@ -58,7 +50,8 @@ TEMPLATE_PRODUCT_TEST_CASE("Multi-threaded-scheduling", TAG_VECTORIZED, (DATA_TY
     randMatrix<DT, VT>(m2, 1234, 10, 0.0, 1.0, 1.0, 3, dctx.get());
 
     DT *r1 = nullptr, *r2 = nullptr;
-    ewBinaryMat<DT, DT, DT>(BinaryOpCode::ADD, r1, m1, m2, dctx.get()); //single-threaded
+    ewBinaryMat<DT, DT, DT>(BinaryOpCode::ADD, r1, m1, m2,
+                            dctx.get()); // single-threaded
 
     auto wrapper = std::make_unique<MTWrapper<DT>>(1, dctx.get());
     DT **outputs[] = {&r2};
@@ -70,9 +63,10 @@ TEMPLATE_PRODUCT_TEST_CASE("Multi-threaded-scheduling", TAG_VECTORIZED, (DATA_TY
     VectorCombine combines[] = {VectorCombine::ROWS};
 
     std::vector<std::function<void(DT ***, Structure **, DCTX(ctx))>> funcs;
-    funcs.push_back(std::function<void(DT***, Structure**, DCTX(ctx))>(reinterpret_cast<void (*)(DT***, Structure **, 
-            DCTX(ctx))>(reinterpret_cast<void*>(&funAdd<DT>))));
-    wrapper->executeCpuQueues(funcs, outputs, isScalar, inputs, 2, 1, outRows, outCols, splits, combines, dctx.get(), false);
+    funcs.push_back(std::function<void(DT ***, Structure **, DCTX(ctx))>(
+        reinterpret_cast<void (*)(DT ***, Structure **, DCTX(ctx))>(reinterpret_cast<void *>(&funAdd<DT>))));
+    wrapper->executeCpuQueues(funcs, outputs, isScalar, inputs, 2, 1, outRows, outCols, splits, combines, dctx.get(),
+                              false);
 
     CHECK(checkEqApprox(r1, r2, 1e-6, dctx.get()));
 
@@ -82,7 +76,8 @@ TEMPLATE_PRODUCT_TEST_CASE("Multi-threaded-scheduling", TAG_VECTORIZED, (DATA_TY
     DataObjectFactory::destroy(r2);
 }
 
-TEMPLATE_PRODUCT_TEST_CASE("Multi-threaded X+Y", TAG_VECTORIZED, (DATA_TYPES), (VALUE_TYPES)) { // NOLINT(cert-err58-cpp)
+TEMPLATE_PRODUCT_TEST_CASE("Multi-threaded X+Y", TAG_VECTORIZED, (DATA_TYPES),
+                           (VALUE_TYPES)) { // NOLINT(cert-err58-cpp)
     using DT = TestType;
     using VT = typename DT::VT;
     auto dctx = setupContextAndLogger();
@@ -92,7 +87,8 @@ TEMPLATE_PRODUCT_TEST_CASE("Multi-threaded X+Y", TAG_VECTORIZED, (DATA_TYPES), (
     randMatrix<DT, VT>(m2, 1234, 10, 0.0, 1.0, 1.0, 3, dctx.get());
 
     DT *r1 = nullptr, *r2 = nullptr;
-    ewBinaryMat<DT, DT, DT>(BinaryOpCode::ADD, r1, m1, m2, dctx.get()); //single-threaded
+    ewBinaryMat<DT, DT, DT>(BinaryOpCode::ADD, r1, m1, m2,
+                            dctx.get()); // single-threaded
 
     auto wrapper = std::make_unique<MTWrapper<DT>>(1, dctx.get());
     DT **outputs[] = {&r2};
@@ -104,9 +100,10 @@ TEMPLATE_PRODUCT_TEST_CASE("Multi-threaded X+Y", TAG_VECTORIZED, (DATA_TYPES), (
     VectorCombine combines[] = {VectorCombine::ROWS};
 
     std::vector<std::function<void(DT ***, Structure **, DCTX(ctx))>> funcs;
-    funcs.push_back(std::function<void(DT***, Structure**, DCTX(ctx))>(reinterpret_cast<void (*)(DT***, Structure **,
-            DCTX(ctx))>(reinterpret_cast<void*>(&funAdd<DT>))));
-    wrapper->executeCpuQueues(funcs, outputs, isScalar, inputs, 2, 1, outRows, outCols, splits, combines, dctx.get(), false);
+    funcs.push_back(std::function<void(DT ***, Structure **, DCTX(ctx))>(
+        reinterpret_cast<void (*)(DT ***, Structure **, DCTX(ctx))>(reinterpret_cast<void *>(&funAdd<DT>))));
+    wrapper->executeCpuQueues(funcs, outputs, isScalar, inputs, 2, 1, outRows, outCols, splits, combines, dctx.get(),
+                              false);
 
     CHECK(checkEqApprox(r1, r2, 1e-6, dctx.get()));
 
@@ -116,7 +113,8 @@ TEMPLATE_PRODUCT_TEST_CASE("Multi-threaded X+Y", TAG_VECTORIZED, (DATA_TYPES), (
     DataObjectFactory::destroy(r2);
 }
 
-TEMPLATE_PRODUCT_TEST_CASE("Multi-threaded X*Y", TAG_VECTORIZED, (DATA_TYPES), (VALUE_TYPES)) { // NOLINT(cert-err58-cpp)
+TEMPLATE_PRODUCT_TEST_CASE("Multi-threaded X*Y", TAG_VECTORIZED, (DATA_TYPES),
+                           (VALUE_TYPES)) { // NOLINT(cert-err58-cpp)
     using DT = TestType;
     using VT = typename DT::VT;
     auto dctx = setupContextAndLogger();
@@ -126,7 +124,8 @@ TEMPLATE_PRODUCT_TEST_CASE("Multi-threaded X*Y", TAG_VECTORIZED, (DATA_TYPES), (
     randMatrix<DT, VT>(m2, 1234, 10, 0.0, 1.0, 1.0, 3, dctx.get());
 
     DT *r1 = nullptr, *r2 = nullptr;
-    ewBinaryMat<DT, DT, DT>(BinaryOpCode::MUL, r1, m1, m2, dctx.get()); //single-threaded
+    ewBinaryMat<DT, DT, DT>(BinaryOpCode::MUL, r1, m1, m2,
+                            dctx.get()); // single-threaded
 
     auto wrapper = std::make_unique<MTWrapper<DT>>(1, dctx.get());
     DT **outputs[] = {&r2};
@@ -138,9 +137,10 @@ TEMPLATE_PRODUCT_TEST_CASE("Multi-threaded X*Y", TAG_VECTORIZED, (DATA_TYPES), (
     VectorCombine combines[] = {VectorCombine::ROWS};
 
     std::vector<std::function<void(DT ***, Structure **, DCTX(ctx))>> funcs;
-    funcs.push_back(std::function<void(DT***, Structure**, DCTX(ctx))>(reinterpret_cast<void (*)(DT***, Structure **,
-            DCTX(ctx))>(reinterpret_cast<void*>(&funMul<DT>))));
-    wrapper->executeCpuQueues(funcs, outputs, isScalar, inputs, 2, 1, outRows, outCols, splits, combines, dctx.get(), false);
+    funcs.push_back(std::function<void(DT ***, Structure **, DCTX(ctx))>(
+        reinterpret_cast<void (*)(DT ***, Structure **, DCTX(ctx))>(reinterpret_cast<void *>(&funMul<DT>))));
+    wrapper->executeCpuQueues(funcs, outputs, isScalar, inputs, 2, 1, outRows, outCols, splits, combines, dctx.get(),
+                              false);
 
     CHECK(checkEqApprox(r1, r2, 1e-6, dctx.get()));
 
diff --git a/test/tags.h b/test/tags.h
index 6f13b8d60..d6ce2c6b4 100644
--- a/test/tags.h
+++ b/test/tags.h
@@ -53,4 +53,4 @@
 #define TAG_VECTORIZED "[vectorized]"
 #define TAG_DAPHNELIB "[daphnelib]"
 
-#endif //TEST_TAGS_H
+#endif // TEST_TAGS_H