From ac7b6cab97712a006390881a9ef0c8b0f6505943 Mon Sep 17 00:00:00 2001
From: Panagiotis Xanthopoulos <panos0511@gmail.com>
Date: Wed, 17 Apr 2024 18:05:13 +0300
Subject: [PATCH 1/2] Add COOMatrix implementation with corresponding tests,
 Extend kernels to support COOMatrix and corresponding tests

Co-authored-by: fwtostatho <fotinist01@gmail.com>
Co-authored-by: koutom111 <despinatmk@gmail.com>
---
 src/runtime/local/datagen/GenGivenVals.h      |  44 ++
 .../local/datastructures/CMakeLists.txt       |   1 +
 .../local/datastructures/COOMatrix.cpp        |  37 ++
 src/runtime/local/datastructures/COOMatrix.h  | 489 ++++++++++++++++++
 src/runtime/local/kernels/AggAll.h            |  77 +++
 src/runtime/local/kernels/AggCol.h            | 103 ++++
 src/runtime/local/kernels/AggRow.h            |  77 +++
 src/runtime/local/kernels/CheckEq.h           |  49 ++
 src/runtime/local/kernels/CheckEqApprox.h     |  52 ++
 src/runtime/local/kernels/EwBinaryMat.h       | 220 ++++++++
 src/runtime/local/kernels/EwUnaryMat.h        |  42 ++
 src/runtime/local/kernels/RandMatrix.h        | 101 ++++
 src/runtime/local/kernels/Transpose.h         |  47 ++
 test/CMakeLists.txt                           |   1 +
 .../local/datastructures/COOMatrixTest.cpp    | 149 ++++++
 test/runtime/local/kernels/AggAllTest.cpp     |   3 +-
 test/runtime/local/kernels/AggColTest.cpp     |   3 +-
 test/runtime/local/kernels/AggRowTest.cpp     |   3 +-
 .../local/kernels/CheckEqApproxTest.cpp       |   3 +-
 test/runtime/local/kernels/CheckEqTest.cpp    |   7 +-
 .../runtime/local/kernels/EwBinaryMatTest.cpp |   9 +-
 test/runtime/local/kernels/EwUnaryMatTest.cpp |  43 +-
 test/runtime/local/kernels/RandMatrixTest.cpp |   3 +-
 test/runtime/local/kernels/TransposeTest.cpp  |   3 +-
 24 files changed, 1539 insertions(+), 27 deletions(-)
 create mode 100644 src/runtime/local/datastructures/COOMatrix.cpp
 create mode 100644 src/runtime/local/datastructures/COOMatrix.h
 create mode 100644 test/runtime/local/datastructures/COOMatrixTest.cpp
diff --git a/src/runtime/local/datagen/GenGivenVals.h b/src/runtime/local/datagen/GenGivenVals.h
index dfcb6aaf9..739752ad8 100644
--- a/src/runtime/local/datagen/GenGivenVals.h
+++ b/src/runtime/local/datagen/GenGivenVals.h
@@ -20,6 +20,7 @@
 #include <runtime/local/datastructures/CSRMatrix.h>
 #include <runtime/local/datastructures/DataObjectFactory.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
+#include <runtime/local/datastructures/COOMatrix.h>
 
 #include <algorithm>
 #include <vector>
@@ -158,4 +159,47 @@ struct GenGivenVals<CSRMatrix<VT>> {
     }
 };
 
+// ----------------------------------------------------------------------------
+// COOMatrix
+// ----------------------------------------------------------------------------
+
+template<typename VT>
+struct GenGivenVals<COOMatrix<VT>> {
+    static COOMatrix<VT> * generate(size_t numRows, const std::vector<VT> & elements, size_t minNumNonZeros = 0) {
+        const size_t numCells = elements.size();
+        assert((numCells % numRows == 0) && "number of given data elements must be divisible by given number of rows");
+        const size_t numCols = numCells / numRows;
+        size_t numNonZeros = 0;
+        for(VT v : elements)
+            if(v != VT(0))
+                numNonZeros++;
+        auto res = DataObjectFactory::create<COOMatrix<VT>>(numRows, numCols, std::max(numNonZeros, minNumNonZeros), false);
+        VT * values = res->getValues();
+        size_t * colIdxs = res->getColIdxs();
+        size_t * rowIdxs = res->getRowIdxs();
+        size_t pos = 0;
+        size_t colIdx = -1;
+        size_t rowIdx = -1;
+        for(size_t i = 0; i < numCells; i++) {
+            if (i % numCols == 0) {
+                rowIdx ++;
+                colIdx = 0;
+            } else {
+                colIdx ++;
+            }
+            VT v = elements[i];
+            if(v != VT(0)) {
+                values[pos] = v;
+                colIdxs[pos] = colIdx;
+                rowIdxs[pos] = rowIdx;
+                pos++;
+            }
+        }
+        values[pos] = VT(0);
+        colIdxs[pos] = size_t(-1);
+        rowIdxs[pos] = size_t(-1);
+        return res;
+    }
+};
+
 #endif //SRC_RUNTIME_LOCAL_DATAGEN_GENGIVENVALS_H
\ No newline at end of file
diff --git a/src/runtime/local/datastructures/CMakeLists.txt b/src/runtime/local/datastructures/CMakeLists.txt
index cf96e7f95..6ad8a8207 100644
--- a/src/runtime/local/datastructures/CMakeLists.txt
+++ b/src/runtime/local/datastructures/CMakeLists.txt
@@ -19,6 +19,7 @@ add_library(DataStructures
         DataPlacement.cpp
         DenseMatrix.cpp
         CSRMatrix.cpp
+        COOMatrix.cpp
         Frame.cpp
         IAllocationDescriptor.h
         MetaDataObject.h
diff --git a/src/runtime/local/datastructures/COOMatrix.cpp b/src/runtime/local/datastructures/COOMatrix.cpp
new file mode 100644
index 000000000..b74b90086
--- /dev/null
+++ b/src/runtime/local/datastructures/COOMatrix.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright 2021 The DAPHNE Consortium
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <runtime/local/io/DaphneSerializer.h>
+
+#include "COOMatrix.h"
+
+
+/* TODO COO serializer */
+template<typename ValueType>
+size_t COOMatrix<ValueType>::serialize(std::vector<char> &buf) const {
+    throw std::runtime_error("COOMatrix does not support serialization yet");
+//    return DaphneSerializer<COOMatrix<ValueType>>::serialize(this, buf);
+}
+
+// explicitly instantiate to satisfy linker
+template class COOMatrix<double>;
+template class COOMatrix<float>;
+template class COOMatrix<int>;
+template class COOMatrix<long>;
+template class COOMatrix<signed char>;
+template class COOMatrix<unsigned char>;
+template class COOMatrix<unsigned int>;
+template class COOMatrix<unsigned long>;
\ No newline at end of file
diff --git a/src/runtime/local/datastructures/COOMatrix.h b/src/runtime/local/datastructures/COOMatrix.h
new file mode 100644
index 000000000..79d1d9775
--- /dev/null
+++ b/src/runtime/local/datastructures/COOMatrix.h
@@ -0,0 +1,489 @@
+/*
+ * Copyright 2021 The DAPHNE Consortium
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <runtime/local/datastructures/DataObjectFactory.h>
+#include <runtime/local/datastructures/Matrix.h>
+#include <runtime/local/datastructures/ValueTypeUtils.h>
+
+#include <algorithm>
+#include <iostream>
+#include <memory>
+#include <utility>
+#include <iomanip>
+
+#include <cassert>
+#include <cstddef>
+#include <cstring>
+
+/**
+ * @brief A sparse matrix in COOrdinate (COO) format.
+ *
+ */
+template<typename ValueType>
+class COOMatrix : public Matrix<ValueType> {
+    // `using`, so that we do not need to prefix each occurrence of these
+    // fields from the super-classes.
+    using Matrix<ValueType>::numRows;
+    using Matrix<ValueType>::numCols;
+
+    /**
+     * @brief The maximum number of non-zero values this matrix was allocated
+     * to accommodate.
+     */
+    size_t maxNumNonZeros;
+
+    /**
+     * @brief We need these in order to accommodate row-based sub-matrix views.
+     */
+    size_t lowerRow;
+    size_t upperRow;
+
+    size_t appendHelp = 0;
+    bool view;
+
+    std::shared_ptr<ValueType> values;
+    std::shared_ptr<size_t> colIdxs;
+    std::shared_ptr<size_t> rowIdxs;
+
+    // Grant DataObjectFactory access to the private constructors and
+    // destructors.
+    template<class DataType, typename ... ArgTypes>
+    friend DataType *DataObjectFactory::create(ArgTypes ...);
+
+    template<class DataType>
+    friend void DataObjectFactory::destroy(const DataType *obj);
+
+    /**
+     * @brief Creates a `COOMatrix` and allocates enough memory for the
+     * specified size in the internal `values`, `colIdxs`, and `rowIdxs`
+     * arrays.
+     *
+     * @param numRows The exact number of rows.
+     * @param numCols The exact number of columns.
+     * @param maxNumNonZeros The maximum number of non-zeros in the matrix.
+     * @param zero Whether the allocated memory of the internal arrays shall be
+     * initialized to zeros (`true`), or be left uninitialized (`false`).
+     */
+    COOMatrix(size_t numRows, size_t numCols, size_t maxNumNonZeros, bool zero) :
+            Matrix<ValueType>(numRows, numCols),
+            maxNumNonZeros(maxNumNonZeros),
+            lowerRow(0),
+            upperRow(numRows),
+            view(false),
+            values(new ValueType[maxNumNonZeros + 1], std::default_delete<ValueType[]>()),
+            colIdxs(new size_t[maxNumNonZeros + 1], std::default_delete<size_t[]>()),
+            rowIdxs(new size_t[maxNumNonZeros + 1], std::default_delete<size_t[]>()) {
+        if (zero) {
+            memset(values.get(), 0, (maxNumNonZeros + 1) * sizeof(ValueType));
+            memset(colIdxs.get(), 0, (maxNumNonZeros + 1) * sizeof(size_t));
+            memset(rowIdxs.get(), 0, (maxNumNonZeros + 1) * sizeof(size_t));
+        }
+
+        values.get()[0] = ValueType(0);
+        rowIdxs.get()[0] = size_t(-1);
+        colIdxs.get()[0] = size_t(-1);
+    }
+
+    /**
+     * @brief Creates a `COOMatrix` around a sub-matrix of another `COOMatrix`
+     * without copying the data.
+     *
+     * @param src The other `COOMatrix`.
+     * @param rowLowerIncl Inclusive lower bound for the range of rows to extract.
+     * @param rowUpperExcl Exclusive upper bound for the range of rows to extract.
+     */
+    COOMatrix(const COOMatrix<ValueType> *src, size_t rowLowerIncl, size_t rowUpperExcl) :
+            Matrix<ValueType>(rowUpperExcl - rowLowerIncl, src->numCols),
+            maxNumNonZeros(std::min(src->maxNumNonZeros, src->numCols * (rowUpperExcl - rowLowerIncl))),
+            lowerRow(rowLowerIncl),
+            upperRow(rowUpperExcl),
+            view(true) {
+        assert(src && "src must not be null");
+        assert((rowLowerIncl < src->numRows) && "rowLowerIncl is out of bounds");
+        assert((rowUpperExcl <= src->numRows) && "rowUpperExcl is out of bounds");
+        assert((rowLowerIncl < rowUpperExcl) && "rowLowerIncl must be lower than rowUpperExcl");
+
+        rowIdxs = src->rowIdxs;
+        colIdxs = src->colIdxs;
+        values = src->values;
+    }
+
+    virtual ~COOMatrix() {
+        // nothing to do
+    }
+
+    [[nodiscard]] std::pair<size_t, size_t> rowRange(size_t rowIdx, size_t start) const {
+        size_t rowStart = 0, rowLength = 0, row, i = start;
+        while (true) {
+            row = rowIdxs.get()[i];
+            if (row == (size_t) -1) break;
+            if (row < rowIdx) {
+                i++;
+                continue;
+            }
+            if (row > rowIdx) {
+                if (rowLength == 0) rowStart = i;
+                return std::make_pair(rowStart, rowLength);
+            }
+            if (row == rowIdx) {
+                if (rowLength == 0) rowStart = i;
+                rowLength++;
+            }
+            i++;
+        }
+
+        if (rowLength == 0) return std::make_pair(i, rowLength);
+        return std::make_pair(rowStart, rowLength);
+    }
+
+    void insert(size_t pos, size_t rowIdx, size_t colIdx, ValueType value) {
+        assert((this->getNumNonZeros() < maxNumNonZeros) && "can't add any more nonzero values");
+
+        if (value == ValueType(0)) return;
+        ValueType val = values.get()[pos];
+        size_t row = rowIdxs.get()[pos];
+        size_t col = colIdxs.get()[pos];
+        size_t i = pos + 1;
+        while (true) {
+            if (row == size_t(-1)) {
+                std::swap(val, values.get()[i]);
+                std::swap(row, rowIdxs.get()[i]);
+                std::swap(col, colIdxs.get()[i]);
+                break;
+            }
+            std::swap(val, values.get()[i]);
+            std::swap(row, rowIdxs.get()[i]);
+            std::swap(col, colIdxs.get()[i]);
+
+            i++;
+        }
+        rowIdxs.get()[pos] = rowIdx;
+        values.get()[pos] = value;
+        colIdxs.get()[pos] = colIdx;
+    }
+
+    void remove(size_t idx) {
+        size_t i = idx;
+        while (true) {
+            rowIdxs.get()[i] = rowIdxs.get()[i + 1];
+            values.get()[i] = values.get()[i + 1];
+            colIdxs.get()[i] = colIdxs.get()[i + 1];
+            if (rowIdxs.get()[i] == size_t(-1)) break;
+            i++;
+        }
+    }
+
+public:
+    [[nodiscard]] bool isView() const {
+        return view;
+    }
+
+    [[nodiscard]] size_t getLowerRow() const {
+        return lowerRow;
+    }
+
+    [[nodiscard]] size_t getUpperRow() const {
+        return upperRow;
+    }
+
+    [[nodiscard]] size_t getMaxNumNonZeros() const {
+        return maxNumNonZeros;
+    }
+
+    [[nodiscard]] size_t getNumNonZeros() const {
+        size_t i = 0, cnt = 0;
+        while (true) {
+            size_t row = rowIdxs.get()[i];
+            if (row == size_t(-1)) break;
+            if (row < lowerRow) {
+                i++;
+                continue;
+            }
+            if (row >= upperRow) break;
+            i++;
+            cnt++;
+        }
+        return cnt;
+    }
+
+    [[nodiscard]] size_t getNumNonZerosRow(size_t rowIdx) const {
+        assert((rowIdx < numRows) && "rowIdx is out of bounds");
+
+        std::pair<size_t, size_t> range = rowRange(rowIdx + lowerRow, 0);
+        return range.second;
+    }
+
+    [[nodiscard]] size_t getNumNonZerosCol(size_t colIdx) const {
+        assert((colIdx < numCols) && "colIdx is out of bounds");
+
+        size_t cnt = 0, i = 0;
+        while (true) {
+            size_t col = colIdxs.get()[i];
+            size_t row = rowIdxs.get()[i];
+            if (col == size_t(-1)) break;
+            if (col == colIdx && row >= lowerRow && row < upperRow) cnt++;
+            i++;
+        }
+        return cnt;
+    }
+
+    [[nodiscard]] ValueType *getValues() {
+        std::pair<size_t, size_t> range = rowRange(lowerRow, 0);
+        size_t rowStart = range.first;
+
+        return values.get() + rowStart;
+    }
+
+    [[nodiscard]] const ValueType *getValues() const {
+        return const_cast<COOMatrix<ValueType> *>(this)->getValues();
+    }
+
+    [[nodiscard]] size_t *getColIdxs() {
+        std::pair<size_t, size_t> range = rowRange(lowerRow, 0);
+        size_t rowStart = range.first;
+
+        return colIdxs.get() + rowStart;
+    }
+
+    [[nodiscard]] const size_t *getColIdxs() const {
+        return const_cast<COOMatrix<ValueType> *>(this)->getColIdxs();
+    }
+
+    [[nodiscard]] size_t *getRowIdxs() {
+        std::pair<size_t, size_t> range = rowRange(lowerRow, 0);
+        size_t rowStart = range.first;
+
+        return rowIdxs.get() + rowStart;
+    }
+
+    [[nodiscard]] const size_t *getRowIdxs() const {
+        return const_cast<COOMatrix<ValueType> *>(this)->getRowIdxs();
+    }
+
+    [[nodiscard]] ValueType *getValues(size_t rowIdx) {
+        assert((rowIdx <= numRows) && "rowIdx is out of bounds");
+
+        std::pair<size_t, size_t> range = rowRange(rowIdx + lowerRow, 0);
+        size_t rowStart = range.first;
+
+        return values.get() + rowStart;
+    }
+
+    [[nodiscard]] const ValueType *getValues(size_t rowIdx) const {
+        return const_cast<COOMatrix<ValueType> *>(this)->getValues(rowIdx);
+    }
+
+    [[nodiscard]] size_t *getColIdxs(size_t rowIdx) {
+        assert((rowIdx <= numRows) && "rowIdx is out of bounds");
+
+        std::pair<size_t, size_t> range = rowRange(rowIdx + lowerRow, 0);
+        size_t rowStart = range.first;
+
+        return colIdxs.get() + rowStart;
+    }
+
+    [[nodiscard]] const size_t *getColIdxs(size_t rowIdx) const {
+        return const_cast<COOMatrix<ValueType> *>(this)->getColIdxs(rowIdx);
+    }
+
+    [[nodiscard]] size_t *getRowIdxs(size_t rowIdx) {
+        assert((rowIdx <= numRows) && "rowIdx is out of bounds");
+
+        std::pair<size_t, size_t> range = rowRange(rowIdx + lowerRow, 0);
+        size_t rowStart = range.first;
+
+        return rowIdxs.get() + rowStart;
+    }
+
+    [[nodiscard]] const size_t *getRowIdxs(size_t rowIdx) const {
+        return const_cast<COOMatrix<ValueType> *>(this)->getRowIdxs(rowIdx);
+    }
+
+    [[nodiscard]] ValueType get(size_t rowIdx, size_t colIdx) const override {
+        rowIdx += lowerRow;
+
+        assert((rowIdx < numRows) && "rowIdx is out of bounds");
+        assert((colIdx < numCols) && "colIdx is out of bounds");
+
+        for (size_t i = 0; i < maxNumNonZeros; i++) {
+            size_t row = rowIdxs.get()[i];
+            if (row == size_t(-1)) break;
+            if (row > rowIdx) break;
+            if (row < rowIdx) continue;
+            size_t col = colIdxs.get()[i];
+            if (col > colIdx) break;
+            if (col < colIdx) continue;
+            if (col == colIdx) return values.get()[i];
+        }
+        return ValueType(0);
+    }
+
+    void set(size_t rowIdx, size_t colIdx, ValueType value) override {
+        rowIdx += lowerRow;
+
+        assert((rowIdx < numRows) && "rowIdx is out of bounds");
+        assert((colIdx < numCols) && "colIdx is out of bounds");
+
+        for (size_t i = 0; i < maxNumNonZeros; i++) {
+            size_t row = rowIdxs.get()[i];
+            if (row == size_t(-1)) {
+                insert(i, rowIdx, colIdx, value);
+                return;
+            }
+            if (row > rowIdx) {
+                insert(i, rowIdx, colIdx, value);
+                return;
+            }
+            if (row < rowIdx) continue;
+            size_t col = colIdxs.get()[i];
+            if (col > colIdx) {
+                insert(i, rowIdx, colIdx, value);
+                return;
+            }
+            if (col < colIdx) continue;
+            if (col == colIdx) {
+                if (value == ValueType(0)) {
+                    remove(i);
+                    return;
+                } else {
+                    values.get()[i] = value;
+                    return;
+                }
+            }
+        }
+    }
+
+    void prepareAppend() override {
+        appendHelp = rowRange(lowerRow, 0).first;
+    }
+
+    void append(size_t rowIdx, size_t colIdx, ValueType value) override {
+        assert((rowIdx < numRows) && "rowIdx is out of bounds");
+        assert((colIdx < numCols) && "colIdx is out of bounds");
+        assert((appendHelp < maxNumNonZeros) && "can't add any more nonzero values");
+
+        if (value == ValueType(0)) return;
+
+        rowIdxs.get()[appendHelp] = rowIdx;
+        values.get()[appendHelp] = value;
+        colIdxs.get()[appendHelp] = colIdx;
+        appendHelp++;
+    }
+
+    void finishAppend() override {
+        rowIdxs.get()[appendHelp] = size_t(-1);
+        values.get()[appendHelp] = ValueType(0);
+        colIdxs.get()[appendHelp] = size_t(-1);
+    }
+
+    /**
+     * @brief Pretty print of this matrix.
+     * @param os The stream to print to.
+     */
+    void print(std::ostream &os) const override {
+        os << "COOMatrix(" << numRows << 'x' << numCols << ", "
+           << ValueTypeUtils::cppNameFor<ValueType> << ')' << std::endl << std::endl;
+
+        auto *colWidths = new int[numCols];
+        for (size_t i = 0; i < numCols; ++i) {
+            colWidths[i] = 1;
+        }
+
+        size_t index = 0;
+        while (true) {
+            ValueType value = values.get()[index];
+            if (value == ValueType(0)) break;
+            std::ostringstream oss;
+            oss << value;
+            std::string strValue = oss.str();
+            colWidths[colIdxs.get()[index]] = std::max(static_cast<int>(strValue.length()),
+                                                       colWidths[colIdxs.get()[index]]);
+            index++;
+        }
+
+        size_t i = 0;
+        ValueType val = values.get()[i];
+        size_t row = rowIdxs.get()[i];
+        size_t col = colIdxs.get()[i];
+        for (size_t currRow = 0; currRow < numRows; ++currRow) {
+            for (size_t currCol = 0; currCol < numCols; ++currCol) {
+                if (currRow == row && currCol == col) {
+                    os << std::setw(colWidths[col]) << val << " ";
+                    i++;
+                    val = values.get()[i];
+                    row = rowIdxs.get()[i];
+                    col = colIdxs.get()[i];
+                } else {
+                    os << std::setw(colWidths[col]) << ValueType(0) << " ";
+                }
+            }
+            os << std::endl;
+        }
+
+        delete[] colWidths;
+    }
+
+    /**
+     * @brief Prints the internal arrays of this matrix.
+     * @param os The stream to print to.
+     */
+    void printRaw(std::ostream &os) const {
+        size_t numNonZeros = this->getNumNonZeros();
+        os << "COOMatrix(" << numRows << 'x' << numCols << ", "
+           << ValueTypeUtils::cppNameFor<ValueType> << ')' << std::endl;
+        os << "maxNumNonZeros: \t" << maxNumNonZeros << std::endl;
+        os << "numNonZeros: \t" << this->getNumNonZeros() << std::endl;
+        os << "values: \t";
+        size_t offset = rowRange(lowerRow, 0).first;
+        for (size_t i = 0; i < numNonZeros; i++)
+            os << values.get()[i + offset] << ", ";
+        os << std::endl;
+        os << "colIdxs: \t";
+        for (size_t i = 0; i < numNonZeros; i++)
+            os << colIdxs.get()[i + offset] << ", ";
+        os << std::endl;
+        os << "rowIdxs: \t";
+        for (size_t i = 0; i < numNonZeros; i++)
+            os << rowIdxs.get()[i + offset] << ", ";
+        os << std::endl;
+    }
+
+    COOMatrix *sliceRow(size_t rl, size_t ru) const override {
+        return DataObjectFactory::create<COOMatrix>(this, rl, ru);
+    }
+
+    COOMatrix *sliceCol(size_t cl, size_t cu) const override {
+        throw std::runtime_error("COOMatrix does not support column-based slicing yet");
+    }
+
+    COOMatrix *slice(size_t rl, size_t ru, size_t cl, size_t cu) const override {
+        throw std::runtime_error("COOMatrix does not support slicing yet");
+    }
+
+    [[nodiscard]] size_t bufferSize() {
+        return (maxNumNonZeros + 1) * (sizeof(ValueType) + sizeof(size_t) + sizeof(size_t));
+    }
+
+    size_t serialize(std::vector<char> &buf) const override;
+};
+
+template<typename ValueType>
+std::ostream &operator<<(std::ostream &os, const COOMatrix<ValueType> &obj) {
+    obj.print(os);
+    return os;
+}
\ No newline at end of file
diff --git a/src/runtime/local/kernels/AggAll.h b/src/runtime/local/kernels/AggAll.h
index a163c4be9..568056162 100644
--- a/src/runtime/local/kernels/AggAll.h
+++ b/src/runtime/local/kernels/AggAll.h
@@ -19,6 +19,7 @@
 
 #include <runtime/local/context/DaphneContext.h>
 #include <runtime/local/datastructures/CSRMatrix.h>
+#include <runtime/local/datastructures/COOMatrix.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
 #include <runtime/local/kernels/AggOpCode.h>
 #include <runtime/local/kernels/EwBinarySca.h>
@@ -189,4 +190,80 @@ struct AggAll<VTRes, CSRMatrix<VTArg>> {
     }
 };
 
+// ----------------------------------------------------------------------------
+// scalar <- COOMatrix
+// ----------------------------------------------------------------------------
+
+template<typename VTRes, typename VTArg>
+struct AggAll<VTRes, COOMatrix<VTArg>> {
+    static VTRes aggArray(const VTArg * values, size_t numNonZeros, size_t numCells, EwBinaryScaFuncPtr<VTRes, VTRes, VTRes> func, bool isSparseSafe, VTRes neutral, DCTX(ctx)) {
+        if(numNonZeros) {
+            VTRes agg = static_cast<VTRes>(values[0]);
+            for(size_t i = 1; i < numNonZeros; i++)
+                agg = func(agg, static_cast<VTRes>(values[i]), ctx);
+
+            if(!isSparseSafe && numNonZeros < numCells)
+                agg = func(agg, 0, ctx);
+
+            return agg;
+        }
+        else
+            return func(neutral, 0, ctx);
+    }
+
+    static VTRes apply(AggOpCode opCode, const COOMatrix<VTArg> * arg, DCTX(ctx)) {
+        if(AggOpCodeUtils::isPureBinaryReduction(opCode)) {
+
+            EwBinaryScaFuncPtr<VTRes, VTRes, VTRes> func = getEwBinaryScaFuncPtr<VTRes, VTRes, VTRes>(AggOpCodeUtils::getBinaryOpCode(opCode));
+
+            return aggArray(
+                    arg->getValues(0),
+                    arg->getNumNonZeros(),
+                    arg->getNumRows() * arg->getNumCols(),
+                    func,
+                    AggOpCodeUtils::isSparseSafe(opCode),
+                    AggOpCodeUtils::template getNeutral<VTRes>(opCode),
+                    ctx
+            );
+        }
+        else { // The op-code is either MEAN or STDDEV or VAR.
+            EwBinaryScaFuncPtr<VTRes, VTRes, VTRes> func = getEwBinaryScaFuncPtr<VTRes, VTRes, VTRes>(AggOpCodeUtils::getBinaryOpCode(AggOpCode::SUM));
+            auto agg = aggArray(
+                    arg->getValues(0),
+                    arg->getNumNonZeros(),
+                    arg->getNumRows() * arg->getNumCols(),
+                    func,
+                    true,
+                    VTRes(0),
+                    ctx
+            );
+            agg = agg / (arg->getNumRows() * arg->getNumCols());
+            if (opCode == AggOpCode::MEAN)
+                return agg;
+            else{
+                //STDDEV-VAR
+                VTRes stddev=0;
+
+                const VTArg * valuesArg = arg->getValues(0);
+                for(size_t i = 0; i < arg->getNumNonZeros(); i++) {
+                    VTRes val = static_cast<VTRes>((valuesArg[i])) - agg;
+                    stddev = stddev + val * val;
+                }
+                stddev += ((arg->getNumRows() * arg->getNumCols()) - arg->getNumNonZeros())*agg*agg;
+                stddev /= (arg->getNumRows() * arg->getNumCols());
+
+                //Variance --> stddev before sqrt() is variance
+                if (opCode == AggOpCode::VAR){
+                    VTRes var = stddev;
+                    return var;
+                }
+
+                stddev = sqrt(stddev);
+                return stddev;
+
+            }
+        }
+    }
+};
+
 #endif //SRC_RUNTIME_LOCAL_KERNELS_AGGALL_H
\ No newline at end of file
diff --git a/src/runtime/local/kernels/AggCol.h b/src/runtime/local/kernels/AggCol.h
index 6594abf0d..f2cf90b9c 100644
--- a/src/runtime/local/kernels/AggCol.h
+++ b/src/runtime/local/kernels/AggCol.h
@@ -19,6 +19,7 @@
 
 #include <runtime/local/context/DaphneContext.h>
 #include <runtime/local/datastructures/CSRMatrix.h>
+#include <runtime/local/datastructures/COOMatrix.h>
 #include <runtime/local/datastructures/DataObjectFactory.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
 #include <runtime/local/kernels/AggOpCode.h>
@@ -281,4 +282,106 @@ struct AggCol<DenseMatrix<VTRes>, CSRMatrix<VTArg>> {
     }
 };
 
+// ----------------------------------------------------------------------------
+// DenseMatrix <- COOMatrix
+// ----------------------------------------------------------------------------
+
+template<typename VTRes, typename VTArg>
+struct AggCol<DenseMatrix<VTRes>, COOMatrix<VTArg>> {
+    static void apply(AggOpCode opCode, DenseMatrix<VTRes> *& res, const COOMatrix<VTArg> * arg, DCTX(ctx)) {
+        const size_t numRows = arg->getNumRows();
+        const size_t numCols = arg->getNumCols();
+
+        if(res == nullptr)
+            res = DataObjectFactory::create<DenseMatrix<VTRes>>(1, numCols, true);
+
+        VTRes * valuesRes = res->getValues();
+
+        EwBinaryScaFuncPtr<VTRes, VTRes, VTRes> func;
+        if(AggOpCodeUtils::isPureBinaryReduction(opCode))
+            func = getEwBinaryScaFuncPtr<VTRes, VTRes, VTRes>(AggOpCodeUtils::getBinaryOpCode(opCode));
+        else
+            // TODO Setting the function pointer yields the correct result.
+            // However, since MEAN and STDDEV are not sparse-safe, the program
+            // does not take the same path for doing the summation, and is less
+            // efficient.
+            // for MEAN and STDDDEV, we need to sum
+            func = getEwBinaryScaFuncPtr<VTRes, VTRes, VTRes>(AggOpCodeUtils::getBinaryOpCode(AggOpCode::SUM));
+
+        const VTArg * valuesArg = arg->getValues(0);
+        const size_t * colIdxsArg = arg->getColIdxs(0);
+
+        const size_t numNonZeros = arg->getNumNonZeros();
+
+        if(AggOpCodeUtils::isSparseSafe(opCode)) {
+            for(size_t i = 0; i < numNonZeros; i++) {
+                const size_t colIdx = colIdxsArg[i];
+                valuesRes[colIdx] = func(valuesRes[colIdx], static_cast<VTRes>(valuesArg[i]), ctx);
+            }
+        }
+        else {
+            size_t * hist = new size_t[numCols](); // initialized to zeros
+
+            const size_t numNonZerosFirstRowArg = arg->getNumNonZerosRow(0);
+            for(size_t i = 0; i < numNonZerosFirstRowArg; i++) {
+                size_t colIdx = colIdxsArg[i];
+                valuesRes[colIdx] = static_cast<VTRes>(valuesArg[i]);
+                hist[colIdx]++;
+            }
+
+            if(arg->getNumRows() > 1) {
+                for(size_t i = numNonZerosFirstRowArg; i < numNonZeros; i++) {
+                    const size_t colIdx = colIdxsArg[i];
+                    valuesRes[colIdx] = func(valuesRes[colIdx], static_cast<VTRes>(valuesArg[i]), ctx);
+                    hist[colIdx]++;
+                }
+                for(size_t c = 0; c < numCols; c++)
+                    if(hist[c] < numRows)
+                        valuesRes[c] = func(valuesRes[c], VTRes(0), ctx);
+            }
+
+            delete[] hist;
+        }
+
+        if(AggOpCodeUtils::isPureBinaryReduction(opCode))
+            return;
+
+        // The op-code is either MEAN or STDDEV or VAR.
+
+        for(size_t c = 0; c < numCols; c++)
+            valuesRes[c] /= arg->getNumRows();
+
+        if(opCode == AggOpCode::MEAN)
+            return;
+
+        auto tmp = DataObjectFactory::create<DenseMatrix<VTRes>>(1, numCols, true);
+        VTRes * valuesT = tmp->getValues();
+
+        size_t * nnzCol = new size_t[numCols](); // initialized to zeros
+        for(size_t i = 0; i < numNonZeros; i++) {
+            const size_t colIdx = colIdxsArg[i];
+            VTRes val = static_cast<VTRes>(valuesArg[i]) - valuesRes[colIdx];
+            valuesT[colIdx] = valuesT[colIdx] + val * val;
+            nnzCol[colIdx]++;
+        }
+
+        for(size_t c = 0; c < numCols; c++) {
+            // Take all zeros in the column into account.
+            valuesT[c] += (valuesRes[c] * valuesRes[c]) * (numRows - nnzCol[c]);
+            // Finish computation of stddev.
+            valuesT[c] /= numRows;
+            if (opCode == AggOpCode::STDDEV)
+                valuesT[c] = sqrt(valuesT[c]);
+        }
+
+        delete[] nnzCol;
+
+        // TODO We could avoid copying by returning tmp and destroying res. But
+        // that might be wrong if res was not nullptr initially.
+        memcpy(valuesRes, valuesT, numCols * sizeof(VTRes));
+        DataObjectFactory::destroy<DenseMatrix<VTRes>>(tmp);
+
+    }
+};
+
 #endif //SRC_RUNTIME_LOCAL_KERNELS_AGGCOL_H
\ No newline at end of file
diff --git a/src/runtime/local/kernels/AggRow.h b/src/runtime/local/kernels/AggRow.h
index 856daaa84..ac1de0bdd 100644
--- a/src/runtime/local/kernels/AggRow.h
+++ b/src/runtime/local/kernels/AggRow.h
@@ -19,6 +19,7 @@
 
 #include <runtime/local/context/DaphneContext.h>
 #include <runtime/local/datastructures/CSRMatrix.h>
+#include <runtime/local/datastructures/COOMatrix.h>
 #include <runtime/local/datastructures/DataObjectFactory.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
 #include <runtime/local/kernels/AggAll.h>
@@ -244,5 +245,81 @@ struct AggRow<DenseMatrix<VTRes>, CSRMatrix<VTArg>> {
     }
 };
 
+// ----------------------------------------------------------------------------
+// DenseMatrix <- COOMatrix
+// ----------------------------------------------------------------------------
+
+template<typename VTRes, typename VTArg>
+struct AggRow<DenseMatrix<VTRes>, COOMatrix<VTArg>> {
+    static void apply(AggOpCode opCode, DenseMatrix<VTRes> *& res, const COOMatrix<VTArg> * arg, DCTX(ctx)) {
+        const size_t numCols = arg->getNumCols();
+        const size_t numRows = arg->getNumRows();
+
+        if(res == nullptr)
+            res = DataObjectFactory::create<DenseMatrix<VTRes>>(numRows, 1, false);
+
+        VTRes * valuesRes = res->getValues();
+
+        if (AggOpCodeUtils::isPureBinaryReduction(opCode)) {
+
+            EwBinaryScaFuncPtr<VTRes, VTRes, VTRes> func = getEwBinaryScaFuncPtr<VTRes, VTRes, VTRes>(AggOpCodeUtils::getBinaryOpCode(opCode));
+
+            const bool isSparseSafe = AggOpCodeUtils::isSparseSafe(opCode);
+            const VTRes neutral = AggOpCodeUtils::template getNeutral<VTRes>(opCode);
+
+            for(size_t r = 0; r < numRows; r++) {
+                *valuesRes = AggAll<VTRes, COOMatrix<VTArg>>::aggArray(
+                        arg->getValues(r),
+                        arg->getNumNonZerosRow(r),
+                        numCols,
+                        func,
+                        isSparseSafe,
+                        neutral,
+                        ctx
+                );
+                valuesRes += res->getRowSkip();
+            }
+        }
+        else { // The op-code is either MEAN or STDDEV or VAR
+            // get sum for each row
+            size_t ctr = 0 ;
+            const VTRes neutral = VTRes(0);
+            const bool isSparseSafe = true;
+            auto tmp = DataObjectFactory::create<DenseMatrix<VTRes>>(numRows, 1, true);
+            VTRes * valuesT = tmp->getValues();
+            EwBinaryScaFuncPtr<VTRes, VTRes, VTRes> func = getEwBinaryScaFuncPtr<VTRes, VTRes, VTRes>(AggOpCodeUtils::getBinaryOpCode(AggOpCode::SUM));
+            for (size_t r = 0; r < numRows; r++){
+                *valuesRes = AggAll<VTRes, COOMatrix<VTArg>>::aggArray(
+                        arg->getValues(r),
+                        arg->getNumNonZerosRow(r),
+                        numCols,
+                        func,
+                        isSparseSafe,
+                        neutral,
+                        ctx
+                );
+                const VTArg * valuesArg = arg->getValues(0);
+                const size_t numNonZeros = arg->getNumNonZerosRow(r);
+                *valuesRes = *valuesRes / numCols;
+                if (opCode != AggOpCode::MEAN){
+                    for(size_t i = ctr; i < ctr+numNonZeros; i++) {
+                        VTRes val = static_cast<VTRes>((valuesArg[i])) - (*valuesRes);
+                        valuesT[r] = valuesT[r] + val * val;
+                    }
+
+                    ctr+=numNonZeros;
+                    valuesT[r] += (numCols - numNonZeros) * (*valuesRes)*(*valuesRes);
+                    valuesT[r] /= numCols;
+                    if(opCode == AggOpCode::STDDEV)
+                        *valuesRes = sqrt(valuesT[r]);
+                    else
+                        *valuesRes = valuesT[r];
+                }
+                valuesRes += res->getRowSkip();
+            }
+            DataObjectFactory::destroy<DenseMatrix<VTRes>>(tmp);
+        }
+    }
+};
 
 #endif //SRC_RUNTIME_LOCAL_KERNELS_AGGROW_H
diff --git a/src/runtime/local/kernels/CheckEq.h b/src/runtime/local/kernels/CheckEq.h
index 2ea170a08..1f6619b50 100644
--- a/src/runtime/local/kernels/CheckEq.h
+++ b/src/runtime/local/kernels/CheckEq.h
@@ -18,6 +18,7 @@
 
 #include <runtime/local/context/DaphneContext.h>
 #include <runtime/local/datastructures/CSRMatrix.h>
+#include <runtime/local/datastructures/COOMatrix.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
 #include <runtime/local/datastructures/Frame.h>
 
@@ -154,6 +155,54 @@ struct CheckEq<CSRMatrix<VT>> {
     }
 };
 
+// ----------------------------------------------------------------------------
+// COOMatrix
+// ----------------------------------------------------------------------------
+
+template<typename VT>
+struct CheckEq<COOMatrix<VT>> {
+    static bool apply(const COOMatrix<VT> * lhs, const COOMatrix<VT> * rhs, DCTX(ctx)) {
+        if(lhs == rhs)
+            return true;
+
+        const size_t numRows = lhs->getNumRows();
+        const size_t numCols = lhs->getNumCols();
+
+        if(numRows != rhs->getNumRows() || numCols != rhs->getNumCols())
+            return false;
+
+        const VT * valuesLhs = lhs->getValues();
+        const size_t * rowsLhs = lhs->getRowIdxs();
+        const size_t * colsLhs = lhs->getColIdxs();
+
+        const VT * valuesRhs = rhs->getValues();
+        const size_t * rowsRhs = rhs->getRowIdxs();
+        const size_t * colsRhs = rhs->getColIdxs();
+
+        const size_t nnzLhs = lhs->getNumNonZeros();
+        const size_t nnzRhs = rhs->getNumNonZeros();
+
+        size_t lowerRowLhs = lhs->getLowerRow();
+        size_t lowerRowRhs = rhs->getLowerRow();
+
+        if(nnzLhs != nnzRhs)
+            return false;
+
+        for (size_t i = 0; i < nnzLhs; i++) {
+            const size_t rowLhs = rowsLhs[i] - lowerRowLhs;
+            const size_t rowRhs = rowsRhs[i] - lowerRowRhs;
+            if (rowLhs != rowRhs) return false;
+            const size_t colLhs = colsLhs[i];
+            const size_t colRhs = colsRhs[i];
+            if (colLhs != colRhs) return false;
+            const VT valLhs = valuesLhs[i];
+            const VT valRhs = valuesRhs[i];
+            if (valLhs != valRhs) return false;
+        }
+        return true;
+    }
+};
+
 // ----------------------------------------------------------------------------
 // Frame
 // ----------------------------------------------------------------------------
diff --git a/src/runtime/local/kernels/CheckEqApprox.h b/src/runtime/local/kernels/CheckEqApprox.h
index e6cff4f6c..2c6957dd6 100644
--- a/src/runtime/local/kernels/CheckEqApprox.h
+++ b/src/runtime/local/kernels/CheckEqApprox.h
@@ -18,6 +18,7 @@
 
 #include <runtime/local/context/DaphneContext.h>
 #include <runtime/local/datastructures/CSRMatrix.h>
+#include <runtime/local/datastructures/COOMatrix.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
 #include <runtime/local/datastructures/Frame.h>
 
@@ -160,6 +161,57 @@ struct CheckEqApprox<CSRMatrix<VT>> {
     }
 };
 
+// ----------------------------------------------------------------------------
+// COOMatrix
+// ----------------------------------------------------------------------------
+
+template<typename VT>
+struct CheckEqApprox<COOMatrix<VT>> {
+    static bool apply(const COOMatrix<VT> * lhs, const COOMatrix<VT> * rhs, double eps, DCTX(ctx)) {
+        if(lhs == rhs)
+            return true;
+
+        const size_t numRows = lhs->getNumRows();
+        const size_t numCols = lhs->getNumCols();
+
+        if(numRows != rhs->getNumRows() || numCols != rhs->getNumCols())
+            return false;
+
+        const VT * valuesLhs = lhs->getValues();
+        const size_t * rowsLhs = lhs->getRowIdxs();
+        const size_t * colsLhs = lhs->getColIdxs();
+
+        const VT * valuesRhs = rhs->getValues();
+        const size_t * rowsRhs = rhs->getRowIdxs();
+        const size_t * colsRhs = rhs->getColIdxs();
+
+        const size_t nnzLhs = lhs->getNumNonZeros();
+        const size_t nnzRhs = rhs->getNumNonZeros();
+
+        size_t lowerRowLhs = lhs->getLowerRow();
+        size_t lowerRowRhs = rhs->getLowerRow();
+
+        if(nnzLhs != nnzRhs)
+            return false;
+
+        for (size_t i = 0; i < nnzLhs; i++) {
+            const size_t rowLhs = rowsLhs[i] - lowerRowLhs;
+            const size_t rowRhs = rowsRhs[i] - lowerRowRhs;
+            if (rowLhs != rowRhs) return false;
+            const size_t colLhs = colsLhs[i];
+            const size_t colRhs = colsRhs[i];
+            if (colLhs != colRhs) return false;
+            const VT valLhs = valuesLhs[i];
+            const VT valRhs = valuesRhs[i];
+            VT diff = valLhs - valRhs;
+            diff = diff > 0 ? diff : -diff;
+            if (diff > eps)
+                return false;
+        }
+        return true;
+    }
+};
+
 // ----------------------------------------------------------------------------
 // Frame
 // ----------------------------------------------------------------------------
diff --git a/src/runtime/local/kernels/EwBinaryMat.h b/src/runtime/local/kernels/EwBinaryMat.h
index c6158313e..1accde80d 100644
--- a/src/runtime/local/kernels/EwBinaryMat.h
+++ b/src/runtime/local/kernels/EwBinaryMat.h
@@ -18,6 +18,7 @@
 
 #include <runtime/local/context/DaphneContext.h>
 #include <runtime/local/datastructures/CSRMatrix.h>
+#include <runtime/local/datastructures/COOMatrix.h>
 #include <runtime/local/datastructures/DataObjectFactory.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
 #include <runtime/local/datastructures/Matrix.h>
@@ -248,6 +249,159 @@ struct EwBinaryMat<CSRMatrix<VT>, CSRMatrix<VT>, CSRMatrix<VT>> {
     }
 };
 
+// ----------------------------------------------------------------------------
+// COOMatrix <- COOMatrix, COOMatrix
+// ----------------------------------------------------------------------------
+
+template<typename VT>
+struct EwBinaryMat<COOMatrix<VT>, COOMatrix<VT>, COOMatrix<VT>> {
+    static void apply(BinaryOpCode opCode, COOMatrix<VT> *& res, const COOMatrix<VT> * lhs, const COOMatrix<VT> * rhs, DCTX(ctx)) {
+        const size_t numRows = lhs->getNumRows();
+        const size_t numCols = lhs->getNumCols();
+        if( numRows != rhs->getNumRows() || numCols != rhs->getNumCols() )
+            throw std::runtime_error("EwBinaryMat(COO) - lhs and rhs must have the same dimensions.");
+
+        size_t maxNnz;
+        switch(opCode) {
+            case BinaryOpCode::ADD: // merge
+                maxNnz = lhs->getNumNonZeros() + rhs->getNumNonZeros();
+                break;
+            case BinaryOpCode::MUL: // intersect
+                maxNnz = std::min(lhs->getNumNonZeros(), rhs->getNumNonZeros());
+                break;
+            default:
+                throw std::runtime_error("EwBinaryMat(COO) - unknown BinaryOpCode");
+        }
+
+        if(res == nullptr)
+            res = DataObjectFactory::create<COOMatrix<VT>>(numRows, numCols, maxNnz, false);
+
+        EwBinaryScaFuncPtr<VT, VT, VT> func = getEwBinaryScaFuncPtr<VT, VT, VT>(opCode);
+
+        switch(opCode) {
+            case BinaryOpCode::ADD: { // merge non-zero cells
+                for(size_t rowIdx = 0; rowIdx < numRows; rowIdx++) {
+                    size_t nnzRowLhs = lhs->getNumNonZerosRow(rowIdx);
+                    size_t nnzRowRhs = rhs->getNumNonZerosRow(rowIdx);
+                    if(nnzRowLhs && nnzRowRhs) {
+                        // merge within row
+                        const VT * valuesRowLhs = lhs->getValues(rowIdx);
+                        const VT * valuesRowRhs = rhs->getValues(rowIdx);
+                        VT * valuesRowRes = res->getValues(rowIdx);
+                        const size_t * colIdxsRowLhs = lhs->getColIdxs(rowIdx);
+                        const size_t * colIdxsRowRhs = rhs->getColIdxs(rowIdx);
+                        const size_t *rowIdxsRowArg1 = lhs->getRowIdxs(rowIdx);
+                        const size_t *rowIdxsRowArg2 = rhs->getRowIdxs(rowIdx);
+                        size_t * colIdxsRowRes = res->getColIdxs(rowIdx);
+                        size_t * rowIdxsRowRes = res->getRowIdxs(rowIdx);
+                        size_t posLhs = 0;
+                        size_t posRhs = 0;
+                        size_t posRes = 0;
+                        while(posLhs < nnzRowLhs && posRhs < nnzRowRhs) {
+                            if(colIdxsRowLhs[posLhs] == colIdxsRowRhs[posRhs]) {
+                                valuesRowRes[posRes] = func(valuesRowLhs[posLhs], valuesRowRhs[posRhs], ctx);
+                                colIdxsRowRes[posRes] = colIdxsRowLhs[posLhs];
+                                posLhs++;
+                                posRhs++;
+                            }
+                            else if(colIdxsRowLhs[posLhs] < colIdxsRowRhs[posRhs]) {
+                                valuesRowRes[posRes] = valuesRowLhs[posLhs];
+                                colIdxsRowRes[posRes] = colIdxsRowLhs[posLhs];
+                                posLhs++;
+                            }
+                            else {
+                                valuesRowRes[posRes] = valuesRowRhs[posRhs];
+                                colIdxsRowRes[posRes] = colIdxsRowRhs[posRhs];
+                                posRhs++;
+                            }
+                            rowIdxsRowRes[posRes] = rowIdx;
+                            posRes++;
+                        }
+                        // copy from left
+                        const size_t restRowLhs = nnzRowLhs - posLhs;
+                        memcpy(valuesRowRes + posRes, valuesRowLhs + posLhs, restRowLhs * sizeof(VT));
+                        memcpy(colIdxsRowRes + posRes, colIdxsRowLhs + posLhs, restRowLhs * sizeof(size_t));
+                        memcpy(rowIdxsRowRes + posRes, rowIdxsRowArg1 + posLhs, restRowLhs * sizeof(size_t));
+                        // copy from right
+                        const size_t restRowRhs = nnzRowRhs - posRhs;
+                        memcpy(valuesRowRes + posRes, valuesRowRhs + posRhs, restRowRhs * sizeof(VT));
+                        memcpy(colIdxsRowRes + posRes, colIdxsRowRhs + posRhs, restRowRhs * sizeof(size_t));
+                        memcpy(rowIdxsRowRes + posRes, rowIdxsRowArg2 + posRhs, restRowRhs * sizeof(size_t));
+
+                        valuesRowRes[posRes + restRowLhs + restRowRhs] = VT(0);
+                        colIdxsRowRes[posRes + restRowLhs + restRowRhs] = size_t(-1);
+                        rowIdxsRowRes[posRes + restRowLhs + restRowRhs] = size_t(-1);
+                    }
+                    else if(nnzRowLhs) {
+                        // copy from left
+                        VT * valuesRowRes = res->getValues(rowIdx);
+                        size_t * colIdxsRowRes = res->getColIdxs(rowIdx);
+                        size_t * rowIdxsRowRes = res->getRowIdxs(rowIdx);
+                        memcpy(valuesRowRes, lhs->getValues(rowIdx), nnzRowLhs * sizeof(VT));
+                        memcpy(colIdxsRowRes, lhs->getColIdxs(rowIdx), nnzRowLhs * sizeof(size_t));
+                        memcpy(rowIdxsRowRes, lhs->getRowIdxs(rowIdx), nnzRowLhs * sizeof(size_t));
+                        valuesRowRes[nnzRowLhs] = VT(0);
+                        colIdxsRowRes[nnzRowLhs] = size_t(-1);
+                        rowIdxsRowRes[nnzRowLhs] = size_t(-1);
+                    }
+                    else if(nnzRowRhs) {
+                        // copy from right
+                        VT * valuesRowRes = res->getValues(rowIdx);
+                        size_t * colIdxsRowRes = res->getColIdxs(rowIdx);
+                        size_t * rowIdxsRowRes = res->getRowIdxs(rowIdx);
+                        memcpy(valuesRowRes, rhs->getValues(rowIdx), nnzRowRhs * sizeof(VT));
+                        memcpy(colIdxsRowRes, rhs->getColIdxs(rowIdx), nnzRowRhs * sizeof(size_t));
+                        memcpy(rowIdxsRowRes, rhs->getRowIdxs(rowIdx), nnzRowRhs * sizeof(size_t));
+                        valuesRowRes[nnzRowRhs] = VT(0);
+                        colIdxsRowRes[nnzRowRhs] = size_t(-1);
+                        rowIdxsRowRes[nnzRowRhs] = size_t(-1);
+                    }
+                }
+                break;
+            }
+            case BinaryOpCode::MUL: { // intersect non-zero cells
+                for(size_t rowIdx = 0; rowIdx < numRows; rowIdx++) {
+                    size_t nnzRowLhs = lhs->getNumNonZerosRow(rowIdx);
+                    size_t nnzRowRhs = rhs->getNumNonZerosRow(rowIdx);
+                    if(nnzRowLhs && nnzRowRhs) {
+                        // intersect within row
+                        const VT * valuesRowLhs = lhs->getValues(rowIdx);
+                        const VT * valuesRowRhs = rhs->getValues(rowIdx);
+                        VT * valuesRowRes = res->getValues(rowIdx);
+                        const size_t * colIdxsRowLhs = lhs->getColIdxs(rowIdx);
+                        const size_t * colIdxsRowRhs = rhs->getColIdxs(rowIdx);
+                        size_t * colIdxsRowRes = res->getColIdxs(rowIdx);
+                        size_t * rowIdxsRowRes = res->getRowIdxs(rowIdx);
+                        size_t posLhs = 0;
+                        size_t posRhs = 0;
+                        size_t posRes = 0;
+                        while(posLhs < nnzRowLhs && posRhs < nnzRowRhs) {
+                            if(colIdxsRowLhs[posLhs] == colIdxsRowRhs[posRhs]) {
+                                valuesRowRes[posRes] = func(valuesRowLhs[posLhs], valuesRowRhs[posRhs], ctx);
+                                colIdxsRowRes[posRes] = colIdxsRowLhs[posLhs];
+                                posLhs++;
+                                posRhs++;
+                                rowIdxsRowRes[posRes] = rowIdx;
+                                posRes++;
+                            }
+                            else if(colIdxsRowLhs[posLhs] < colIdxsRowRhs[posRhs])
+                                posLhs++;
+                            else
+                                posRhs++;
+                        }
+                        valuesRowRes[posRes] = VT(0);
+                        colIdxsRowRes[posRes] = size_t(-1);
+                        rowIdxsRowRes[posRes] = size_t(-1);
+                    }
+                }
+                break;
+            }
+            default:
+                throw std::runtime_error("EwBinaryMat(COO) - unknown BinaryOpCode");
+        }
+    }
+};
+
 // ----------------------------------------------------------------------------
 // CSRMatrix <- CSRMatrix, DenseMatrix
 // ----------------------------------------------------------------------------
@@ -317,6 +471,72 @@ struct EwBinaryMat<CSRMatrix<VT>, CSRMatrix<VT>, DenseMatrix<VT>> {
     }
 };
 
+// ----------------------------------------------------------------------------
+// COOMatrix <- COOMatrix, DenseMatrix
+// ----------------------------------------------------------------------------
+
+template<typename VT>
+struct EwBinaryMat<COOMatrix<VT>, COOMatrix<VT>, DenseMatrix<VT>> {
+    static void apply(BinaryOpCode opCode, COOMatrix<VT> *& res, const COOMatrix<VT> * lhs, const DenseMatrix<VT> * rhs, DCTX(ctx)) {
+        const size_t numRows = lhs->getNumRows();
+        const size_t numCols = lhs->getNumCols();
+        if( (numRows != rhs->getNumRows() &&  rhs->getNumRows() != 1)
+            || (numCols != rhs->getNumCols() && rhs->getNumCols() != 1 ) )
+            throw std::runtime_error("EwBinaryMat(COO) - lhs and rhs must have the same dimensions (or broadcast)");
+
+        size_t maxNnz;
+        switch(opCode) {
+            case BinaryOpCode::MUL: // intersect
+                maxNnz = lhs->getNumNonZeros();
+                break;
+            default:
+                throw std::runtime_error("EwBinaryMat(COO) - unknown BinaryOpCode");
+        }
+
+        if(res == nullptr)
+            res = DataObjectFactory::create<COOMatrix<VT>>(numRows, numCols, maxNnz, false);
+
+
+        EwBinaryScaFuncPtr<VT, VT, VT> func = getEwBinaryScaFuncPtr<VT, VT, VT>(opCode);
+
+
+        switch(opCode) {
+            case BinaryOpCode::MUL: { // intersect non-zero cells
+                for(size_t rowIdx = 0; rowIdx < numRows; rowIdx++) {
+                    size_t nnzRowLhs = lhs->getNumNonZerosRow(rowIdx);
+                    if(nnzRowLhs) {
+                        // intersect within row
+                        const VT * valuesRowLhs = lhs->getValues(rowIdx);
+                        VT * valuesRowRes = res->getValues(rowIdx);
+                        const size_t * colIdxsRowLhs = lhs->getColIdxs(rowIdx);
+                        size_t * colIdxsRowRes = res->getColIdxs(rowIdx);
+                        const size_t * rowIdxsRowLhs = lhs->getRowIdxs(rowIdx);
+                        size_t * rowIdxsRowRes = res->getRowIdxs(rowIdx);
+                        auto rhsRow = (rhs->getNumRows() == 1 ? 0 : rowIdx);
+                        size_t posRes = 0;
+                        for (size_t posLhs = 0; posLhs < nnzRowLhs; ++posLhs) {
+                            auto rhsCol = (rhs->getNumCols() == 1 ? 0 : colIdxsRowLhs[posLhs]);
+                            auto rVal = rhs->get(rhsRow, rhsCol);
+                            if(rVal != 0) {
+                                valuesRowRes[posRes] = func(valuesRowLhs[posLhs], rVal, ctx);
+                                colIdxsRowRes[posRes] = colIdxsRowLhs[posLhs];
+                                rowIdxsRowRes[posRes] = rowIdxsRowLhs[posLhs];
+                                posRes++;
+                            }
+                        }
+                        valuesRowRes[posRes] = VT(0);
+                        colIdxsRowRes[posRes] = size_t(-1);
+                        rowIdxsRowRes[posRes] = size_t(-1);
+                    }
+                }
+                break;
+            }
+            default:
+                throw std::runtime_error("EwBinaryMat(COO) - unknown BinaryOpCode");
+        }
+    }
+};
+
 // ----------------------------------------------------------------------------
 // Matrix <- Matrix, Matrix
 // ----------------------------------------------------------------------------
diff --git a/src/runtime/local/kernels/EwUnaryMat.h b/src/runtime/local/kernels/EwUnaryMat.h
index 6af5440cb..3ca34622e 100644
--- a/src/runtime/local/kernels/EwUnaryMat.h
+++ b/src/runtime/local/kernels/EwUnaryMat.h
@@ -20,6 +20,7 @@
 #include <runtime/local/context/DaphneContext.h>
 #include <runtime/local/datastructures/DataObjectFactory.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
+#include <runtime/local/datastructures/COOMatrix.h>
 #include <runtime/local/kernels/UnaryOpCode.h>
 #include <runtime/local/kernels/EwUnarySca.h>
 
@@ -75,4 +76,45 @@ struct EwUnaryMat<DenseMatrix<VT>, DenseMatrix<VT>> {
     }
 };
 
+// ----------------------------------------------------------------------------
+// DenseMatrix <- COOMatrix
+// ----------------------------------------------------------------------------
+
+template<typename VT>
+struct EwUnaryMat<DenseMatrix<VT>, COOMatrix<VT>> {
+    static void apply(UnaryOpCode opCode, DenseMatrix<VT> *& res, const COOMatrix<VT> * arg, DCTX(ctx)) {
+        const size_t numRows = arg->getNumRows();
+        const size_t numCols = arg->getNumCols();
+
+        if(res == nullptr)
+            res = DataObjectFactory::create<DenseMatrix<VT>>(numRows, numCols, false);
+
+        VT * valuesRes = res->getValues();
+
+        EwUnaryScaFuncPtr<VT, VT> func = getEwUnaryScaFuncPtr<VT, VT>(opCode);
+
+        const size_t * rowsArg = arg->getRowIdxs();
+        const size_t * colsArg = arg->getColIdxs();
+        const VT * valuesArg = arg->getValues();
+        size_t index = 0;
+        size_t argRow = rowsArg[index];
+        size_t argCol = colsArg[index];
+        VT argVal = valuesArg[index];
+        for(size_t r = 0; r < numRows; r++) {
+            for(size_t c = 0; c < numCols; c++) {
+                if (r == argRow && c == argCol) {
+                    valuesRes[c] = func(argVal, ctx);
+                    index ++;
+                    argRow = rowsArg[index];
+                    argCol = colsArg[index];
+                    argVal = valuesArg[index];
+                } else {
+                    valuesRes[c] = func(VT(0), ctx);
+                }
+            }
+            valuesRes += res->getRowSkip();
+        }
+    }
+};
+
 #endif //SRC_RUNTIME_LOCAL_KERNELS_EWUNARYMAT_H
\ No newline at end of file
diff --git a/src/runtime/local/kernels/RandMatrix.h b/src/runtime/local/kernels/RandMatrix.h
index 1c63b8011..0782c91db 100644
--- a/src/runtime/local/kernels/RandMatrix.h
+++ b/src/runtime/local/kernels/RandMatrix.h
@@ -18,6 +18,7 @@
 
 #include <runtime/local/context/DaphneContext.h>
 #include <runtime/local/datastructures/CSRMatrix.h>
+#include <runtime/local/datastructures/COOMatrix.h>
 #include <runtime/local/datastructures/DataObjectFactory.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
 
@@ -25,6 +26,8 @@
 #include <random>
 #include <set>
 #include <type_traits>
+#include <vector>
+#include <unordered_set>
 
 #include <cassert>
 #include <cmath>
@@ -254,3 +257,101 @@ struct RandMatrix<CSRMatrix<VT>, VT> {
             rowOffsetsRes[i] += rowOffsetsRes[i - 1];
     }
 };
+
+// ----------------------------------------------------------------------------
+// COOMatrix
+// ----------------------------------------------------------------------------
+
+template<typename VT>
+struct RandMatrix<COOMatrix<VT>, VT> {
+    static void apply(COOMatrix<VT> *& res, size_t numRows, size_t numCols, VT min, VT max, double sparsity, int64_t seed, DCTX(ctx)) {
+        assert(numRows > 0 && "numRows must be > 0");
+        assert(numCols > 0 && "numCols must be > 0");
+        assert(min <= max && "min must be <= max");
+        assert(sparsity >= 0.0 && sparsity <= 1.0 &&
+               "sparsity has to be in the interval [0.0, 1.0]");
+
+        // The exact number of non-zeros to generate.
+        // TODO Ideally, it should not be allowed that zero is included in [min, max].
+        const auto nnz = static_cast<size_t>(round(numRows * numCols * sparsity));
+
+        if (res == nullptr)
+            res = DataObjectFactory::create<COOMatrix<VT>>(numRows, numCols, nnz, false);
+
+        // Initialize pseudo random number generators.
+        if (seed == -1)
+            seed = std::chrono::high_resolution_clock::now().time_since_epoch().count();
+        std::default_random_engine gen(seed);
+
+        static_assert(
+                std::is_floating_point<VT>::value || std::is_integral<VT>::value,
+                "the value type must be either floating point or integral"
+        );
+        typename std::conditional<
+                std::is_floating_point<VT>::value,
+                std::uniform_real_distribution<VT>,
+                std::uniform_int_distribution<VT>
+        >::type distrVal(min, max);
+
+        VT *valuesRes = res->getValues();
+        for (size_t i = 0; i < nnz; i++)
+            valuesRes[i] = distrVal(gen);
+
+        std::uniform_int_distribution<size_t> distrRow(0, numRows - 1);
+        std::vector<size_t> rowSequence;
+        std::vector<size_t> occurrences(numRows, 0);
+        for (size_t i = 0; i < nnz; ++i) {
+            size_t randomValue = distrRow(gen);
+
+            while (occurrences[randomValue] >= numCols) {
+                randomValue = distrRow(gen);
+            }
+
+            occurrences[randomValue]++;
+            rowSequence.push_back(randomValue);
+        }
+        std::sort(rowSequence.begin(), rowSequence.end());
+
+        std::vector<size_t> colSequence;
+
+        std::vector<size_t> startRow;
+        size_t lastRow = -1;
+        for (size_t i = 0; i < nnz; ++i) {
+            if (rowSequence[i] != lastRow) startRow.push_back(i);
+            lastRow = rowSequence[i];
+        }
+        startRow.push_back(nnz);
+
+        std::uniform_int_distribution<size_t> distrCol(0, numCols - 1);
+
+        for (size_t i = 0; i < startRow.size() - 1; i++) {
+            size_t start = startRow[i];
+            size_t end = startRow[i + 1];
+            std::vector<size_t> subSequence;
+            std::unordered_set<size_t> uniqueValues;
+            while (subSequence.size() < end - start) {
+                size_t randomValue = distrCol(gen);
+                if (uniqueValues.find(randomValue) == uniqueValues.end()) {
+                    subSequence.push_back(randomValue);
+                    uniqueValues.insert(randomValue);
+                }
+            }
+
+            std::sort(subSequence.begin(), subSequence.end());
+            for (size_t value : subSequence) {
+                colSequence.push_back(value);
+            }
+        }
+
+        size_t * rowIdxs = res->getRowIdxs();
+        size_t * colIdxs = res->getColIdxs();
+        for (size_t i = 0; i < nnz; i++) {
+            rowIdxs[i] = rowSequence[i];
+            colIdxs[i] = colSequence[i];
+        }
+
+        valuesRes[nnz] = VT(0);
+        rowIdxs[nnz] = size_t(-1);
+        colIdxs[nnz] = size_t(-1);
+    }
+};
diff --git a/src/runtime/local/kernels/Transpose.h b/src/runtime/local/kernels/Transpose.h
index 1247c3138..eb946c5ee 100644
--- a/src/runtime/local/kernels/Transpose.h
+++ b/src/runtime/local/kernels/Transpose.h
@@ -18,6 +18,7 @@
 
 #include <runtime/local/context/DaphneContext.h>
 #include <runtime/local/datastructures/CSRMatrix.h>
+#include <runtime/local/datastructures/COOMatrix.h>
 #include <runtime/local/datastructures/DataObjectFactory.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
 
@@ -118,3 +119,49 @@ struct Transpose<CSRMatrix<VT>, CSRMatrix<VT>> {
         delete[] curRowOffsets;
     }
 };
+
+// ----------------------------------------------------------------------------
+// COOMatrix <- COOMatrix
+// ----------------------------------------------------------------------------
+
+template<typename VT>
+struct Transpose<COOMatrix<VT>, COOMatrix<VT>> {
+    static void apply(COOMatrix<VT> *& res, const COOMatrix<VT> * arg, DCTX(ctx)) {
+        const size_t numRows = arg->getNumRows();
+        const size_t numCols = arg->getNumCols();
+
+        if(res == nullptr)
+            res = DataObjectFactory::create<COOMatrix<VT>>(numCols, numRows, arg->getMaxNumNonZeros(), false);
+
+        // (re)initialize the matrix for consecutive set calls (because the row,col pairs are not in the correct order
+        VT * valuesRes = res->getValues();
+        size_t * colIdxsRes = res->getColIdxs();
+        size_t * rowIdxsRes = res->getRowIdxs();
+
+        const VT * valuesArg = arg->getValues();
+        const size_t * colIdxsArg = arg->getColIdxs();
+        const size_t * rowIdxsArg = arg->getRowIdxs();
+
+        std::vector<std::pair<size_t, size_t>> result;
+
+        size_t size = 0;
+        for (size_t i = 0; colIdxsArg[i] != size_t(-1); ++i) {
+            result.emplace_back(colIdxsArg[i], i);
+            size++;
+        }
+
+        std::sort(result.begin(), result.end(), [](const auto &a, const auto &b) {
+            return a.first < b.first;
+        });
+
+        for (size_t i = 0; i < size; ++i) {
+            valuesRes[i] = valuesArg[result[i].second];
+            colIdxsRes[i] = rowIdxsArg[result[i].second];
+            rowIdxsRes[i] = result[i].first;
+        }
+
+        valuesRes[size] = int(0);
+        colIdxsRes[size] = size_t(-1);
+        rowIdxsRes[size] = size_t(-1);
+    }
+};
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 990339a7d..6cf1fe3de 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -65,6 +65,7 @@ set(TEST_SOURCES
         runtime/distributed/worker/WorkerTest.cpp
 
         runtime/local/datastructures/CSRMatrixTest.cpp
+        runtime/local/datastructures/COOMatrixTest.cpp
         runtime/local/datastructures/DenseMatrixTest.cpp
         runtime/local/datastructures/FrameTest.cpp
         runtime/local/datastructures/MatrixTest.cpp
diff --git a/test/runtime/local/datastructures/COOMatrixTest.cpp b/test/runtime/local/datastructures/COOMatrixTest.cpp
new file mode 100644
index 000000000..578b8edc5
--- /dev/null
+++ b/test/runtime/local/datastructures/COOMatrixTest.cpp
@@ -0,0 +1,149 @@
+/*
+ * Copyright 2021 The DAPHNE Consortium
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <runtime/local/datastructures/COOMatrix.h>
+#include <runtime/local/datastructures/DataObjectFactory.h>
+#include <runtime/local/datastructures/ValueTypeUtils.h>
+
+#include <tags.h>
+
+#include <catch.hpp>
+
+#include <cstdint>
+
+TEMPLATE_TEST_CASE("COOMatrix allocates enough space", TAG_DATASTRUCTURES, ALL_VALUE_TYPES) {
+    // No assertions in this test case. We just want to see if it runs without
+    // crashing.
+
+    using ValueType = TestType;
+
+    const size_t numRows = 10000;
+    const size_t numCols = 2000;
+    const size_t numNonZeros = 500;
+
+    COOMatrix<ValueType> * m = DataObjectFactory::create<COOMatrix<ValueType>>(numRows, numCols, numNonZeros, false);
+
+    ValueType * values = m->getValues();
+    size_t * colIdxs = m->getColIdxs();
+    size_t * rowIdxs = m->getRowIdxs();
+
+    // Fill all arrays with ones of the respective type. Note that this does
+    // not result in a valid COO representation, but we only want to check if
+    // there is enough space.
+    for(size_t i = 0; i < numNonZeros; i++) {
+        values[i] = ValueType(1);
+        colIdxs[i] = size_t(1);
+        rowIdxs[i] = size_t(1);
+    }
+
+    DataObjectFactory::destroy(m);
+}
+
+TEST_CASE("COOMatrix methods work properly", TAG_DATASTRUCTURES) {
+    using ValueType = uint64_t;
+
+    const size_t numRows = 10;
+    const size_t numCols = 10;
+    const size_t maxnumNonZeros = 6;
+
+    COOMatrix<ValueType> * matrix = DataObjectFactory::create<COOMatrix<ValueType>>(numRows, numCols, maxnumNonZeros, true);
+
+    matrix->set(0, 0, 5);
+    matrix->set(2, 2, 3);
+    matrix->set(1, 1, 4);
+    matrix->set(3, 3, 2);
+    matrix->set(4, 4, 1);
+
+    CHECK(matrix->getMaxNumNonZeros() == 6);
+    CHECK(matrix->getNumNonZeros() == 5);
+    CHECK(matrix->getNumRows() == 10);
+    CHECK(matrix->getNumCols() == 10);
+    CHECK(matrix->getNumNonZerosRow(1) == 1);
+    CHECK(matrix->getNumNonZerosCol(1) == 1);
+    CHECK(matrix->getValues()[0] == 5);
+    CHECK(matrix->getColIdxs()[3] == 3);
+    CHECK(matrix->getRowIdxs()[2] == 2);
+    CHECK(matrix->getValues(1)[0] == 4);
+    CHECK(matrix->getColIdxs(1)[0] == 1);
+    CHECK(matrix->get(1, 1) == 4);
+
+    matrix->prepareAppend();
+    matrix->append(0, 0, 5);
+    matrix->append(1, 1, 4);
+    matrix->append(2, 2, 3);
+    matrix->append(3, 3, 2);
+    matrix->append(4, 4, 1);
+
+    CHECK(matrix->getNumNonZeros() == 5);
+    CHECK(matrix->getNumNonZerosRow(1) == 1);
+    CHECK(matrix->getNumNonZerosCol(1) == 1);
+    CHECK(matrix->getValues()[0] == 5);
+    CHECK(matrix->getColIdxs()[3] == 3);
+    CHECK(matrix->getRowIdxs()[2] == 2);
+    CHECK(matrix->getValues(1)[0] == 4);
+    CHECK(matrix->getColIdxs(1)[0] == 1);
+    CHECK(matrix->get(1, 1) == 4);
+}
+
+TEST_CASE("COOMatrix sub-matrix works properly", TAG_DATASTRUCTURES) {
+    using ValueType = uint64_t;
+
+    const size_t numRowsOrig = 10;
+    const size_t numColsOrig = 10;
+    const size_t maxnumNonZeros = 10;
+
+    COOMatrix<ValueType> * mOrig = DataObjectFactory::create<COOMatrix<ValueType>>(numRowsOrig, numColsOrig, maxnumNonZeros, true);
+    COOMatrix<ValueType> * mSub = DataObjectFactory::create<COOMatrix<ValueType>>(mOrig, 3, 5);
+
+    mOrig->set(0, 0, 5);
+    mOrig->set(2, 2, 3);
+    mOrig->set(1, 1, 4);
+    mOrig->set(3, 3, 2);
+    mOrig->set(4, 4, 1);
+
+    // Sub-matrix dimensions are as expected.
+    CHECK(mSub->getNumRows() == 2);
+    CHECK(mSub->getNumCols() == numColsOrig);
+
+    // Sub-matrix shares arrays with original.
+    CHECK(mSub->getValues()[0] == mOrig->getValues()[3]);
+    CHECK(mSub->getColIdxs()[0] == mOrig->getColIdxs()[3]);
+    CHECK(mSub->getRowIdxs()[0] == mOrig->getRowIdxs()[3]);
+
+    CHECK(mOrig->get(3, 3) == mSub->get(0, 3));
+    CHECK(mOrig->get(4, 4) == mSub->get(1, 4));
+
+    mOrig->set(3, 4, 15);
+
+    CHECK(mSub->getNumNonZeros() == 3);
+
+    mOrig->set(0, 1, 15);
+
+    CHECK(mSub->getNumNonZeros() == 3);
+
+    CHECK(mOrig->get(3, 3) == mSub->get(0, 3));
+    CHECK(mOrig->get(4, 4) == mSub->get(1, 4));
+
+    // Freeing both matrices does not result in double-free errors.
+    SECTION("Freeing the original matrix first is fine") {
+        DataObjectFactory::destroy(mOrig);
+        DataObjectFactory::destroy(mSub);
+    }
+    SECTION("Freeing the sub-matrix first is fine") {
+        DataObjectFactory::destroy(mSub);
+        DataObjectFactory::destroy(mOrig);
+    }
+}
\ No newline at end of file
diff --git a/test/runtime/local/kernels/AggAllTest.cpp b/test/runtime/local/kernels/AggAllTest.cpp
index c12c98d4b..c8c039a08 100644
--- a/test/runtime/local/kernels/AggAllTest.cpp
+++ b/test/runtime/local/kernels/AggAllTest.cpp
@@ -16,6 +16,7 @@
 
 #include <runtime/local/datagen/GenGivenVals.h>
 #include <runtime/local/datastructures/CSRMatrix.h>
+#include <runtime/local/datastructures/COOMatrix.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
 #include <runtime/local/kernels/CheckEq.h>
 #include <runtime/local/kernels/AggAll.h>
@@ -28,7 +29,7 @@
 #include <vector>
 
 #define TEST_NAME(opName) "AggAll (" opName ")"
-#define DATA_TYPES DenseMatrix, CSRMatrix
+#define DATA_TYPES DenseMatrix, CSRMatrix, COOMatrix
 #define VALUE_TYPES double, uint32_t
 
 template<typename VTRes, class DTArg>
diff --git a/test/runtime/local/kernels/AggColTest.cpp b/test/runtime/local/kernels/AggColTest.cpp
index 708d50fef..006898ebd 100644
--- a/test/runtime/local/kernels/AggColTest.cpp
+++ b/test/runtime/local/kernels/AggColTest.cpp
@@ -16,6 +16,7 @@
 
 #include <runtime/local/datagen/GenGivenVals.h>
 #include <runtime/local/datastructures/CSRMatrix.h>
+#include <runtime/local/datastructures/COOMatrix.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
 #include <runtime/local/kernels/CheckEqApprox.h>
 #include <runtime/local/kernels/AggCol.h>
@@ -28,7 +29,7 @@
 #include <vector>
 
 #define TEST_NAME(opName) "AggCol (" opName ")"
-#define DATA_TYPES DenseMatrix, CSRMatrix
+#define DATA_TYPES DenseMatrix, CSRMatrix, COOMatrix
 #define VALUE_TYPES double, uint32_t
 
 template<class DTRes, class DTArg>
diff --git a/test/runtime/local/kernels/AggRowTest.cpp b/test/runtime/local/kernels/AggRowTest.cpp
index b0d3c8862..603d71a3e 100644
--- a/test/runtime/local/kernels/AggRowTest.cpp
+++ b/test/runtime/local/kernels/AggRowTest.cpp
@@ -16,6 +16,7 @@
 
 #include <runtime/local/datagen/GenGivenVals.h>
 #include <runtime/local/datastructures/CSRMatrix.h>
+#include <runtime/local/datastructures/COOMatrix.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
 #include <runtime/local/kernels/CheckEqApprox.h>
 #include <runtime/local/kernels/AggRow.h>
@@ -28,7 +29,7 @@
 #include <vector>
 
 #define TEST_NAME(opName) "AggRow (" opName ")"
-#define DATA_TYPES DenseMatrix, CSRMatrix
+#define DATA_TYPES DenseMatrix, CSRMatrix, COOMatrix
 #define VALUE_TYPES double, uint32_t
 
 template<class DTRes, class DTArg>
diff --git a/test/runtime/local/kernels/CheckEqApproxTest.cpp b/test/runtime/local/kernels/CheckEqApproxTest.cpp
index 5efe82701..20f0b87ae 100644
--- a/test/runtime/local/kernels/CheckEqApproxTest.cpp
+++ b/test/runtime/local/kernels/CheckEqApproxTest.cpp
@@ -18,6 +18,7 @@
 #include <runtime/local/datastructures/DataObjectFactory.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
 #include <runtime/local/datastructures/CSRMatrix.h>
+#include <runtime/local/datastructures/COOMatrix.h>
 #include <runtime/local/kernels/CheckEqApprox.h>
 
 #include <tags.h>
@@ -31,7 +32,7 @@
 // TODO Extend tests to integral value types, they should be handled
 // gracefully, too.
 
-TEMPLATE_PRODUCT_TEST_CASE("CheckEqApprox, original matrices", TAG_KERNELS, (DenseMatrix, CSRMatrix), (float, double)) {
+TEMPLATE_PRODUCT_TEST_CASE("CheckEqApprox, original matrices", TAG_KERNELS, (DenseMatrix, CSRMatrix, COOMatrix), (float, double)) {
     using DT = TestType;
     
     std::vector<typename DT::VT> vals = {
diff --git a/test/runtime/local/kernels/CheckEqTest.cpp b/test/runtime/local/kernels/CheckEqTest.cpp
index bd3d0dc1f..02830a88c 100644
--- a/test/runtime/local/kernels/CheckEqTest.cpp
+++ b/test/runtime/local/kernels/CheckEqTest.cpp
@@ -18,6 +18,7 @@
 #include <runtime/local/datastructures/DataObjectFactory.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
 #include <runtime/local/datastructures/CSRMatrix.h>
+#include <runtime/local/datastructures/COOMatrix.h>
 #include <runtime/local/kernels/CheckEq.h>
 
 #include <tags.h>
@@ -28,7 +29,7 @@
 
 #include <cstdint>
 
-TEMPLATE_PRODUCT_TEST_CASE("CheckEq, original matrices", TAG_KERNELS, (DenseMatrix, CSRMatrix), (double, uint32_t)) {
+TEMPLATE_PRODUCT_TEST_CASE("CheckEq, original matrices", TAG_KERNELS, (DenseMatrix, CSRMatrix, COOMatrix), (double, uint32_t)) {
     using DT = TestType;
     
     std::vector<typename DT::VT> vals = {
@@ -118,7 +119,7 @@ TEMPLATE_PRODUCT_TEST_CASE("CheckEq, views on matrices", TAG_KERNELS, (DenseMatr
     }
 }
 
-TEMPLATE_PRODUCT_TEST_CASE("CheckEq, views on matrices", TAG_KERNELS, (CSRMatrix), (double, uint32_t)) {
+TEMPLATE_PRODUCT_TEST_CASE("CheckEq, views on matrices", TAG_KERNELS, (CSRMatrix, COOMatrix), (double, uint32_t)) {
     using DT = TestType;
     
     std::vector<typename DT::VT> vals = {
@@ -196,7 +197,7 @@ TEMPLATE_PRODUCT_TEST_CASE("CheckEq, empty matrices", TAG_KERNELS, (DenseMatrix)
     }
 }
 
-TEMPLATE_PRODUCT_TEST_CASE("CheckEq, empty matrices", TAG_KERNELS, (CSRMatrix), (double, uint32_t)) {
+TEMPLATE_PRODUCT_TEST_CASE("CheckEq, empty matrices", TAG_KERNELS, (CSRMatrix, COOMatrix), (double, uint32_t)) {
     using DT = TestType;
     
     std::vector<typename DT::VT> vals = {
diff --git a/test/runtime/local/kernels/EwBinaryMatTest.cpp b/test/runtime/local/kernels/EwBinaryMatTest.cpp
index a6b62c570..5347c0935 100644
--- a/test/runtime/local/kernels/EwBinaryMatTest.cpp
+++ b/test/runtime/local/kernels/EwBinaryMatTest.cpp
@@ -15,6 +15,7 @@
  */
 
 #include <runtime/local/datastructures/CSRMatrix.h>
+#include <runtime/local/datastructures/COOMatrix.h>
 #include <runtime/local/datagen/GenGivenVals.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
 #include <runtime/local/kernels/CheckEq.h>
@@ -29,7 +30,7 @@
 #include <cstdint>
 
 #define TEST_NAME(opName) "EwBinaryMat (" opName ")"
-#define DATA_TYPES DenseMatrix, CSRMatrix
+#define DATA_TYPES DenseMatrix, CSRMatrix, COOMatrix
 #define VALUE_TYPES double, uint32_t
 
 template<class DT>
@@ -126,10 +127,10 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("mul"), TAG_KERNELS, (DATA_TYPES), (VALUE_T
     DataObjectFactory::destroy(m3);
 }
 
-TEMPLATE_TEST_CASE(TEST_NAME("mul_sparse_dense"), TAG_KERNELS, VALUE_TYPES) {
+TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("mul_sparse_dense"), TAG_KERNELS, (CSRMatrix, COOMatrix), (VALUE_TYPES)) {
     // TODO: all Dense - CSR combinations
-    using VT = TestType;
-    using SparseDT = CSRMatrix<VT>;
+    using SparseDT = TestType;
+    using VT = typename SparseDT::VT;
     using DT = DenseMatrix<VT>;
 
     auto m0 = genGivenVals<SparseDT>(4, {
diff --git a/test/runtime/local/kernels/EwUnaryMatTest.cpp b/test/runtime/local/kernels/EwUnaryMatTest.cpp
index 38730a79f..14aed5855 100644
--- a/test/runtime/local/kernels/EwUnaryMatTest.cpp
+++ b/test/runtime/local/kernels/EwUnaryMatTest.cpp
@@ -17,6 +17,7 @@
 #include <runtime/local/kernels/CheckEq.h>
 #include <runtime/local/kernels/CheckEqApprox.h>
 #include <runtime/local/datastructures/CSRMatrix.h>
+#include <runtime/local/datastructures/COOMatrix.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
 #include <runtime/local/kernels/EwUnaryMat.h>
 #include <runtime/local/datagen/GenGivenVals.h>
@@ -30,7 +31,7 @@
 #include <cstdint>
 
 #define TEST_NAME(opName) "EwUnaryMat (" opName ")"
-#define DATA_TYPES DenseMatrix
+#define DATA_TYPES DenseMatrix, COOMatrix
 #define VALUE_TYPES int32_t, double
 
 template<typename DTRes, typename DTArg>
@@ -49,10 +50,10 @@ void checkEwUnaryMatApprox(UnaryOpCode opCode, const DTArg * arg, const DTRes *
     DataObjectFactory::destroy(res);
 }
 
-template<typename DTArg>
-void checkEwUnaryMatThrow(UnaryOpCode opCode, const DTArg * arg) {
-    DTArg * res = nullptr;
-    REQUIRE_THROWS_AS((ewUnaryMat<DTArg, DTArg>(opCode, res, arg, nullptr)), std::domain_error);
+template<typename DTRes, typename DTArg>
+void checkEwUnaryMatThrow(UnaryOpCode opCode, const DTArg * arg, [[maybe_unused]] const DTRes * exp) {
+    DTRes * res = nullptr;
+    REQUIRE_THROWS_AS((ewUnaryMat<DTRes, DTArg>(opCode, res, arg, nullptr)), std::domain_error);
     DataObjectFactory::destroy(res);
 }
 
@@ -142,6 +143,7 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("sqrt"), TAG_KERNELS, (DATA_TYPES), (VALUE_
 
 TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("sqrt, check domain_error"), TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)) {
     using DT = TestType;
+    using VT = typename DT::VT;
 
     auto arg = genGivenVals<DT>(3, {
         0,
@@ -149,7 +151,9 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("sqrt, check domain_error"), TAG_KERNELS, (
         -1,
     });
 
-    checkEwUnaryMatThrow(UnaryOpCode::SQRT, arg);
+    DenseMatrix<VT> * dense = nullptr;
+
+    checkEwUnaryMatThrow(UnaryOpCode::SQRT, arg, dense);
 
     DataObjectFactory::destroy(arg);
 }
@@ -198,6 +202,7 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("ln"), TAG_KERNELS, (DATA_TYPES), (VALUE_TY
 
 TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("ln, check domain_error"), TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)) {
     using DT = TestType;
+    using VT = typename DT::VT;
 
     auto arg = genGivenVals<DT>(3, {
         0,
@@ -205,7 +210,9 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("ln, check domain_error"), TAG_KERNELS, (DA
         -1,
     });
 
-    checkEwUnaryMatThrow(UnaryOpCode::LN, arg);
+    DenseMatrix<VT> * dense = nullptr;
+
+    checkEwUnaryMatThrow(UnaryOpCode::LN, arg, dense);
 
     DataObjectFactory::destroy(arg);
 }
@@ -300,6 +307,7 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("asin"), TAG_KERNELS, (DATA_TYPES), (VALUE_
 
 TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("asin, check domain_error"), TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)) {
     using DT = TestType;
+    using VT = typename DT::VT;
 
     auto arg = genGivenVals<DT>(3, {
         0,
@@ -307,7 +315,9 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("asin, check domain_error"), TAG_KERNELS, (
         -2,
     });
 
-    checkEwUnaryMatThrow(UnaryOpCode::ASIN, arg);
+    DenseMatrix<VT> * dense = nullptr;
+
+    checkEwUnaryMatThrow(UnaryOpCode::ASIN, arg, dense);
 
     DataObjectFactory::destroy(arg);
 }
@@ -335,6 +345,7 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("acos"), TAG_KERNELS, (DATA_TYPES), (VALUE_
 
 TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("acos, check domain_error"), TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)) {
     using DT = TestType;
+    using VT = typename DT::VT;
 
     auto arg = genGivenVals<DT>(3, {
         0,
@@ -342,7 +353,9 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("acos, check domain_error"), TAG_KERNELS, (
         -2,
     });
 
-    checkEwUnaryMatThrow(UnaryOpCode::ACOS, arg);
+    DenseMatrix<VT> * dense = nullptr;
+
+    checkEwUnaryMatThrow(UnaryOpCode::ACOS, arg, dense);
 
     DataObjectFactory::destroy(arg);
 }
@@ -560,11 +573,13 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("round, floating-point-specific"), TAG_KERN
 // ****************************************************************************
 
 TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("some invalid op-code"), TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)) {
-    using DT = TestType;
-    
-    auto arg = genGivenVals<DT>(1, {1});
-    DT * exp = nullptr;
-    CHECK_THROWS(ewUnaryMat<DT, DT>(static_cast<UnaryOpCode>(999), exp, arg, nullptr));
+    using DTArg = TestType;
+    using VT = typename DTArg::VT;
+    using DTRes = DenseMatrix<VT>;
+
+    auto arg = genGivenVals<DTArg>(1, {1});
+    DTRes * exp = nullptr;
+    CHECK_THROWS(ewUnaryMat<DTRes, DTArg>(static_cast<UnaryOpCode>(999), exp, arg, nullptr));
 
     DataObjectFactory::destroy(arg);
 }
\ No newline at end of file
diff --git a/test/runtime/local/kernels/RandMatrixTest.cpp b/test/runtime/local/kernels/RandMatrixTest.cpp
index 3d6a1c30b..e9fd83d20 100644
--- a/test/runtime/local/kernels/RandMatrixTest.cpp
+++ b/test/runtime/local/kernels/RandMatrixTest.cpp
@@ -15,6 +15,7 @@
  */
 
 #include <runtime/local/datastructures/CSRMatrix.h>
+#include <runtime/local/datastructures/COOMatrix.h>
 #include <runtime/local/datastructures/DataObjectFactory.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
 #include <runtime/local/kernels/RandMatrix.h>
@@ -28,7 +29,7 @@
 #include <cmath>
 #include <cstdint>
 
-TEMPLATE_PRODUCT_TEST_CASE("RandMatrix", TAG_KERNELS, (DenseMatrix, CSRMatrix), (double, float, uint32_t, uint8_t)) {
+TEMPLATE_PRODUCT_TEST_CASE("RandMatrix", TAG_KERNELS, (DenseMatrix, CSRMatrix, COOMatrix), (double, float, uint32_t, uint8_t)) {
     using DT = TestType;
     using VT = typename DT::VT;
     const size_t numRows = 100;
diff --git a/test/runtime/local/kernels/TransposeTest.cpp b/test/runtime/local/kernels/TransposeTest.cpp
index 70a337504..af9a70d6e 100644
--- a/test/runtime/local/kernels/TransposeTest.cpp
+++ b/test/runtime/local/kernels/TransposeTest.cpp
@@ -15,6 +15,7 @@
  */
 
 #include <runtime/local/datastructures/CSRMatrix.h>
+#include <runtime/local/datastructures/COOMatrix.h>
 #include <runtime/local/datastructures/DenseMatrix.h>
 #include <runtime/local/datagen/GenGivenVals.h>
 #include <runtime/local/kernels/CheckEq.h>
@@ -33,7 +34,7 @@ void checkTranspose(const DT * arg, const DT * exp) {
     CHECK(*res == *exp);
 }
 
-TEMPLATE_PRODUCT_TEST_CASE("Transpose", TAG_KERNELS, (DenseMatrix, CSRMatrix), (double, uint32_t)) {
+TEMPLATE_PRODUCT_TEST_CASE("Transpose", TAG_KERNELS, (DenseMatrix, CSRMatrix, COOMatrix), (double, uint32_t)) {
     using DT = TestType;
     
     DT * m = nullptr;

From 16d0b2de70dfedb02dcea81e5c8362a5c48f37c9 Mon Sep 17 00:00:00 2001
From: AlexRTer <74372589+AlexRTer@users.noreply.github.com>
Date: Thu, 22 Aug 2024 19:45:44 +0200
Subject: [PATCH 2/2] Minor Changes and Fixes - replace asserts with exceptions
 - add comments and briefs for clarity - fix colWidth formatting in print
 method and replace array allocation with vector - change print to use
 printValue utility function to handle all value types - move equality
 operator to COOMatrix.h like other datastructures - refactor AggAll kernel
 for CSR/COO to reduce code duplication. Add namespace for better usage in
 AggRow kernel - remove redundant getValues call in AggRow kernel -
 EwBinaryMat kernel:   - fix a bug that would append a zero entry to sparse
 matrices if lhs and rhs entries added to zero   - replace memcpy with fill_n
 for copy of identical row values - minor optimization in RandMatrix kernel to
 avoid potentially re-allocating resources in loop iteration - minor
 optimization and bug fix in Transpose kernel to avoid copying column indices
 for argsort and change to stable_sort to guarantee row index order stays
 valid - extend COOMatrixTest to verify proper ordering of coordinates - fix
 test errors from previous merge - minor formatting changes

---
 src/runtime/local/datagen/GenGivenVals.h      |   4 +-
 src/runtime/local/datastructures/COOMatrix.h  | 212 +++++++----
 src/runtime/local/kernels/AggAll.h            | 211 ++++-------
 src/runtime/local/kernels/AggRow.h            |  10 +-
 src/runtime/local/kernels/CheckEq.h           |  39 +-
 src/runtime/local/kernels/CheckEqApprox.h     |  10 +-
 src/runtime/local/kernels/EwBinaryMat.h       |  47 ++-
 src/runtime/local/kernels/RandMatrix.h        |  42 ++-
 src/runtime/local/kernels/Transpose.h         |  79 ++--
 .../local/datastructures/COOMatrixTest.cpp    |  49 ++-
 test/runtime/local/kernels/CheckEqTest.cpp    |   1 -
 .../runtime/local/kernels/EwBinaryMatTest.cpp |  58 ++-
 test/runtime/local/kernels/EwUnaryMatTest.cpp | 337 +++++++++++++-----
 13 files changed, 648 insertions(+), 451 deletions(-)

diff --git a/src/runtime/local/datagen/GenGivenVals.h b/src/runtime/local/datagen/GenGivenVals.h
index 298ad7a2f..d855821f8 100644
--- a/src/runtime/local/datagen/GenGivenVals.h
+++ b/src/runtime/local/datagen/GenGivenVals.h
@@ -23,6 +23,7 @@
 #include <runtime/local/datastructures/COOMatrix.h>
 
 #include <algorithm>
+#include <stdexcept>
 #include <vector>
 
 #include <cstddef>
@@ -168,7 +169,8 @@ template<typename VT>
 struct GenGivenVals<COOMatrix<VT>> {
     static COOMatrix<VT> * generate(size_t numRows, const std::vector<VT> & elements, size_t minNumNonZeros = 0) {
         const size_t numCells = elements.size();
-        assert((numCells % numRows == 0) && "number of given data elements must be divisible by given number of rows");
+        if (numCells % numRows != 0)
+            throw std::runtime_error("genGivenVals: number of given data elements must be divisible by given number of rows");
         const size_t numCols = numCells / numRows;
         size_t numNonZeros = 0;
         for(VT v : elements)
diff --git a/src/runtime/local/datastructures/COOMatrix.h b/src/runtime/local/datastructures/COOMatrix.h
index 79d1d9775..5aaf7cdab 100644
--- a/src/runtime/local/datastructures/COOMatrix.h
+++ b/src/runtime/local/datastructures/COOMatrix.h
@@ -21,18 +21,33 @@
 #include <runtime/local/datastructures/ValueTypeUtils.h>
 
 #include <algorithm>
+#include <iomanip>
 #include <iostream>
 #include <memory>
+#include <stdexcept>
 #include <utility>
-#include <iomanip>
+#include <vector>
 
-#include <cassert>
 #include <cstddef>
 #include <cstring>
 
 /**
  * @brief A sparse matrix in COOrdinate (COO) format.
+ * 
+ * This matrix implementation is backed by three contiguous arrays.
+ * A `values` array containing all non-zero entries, as well as two arrays containing
+ * the corresponding row and column indices of every non-zero entry - `rowIdxs` and `colIdxs`.
+ * All of them have a total capacity of `maxNumNonZeros` entries.
+ * Row indices are sorted in ascending order. The same applies to the column indices within each row.
  *
+ * To mark the current end of these arrays they reserve one more entry to store `VT(0)` (`values`)
+ * or `size_t(-1)` (indice arrays). This indicates how many entries the matrix currently holds
+ * contrary to its total capacity and needs to be set again if overwritten.
+ *
+ * Each instance of this class might represent a (row-based) sub-matrix of another
+ * `COOMatrix`. Thus, passing the `rowIdx` to `getValues()` or the `get*Idxs()`
+ * methods returns a pointer to the start of the respective column. Its size can
+ * be retrieved by calling `getNumNonZerosRow()` with the same `rowIdx`.
  */
 template<typename ValueType>
 class COOMatrix : public Matrix<ValueType> {
@@ -48,7 +63,8 @@ class COOMatrix : public Matrix<ValueType> {
     size_t maxNumNonZeros;
 
     /**
-     * @brief We need these in order to accommodate row-based sub-matrix views.
+     * @brief Lower and upper bounds to accommodate views on a row-based
+     * sub-matrix.
      */
     size_t lowerRow;
     size_t upperRow;
@@ -113,10 +129,15 @@ class COOMatrix : public Matrix<ValueType> {
             lowerRow(rowLowerIncl),
             upperRow(rowUpperExcl),
             view(true) {
-        assert(src && "src must not be null");
-        assert((rowLowerIncl < src->numRows) && "rowLowerIncl is out of bounds");
-        assert((rowUpperExcl <= src->numRows) && "rowUpperExcl is out of bounds");
-        assert((rowLowerIncl < rowUpperExcl) && "rowLowerIncl must be lower than rowUpperExcl");
+        
+        if (!src)
+            throw std::runtime_error("COOMatrix: src must not be null");
+        if (rowLowerIncl >= src->numRows)
+            throw std::runtime_error("COOMatrix: rowLowerIncl is out of bounds");
+        if (rowUpperExcl > src->numRows)
+            throw std::runtime_error("COOMatrix: rowUpperExcl is out of bounds");
+        if (rowLowerIncl >= rowUpperExcl)
+            throw std::runtime_error("COOMatrix: rowLowerIncl must be lower than rowUpperExcl");
 
         rowIdxs = src->rowIdxs;
         colIdxs = src->colIdxs;
@@ -127,7 +148,18 @@ class COOMatrix : public Matrix<ValueType> {
         // nothing to do
     }
 
+    /**
+     * @brief Returns a tuple containing the index of the first coordinate pointing to
+     * a value in the specified row as well as the amount of values in that row.
+     *
+     * @param rowIdx Index of the specified row.
+     * @param start Inclusive lower bound of where to begin querying for the specified
+     * row in the coordinate array.
+     */
     [[nodiscard]] std::pair<size_t, size_t> rowRange(size_t rowIdx, size_t start) const {
+        if (rowIdx - lowerRow >= upperRow)
+            throw std::runtime_error("COOMatrix: rowIdx is out of bounds");
+
         size_t rowStart = 0, rowLength = 0, row, i = start;
         while (true) {
             row = rowIdxs.get()[i];
@@ -151,8 +183,20 @@ class COOMatrix : public Matrix<ValueType> {
         return std::make_pair(rowStart, rowLength);
     }
 
+    /**
+     * @brief Inserts a non-zero value and its indices at the given position
+     * in the values, rowIdxs, colIdxs arrays. Does not check whether a
+     * value with the same coordinates already exists and skips zero
+     * values without insertion.
+     *
+     * @param pos The index at which to insert the values into the respective arrays.
+     * @param rowIdx The row index of the inserted value.
+     * @param colIdx The col index of the inserted value.
+     * @param value The value to be inserted.
+     */
     void insert(size_t pos, size_t rowIdx, size_t colIdx, ValueType value) {
-        assert((this->getNumNonZeros() < maxNumNonZeros) && "can't add any more nonzero values");
+        if (this->getNumNonZeros() >= maxNumNonZeros)
+            throw std::runtime_error("COOMatrix: cannot add any more nonzero values");
 
         if (value == ValueType(0)) return;
         ValueType val = values.get()[pos];
@@ -177,14 +221,18 @@ class COOMatrix : public Matrix<ValueType> {
         colIdxs.get()[pos] = colIdx;
     }
 
+    /**
+     * @brief Removes te entry at the given index.
+     *
+     * @param idx The index at which to remove the entry.
+     */
     void remove(size_t idx) {
-        size_t i = idx;
         while (true) {
-            rowIdxs.get()[i] = rowIdxs.get()[i + 1];
-            values.get()[i] = values.get()[i + 1];
-            colIdxs.get()[i] = colIdxs.get()[i + 1];
-            if (rowIdxs.get()[i] == size_t(-1)) break;
-            i++;
+            rowIdxs.get()[idx] = rowIdxs.get()[idx + 1];
+            values.get()[idx] = values.get()[idx + 1];
+            colIdxs.get()[idx] = colIdxs.get()[idx + 1];
+            if (rowIdxs.get()[idx] == size_t(-1)) break;
+            ++idx;
         }
     }
 
@@ -222,14 +270,15 @@ class COOMatrix : public Matrix<ValueType> {
     }
 
     [[nodiscard]] size_t getNumNonZerosRow(size_t rowIdx) const {
-        assert((rowIdx < numRows) && "rowIdx is out of bounds");
+        if (rowIdx >= numRows)
+            throw std::runtime_error("COOMatrix: rowIdx is out of bounds");
 
-        std::pair<size_t, size_t> range = rowRange(rowIdx + lowerRow, 0);
-        return range.second;
+        return rowRange(rowIdx + lowerRow, 0).second;
     }
 
     [[nodiscard]] size_t getNumNonZerosCol(size_t colIdx) const {
-        assert((colIdx < numCols) && "colIdx is out of bounds");
+        if (colIdx >= numCols)
+            throw std::runtime_error("COOMatrix: colIdx is out of bounds");
 
         size_t cnt = 0, i = 0;
         while (true) {
@@ -243,8 +292,7 @@ class COOMatrix : public Matrix<ValueType> {
     }
 
     [[nodiscard]] ValueType *getValues() {
-        std::pair<size_t, size_t> range = rowRange(lowerRow, 0);
-        size_t rowStart = range.first;
+        size_t rowStart = rowRange(lowerRow, 0).first;
 
         return values.get() + rowStart;
     }
@@ -254,8 +302,7 @@ class COOMatrix : public Matrix<ValueType> {
     }
 
     [[nodiscard]] size_t *getColIdxs() {
-        std::pair<size_t, size_t> range = rowRange(lowerRow, 0);
-        size_t rowStart = range.first;
+        size_t rowStart = rowRange(lowerRow, 0).first;
 
         return colIdxs.get() + rowStart;
     }
@@ -265,8 +312,7 @@ class COOMatrix : public Matrix<ValueType> {
     }
 
     [[nodiscard]] size_t *getRowIdxs() {
-        std::pair<size_t, size_t> range = rowRange(lowerRow, 0);
-        size_t rowStart = range.first;
+        size_t rowStart = rowRange(lowerRow, 0).first;
 
         return rowIdxs.get() + rowStart;
     }
@@ -276,10 +322,10 @@ class COOMatrix : public Matrix<ValueType> {
     }
 
     [[nodiscard]] ValueType *getValues(size_t rowIdx) {
-        assert((rowIdx <= numRows) && "rowIdx is out of bounds");
+        if (rowIdx > numRows)
+            throw std::runtime_error("COOMatrix: rowIdx is out of bounds");
 
-        std::pair<size_t, size_t> range = rowRange(rowIdx + lowerRow, 0);
-        size_t rowStart = range.first;
+        size_t rowStart = rowRange(rowIdx + lowerRow, 0).first;
 
         return values.get() + rowStart;
     }
@@ -289,10 +335,10 @@ class COOMatrix : public Matrix<ValueType> {
     }
 
     [[nodiscard]] size_t *getColIdxs(size_t rowIdx) {
-        assert((rowIdx <= numRows) && "rowIdx is out of bounds");
+        if (rowIdx > numRows)
+            throw std::runtime_error("COOMatrix: rowIdx is out of bounds");
 
-        std::pair<size_t, size_t> range = rowRange(rowIdx + lowerRow, 0);
-        size_t rowStart = range.first;
+        size_t rowStart = rowRange(rowIdx + lowerRow, 0).first;
 
         return colIdxs.get() + rowStart;
     }
@@ -302,10 +348,10 @@ class COOMatrix : public Matrix<ValueType> {
     }
 
     [[nodiscard]] size_t *getRowIdxs(size_t rowIdx) {
-        assert((rowIdx <= numRows) && "rowIdx is out of bounds");
+        if (rowIdx > numRows)
+            throw std::runtime_error("COOMatrix: rowIdx is out of bounds");
 
-        std::pair<size_t, size_t> range = rowRange(rowIdx + lowerRow, 0);
-        size_t rowStart = range.first;
+        size_t rowStart = rowRange(rowIdx + lowerRow, 0).first;
 
         return rowIdxs.get() + rowStart;
     }
@@ -317,8 +363,10 @@ class COOMatrix : public Matrix<ValueType> {
     [[nodiscard]] ValueType get(size_t rowIdx, size_t colIdx) const override {
         rowIdx += lowerRow;
 
-        assert((rowIdx < numRows) && "rowIdx is out of bounds");
-        assert((colIdx < numCols) && "colIdx is out of bounds");
+        if (rowIdx >= upperRow)
+            throw std::runtime_error("COOMatrix: rowIdx is out of bounds");
+        if (colIdx >= numCols)
+            throw std::runtime_error("COOMatrix: colIdx is out of bounds");
 
         for (size_t i = 0; i < maxNumNonZeros; i++) {
             size_t row = rowIdxs.get()[i];
@@ -336,9 +384,12 @@ class COOMatrix : public Matrix<ValueType> {
     void set(size_t rowIdx, size_t colIdx, ValueType value) override {
         rowIdx += lowerRow;
 
-        assert((rowIdx < numRows) && "rowIdx is out of bounds");
-        assert((colIdx < numCols) && "colIdx is out of bounds");
+        if (rowIdx >= numRows)
+            throw std::runtime_error("COOMatrix: rowIdx is out of bounds");
+        if (colIdx >= numCols)
+            throw std::runtime_error("COOMatrix: colIdx is out of bounds");
 
+        // Zero values are handles by `insert`.
         for (size_t i = 0; i < maxNumNonZeros; i++) {
             size_t row = rowIdxs.get()[i];
             if (row == size_t(-1)) {
@@ -373,9 +424,12 @@ class COOMatrix : public Matrix<ValueType> {
     }
 
     void append(size_t rowIdx, size_t colIdx, ValueType value) override {
-        assert((rowIdx < numRows) && "rowIdx is out of bounds");
-        assert((colIdx < numCols) && "colIdx is out of bounds");
-        assert((appendHelp < maxNumNonZeros) && "can't add any more nonzero values");
+        if (rowIdx >= numRows)
+            throw std::runtime_error("COOMatrix: rowIdx is out of bounds");
+        if (colIdx >= numCols)
+            throw std::runtime_error("COOMatrix: colIdx is out of bounds");
+        if (appendHelp >= maxNumNonZeros)
+            throw std::runtime_error("COOMatrix: can't add any more nonzero values");
 
         if (value == ValueType(0)) return;
 
@@ -397,45 +451,43 @@ class COOMatrix : public Matrix<ValueType> {
      */
     void print(std::ostream &os) const override {
         os << "COOMatrix(" << numRows << 'x' << numCols << ", "
-           << ValueTypeUtils::cppNameFor<ValueType> << ')' << std::endl << std::endl;
+           << ValueTypeUtils::cppNameFor<ValueType> << ')' << std::endl;
 
-        auto *colWidths = new int[numCols];
-        for (size_t i = 0; i < numCols; ++i) {
-            colWidths[i] = 1;
-        }
+        ValueTypeCode VTCode = ValueTypeUtils::codeFor<ValueType>;
+
+        std::vector<int64_t> colWidths(numCols, 1);
 
         size_t index = 0;
         while (true) {
-            ValueType value = values.get()[index];
-            if (value == ValueType(0)) break;
+            if (rowIdxs.get()[index] == size_t(-1)) break;
             std::ostringstream oss;
-            oss << value;
+            ValueTypeUtils::printValue(oss, VTCode, values.get(), index);
             std::string strValue = oss.str();
-            colWidths[colIdxs.get()[index]] = std::max(static_cast<int>(strValue.length()),
+            colWidths[colIdxs.get()[index]] = std::max(static_cast<int64_t>(strValue.length()),
                                                        colWidths[colIdxs.get()[index]]);
             index++;
         }
 
         size_t i = 0;
-        ValueType val = values.get()[i];
         size_t row = rowIdxs.get()[i];
         size_t col = colIdxs.get()[i];
         for (size_t currRow = 0; currRow < numRows; ++currRow) {
             for (size_t currCol = 0; currCol < numCols; ++currCol) {
-                if (currRow == row && currCol == col) {
-                    os << std::setw(colWidths[col]) << val << " ";
+                if (currRow == row && currCol == col && i < index) {
+                    os << std::setw(colWidths[currCol]);
+                    ValueTypeUtils::printValue(os, VTCode, values.get(), i);
+                    os << " ";
                     i++;
-                    val = values.get()[i];
-                    row = rowIdxs.get()[i];
-                    col = colIdxs.get()[i];
+                    if (i < index) {
+                        row = rowIdxs.get()[i];
+                        col = colIdxs.get()[i];
+                    }
                 } else {
-                    os << std::setw(colWidths[col]) << ValueType(0) << " ";
+                    os << std::setw(colWidths[currCol]) << 0 << " ";
                 }
             }
             os << std::endl;
         }
-
-        delete[] colWidths;
     }
 
     /**
@@ -445,18 +497,18 @@ class COOMatrix : public Matrix<ValueType> {
     void printRaw(std::ostream &os) const {
         size_t numNonZeros = this->getNumNonZeros();
         os << "COOMatrix(" << numRows << 'x' << numCols << ", "
-           << ValueTypeUtils::cppNameFor<ValueType> << ')' << std::endl;
-        os << "maxNumNonZeros: \t" << maxNumNonZeros << std::endl;
-        os << "numNonZeros: \t" << this->getNumNonZeros() << std::endl;
+           << ValueTypeUtils::cppNameFor<ValueType> << ')' << "\n";
+        os << "maxNumNonZeros:\t" << maxNumNonZeros << "\n";
+        os << "numNonZeros: \t" << this->getNumNonZeros() << "\n";
         os << "values: \t";
         size_t offset = rowRange(lowerRow, 0).first;
         for (size_t i = 0; i < numNonZeros; i++)
             os << values.get()[i + offset] << ", ";
-        os << std::endl;
+        os << "\n";
         os << "colIdxs: \t";
         for (size_t i = 0; i < numNonZeros; i++)
             os << colIdxs.get()[i + offset] << ", ";
-        os << std::endl;
+        os << "\n";
         os << "rowIdxs: \t";
         for (size_t i = 0; i < numNonZeros; i++)
             os << rowIdxs.get()[i + offset] << ", ";
@@ -479,6 +531,40 @@ class COOMatrix : public Matrix<ValueType> {
         return (maxNumNonZeros + 1) * (sizeof(ValueType) + sizeof(size_t) + sizeof(size_t));
     }
 
+    bool operator==(const COOMatrix<ValueType> & rhs) const {
+        if (this == &rhs)
+            return true;
+
+        const size_t numRows = this->getNumRows();
+        const size_t numCols = this->getNumCols();
+        if (numRows != rhs.getNumRows() || numCols != rhs.getNumCols())
+            return false;
+
+        const size_t nnzLhs = this->getNumNonZeros();
+        if (nnzLhs != rhs.getNumNonZeros())
+            return false;
+
+        const ValueType * valuesLhs = this->getValues();
+        const size_t * rowsLhs = this->getRowIdxs();
+        const size_t * colsLhs = this->getColIdxs();
+
+        const ValueType * valuesRhs = rhs.getValues();
+        const size_t * rowsRhs = rhs.getRowIdxs();
+        const size_t * colsRhs = rhs.getColIdxs();
+
+        size_t lowerRowLhs = this->getLowerRow();
+        size_t lowerRowRhs = rhs.getLowerRow();
+
+        for (size_t i = 0; i < nnzLhs; i++) {
+            if (   rowsLhs[i] - lowerRowLhs != rowsRhs[i] - lowerRowRhs
+                || colsLhs[i] != colsRhs[i]
+                || valuesLhs[i] != valuesRhs[i]
+                )
+                return false;
+        }
+        return true;
+    }
+
     size_t serialize(std::vector<char> &buf) const override;
 };
 
diff --git a/src/runtime/local/kernels/AggAll.h b/src/runtime/local/kernels/AggAll.h
index bf2ece086..d3e83a557 100644
--- a/src/runtime/local/kernels/AggAll.h
+++ b/src/runtime/local/kernels/AggAll.h
@@ -46,6 +46,81 @@ VTRes aggAll(AggOpCode opCode, const DTArg * arg, DCTX(ctx)) {
     return AggAll<VTRes, DTArg>::apply(opCode, arg, ctx);
 }
 
+// ****************************************************************************
+// Functions called by multiple template specializations
+// ****************************************************************************
+
+namespace AggAllUtility {
+    template<typename VTRes, typename VTArg>
+    VTRes aggArray(const VTArg * values, size_t numNonZeros, size_t numCells, EwBinaryScaFuncPtr<VTRes, VTRes, VTRes> func, bool isSparseSafe, VTRes neutral, DCTX(ctx)) {
+        if(numNonZeros) {
+            VTRes agg = static_cast<VTRes>(values[0]);
+            for(size_t i = 1; i < numNonZeros; i++)
+                agg = func(agg, static_cast<VTRes>(values[i]), ctx);
+
+            if(!isSparseSafe && numNonZeros < numCells)
+                agg = func(agg, 0, ctx);
+
+            return agg;
+        }
+        else
+            return func(neutral, 0, ctx);
+    };
+
+    template<typename VTRes, typename DTArg, typename VTArg>
+    VTRes aggAllApplySparse(AggOpCode opCode, const DTArg * arg, DCTX(ctx)) {
+        if (AggOpCodeUtils::isPureBinaryReduction(opCode)) {
+            EwBinaryScaFuncPtr<VTRes, VTRes, VTRes> func = getEwBinaryScaFuncPtr<VTRes, VTRes, VTRes>(AggOpCodeUtils::getBinaryOpCode(opCode));
+
+            return aggArray(
+                    arg->getValues(0),
+                    arg->getNumNonZeros(),
+                    arg->getNumRows() * arg->getNumCols(),
+                    func,
+                    AggOpCodeUtils::isSparseSafe(opCode),
+                    AggOpCodeUtils::template getNeutral<VTRes>(opCode),
+                    ctx
+            );
+        }
+        else { // The op-code is either MEAN or STDDEV or VAR.
+            EwBinaryScaFuncPtr<VTRes, VTRes, VTRes> func = getEwBinaryScaFuncPtr<VTRes, VTRes, VTRes>(AggOpCodeUtils::getBinaryOpCode(AggOpCode::SUM));
+            auto agg = aggArray(
+                    arg->getValues(0),
+                    arg->getNumNonZeros(),
+                    arg->getNumRows() * arg->getNumCols(),
+                    func,
+                    true,
+                    VTRes(0),
+                    ctx
+            );
+            agg = agg / (arg->getNumRows() * arg->getNumCols());
+            if (opCode == AggOpCode::MEAN)
+                return agg;
+            else {
+                // STDDEV-VAR
+                VTRes stddev=0;
+
+                const VTArg * valuesArg = arg->getValues(0);
+                for (size_t i = 0; i < arg->getNumNonZeros(); i++) {
+                    VTRes val = static_cast<VTRes>((valuesArg[i])) - agg;
+                    stddev = stddev + val * val;
+                }
+                stddev += ((arg->getNumRows() * arg->getNumCols()) - arg->getNumNonZeros())*agg*agg;
+                stddev /= (arg->getNumRows() * arg->getNumCols());
+
+                // Variance --> stddev before sqrt() is variance
+                if (opCode == AggOpCode::VAR){
+                    VTRes var = stddev;
+                    return var;
+                }
+
+                stddev = sqrt(stddev);
+                return stddev;
+            }
+        }
+    };
+}
+
 // ****************************************************************************
 // (Partial) template specializations for different data/value types
 // ****************************************************************************
@@ -120,74 +195,9 @@ struct AggAll<VTRes, DenseMatrix<VTArg>> {
 // ----------------------------------------------------------------------------
 
 template<typename VTRes, typename VTArg>
-struct AggAll<VTRes, CSRMatrix<VTArg>> {
-    static VTRes aggArray(const VTArg * values, size_t numNonZeros, size_t numCells, EwBinaryScaFuncPtr<VTRes, VTRes, VTRes> func, bool isSparseSafe, VTRes neutral, DCTX(ctx)) {
-        if(numNonZeros) {
-            VTRes agg = static_cast<VTRes>(values[0]);
-            for(size_t i = 1; i < numNonZeros; i++)
-                agg = func(agg, static_cast<VTRes>(values[i]), ctx);
-
-            if(!isSparseSafe && numNonZeros < numCells)
-                agg = func(agg, 0, ctx);
-
-            return agg;
-        }
-        else
-            return func(neutral, 0, ctx);
-    }
-    
+struct AggAll<VTRes, CSRMatrix<VTArg>> {    
     static VTRes apply(AggOpCode opCode, const CSRMatrix<VTArg> * arg, DCTX(ctx)) {
-        if(AggOpCodeUtils::isPureBinaryReduction(opCode)) {
-
-            EwBinaryScaFuncPtr<VTRes, VTRes, VTRes> func = getEwBinaryScaFuncPtr<VTRes, VTRes, VTRes>(AggOpCodeUtils::getBinaryOpCode(opCode));
-            
-            return aggArray(
-                    arg->getValues(0),
-                    arg->getNumNonZeros(),
-                    arg->getNumRows() * arg->getNumCols(),
-                    func,
-                    AggOpCodeUtils::isSparseSafe(opCode),
-                    AggOpCodeUtils::template getNeutral<VTRes>(opCode),
-                    ctx
-            );
-        }
-        else { // The op-code is either MEAN or STDDEV or VAR.
-            EwBinaryScaFuncPtr<VTRes, VTRes, VTRes> func = getEwBinaryScaFuncPtr<VTRes, VTRes, VTRes>(AggOpCodeUtils::getBinaryOpCode(AggOpCode::SUM));            
-            auto agg = aggArray(
-                arg->getValues(0),
-                arg->getNumNonZeros(),
-                arg->getNumRows() * arg->getNumCols(),
-                func,
-                true,
-                VTRes(0),
-                ctx
-            );
-            agg = agg / (arg->getNumRows() * arg->getNumCols());
-            if (opCode == AggOpCode::MEAN)
-                return agg;
-            else{
-                //STDDEV-VAR
-                VTRes stddev=0;
-
-                const VTArg * valuesArg = arg->getValues(0);
-                for(size_t i = 0; i < arg->getNumNonZeros(); i++) {
-                    VTRes val = static_cast<VTRes>((valuesArg[i])) - agg;
-                    stddev = stddev + val * val;
-                }
-                stddev += ((arg->getNumRows() * arg->getNumCols()) - arg->getNumNonZeros())*agg*agg;
-                stddev /= (arg->getNumRows() * arg->getNumCols());
-                 
-                //Variance --> stddev before sqrt() is variance
-                if (opCode == AggOpCode::VAR){
-                    VTRes var = stddev;
-                    return var;
-                }
-
-                stddev = sqrt(stddev);
-                return stddev;
-
-            }
-        }
+        return AggAllUtility::aggAllApplySparse<VTRes, CSRMatrix<VTArg>, VTArg>(opCode, arg, ctx);
     }
 };
 
@@ -197,73 +207,8 @@ struct AggAll<VTRes, CSRMatrix<VTArg>> {
 
 template<typename VTRes, typename VTArg>
 struct AggAll<VTRes, COOMatrix<VTArg>> {
-    static VTRes aggArray(const VTArg * values, size_t numNonZeros, size_t numCells, EwBinaryScaFuncPtr<VTRes, VTRes, VTRes> func, bool isSparseSafe, VTRes neutral, DCTX(ctx)) {
-        if(numNonZeros) {
-            VTRes agg = static_cast<VTRes>(values[0]);
-            for(size_t i = 1; i < numNonZeros; i++)
-                agg = func(agg, static_cast<VTRes>(values[i]), ctx);
-
-            if(!isSparseSafe && numNonZeros < numCells)
-                agg = func(agg, 0, ctx);
-
-            return agg;
-        }
-        else
-            return func(neutral, 0, ctx);
-    }
-
     static VTRes apply(AggOpCode opCode, const COOMatrix<VTArg> * arg, DCTX(ctx)) {
-        if(AggOpCodeUtils::isPureBinaryReduction(opCode)) {
-
-            EwBinaryScaFuncPtr<VTRes, VTRes, VTRes> func = getEwBinaryScaFuncPtr<VTRes, VTRes, VTRes>(AggOpCodeUtils::getBinaryOpCode(opCode));
-
-            return aggArray(
-                    arg->getValues(0),
-                    arg->getNumNonZeros(),
-                    arg->getNumRows() * arg->getNumCols(),
-                    func,
-                    AggOpCodeUtils::isSparseSafe(opCode),
-                    AggOpCodeUtils::template getNeutral<VTRes>(opCode),
-                    ctx
-            );
-        }
-        else { // The op-code is either MEAN or STDDEV or VAR.
-            EwBinaryScaFuncPtr<VTRes, VTRes, VTRes> func = getEwBinaryScaFuncPtr<VTRes, VTRes, VTRes>(AggOpCodeUtils::getBinaryOpCode(AggOpCode::SUM));
-            auto agg = aggArray(
-                    arg->getValues(0),
-                    arg->getNumNonZeros(),
-                    arg->getNumRows() * arg->getNumCols(),
-                    func,
-                    true,
-                    VTRes(0),
-                    ctx
-            );
-            agg = agg / (arg->getNumRows() * arg->getNumCols());
-            if (opCode == AggOpCode::MEAN)
-                return agg;
-            else{
-                //STDDEV-VAR
-                VTRes stddev=0;
-
-                const VTArg * valuesArg = arg->getValues(0);
-                for(size_t i = 0; i < arg->getNumNonZeros(); i++) {
-                    VTRes val = static_cast<VTRes>((valuesArg[i])) - agg;
-                    stddev = stddev + val * val;
-                }
-                stddev += ((arg->getNumRows() * arg->getNumCols()) - arg->getNumNonZeros())*agg*agg;
-                stddev /= (arg->getNumRows() * arg->getNumCols());
-
-                //Variance --> stddev before sqrt() is variance
-                if (opCode == AggOpCode::VAR){
-                    VTRes var = stddev;
-                    return var;
-                }
-
-                stddev = sqrt(stddev);
-                return stddev;
-
-            }
-        }
+        return AggAllUtility::aggAllApplySparse<VTRes, COOMatrix<VTArg>, VTArg>(opCode, arg, ctx);
     }
 };
 
diff --git a/src/runtime/local/kernels/AggRow.h b/src/runtime/local/kernels/AggRow.h
index f3fd16fe9..c302619ea 100644
--- a/src/runtime/local/kernels/AggRow.h
+++ b/src/runtime/local/kernels/AggRow.h
@@ -32,7 +32,6 @@
 #include <cstddef>
 #include <cstring>
 #include <cmath>
-#include <typeinfo>
 
 // ****************************************************************************
 // Struct for partial template specialization
@@ -191,7 +190,7 @@ struct AggRow<DenseMatrix<VTRes>, CSRMatrix<VTArg>> {
             const VTRes neutral = AggOpCodeUtils::template getNeutral<VTRes>(opCode);
         
             for(size_t r = 0; r < numRows; r++) {
-                *valuesRes = AggAll<VTRes, CSRMatrix<VTArg>>::aggArray(
+                *valuesRes = AggAllUtility::aggArray(
                         arg->getValues(r),
                         arg->getNumNonZeros(r),
                         numCols,
@@ -212,7 +211,7 @@ struct AggRow<DenseMatrix<VTRes>, CSRMatrix<VTArg>> {
             VTRes * valuesT = tmp->getValues();
             EwBinaryScaFuncPtr<VTRes, VTRes, VTRes> func = getEwBinaryScaFuncPtr<VTRes, VTRes, VTRes>(AggOpCodeUtils::getBinaryOpCode(AggOpCode::SUM));
             for (size_t r = 0; r < numRows; r++){
-                *valuesRes = AggAll<VTRes, CSRMatrix<VTArg>>::aggArray(
+                *valuesRes = AggAllUtility::aggArray(
                     arg->getValues(r),
                     arg->getNumNonZeros(r),
                     numCols,
@@ -240,7 +239,6 @@ struct AggRow<DenseMatrix<VTRes>, CSRMatrix<VTArg>> {
                 }
                 valuesRes += res->getRowSkip();
             }
-            valuesRes = res->getValues();
             DataObjectFactory::destroy<DenseMatrix<VTRes>>(tmp);
 
         }
@@ -270,7 +268,7 @@ struct AggRow<DenseMatrix<VTRes>, COOMatrix<VTArg>> {
             const VTRes neutral = AggOpCodeUtils::template getNeutral<VTRes>(opCode);
 
             for(size_t r = 0; r < numRows; r++) {
-                *valuesRes = AggAll<VTRes, COOMatrix<VTArg>>::aggArray(
+                *valuesRes = AggAllUtility::aggArray(
                         arg->getValues(r),
                         arg->getNumNonZerosRow(r),
                         numCols,
@@ -291,7 +289,7 @@ struct AggRow<DenseMatrix<VTRes>, COOMatrix<VTArg>> {
             VTRes * valuesT = tmp->getValues();
             EwBinaryScaFuncPtr<VTRes, VTRes, VTRes> func = getEwBinaryScaFuncPtr<VTRes, VTRes, VTRes>(AggOpCodeUtils::getBinaryOpCode(AggOpCode::SUM));
             for (size_t r = 0; r < numRows; r++){
-                *valuesRes = AggAll<VTRes, COOMatrix<VTArg>>::aggArray(
+                *valuesRes = AggAllUtility::aggArray(
                         arg->getValues(r),
                         arg->getNumNonZerosRow(r),
                         numCols,
diff --git a/src/runtime/local/kernels/CheckEq.h b/src/runtime/local/kernels/CheckEq.h
index 4fbc2c720..9bb427beb 100644
--- a/src/runtime/local/kernels/CheckEq.h
+++ b/src/runtime/local/kernels/CheckEq.h
@@ -86,44 +86,7 @@ struct CheckEq<CSRMatrix<VT>> {
 template<typename VT>
 struct CheckEq<COOMatrix<VT>> {
     static bool apply(const COOMatrix<VT> * lhs, const COOMatrix<VT> * rhs, DCTX(ctx)) {
-        if(lhs == rhs)
-            return true;
-
-        const size_t numRows = lhs->getNumRows();
-        const size_t numCols = lhs->getNumCols();
-
-        if(numRows != rhs->getNumRows() || numCols != rhs->getNumCols())
-            return false;
-
-        const VT * valuesLhs = lhs->getValues();
-        const size_t * rowsLhs = lhs->getRowIdxs();
-        const size_t * colsLhs = lhs->getColIdxs();
-
-        const VT * valuesRhs = rhs->getValues();
-        const size_t * rowsRhs = rhs->getRowIdxs();
-        const size_t * colsRhs = rhs->getColIdxs();
-
-        const size_t nnzLhs = lhs->getNumNonZeros();
-        const size_t nnzRhs = rhs->getNumNonZeros();
-
-        size_t lowerRowLhs = lhs->getLowerRow();
-        size_t lowerRowRhs = rhs->getLowerRow();
-
-        if(nnzLhs != nnzRhs)
-            return false;
-
-        for (size_t i = 0; i < nnzLhs; i++) {
-            const size_t rowLhs = rowsLhs[i] - lowerRowLhs;
-            const size_t rowRhs = rowsRhs[i] - lowerRowRhs;
-            if (rowLhs != rowRhs) return false;
-            const size_t colLhs = colsLhs[i];
-            const size_t colRhs = colsRhs[i];
-            if (colLhs != colRhs) return false;
-            const VT valLhs = valuesLhs[i];
-            const VT valRhs = valuesRhs[i];
-            if (valLhs != valRhs) return false;
-        }
-        return true;
+        return *lhs == *rhs;
     }
 };
 
diff --git a/src/runtime/local/kernels/CheckEqApprox.h b/src/runtime/local/kernels/CheckEqApprox.h
index e3e46a665..4d0aefb12 100644
--- a/src/runtime/local/kernels/CheckEqApprox.h
+++ b/src/runtime/local/kernels/CheckEqApprox.h
@@ -196,12 +196,10 @@ struct CheckEqApprox<COOMatrix<VT>> {
             return false;
 
         for (size_t i = 0; i < nnzLhs; i++) {
-            const size_t rowLhs = rowsLhs[i] - lowerRowLhs;
-            const size_t rowRhs = rowsRhs[i] - lowerRowRhs;
-            if (rowLhs != rowRhs) return false;
-            const size_t colLhs = colsLhs[i];
-            const size_t colRhs = colsRhs[i];
-            if (colLhs != colRhs) return false;
+            if (   rowsLhs[i] - lowerRowLhs != rowsRhs[i] - lowerRowRhs
+                || colsLhs[i] != colsRhs[i]
+                )
+                return false;
             const VT valLhs = valuesLhs[i];
             const VT valRhs = valuesRhs[i];
             VT diff = valLhs - valRhs;
diff --git a/src/runtime/local/kernels/EwBinaryMat.h b/src/runtime/local/kernels/EwBinaryMat.h
index 0c53d8f0b..79b1657c2 100644
--- a/src/runtime/local/kernels/EwBinaryMat.h
+++ b/src/runtime/local/kernels/EwBinaryMat.h
@@ -25,6 +25,8 @@
 #include <runtime/local/kernels/BinaryOpCode.h>
 #include <runtime/local/kernels/EwBinarySca.h>
 
+#include <algorithm>
+
 #include <cstddef>
 
 // ****************************************************************************
@@ -161,8 +163,12 @@ struct EwBinaryMat<CSRMatrix<VT>, CSRMatrix<VT>, CSRMatrix<VT>> {
                         size_t posRes = 0;
                         while(posLhs < nnzRowLhs && posRhs < nnzRowRhs) {
                             if(colIdxsRowLhs[posLhs] == colIdxsRowRhs[posRhs]) {
-                                valuesRowRes[posRes] = func(valuesRowLhs[posLhs], valuesRowRhs[posRhs], ctx);
-                                colIdxsRowRes[posRes] = colIdxsRowLhs[posLhs];
+                                VT funcRes = func(valuesRowLhs[posLhs], valuesRowRhs[posRhs], ctx);
+                                if (funcRes != VT(0)) {
+                                    valuesRowRes[posRes] = funcRes;
+                                    colIdxsRowRes[posRes] = colIdxsRowLhs[posLhs];
+                                    posRes++;
+                                }
                                 posLhs++;
                                 posRhs++;
                             }
@@ -170,13 +176,14 @@ struct EwBinaryMat<CSRMatrix<VT>, CSRMatrix<VT>, CSRMatrix<VT>> {
                                 valuesRowRes[posRes] = valuesRowLhs[posLhs];
                                 colIdxsRowRes[posRes] = colIdxsRowLhs[posLhs];
                                 posLhs++;
+                                posRes++;
                             }
                             else {
                                 valuesRowRes[posRes] = valuesRowRhs[posRhs];
                                 colIdxsRowRes[posRes] = colIdxsRowRhs[posRhs];
                                 posRhs++;
+                                posRes++;
                             }
-                            posRes++;
                         }
                         // copy from left
                         const size_t restRowLhs = nnzRowLhs - posLhs;
@@ -285,6 +292,10 @@ struct EwBinaryMat<COOMatrix<VT>, COOMatrix<VT>, COOMatrix<VT>> {
                 for(size_t rowIdx = 0; rowIdx < numRows; rowIdx++) {
                     size_t nnzRowLhs = lhs->getNumNonZerosRow(rowIdx);
                     size_t nnzRowRhs = rhs->getNumNonZerosRow(rowIdx);
+                    // If only one row has non-zero entries, the values and coordinates
+                    // can simply be copied to res. Otherwise, they are merged until
+                    // only one of them has non-zero entries remaining which can then
+                    // be copied again.
                     if(nnzRowLhs && nnzRowRhs) {
                         // merge within row
                         const VT * valuesRowLhs = lhs->getValues(rowIdx);
@@ -292,8 +303,6 @@ struct EwBinaryMat<COOMatrix<VT>, COOMatrix<VT>, COOMatrix<VT>> {
                         VT * valuesRowRes = res->getValues(rowIdx);
                         const size_t * colIdxsRowLhs = lhs->getColIdxs(rowIdx);
                         const size_t * colIdxsRowRhs = rhs->getColIdxs(rowIdx);
-                        const size_t *rowIdxsRowArg1 = lhs->getRowIdxs(rowIdx);
-                        const size_t *rowIdxsRowArg2 = rhs->getRowIdxs(rowIdx);
                         size_t * colIdxsRowRes = res->getColIdxs(rowIdx);
                         size_t * rowIdxsRowRes = res->getRowIdxs(rowIdx);
                         size_t posLhs = 0;
@@ -301,34 +310,41 @@ struct EwBinaryMat<COOMatrix<VT>, COOMatrix<VT>, COOMatrix<VT>> {
                         size_t posRes = 0;
                         while(posLhs < nnzRowLhs && posRhs < nnzRowRhs) {
                             if(colIdxsRowLhs[posLhs] == colIdxsRowRhs[posRhs]) {
-                                valuesRowRes[posRes] = func(valuesRowLhs[posLhs], valuesRowRhs[posRhs], ctx);
-                                colIdxsRowRes[posRes] = colIdxsRowLhs[posLhs];
+                                VT funcRes = func(valuesRowLhs[posLhs], valuesRowRhs[posRhs], ctx);
+                                if (funcRes != VT(0)) {
+                                    valuesRowRes[posRes] = funcRes;
+                                    colIdxsRowRes[posRes] = colIdxsRowLhs[posLhs];
+                                    rowIdxsRowRes[posRes] = rowIdx;
+                                    posRes++;
+                                }
                                 posLhs++;
                                 posRhs++;
                             }
                             else if(colIdxsRowLhs[posLhs] < colIdxsRowRhs[posRhs]) {
                                 valuesRowRes[posRes] = valuesRowLhs[posLhs];
                                 colIdxsRowRes[posRes] = colIdxsRowLhs[posLhs];
+                                rowIdxsRowRes[posRes] = rowIdx;
                                 posLhs++;
+                                posRes++;
                             }
                             else {
                                 valuesRowRes[posRes] = valuesRowRhs[posRhs];
                                 colIdxsRowRes[posRes] = colIdxsRowRhs[posRhs];
+                                rowIdxsRowRes[posRes] = rowIdx;
                                 posRhs++;
+                                posRes++;
                             }
-                            rowIdxsRowRes[posRes] = rowIdx;
-                            posRes++;
                         }
                         // copy from left
                         const size_t restRowLhs = nnzRowLhs - posLhs;
                         memcpy(valuesRowRes + posRes, valuesRowLhs + posLhs, restRowLhs * sizeof(VT));
                         memcpy(colIdxsRowRes + posRes, colIdxsRowLhs + posLhs, restRowLhs * sizeof(size_t));
-                        memcpy(rowIdxsRowRes + posRes, rowIdxsRowArg1 + posLhs, restRowLhs * sizeof(size_t));
+                        std::fill_n(rowIdxsRowRes + posRes, restRowLhs, rowIdx);
                         // copy from right
                         const size_t restRowRhs = nnzRowRhs - posRhs;
                         memcpy(valuesRowRes + posRes, valuesRowRhs + posRhs, restRowRhs * sizeof(VT));
                         memcpy(colIdxsRowRes + posRes, colIdxsRowRhs + posRhs, restRowRhs * sizeof(size_t));
-                        memcpy(rowIdxsRowRes + posRes, rowIdxsRowArg2 + posRhs, restRowRhs * sizeof(size_t));
+                        std::fill_n(rowIdxsRowRes + posRes, restRowRhs, rowIdx);
 
                         valuesRowRes[posRes + restRowLhs + restRowRhs] = VT(0);
                         colIdxsRowRes[posRes + restRowLhs + restRowRhs] = size_t(-1);
@@ -341,7 +357,7 @@ struct EwBinaryMat<COOMatrix<VT>, COOMatrix<VT>, COOMatrix<VT>> {
                         size_t * rowIdxsRowRes = res->getRowIdxs(rowIdx);
                         memcpy(valuesRowRes, lhs->getValues(rowIdx), nnzRowLhs * sizeof(VT));
                         memcpy(colIdxsRowRes, lhs->getColIdxs(rowIdx), nnzRowLhs * sizeof(size_t));
-                        memcpy(rowIdxsRowRes, lhs->getRowIdxs(rowIdx), nnzRowLhs * sizeof(size_t));
+                        std::fill_n(rowIdxsRowRes, nnzRowLhs, rowIdx);
                         valuesRowRes[nnzRowLhs] = VT(0);
                         colIdxsRowRes[nnzRowLhs] = size_t(-1);
                         rowIdxsRowRes[nnzRowLhs] = size_t(-1);
@@ -353,7 +369,7 @@ struct EwBinaryMat<COOMatrix<VT>, COOMatrix<VT>, COOMatrix<VT>> {
                         size_t * rowIdxsRowRes = res->getRowIdxs(rowIdx);
                         memcpy(valuesRowRes, rhs->getValues(rowIdx), nnzRowRhs * sizeof(VT));
                         memcpy(colIdxsRowRes, rhs->getColIdxs(rowIdx), nnzRowRhs * sizeof(size_t));
-                        memcpy(rowIdxsRowRes, rhs->getRowIdxs(rowIdx), nnzRowRhs * sizeof(size_t));
+                        std::fill_n(rowIdxsRowRes, nnzRowRhs, rowIdx);
                         valuesRowRes[nnzRowRhs] = VT(0);
                         colIdxsRowRes[nnzRowRhs] = size_t(-1);
                         rowIdxsRowRes[nnzRowRhs] = size_t(-1);
@@ -381,9 +397,9 @@ struct EwBinaryMat<COOMatrix<VT>, COOMatrix<VT>, COOMatrix<VT>> {
                             if(colIdxsRowLhs[posLhs] == colIdxsRowRhs[posRhs]) {
                                 valuesRowRes[posRes] = func(valuesRowLhs[posLhs], valuesRowRhs[posRhs], ctx);
                                 colIdxsRowRes[posRes] = colIdxsRowLhs[posLhs];
+                                rowIdxsRowRes[posRes] = rowIdx;
                                 posLhs++;
                                 posRhs++;
-                                rowIdxsRowRes[posRes] = rowIdx;
                                 posRes++;
                             }
                             else if(colIdxsRowLhs[posLhs] < colIdxsRowRhs[posRhs])
@@ -514,7 +530,8 @@ struct EwBinaryMat<COOMatrix<VT>, COOMatrix<VT>, DenseMatrix<VT>> {
                         size_t * colIdxsRowRes = res->getColIdxs(rowIdx);
                         const size_t * rowIdxsRowLhs = lhs->getRowIdxs(rowIdx);
                         size_t * rowIdxsRowRes = res->getRowIdxs(rowIdx);
-                        auto rhsRow = (rhs->getNumRows() == 1 ? 0 : rowIdx);
+                        // Fix row index for rhs to first row if it is being broadcast.
+                        const size_t rhsRow = (rhs->getNumRows() == 1 ? 0 : rowIdx);
                         size_t posRes = 0;
                         for (size_t posLhs = 0; posLhs < nnzRowLhs; ++posLhs) {
                             auto rhsCol = (rhs->getNumCols() == 1 ? 0 : colIdxsRowLhs[posLhs]);
diff --git a/src/runtime/local/kernels/RandMatrix.h b/src/runtime/local/kernels/RandMatrix.h
index 053957287..db3be1e24 100644
--- a/src/runtime/local/kernels/RandMatrix.h
+++ b/src/runtime/local/kernels/RandMatrix.h
@@ -277,18 +277,16 @@ struct RandMatrix<CSRMatrix<VT>, VT> {
 template<typename VT>
 struct RandMatrix<COOMatrix<VT>, VT> {
     static void apply(COOMatrix<VT> *& res, size_t numRows, size_t numCols, VT min, VT max, double sparsity, int64_t seed, DCTX(ctx)) {
-        assert(numRows > 0 && "numRows must be > 0");
-        assert(numCols > 0 && "numCols must be > 0");
-        assert(min <= max && "min must be <= max");
-        assert(sparsity >= 0.0 && sparsity <= 1.0 &&
-               "sparsity has to be in the interval [0.0, 1.0]");
+        validateArgsRandMatrix(numRows, numCols, min, max, sparsity);
 
         // The exact number of non-zeros to generate.
         // TODO Ideally, it should not be allowed that zero is included in [min, max].
-        const auto nnz = static_cast<size_t>(round(numRows * numCols * sparsity));
+        const size_t nnz = static_cast<size_t>(round(numRows * numCols * sparsity));
 
         if (res == nullptr)
             res = DataObjectFactory::create<COOMatrix<VT>>(numRows, numCols, nnz, false);
+        else if (res->getMaxNumNonZeros() < nnz)
+            throw std::runtime_error("RandMatrix - res does not have enough space to fit all values");
 
         // Initialize pseudo random number generators.
         if (seed == -1)
@@ -309,6 +307,8 @@ struct RandMatrix<COOMatrix<VT>, VT> {
         for (size_t i = 0; i < nnz; i++)
             valuesRes[i] = distrVal(gen);
 
+        // Generate and sort random row indices where each row indice can only occur
+        // `numCols` times at most.
         std::uniform_int_distribution<size_t> distrRow(0, numRows - 1);
         std::vector<size_t> rowSequence;
         std::vector<size_t> occurrences(numRows, 0);
@@ -324,23 +324,27 @@ struct RandMatrix<COOMatrix<VT>, VT> {
         }
         std::sort(rowSequence.begin(), rowSequence.end());
 
-        std::vector<size_t> colSequence;
-
-        std::vector<size_t> startRow;
-        size_t lastRow = -1;
-        for (size_t i = 0; i < nnz; ++i) {
-            if (rowSequence[i] != lastRow) startRow.push_back(i);
-            lastRow = rowSequence[i];
+        // Count the occurrence of each row index to determine the amount of
+        // column indices that need to be generated respectively.
+        std::vector<size_t> rowRanges{0};
+        for (size_t i = 1; i < nnz; ++i) {
+            if (rowSequence[i-1] != rowSequence[i])
+                rowRanges.push_back(i);
         }
-        startRow.push_back(nnz);
+        rowRanges.push_back(nnz);
 
+        // Generate and sort subsequences of column indices for each row
+        // such that duplicates are only allowed in separate rows.
         std::uniform_int_distribution<size_t> distrCol(0, numCols - 1);
+        std::vector<size_t> colSequence;
 
-        for (size_t i = 0; i < startRow.size() - 1; i++) {
-            size_t start = startRow[i];
-            size_t end = startRow[i + 1];
-            std::vector<size_t> subSequence;
-            std::unordered_set<size_t> uniqueValues;
+        std::unordered_set<size_t> uniqueValues;
+        std::vector<size_t> subSequence;
+        for (size_t i = 0; i < rowRanges.size() - 1; i++) {
+            uniqueValues.clear();
+            subSequence.clear();
+            size_t start = rowRanges[i];
+            size_t end = rowRanges[i + 1];
             while (subSequence.size() < end - start) {
                 size_t randomValue = distrCol(gen);
                 if (uniqueValues.find(randomValue) == uniqueValues.end()) {
diff --git a/src/runtime/local/kernels/Transpose.h b/src/runtime/local/kernels/Transpose.h
index c27ce5e9a..b3187d870 100644
--- a/src/runtime/local/kernels/Transpose.h
+++ b/src/runtime/local/kernels/Transpose.h
@@ -23,6 +23,9 @@
 #include <runtime/local/datastructures/DenseMatrix.h>
 #include <runtime/local/datastructures/Matrix.h>
 
+#include <algorithm>
+#include <numeric>
+
 #include <cstddef>
 
 // ****************************************************************************
@@ -137,27 +140,6 @@ struct Transpose<CSRMatrix<VT>, CSRMatrix<VT>> {
     }
 };
 
-// ----------------------------------------------------------------------------
-// Matrix <- Matrix
-// ----------------------------------------------------------------------------
-
-template<typename VT>
-struct Transpose<Matrix<VT>, Matrix<VT>> {
-    static void apply(Matrix<VT> *& res, const Matrix<VT> * arg, DCTX(ctx)) {
-        const size_t numRowsRes = arg->getNumCols();
-        const size_t numColsRes = arg->getNumRows();
-
-        if (res == nullptr)
-            res = DataObjectFactory::create<DenseMatrix<VT>>(numRowsRes, numColsRes, false);
-
-        res->prepareAppend();
-        for (size_t r = 0; r < numRowsRes; ++r)
-            for (size_t c = 0; c < numColsRes; ++c)
-                res->append(r, c, arg->get(c, r));
-        res->finishAppend();
-    }
-};
-
 // ----------------------------------------------------------------------------
 // COOMatrix <- COOMatrix
 // ----------------------------------------------------------------------------
@@ -171,7 +153,6 @@ struct Transpose<COOMatrix<VT>, COOMatrix<VT>> {
         if(res == nullptr)
             res = DataObjectFactory::create<COOMatrix<VT>>(numCols, numRows, arg->getMaxNumNonZeros(), false);
 
-        // (re)initialize the matrix for consecutive set calls (because the row,col pairs are not in the correct order
         VT * valuesRes = res->getValues();
         size_t * colIdxsRes = res->getColIdxs();
         size_t * rowIdxsRes = res->getRowIdxs();
@@ -180,26 +161,48 @@ struct Transpose<COOMatrix<VT>, COOMatrix<VT>> {
         const size_t * colIdxsArg = arg->getColIdxs();
         const size_t * rowIdxsArg = arg->getRowIdxs();
 
-        std::vector<std::pair<size_t, size_t>> result;
+        // The transpose switches the column and row indices.
+        // To guarantee that res is in a valid state (proper ordering
+        // of row and column indices), the arrays are sorted using the indices
+        // from a stable argsort of arg's columns.
+        const size_t nnz = arg->getNumNonZeros();
+        std::vector<size_t> colIdxArgsort(nnz);
+        std::iota(colIdxArgsort.begin(), colIdxArgsort.end(), 0);
+        std::stable_sort(colIdxArgsort.begin(), colIdxArgsort.end(),
+            [&colIdxsArg](const size_t &i, const size_t &j) {
+                return colIdxsArg[i] < colIdxsArg[j];
+            }
+        );
 
-        size_t size = 0;
-        for (size_t i = 0; colIdxsArg[i] != size_t(-1); ++i) {
-            result.emplace_back(colIdxsArg[i], i);
-            size++;
+        for (size_t i = 0; i < nnz; ++i) {
+            valuesRes[i] = valuesArg[colIdxArgsort[i]];
+            colIdxsRes[i] = rowIdxsArg[colIdxArgsort[i]];
+            rowIdxsRes[i] = colIdxsArg[colIdxArgsort[i]];
         }
 
-        std::sort(result.begin(), result.end(), [](const auto &a, const auto &b) {
-            return a.first < b.first;
-        });
+        valuesRes[nnz] = VT(0);
+        colIdxsRes[nnz] = size_t(-1);
+        rowIdxsRes[nnz] = size_t(-1);
+    }
+};
 
-        for (size_t i = 0; i < size; ++i) {
-            valuesRes[i] = valuesArg[result[i].second];
-            colIdxsRes[i] = rowIdxsArg[result[i].second];
-            rowIdxsRes[i] = result[i].first;
-        }
+// ----------------------------------------------------------------------------
+// Matrix <- Matrix
+// ----------------------------------------------------------------------------
 
-        valuesRes[size] = int(0);
-        colIdxsRes[size] = size_t(-1);
-        rowIdxsRes[size] = size_t(-1);
+template<typename VT>
+struct Transpose<Matrix<VT>, Matrix<VT>> {
+    static void apply(Matrix<VT> *& res, const Matrix<VT> * arg, DCTX(ctx)) {
+        const size_t numRowsRes = arg->getNumCols();
+        const size_t numColsRes = arg->getNumRows();
+
+        if (res == nullptr)
+            res = DataObjectFactory::create<DenseMatrix<VT>>(numRowsRes, numColsRes, false);
+
+        res->prepareAppend();
+        for (size_t r = 0; r < numRowsRes; ++r)
+            for (size_t c = 0; c < numColsRes; ++c)
+                res->append(r, c, arg->get(c, r));
+        res->finishAppend();
     }
 };
diff --git a/test/runtime/local/datastructures/COOMatrixTest.cpp b/test/runtime/local/datastructures/COOMatrixTest.cpp
index 578b8edc5..5d79d9033 100644
--- a/test/runtime/local/datastructures/COOMatrixTest.cpp
+++ b/test/runtime/local/datastructures/COOMatrixTest.cpp
@@ -22,7 +22,10 @@
 
 #include <catch.hpp>
 
+#include <array>
+
 #include <cstdint>
+#include <cstring>
 
 TEMPLATE_TEST_CASE("COOMatrix allocates enough space", TAG_DATASTRUCTURES, ALL_VALUE_TYPES) {
     // No assertions in this test case. We just want to see if it runs without
@@ -57,28 +60,37 @@ TEST_CASE("COOMatrix methods work properly", TAG_DATASTRUCTURES) {
 
     const size_t numRows = 10;
     const size_t numCols = 10;
-    const size_t maxnumNonZeros = 6;
+    const size_t maxnumNonZeros = 7;
 
     COOMatrix<ValueType> * matrix = DataObjectFactory::create<COOMatrix<ValueType>>(numRows, numCols, maxnumNonZeros, true);
 
-    matrix->set(0, 0, 5);
-    matrix->set(2, 2, 3);
-    matrix->set(1, 1, 4);
+    matrix->set(0, 0, 6);
+    matrix->set(2, 2, 4);
+    matrix->set(1, 1, 5);
     matrix->set(3, 3, 2);
     matrix->set(4, 4, 1);
+    matrix->set(3, 2, 3);
 
-    CHECK(matrix->getMaxNumNonZeros() == 6);
-    CHECK(matrix->getNumNonZeros() == 5);
+    CHECK(matrix->getMaxNumNonZeros() == 7);
+    CHECK(matrix->getNumNonZeros() == 6);
     CHECK(matrix->getNumRows() == 10);
     CHECK(matrix->getNumCols() == 10);
     CHECK(matrix->getNumNonZerosRow(1) == 1);
     CHECK(matrix->getNumNonZerosCol(1) == 1);
-    CHECK(matrix->getValues()[0] == 5);
-    CHECK(matrix->getColIdxs()[3] == 3);
+    CHECK(matrix->getValues()[0] == 6);
+    CHECK(matrix->getColIdxs()[2] == 2);
     CHECK(matrix->getRowIdxs()[2] == 2);
-    CHECK(matrix->getValues(1)[0] == 4);
+    CHECK(matrix->getValues(1)[0] == 5);
     CHECK(matrix->getColIdxs(1)[0] == 1);
-    CHECK(matrix->get(1, 1) == 4);
+    CHECK(matrix->get(1, 1) == 5);
+
+    // Check whether coordinates are sorted properly.
+    std::array<ValueType, 6> expValues = {6, 5, 4, 3, 2, 1};
+    std::array<size_t, 6> expRowIdxs = {0, 1, 2, 3, 3, 4};
+    std::array<size_t, 6> expColIdxs = {0, 1, 2, 2, 3, 4};
+    CHECK(0 == std::memcmp(matrix->getValues(), &expValues, sizeof(ValueType) * 6));
+    CHECK(0 == std::memcmp(matrix->getRowIdxs(), &expRowIdxs, sizeof(size_t) * 6));
+    CHECK(0 == std::memcmp(matrix->getColIdxs(), &expColIdxs, sizeof(size_t) * 6));
 
     matrix->prepareAppend();
     matrix->append(0, 0, 5);
@@ -86,6 +98,7 @@ TEST_CASE("COOMatrix methods work properly", TAG_DATASTRUCTURES) {
     matrix->append(2, 2, 3);
     matrix->append(3, 3, 2);
     matrix->append(4, 4, 1);
+    matrix->finishAppend();
 
     CHECK(matrix->getNumNonZeros() == 5);
     CHECK(matrix->getNumNonZerosRow(1) == 1);
@@ -96,6 +109,16 @@ TEST_CASE("COOMatrix methods work properly", TAG_DATASTRUCTURES) {
     CHECK(matrix->getValues(1)[0] == 4);
     CHECK(matrix->getColIdxs(1)[0] == 1);
     CHECK(matrix->get(1, 1) == 4);
+
+    // Check whether coordinates are sorted properly.
+    expValues = {5, 4, 3, 2, 1};
+    expRowIdxs = {0, 1, 2, 3, 4};
+    expColIdxs = {0, 1, 2, 3, 4};
+    CHECK(0 == std::memcmp(matrix->getValues(), &expValues, sizeof(ValueType) * 5));
+    CHECK(0 == std::memcmp(matrix->getRowIdxs(), &expRowIdxs, sizeof(size_t) * 5));
+    CHECK(0 == std::memcmp(matrix->getColIdxs(), &expColIdxs, sizeof(size_t) * 5));
+
+    DataObjectFactory::destroy(matrix);
 }
 
 TEST_CASE("COOMatrix sub-matrix works properly", TAG_DATASTRUCTURES) {
@@ -139,11 +162,9 @@ TEST_CASE("COOMatrix sub-matrix works properly", TAG_DATASTRUCTURES) {
 
     // Freeing both matrices does not result in double-free errors.
     SECTION("Freeing the original matrix first is fine") {
-        DataObjectFactory::destroy(mOrig);
-        DataObjectFactory::destroy(mSub);
+        DataObjectFactory::destroy(mOrig, mSub);
     }
     SECTION("Freeing the sub-matrix first is fine") {
-        DataObjectFactory::destroy(mSub);
-        DataObjectFactory::destroy(mOrig);
+        DataObjectFactory::destroy(mSub, mOrig);
     }
 }
\ No newline at end of file
diff --git a/test/runtime/local/kernels/CheckEqTest.cpp b/test/runtime/local/kernels/CheckEqTest.cpp
index ebf189a27..15e904e69 100644
--- a/test/runtime/local/kernels/CheckEqTest.cpp
+++ b/test/runtime/local/kernels/CheckEqTest.cpp
@@ -144,7 +144,6 @@ TEMPLATE_PRODUCT_TEST_CASE("CheckEq, views on matrices", TAG_KERNELS, (DenseMatr
 }
 
 TEMPLATE_PRODUCT_TEST_CASE("CheckEq, views on matrices", TAG_KERNELS, (CSRMatrix, COOMatrix), (VALUE_TYPES)) {
-
     using DT = TestType;
     
     std::vector<typename DT::VT> vals = {
diff --git a/test/runtime/local/kernels/EwBinaryMatTest.cpp b/test/runtime/local/kernels/EwBinaryMatTest.cpp
index ae2324768..9eae934a9 100644
--- a/test/runtime/local/kernels/EwBinaryMatTest.cpp
+++ b/test/runtime/local/kernels/EwBinaryMatTest.cpp
@@ -25,6 +25,7 @@
 
 #include <catch.hpp>
 
+#include <type_traits>
 #include <vector>
 
 #include <cstdint>
@@ -32,8 +33,8 @@
 #define TEST_NAME(opName) "EwBinaryMat (" opName ")"
 #define DATA_TYPES DenseMatrix, CSRMatrix, COOMatrix, Matrix
 #define VALUE_TYPES double, uint32_t
-// CSRMatrix, COOMatrix currently only supports ADD and MUL opCodes
-#define DATA_TYPES_NO_CSR DenseMatrix, Matrix
+// CSRMatrix, COOMatrix currently only support ADD and MUL opCodes
+#define DATA_TYPES_DENSE DenseMatrix, Matrix
 
 template<class DT>
 void checkEwBinaryMat(BinaryOpCode opCode, const DT * lhs, const DT * rhs, const DT * exp) {
@@ -55,7 +56,8 @@ void checkSparseDenseEwBinaryMat(BinaryOpCode opCode, const SparseDT * lhs, cons
 
 TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("add"), TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)) {
     using DT = TestType;
-    
+    using VT = typename DT::VT;    
+
     auto m0 = genGivenVals<DT>(4, {
             0, 0, 0, 0, 0, 0,
             0, 0, 0, 0, 0, 0,
@@ -68,19 +70,37 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("add"), TAG_KERNELS, (DATA_TYPES), (VALUE_T
             0, 0, 0, 0, 0, 0,
             0, 0, 0, 0, 0, 0,
     });
-    auto m2 = genGivenVals<DT>(4, {
+    DT * m2 = nullptr;
+    DT * m3 = nullptr;
+    if (std::is_unsigned_v<VT>) {
+        m2 = genGivenVals<DT>(4, {
             0, 0, 0, 0, 0, 0,
             1, 2, 3, 1, 0, 0,
             0, 0, 0, 0, 0, 0,
             0, 0, 3, 1, 0, 2,
-    });
-    auto m3 = genGivenVals<DT>(4, {
+        });
+        m3 = genGivenVals<DT>(4, {
             1, 2, 0, 0, 1, 3,
             1, 3, 3, 3, 0, 3,
             0, 0, 0, 0, 0, 0,
             0, 0, 3, 1, 0, 2,
-    });
-    
+        });
+    }
+    else {
+        m2 = genGivenVals<DT>(4, {
+            VT(-1), 0, 0, 0, 0, 0,
+            1,      2, 3, 1, 0, 0,
+            0,      0, 0, 0, 0, 0,
+            0,      0, 3, 1, 0, 2,
+        });
+        m3 = genGivenVals<DT>(4, {
+            0, 2, 0, 0, 1, 3,
+            1, 3, 3, 3, 0, 3,
+            0, 0, 0, 0, 0, 0,
+            0, 0, 3, 1, 0, 2,
+        });
+    }
+
     checkEwBinaryMat(BinaryOpCode::ADD, m0, m0, m0);
     checkEwBinaryMat(BinaryOpCode::ADD, m1, m0, m1);
     checkEwBinaryMat(BinaryOpCode::ADD, m1, m2, m3);
@@ -174,7 +194,7 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("mul_sparse_dense"), TAG_KERNELS, (CSRMatri
     DataObjectFactory::destroy(m0, m1, m2, m3, exp0, exp1);
 }
 
-TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("div"), TAG_KERNELS, (DATA_TYPES_NO_CSR), (VALUE_TYPES)) {
+TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("div"), TAG_KERNELS, (DATA_TYPES_DENSE), (VALUE_TYPES)) {
     using DT = TestType;
     
     auto m0 = genGivenVals<DT>(2, {
@@ -204,7 +224,7 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("div"), TAG_KERNELS, (DATA_TYPES_NO_CSR), (
 // Comparisons
 // ****************************************************************************
 
-TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("eq"), TAG_KERNELS, (DATA_TYPES_NO_CSR), (VALUE_TYPES)) {
+TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("eq"), TAG_KERNELS, (DATA_TYPES_DENSE), (VALUE_TYPES)) {
     using DT = TestType;
     
     auto m1 = genGivenVals<DT>(2, {1, 2, 3,  4, 5, 6,});
@@ -216,7 +236,7 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("eq"), TAG_KERNELS, (DATA_TYPES_NO_CSR), (V
     DataObjectFactory::destroy(m1, m2, m3);
 }
 
-TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("neq"), TAG_KERNELS, (DATA_TYPES_NO_CSR), (VALUE_TYPES)) {
+TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("neq"), TAG_KERNELS, (DATA_TYPES_DENSE), (VALUE_TYPES)) {
     using DT = TestType;
     
     auto m1 = genGivenVals<DT>(2, {1, 2, 3,  4, 5, 6,});
@@ -228,7 +248,7 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("neq"), TAG_KERNELS, (DATA_TYPES_NO_CSR), (
     DataObjectFactory::destroy(m1, m2, m3);
 }
 
-TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("lt"), TAG_KERNELS, (DATA_TYPES_NO_CSR), (VALUE_TYPES)) {
+TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("lt"), TAG_KERNELS, (DATA_TYPES_DENSE), (VALUE_TYPES)) {
     using DT = TestType;
     
     auto m1 = genGivenVals<DT>(2, {1, 2, 3,  4, 5, 6,});
@@ -240,7 +260,7 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("lt"), TAG_KERNELS, (DATA_TYPES_NO_CSR), (V
     DataObjectFactory::destroy(m1, m2, m3);
 }
 
-TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("le"), TAG_KERNELS, (DATA_TYPES_NO_CSR), (VALUE_TYPES)) {
+TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("le"), TAG_KERNELS, (DATA_TYPES_DENSE), (VALUE_TYPES)) {
     using DT = TestType;
     
     auto m1 = genGivenVals<DT>(2, {1, 2, 3,  4, 5, 6,});
@@ -252,7 +272,7 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("le"), TAG_KERNELS, (DATA_TYPES_NO_CSR), (V
     DataObjectFactory::destroy(m1, m2, m3);
 }
 
-TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("gt"), TAG_KERNELS, (DATA_TYPES_NO_CSR), (VALUE_TYPES)) {
+TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("gt"), TAG_KERNELS, (DATA_TYPES_DENSE), (VALUE_TYPES)) {
     using DT = TestType;
     
     auto m1 = genGivenVals<DT>(2, {1, 2, 3,  4, 5, 6,});
@@ -264,7 +284,7 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("gt"), TAG_KERNELS, (DATA_TYPES_NO_CSR), (V
     DataObjectFactory::destroy(m1, m2, m3);
 }
 
-TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("ge"), TAG_KERNELS, (DATA_TYPES_NO_CSR), (VALUE_TYPES)) {
+TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("ge"), TAG_KERNELS, (DATA_TYPES_DENSE), (VALUE_TYPES)) {
     using DT = TestType;
     
     auto m1 = genGivenVals<DT>(2, {1, 2, 3,  4, 5, 6,});
@@ -280,7 +300,7 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("ge"), TAG_KERNELS, (DATA_TYPES_NO_CSR), (V
 // Min/max
 // ****************************************************************************
 
-TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("min"), TAG_KERNELS, (DATA_TYPES_NO_CSR), (VALUE_TYPES)) {
+TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("min"), TAG_KERNELS, (DATA_TYPES_DENSE), (VALUE_TYPES)) {
     using DT = TestType;
     
     auto m1 = genGivenVals<DT>(2, {1, 2, 3,  4, 5, 6,});
@@ -292,7 +312,7 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("min"), TAG_KERNELS, (DATA_TYPES_NO_CSR), (
     DataObjectFactory::destroy(m1, m2, m3);
 }
 
-TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("max"), TAG_KERNELS, (DATA_TYPES_NO_CSR), (VALUE_TYPES)) {
+TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("max"), TAG_KERNELS, (DATA_TYPES_DENSE), (VALUE_TYPES)) {
     using DT = TestType;
     
     auto m1 = genGivenVals<DT>(2, {1, 2, 3,  4, 5, 6,});
@@ -308,7 +328,7 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("max"), TAG_KERNELS, (DATA_TYPES_NO_CSR), (
 // Logical
 // ****************************************************************************
 
-TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("and"), TAG_KERNELS, (DATA_TYPES_NO_CSR), (VALUE_TYPES)) {
+TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("and"), TAG_KERNELS, (DATA_TYPES_DENSE), (VALUE_TYPES)) {
     using DT = TestType;
     using VT = typename DT::VT;
     
@@ -321,7 +341,7 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("and"), TAG_KERNELS, (DATA_TYPES_NO_CSR), (
     DataObjectFactory::destroy(m1, m2, m3);
 }
 
-TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("or"), TAG_KERNELS, (DATA_TYPES_NO_CSR), (VALUE_TYPES)) {
+TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("or"), TAG_KERNELS, (DATA_TYPES_DENSE), (VALUE_TYPES)) {
     using DT = TestType;
     using VT = typename DT::VT;
     
diff --git a/test/runtime/local/kernels/EwUnaryMatTest.cpp b/test/runtime/local/kernels/EwUnaryMatTest.cpp
index 5786c698e..89b5b041f 100644
--- a/test/runtime/local/kernels/EwUnaryMatTest.cpp
+++ b/test/runtime/local/kernels/EwUnaryMatTest.cpp
@@ -28,6 +28,7 @@
 #include <catch.hpp>
 
 #include <limits>
+#include <type_traits>
 
 #include <cstdint>
 
@@ -52,7 +53,7 @@ void checkEwUnaryMatApprox(UnaryOpCode opCode, const DTArg * arg, const DTRes *
 }
 
 template<typename DTRes, typename DTArg>
-void checkEwUnaryMatThrow(UnaryOpCode opCode, const DTArg * arg, [[maybe_unused]] const DTRes * exp) {
+void checkEwUnaryMatThrow(UnaryOpCode opCode, const DTArg * arg) {
     DTRes * res = nullptr;
     REQUIRE_THROWS_AS((ewUnaryMat<DTRes, DTArg>(opCode, res, arg, nullptr)), std::domain_error);
     DataObjectFactory::destroy(res);
@@ -63,15 +64,21 @@ void checkEwUnaryMatThrow(UnaryOpCode opCode, const DTArg * arg, [[maybe_unused]
 // ****************************************************************************
 
 TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("abs"), TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)) {
-    using DT = TestType;
+    using DTArg = TestType;
+    using VT = typename DTArg::VT;
+    using DTRes = typename std::conditional<
+                    std::is_same<DTArg, COOMatrix<VT>>::value,
+                    DenseMatrix<VT>,
+                    DTArg
+                >::type;
 
-    auto arg = genGivenVals<DT>(3, {
+    auto arg = genGivenVals<DTArg>(3, {
         0,
         1,
         -1,
     });
 
-    auto exp = genGivenVals<DT>(3, {
+    auto exp = genGivenVals<DTRes>(3, {
         0,
         1,
         1,
@@ -83,15 +90,20 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("abs"), TAG_KERNELS, (DATA_TYPES), (VALUE_T
 }
 
 TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("sign"), TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)) {
-    using DT = TestType;
-    using VT = typename DT::VT;
+    using DTArg = TestType;
+    using VT = typename DTArg::VT;
+    using DTRes = typename std::conditional<
+                    std::is_same<DTArg, COOMatrix<VT>>::value,
+                    DenseMatrix<VT>,
+                    DTArg
+                >::type;
 
-    auto arg = genGivenVals<DT>(2, {
+    auto arg = genGivenVals<DTArg>(2, {
         0, 1, -1,
         10, -10, VT(1.4),
     });
 
-    auto exp = genGivenVals<DT>(2, {
+    auto exp = genGivenVals<DTRes>(2, {
         0, 1, -1,
         1, -1, 1,
     });
@@ -102,15 +114,20 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("sign"), TAG_KERNELS, (DATA_TYPES), (VALUE_
 }
 
 TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("sign, floating-point-specific"), TAG_KERNELS, (DATA_TYPES), (double)) {
-    using DT = TestType;
-    using VT = typename DT::VT;
+    using DTArg = TestType;
+    using VT = typename DTArg::VT;
+    using DTRes = typename std::conditional<
+                    std::is_same<DTArg, COOMatrix<VT>>::value,
+                    DenseMatrix<VT>,
+                    DTArg
+                >::type;
 
-    auto arg = genGivenVals<DT>(2, {
+    auto arg = genGivenVals<DTArg>(2, {
         std::numeric_limits<VT>::infinity(),
         - std::numeric_limits<VT>::infinity(),
     });
 
-    auto exp = genGivenVals<DT>(2, {
+    auto exp = genGivenVals<DTRes>(2, {
         1,
         -1,
     });
@@ -121,15 +138,21 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("sign, floating-point-specific"), TAG_KERNE
 }
 
 TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("sqrt"), TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)) {
-    using DT = TestType;
+    using DTArg = TestType;
+    using VT = typename DTArg::VT;
+    using DTRes = typename std::conditional<
+                    std::is_same<DTArg, COOMatrix<VT>>::value,
+                    DenseMatrix<VT>,
+                    DTArg
+                >::type;
 
-    auto arg = genGivenVals<DT>(3, {
+    auto arg = genGivenVals<DTArg>(3, {
         0,
         1,
         16,
     });
 
-    auto exp = genGivenVals<DT>(3, {
+    auto exp = genGivenVals<DTRes>(3, {
         0,
         1,
         4,
@@ -143,6 +166,11 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("sqrt"), TAG_KERNELS, (DATA_TYPES), (VALUE_
 TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("sqrt, check domain_error"), TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)) {
     using DT = TestType;
     using VT = typename DT::VT;
+    using DTRes = typename std::conditional<
+                    std::is_same<DT, COOMatrix<VT>>::value,
+                    DenseMatrix<VT>,
+                    DT
+                >::type;
 
     auto arg = genGivenVals<DT>(3, {
         0,
@@ -150,24 +178,27 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("sqrt, check domain_error"), TAG_KERNELS, (
         -1,
     });
 
-    DenseMatrix<VT> * dense = nullptr;
-
-    checkEwUnaryMatThrow(UnaryOpCode::SQRT, arg, dense);
+    checkEwUnaryMatThrow<DTRes, DT>(UnaryOpCode::SQRT, arg);
 
     DataObjectFactory::destroy(arg);
 }
 
 TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("exp"), TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)) {
-    using DT = TestType;
-    using VT = typename DT::VT;
+    using DTArg = TestType;
+    using VT = typename DTArg::VT;
+    using DTRes = typename std::conditional<
+                    std::is_same<DTArg, COOMatrix<VT>>::value,
+                    DenseMatrix<VT>,
+                    DTArg
+                >::type;
 
-    auto arg = genGivenVals<DT>(3, {
+    auto arg = genGivenVals<DTArg>(3, {
         0,
         -1,
         3,
     });
 
-    auto exp = genGivenVals<DT>(3, {
+    auto exp = genGivenVals<DTRes>(3, {
         1,
         VT(0.367),
         VT(20.085),
@@ -179,16 +210,21 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("exp"), TAG_KERNELS, (DATA_TYPES), (VALUE_T
 }
 
 TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("ln"), TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)) {
-    using DT = TestType;
-    using VT = typename DT::VT;
+    using DTArg = TestType;
+    using VT = typename DTArg::VT;
+    using DTRes = typename std::conditional<
+                    std::is_same<DTArg, COOMatrix<VT>>::value,
+                    DenseMatrix<VT>,
+                    DTArg
+                >::type;
 
-    auto arg = genGivenVals<DT>(3, {
+    auto arg = genGivenVals<DTArg>(3, {
         1,
         3,
         8,
     });
 
-    auto exp = genGivenVals<DT>(3, {
+    auto exp = genGivenVals<DTRes>(3, {
         0,
         VT(1.098),
         VT(2.079),
@@ -202,6 +238,11 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("ln"), TAG_KERNELS, (DATA_TYPES), (VALUE_TY
 TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("ln, check domain_error"), TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)) {
     using DT = TestType;
     using VT = typename DT::VT;
+    using DTRes = typename std::conditional<
+                    std::is_same<DT, COOMatrix<VT>>::value,
+                    DenseMatrix<VT>,
+                    DT
+                >::type;
 
     auto arg = genGivenVals<DT>(3, {
         0,
@@ -209,9 +250,7 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("ln, check domain_error"), TAG_KERNELS, (DA
         -1,
     });
 
-    DenseMatrix<VT> * dense = nullptr;
-
-    checkEwUnaryMatThrow(UnaryOpCode::LN, arg, dense);
+    checkEwUnaryMatThrow<DTRes, DT>(UnaryOpCode::LN, arg);
 
     DataObjectFactory::destroy(arg);
 }
@@ -221,16 +260,21 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("ln, check domain_error"), TAG_KERNELS, (DA
 // ****************************************************************************
 
 TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("sin"), TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)) {
-    using DT = TestType;
-    using VT = typename DT::VT;
+    using DTArg = TestType;
+    using VT = typename DTArg::VT;
+    using DTRes = typename std::conditional<
+                    std::is_same<DTArg, COOMatrix<VT>>::value,
+                    DenseMatrix<VT>,
+                    DTArg
+                >::type;
 
-    auto arg = genGivenVals<DT>(3, {
+    auto arg = genGivenVals<DTArg>(3, {
         0,
         1,
         -1,
     });
 
-    auto exp = genGivenVals<DT>(3, {
+    auto exp = genGivenVals<DTRes>(3, {
         0,
         VT(0.841),
         VT(-0.841),
@@ -242,16 +286,21 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("sin"), TAG_KERNELS, (DATA_TYPES), (VALUE_T
 }
 
 TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("cos"), TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)) {
-    using DT = TestType;
-    using VT = typename DT::VT;
+    using DTArg = TestType;
+    using VT = typename DTArg::VT;
+    using DTRes = typename std::conditional<
+                    std::is_same<DTArg, COOMatrix<VT>>::value,
+                    DenseMatrix<VT>,
+                    DTArg
+                >::type;
 
-    auto arg = genGivenVals<DT>(3, {
+    auto arg = genGivenVals<DTArg>(3, {
         0,
         1,
         -1,
     });
 
-    auto exp = genGivenVals<DT>(3, {
+    auto exp = genGivenVals<DTRes>(3, {
         1,
         VT(0.54),
         VT(0.54),
@@ -263,16 +312,21 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("cos"), TAG_KERNELS, (DATA_TYPES), (VALUE_T
 }
 
 TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("tan"), TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)) {
-    using DT = TestType;
-    using VT = typename DT::VT;
+    using DTArg = TestType;
+    using VT = typename DTArg::VT;
+    using DTRes = typename std::conditional<
+                    std::is_same<DTArg, COOMatrix<VT>>::value,
+                    DenseMatrix<VT>,
+                    DTArg
+                >::type;
 
-    auto arg = genGivenVals<DT>(3, {
+    auto arg = genGivenVals<DTArg>(3, {
         0,
         1,
         -1,
     });
 
-    auto exp = genGivenVals<DT>(3, {
+    auto exp = genGivenVals<DTRes>(3, {
         0,
         VT(1.557),
         VT(-1.557),
@@ -284,16 +338,21 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("tan"), TAG_KERNELS, (DATA_TYPES), (VALUE_T
 }
 
 TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("asin"), TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)) {
-    using DT = TestType;
-    using VT = typename DT::VT;
+    using DTArg = TestType;
+    using VT = typename DTArg::VT;
+    using DTRes = typename std::conditional<
+                    std::is_same<DTArg, COOMatrix<VT>>::value,
+                    DenseMatrix<VT>,
+                    DTArg
+                >::type;
 
-    auto arg = genGivenVals<DT>(3, {
+    auto arg = genGivenVals<DTArg>(3, {
         0,
         1,
         -1,
     });
 
-    auto exp = genGivenVals<DT>(3, {
+    auto exp = genGivenVals<DTRes>(3, {
         0,
         VT(1.57),
         VT(-1.57),
@@ -307,6 +366,11 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("asin"), TAG_KERNELS, (DATA_TYPES), (VALUE_
 TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("asin, check domain_error"), TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)) {
     using DT = TestType;
     using VT = typename DT::VT;
+    using DTRes = typename std::conditional<
+                    std::is_same<DT, COOMatrix<VT>>::value,
+                    DenseMatrix<VT>,
+                    DT
+                >::type;
 
     auto arg = genGivenVals<DT>(3, {
         0,
@@ -314,24 +378,27 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("asin, check domain_error"), TAG_KERNELS, (
         -2,
     });
 
-    DenseMatrix<VT> * dense = nullptr;
-
-    checkEwUnaryMatThrow(UnaryOpCode::ASIN, arg, dense);
+    checkEwUnaryMatThrow<DTRes, DT>(UnaryOpCode::ASIN, arg);
 
     DataObjectFactory::destroy(arg);
 }
 
 TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("acos"), TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)) {
-    using DT = TestType;
-    using VT = typename DT::VT;
+    using DTArg = TestType;
+    using VT = typename DTArg::VT;
+    using DTRes = typename std::conditional<
+                    std::is_same<DTArg, COOMatrix<VT>>::value,
+                    DenseMatrix<VT>,
+                    DTArg
+                >::type;
 
-    auto arg = genGivenVals<DT>(3, {
+    auto arg = genGivenVals<DTArg>(3, {
         0,
         1,
         -1,
     });
 
-    auto exp = genGivenVals<DT>(3, {
+    auto exp = genGivenVals<DTRes>(3, {
         VT(1.57),
         0,
         VT(3.141),
@@ -345,6 +412,11 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("acos"), TAG_KERNELS, (DATA_TYPES), (VALUE_
 TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("acos, check domain_error"), TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)) {
     using DT = TestType;
     using VT = typename DT::VT;
+    using DTRes = typename std::conditional<
+                    std::is_same<DT, COOMatrix<VT>>::value,
+                    DenseMatrix<VT>,
+                    DT
+                >::type;
 
     auto arg = genGivenVals<DT>(3, {
         0,
@@ -352,24 +424,27 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("acos, check domain_error"), TAG_KERNELS, (
         -2,
     });
 
-    DenseMatrix<VT> * dense = nullptr;
-
-    checkEwUnaryMatThrow(UnaryOpCode::ACOS, arg, dense);
+    checkEwUnaryMatThrow<DTRes, DT>(UnaryOpCode::ACOS, arg);
 
     DataObjectFactory::destroy(arg);
 }
 
 TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("atan"), TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)) {
-    using DT = TestType;
-    using VT = typename DT::VT;
+    using DTArg = TestType;
+    using VT = typename DTArg::VT;
+    using DTRes = typename std::conditional<
+                    std::is_same<DTArg, COOMatrix<VT>>::value,
+                    DenseMatrix<VT>,
+                    DTArg
+                >::type;
 
-    auto arg = genGivenVals<DT>(3, {
+    auto arg = genGivenVals<DTArg>(3, {
         0,
         1,
         -1,
     });
 
-    auto exp = genGivenVals<DT>(3, {
+    auto exp = genGivenVals<DTRes>(3, {
         0,
         VT(0.785),
         VT(-0.785),
@@ -381,16 +456,21 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("atan"), TAG_KERNELS, (DATA_TYPES), (VALUE_
 }
 
 TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("sinh"), TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)) {
-    using DT = TestType;
-    using VT = typename DT::VT;
+    using DTArg = TestType;
+    using VT = typename DTArg::VT;
+    using DTRes = typename std::conditional<
+                    std::is_same<DTArg, COOMatrix<VT>>::value,
+                    DenseMatrix<VT>,
+                    DTArg
+                >::type;
 
-    auto arg = genGivenVals<DT>(3, {
+    auto arg = genGivenVals<DTArg>(3, {
         0,
         1,
         -1,
     });
 
-    auto exp = genGivenVals<DT>(3, {
+    auto exp = genGivenVals<DTRes>(3, {
         0,
         VT(1.175),
         VT(-1.175),
@@ -402,16 +482,21 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("sinh"), TAG_KERNELS, (DATA_TYPES), (VALUE_
 }
 
 TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("cosh"), TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)) {
-    using DT = TestType;
-    using VT = typename DT::VT;
+    using DTArg = TestType;
+    using VT = typename DTArg::VT;
+    using DTRes = typename std::conditional<
+                    std::is_same<DTArg, COOMatrix<VT>>::value,
+                    DenseMatrix<VT>,
+                    DTArg
+                >::type;
 
-    auto arg = genGivenVals<DT>(3, {
+    auto arg = genGivenVals<DTArg>(3, {
         0,
         1,
         -1,
     });
 
-    auto exp = genGivenVals<DT>(3, {
+    auto exp = genGivenVals<DTRes>(3, {
         1,
         VT(1.543),
         VT(1.543),
@@ -423,16 +508,21 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("cosh"), TAG_KERNELS, (DATA_TYPES), (VALUE_
 }
 
 TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("tanh"), TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)) {
-    using DT = TestType;
-    using VT = typename DT::VT;
+    using DTArg = TestType;
+    using VT = typename DTArg::VT;
+    using DTRes = typename std::conditional<
+                    std::is_same<DTArg, COOMatrix<VT>>::value,
+                    DenseMatrix<VT>,
+                    DTArg
+                >::type;
 
-    auto arg = genGivenVals<DT>(3, {
+    auto arg = genGivenVals<DTArg>(3, {
         0,
         1,
         -1,
     });
 
-    auto exp = genGivenVals<DT>(3, {
+    auto exp = genGivenVals<DTRes>(3, {
         0,
         VT(0.761),
         VT(-0.761),
@@ -448,15 +538,21 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("tanh"), TAG_KERNELS, (DATA_TYPES), (VALUE_
 // ****************************************************************************
 
 TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("floor"), TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)) {
-    using DT = TestType;
+    using DTArg = TestType;
+    using VT = typename DTArg::VT;
+    using DTRes = typename std::conditional<
+                    std::is_same<DTArg, COOMatrix<VT>>::value,
+                    DenseMatrix<VT>,
+                    DTArg
+                >::type;
 
-    auto arg = genGivenVals<DT>(3, {
+    auto arg = genGivenVals<DTArg>(3, {
         0,
         1,
         -1,
     });
 
-    auto exp = genGivenVals<DT>(3, {
+    auto exp = genGivenVals<DTRes>(3, {
         0,
         1,
         -1,
@@ -468,14 +564,20 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("floor"), TAG_KERNELS, (DATA_TYPES), (VALUE
 }
 
 TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("floor, floating-point-specific"), TAG_KERNELS, (DATA_TYPES), (double)) {
-    using DT = TestType;
+    using DTArg = TestType;
+    using VT = typename DTArg::VT;
+    using DTRes = typename std::conditional<
+                    std::is_same<DTArg, COOMatrix<VT>>::value,
+                    DenseMatrix<VT>,
+                    DTArg
+                >::type;
 
-    auto arg = genGivenVals<DT>(2, {
+    auto arg = genGivenVals<DTArg>(2, {
         0.3, -0.3,
         0.9, -0.9,
     });
 
-    auto exp = genGivenVals<DT>(2, {
+    auto exp = genGivenVals<DTRes>(2, {
         0, -1,
         0, -1,
     });
@@ -486,15 +588,21 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("floor, floating-point-specific"), TAG_KERN
 }
 
 TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("ceil"), TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)) {
-    using DT = TestType;
+    using DTArg = TestType;
+    using VT = typename DTArg::VT;
+    using DTRes = typename std::conditional<
+                    std::is_same<DTArg, COOMatrix<VT>>::value,
+                    DenseMatrix<VT>,
+                    DTArg
+                >::type;
 
-    auto arg = genGivenVals<DT>(3, {
+    auto arg = genGivenVals<DTArg>(3, {
         0,
         1,
         -1,
     });
 
-    auto exp = genGivenVals<DT>(3, {
+    auto exp = genGivenVals<DTRes>(3, {
         0,
         1,
         -1,
@@ -506,14 +614,20 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("ceil"), TAG_KERNELS, (DATA_TYPES), (VALUE_
 }
 
 TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("ceil, floating-point-specific"), TAG_KERNELS, (DATA_TYPES), (double)) {
-    using DT = TestType;
+    using DTArg = TestType;
+    using VT = typename DTArg::VT;
+    using DTRes = typename std::conditional<
+                    std::is_same<DTArg, COOMatrix<VT>>::value,
+                    DenseMatrix<VT>,
+                    DTArg
+                >::type;
 
-    auto arg = genGivenVals<DT>(2, {
+    auto arg = genGivenVals<DTArg>(2, {
         0.3, -0.3,
         1.1, -1.9,
     });
 
-    auto exp = genGivenVals<DT>(2, {
+    auto exp = genGivenVals<DTRes>(2, {
         1, -0.0,
         2, -1,
     });
@@ -524,15 +638,21 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("ceil, floating-point-specific"), TAG_KERNE
 }
 
 TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("round"), TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)) {
-    using DT = TestType;
+    using DTArg = TestType;
+    using VT = typename DTArg::VT;
+    using DTRes = typename std::conditional<
+                    std::is_same<DTArg, COOMatrix<VT>>::value,
+                    DenseMatrix<VT>,
+                    DTArg
+                >::type;
 
-    auto arg = genGivenVals<DT>(3, {
+    auto arg = genGivenVals<DTArg>(3, {
         0,
         1,
         -1,
     });
 
-    auto exp = genGivenVals<DT>(3, {
+    auto exp = genGivenVals<DTRes>(3, {
         0,
         1,
         -1,
@@ -544,14 +664,20 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("round"), TAG_KERNELS, (DATA_TYPES), (VALUE
 }
 
 TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("round, floating-point-specific"), TAG_KERNELS, (DATA_TYPES), (double)) {
-    using DT = TestType;
+    using DTArg = TestType;
+    using VT = typename DTArg::VT;
+    using DTRes = typename std::conditional<
+                    std::is_same<DTArg, COOMatrix<VT>>::value,
+                    DenseMatrix<VT>,
+                    DTArg
+                >::type;
 
-    auto arg = genGivenVals<DT>(2, {
+    auto arg = genGivenVals<DTArg>(2, {
         0.3, -0.3,
         0.5, -0.5,
     });
 
-    auto exp = genGivenVals<DT>(2, {
+    auto exp = genGivenVals<DTRes>(2, {
         0, -0.0,
         1, -1,
     });
@@ -566,16 +692,22 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("round, floating-point-specific"), TAG_KERN
 // ****************************************************************************
 
 TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("isNan"), TAG_KERNELS, (DATA_TYPES), (int32_t)) {
-    using DT = TestType;
+    using DTArg = TestType;
+    using VT = typename DTArg::VT;
+    using DTRes = typename std::conditional<
+                    std::is_same<DTArg, COOMatrix<VT>>::value,
+                    DenseMatrix<VT>,
+                    DTArg
+                >::type;
 
-    auto arg = genGivenVals<DT>(4, {
+    auto arg = genGivenVals<DTArg>(4, {
         1,
         0,
         99,
         -99, 
     });
 
-    auto exp = genGivenVals<DT>(4, {
+    auto exp = genGivenVals<DTRes>(4, {
         0,
         0,
         0,
@@ -588,10 +720,15 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("isNan"), TAG_KERNELS, (DATA_TYPES), (int32
 }
 
 TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("isNan, floating-point specific"), TAG_KERNELS, (DATA_TYPES), (double)) {
-    using DT = TestType;
-    using VT = typename DT::VT;
+    using DTArg = TestType;
+    using VT = typename DTArg::VT;
+    using DTRes = typename std::conditional<
+                    std::is_same<DTArg, COOMatrix<VT>>::value,
+                    DenseMatrix<VT>,
+                    DTArg
+                >::type;
 
-    auto arg = genGivenVals<DT>(9, {
+    auto arg = genGivenVals<DTArg>(9, {
         1,
         std::numeric_limits<VT>::quiet_NaN(),
         0,
@@ -603,7 +740,7 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("isNan, floating-point specific"), TAG_KERN
         std::numeric_limits<VT>::denorm_min()
     });
 
-    auto exp = genGivenVals<DT>(9, {
+    auto exp = genGivenVals<DTRes>(9, {
         0,
         1,
         0,
@@ -627,11 +764,15 @@ TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("isNan, floating-point specific"), TAG_KERN
 TEMPLATE_PRODUCT_TEST_CASE(TEST_NAME("some invalid op-code"), TAG_KERNELS, (DATA_TYPES), (VALUE_TYPES)) {
     using DTArg = TestType;
     using VT = typename DTArg::VT;
-    using DTRes = DenseMatrix<VT>;
+    using DTRes = typename std::conditional<
+                    std::is_same<DTArg, COOMatrix<VT>>::value,
+                    DenseMatrix<VT>,
+                    DTArg
+                >::type;
 
     auto arg = genGivenVals<DTArg>(1, {1});
     DTRes * exp = nullptr;
     CHECK_THROWS(ewUnaryMat<DTRes, DTArg>(static_cast<UnaryOpCode>(999), exp, arg, nullptr));
 
-    DataObjectFactory::destroy(arg);
+    DataObjectFactory::destroy(arg, exp);
 }