From 02dd2cfdf33fc0a69b1a896d1851bd10b912954c Mon Sep 17 00:00:00 2001
From: Samin <bassirisamin@gmail.com>
Date: Wed, 20 Nov 2024 02:41:45 +0100
Subject: [PATCH] - Add 'group()' built-in function to DaphneDSL.  - This
 built-in function creates a GroupOp in DaphneIR.  - Only support 'SUM' as an
 aggregation function.  - Get only one aggregation column.  - Get an arbitrary
 number of columns to group on. - Add support for string values in the 'group'
 kernel function.   - SUM, MIN, and MAX are the only aggregation functions
 applied to string columns.   - Other aggregation functions throw an exception
 if they receive strings as arguments or results.   - Additionally,
 'DeduceValueTypeAndExecute' cannot handle string values due to unsupported
 operations on strings.   - Therefore, 'ColumnGroupAggStringVTArg' that is
 specialized for strings is used.   - Or 'ColumnGroupAgg' is called
 exclusively with string values. - The 'group' function internally calls the
 'order' and 'extractCol' kernel functions.   - These two functions are
 updated to handle string values correctly. - Added script-level test cases to
 validate the new functionality. - Close issue #903

---
 src/parser/daphnedsl/DaphneDSLBuiltins.cpp |  35 ++++++++
 src/runtime/local/kernels/ExtractRow.h     |  27 +++---
 src/runtime/local/kernels/Group.h          | 100 ++++++++++++++++-----
 src/runtime/local/kernels/Order.h          |  21 +++--
 test/api/cli/operations/OperationsTest.cpp |   1 +
 test/api/cli/operations/groupSum_1.daphne  |   8 ++
 test/api/cli/operations/groupSum_1.txt     |   5 ++
 7 files changed, 157 insertions(+), 40 deletions(-)
 create mode 100644 test/api/cli/operations/groupSum_1.daphne
 create mode 100644 test/api/cli/operations/groupSum_1.txt
diff --git a/src/parser/daphnedsl/DaphneDSLBuiltins.cpp b/src/parser/daphnedsl/DaphneDSLBuiltins.cpp
index bc136b9d0..34832a0bf 100644
--- a/src/parser/daphnedsl/DaphneDSLBuiltins.cpp
+++ b/src/parser/daphnedsl/DaphneDSLBuiltins.cpp
@@ -1033,6 +1033,41 @@ antlrcpp::Any DaphneDSLBuiltins::build(mlir::Location loc, const std::string &fu
                                  utils.matrixOfSizeType, lhs, rhs, lhsOn, rhsOn, rhsAgg)
             .getResults();
     }
+    if (func == "group") {
+        // arbitrary number of columns to group on.
+        // A single column to calculate the sum on.
+        checkNumArgsMin(loc, func, numArgs, 3);
+        mlir::Value currentFrame = args[0];
+        mlir::Value aggCol = args[1];
+        std::vector<mlir::Value> groupName;
+        std::vector<mlir::Value> columnName;
+        std::vector<mlir::Type> colTypes;
+
+        // set aggregaton function to SUM
+        auto aggFunc = static_cast<mlir::Attribute>(
+            mlir::daphne::GroupEnumAttr::get(builder.getContext(), mlir::daphne::GroupEnum::SUM));
+        std::vector<mlir::Attribute> functionName;
+        functionName.push_back(aggFunc);
+
+        // get group columns
+        for (size_t i = 2; i < numArgs; i++) {
+            groupName.push_back(args[i]);
+        }
+
+        // get agg column
+        columnName.push_back(aggCol);
+
+        // result column types
+        mlir::Type vt = utils.unknownType;
+        for (size_t i = 0; i < groupName.size() + columnName.size(); i++) {
+            std::cout << "4";
+            colTypes.push_back(vt);
+        }
+
+        return static_cast<mlir::Value>(builder.create<GroupOp>(loc, FrameType::get(builder.getContext(), colTypes),
+                                                                currentFrame, groupName, columnName,
+                                                                builder.getArrayAttr(functionName)));
+    }
 
     // ********************************************************************
     // Frame label manipulation
diff --git a/src/runtime/local/kernels/ExtractRow.h b/src/runtime/local/kernels/ExtractRow.h
index 442bccdd0..a4367d198 100644
--- a/src/runtime/local/kernels/ExtractRow.h
+++ b/src/runtime/local/kernels/ExtractRow.h
@@ -118,16 +118,23 @@ template <typename VTSel> struct ExtractRow<Frame, Frame, VTSel> {
                 throw std::out_of_range(errMsg.str());
             }
             for (size_t c = 0; c < numCols; c++) {
-                // We always copy in units of 8 bytes (uint64_t). If the
-                // actual element size is lower, the superfluous bytes will
-                // be overwritten by the next match. With this approach, we
-                // do not need to call memcpy for each element, nor
-                // interpret the types for a L/S of fitting size.
-                // TODO Don't multiply by elementSize, but left-shift by
-                // ld(elementSize).
-                *reinterpret_cast<uint64_t *>(resCols[c]) =
-                    *reinterpret_cast<const uint64_t *>(argCols[c] + pos * elementSizes[c]);
-                resCols[c] += elementSizes[c];
+                if (schema[c] == ValueTypeCode::STR) {
+                    // Handle std::string column
+                    *reinterpret_cast<std::string *>(resCols[c]) =
+                        *reinterpret_cast<const std::string *>(argCols[c] + pos * elementSizes[c]);
+                    resCols[c] += elementSizes[c];
+                } else {
+                    // We always copy in units of 8 bytes (uint64_t). If the
+                    // actual element size is lower, the superfluous bytes will
+                    // be overwritten by the next match. With this approach, we
+                    // do not need to call memcpy for each element, nor
+                    // interpret the types for a L/S of fitting size.
+                    // TODO Don't multiply by elementSize, but left-shift by
+                    // ld(elementSize).
+                    *reinterpret_cast<uint64_t *>(resCols[c]) =
+                        *reinterpret_cast<const uint64_t *>(argCols[c] + pos * elementSizes[c]);
+                    resCols[c] += elementSizes[c];
+                }
             }
         }
         res->shrinkNumRows(numRowsSel);
diff --git a/src/runtime/local/kernels/Group.h b/src/runtime/local/kernels/Group.h
index 0d8df6410..f4b7b58da 100644
--- a/src/runtime/local/kernels/Group.h
+++ b/src/runtime/local/kernels/Group.h
@@ -58,6 +58,23 @@ void group(DT *&res, const DT *arg, const char **keyCols, size_t numKeyCols, con
 // Frame <- Frame
 // ----------------------------------------------------------------------------
 
+inline std::string myStringifyGroupEnum(mlir::daphne::GroupEnum val) {
+    using mlir::daphne::GroupEnum;
+    switch (val) {
+    case GroupEnum::COUNT:
+        return "COUNT";
+    case GroupEnum::SUM:
+        return "SUM";
+    case GroupEnum::MIN:
+        return "MIN";
+    case GroupEnum::MAX:
+        return "MAX";
+    case GroupEnum::AVG:
+        return "AVG";
+    }
+    return "";
+}
+
 // returns the result of the aggregation function aggFunc over the (contiguous)
 // memory between the begin and end pointer
 template <typename VTRes, typename VTArg>
@@ -65,26 +82,60 @@ VTRes aggregate(const mlir::daphne::GroupEnum &aggFunc, const VTArg *begin, cons
     using mlir::daphne::GroupEnum;
     switch (aggFunc) {
     case GroupEnum::COUNT:
-        return end - begin;
+        if constexpr (std::is_same<VTRes, std::string>::value)
+            throw std::invalid_argument(std::string("aggregate: ") + myStringifyGroupEnum(aggFunc) +
+                                        std::string(" aggregation is not supported for these value types."));
+        else
+            return end - begin;
         break; // TODO: Do we need to check for Null elements here?
     case GroupEnum::SUM:
-        return std::accumulate(begin, end, (VTRes)0);
+        if constexpr ((std::is_same<VTRes, std::string>::value) || (std::is_same<VTArg, std::string>::value))
+            throw std::invalid_argument(std::string("aggregate: ") + myStringifyGroupEnum(aggFunc) +
+                                        std::string(" aggregation is not supported for these value types."));
+        else
+            return std::accumulate(begin, end, (VTRes)0);
         break;
     case GroupEnum::MIN:
-        return *std::min_element(begin, end);
+        if constexpr ((std::is_same<VTRes, std::string>::value) || (std::is_same<VTArg, std::string>::value))
+            throw std::invalid_argument(std::string("aggregate: ") + myStringifyGroupEnum(aggFunc) +
+                                        std::string(" aggregation is not supported for these value types."));
+        else
+            return *std::min_element(begin, end);
         break;
     case GroupEnum::MAX:
-        return *std::max_element(begin, end);
+        if constexpr ((std::is_same<VTRes, std::string>::value) || (std::is_same<VTArg, std::string>::value))
+            throw std::invalid_argument(std::string("aggregate: ") + myStringifyGroupEnum(aggFunc) +
+                                        std::string(" aggregation is not supported for these value types."));
+        else
+            return *std::max_element(begin, end);
         break;
     case GroupEnum::AVG:
-        return std::accumulate(begin, end, (double)0) / (double)(end - begin);
+        if constexpr ((std::is_same<VTRes, std::string>::value) || (std::is_same<VTArg, std::string>::value))
+            throw std::invalid_argument(std::string("aggregate: ") + myStringifyGroupEnum(aggFunc) +
+                                        std::string(" aggregation is not supported for these value types."));
+        else
+            return std::accumulate(begin, end, (double)0) / (double)(end - begin);
         break;
     default:
-        return *begin;
+        if constexpr (std::is_same<VTArg, std::string>::value || std::is_same<VTRes, std::string>::value)
+            throw std::invalid_argument("aggregate: Unsupported aggregation operation for string types.");
+        else
+            return *begin;
         break;
     }
 }
 
+template <>
+std::string aggregate(const mlir::daphne::GroupEnum &aggFunc, const std::string *begin, const std::string *end) {
+    using mlir::daphne::GroupEnum;
+    if (aggFunc == GroupEnum::MIN)
+        return *std::min_element(begin, end);
+    if (aggFunc == GroupEnum::MAX)
+        return *std::max_element(begin, end);
+    else
+        return *begin;
+}
+
 // struct which calls the aggregate() function (specified via aggFunc) on each
 // duplicate group in the groups vector and on all implied single groups for a
 // sepcified column (colIdx) of the argument frame (arg) and stores the result
@@ -117,22 +168,14 @@ template <typename VTRes, typename VTArg> struct ColumnGroupAgg {
     }
 };
 
-inline std::string myStringifyGroupEnum(mlir::daphne::GroupEnum val) {
-    using mlir::daphne::GroupEnum;
-    switch (val) {
-    case GroupEnum::COUNT:
-        return "COUNT";
-    case GroupEnum::SUM:
-        return "SUM";
-    case GroupEnum::MIN:
-        return "MIN";
-    case GroupEnum::MAX:
-        return "MAX";
-    case GroupEnum::AVG:
-        return "AVG";
+// Since DeduceValueTypeAndExecute can not handle string values,
+// we add special ColumnGroupAgg function for arg with std::string values.
+template <typename VTRes> struct ColumnGroupAggStringVTArg {
+    static void apply(Frame *res, const Frame *arg, size_t colIdx, std::vector<std::pair<size_t, size_t>> *groups,
+                      mlir::daphne::GroupEnum aggFunc, DCTX(ctx)) {
+        ColumnGroupAgg<VTRes, std::string>::apply(res, arg, colIdx, groups, aggFunc, ctx);
     }
-    return "";
-}
+};
 
 template <> struct Group<Frame> {
     static void apply(Frame *&res, const Frame *arg, const char **keyCols, size_t numKeyCols, const char **aggCols,
@@ -270,9 +313,18 @@ template <> struct Group<Frame> {
 
         // copying key columns and column-wise group aggregation
         for (size_t i = 0; i < numColsRes; i++) {
-            DeduceValueTypeAndExecute<ColumnGroupAgg>::apply(
-                res->getSchema()[i], ordered->getSchema()[i], res, ordered, i, groups,
-                (i < numKeyCols) ? (GroupEnum)0 : aggFuncs[i - numKeyCols], ctx);
+            if (ordered->getSchema()[i] == ValueTypeCode::STR) {
+                if (ordered->getSchema()[i] == ValueTypeCode::STR)
+                    ColumnGroupAgg<std::string, std::string>::apply(
+                        res, ordered, i, groups, (i < numKeyCols) ? (GroupEnum)0 : aggFuncs[i - numKeyCols], ctx);
+                else
+                    DeduceValueTypeAndExecute<ColumnGroupAggStringVTArg>::apply(
+                        res->getSchema()[i], res, ordered, i, groups,
+                        (i < numKeyCols) ? (GroupEnum)0 : aggFuncs[i - numKeyCols], ctx);
+            } else
+                DeduceValueTypeAndExecute<ColumnGroupAgg>::apply(
+                    res->getSchema()[i], ordered->getSchema()[i], res, ordered, i, groups,
+                    (i < numKeyCols) ? (GroupEnum)0 : aggFuncs[i - numKeyCols], ctx);
         }
         delete groups;
         DataObjectFactory::destroy(ordered);
diff --git a/src/runtime/local/kernels/Order.h b/src/runtime/local/kernels/Order.h
index afa6d1a10..5b78e6a6a 100644
--- a/src/runtime/local/kernels/Order.h
+++ b/src/runtime/local/kernels/Order.h
@@ -197,8 +197,11 @@ struct OrderFrame {
 
         if (numColIdxs > 1) {
             for (size_t i = 0; i < numColIdxs - 1; i++) {
-                DeduceValueTypeAndExecute<MultiColumnIDSort>::apply(arg->getSchema()[colIdxs[i]], arg, idx, groups,
-                                                                    ascending[i], colIdxs[i], ctx);
+                if (arg->getSchema()[colIdxs[i]] == ValueTypeCode::STR)
+                    MultiColumnIDSort<std::string>::apply(arg, idx, groups, ascending[i], colIdxs[i], ctx);
+                else
+                    DeduceValueTypeAndExecute<MultiColumnIDSort>::apply(arg->getSchema()[colIdxs[i]], arg, idx, groups,
+                                                                        ascending[i], colIdxs[i], ctx);
             }
         }
 
@@ -206,11 +209,17 @@ struct OrderFrame {
         // use
         size_t colIdx = colIdxs[numColIdxs - 1];
         if (groupsRes == nullptr) {
-            DeduceValueTypeAndExecute<ColumnIDSort>::apply(arg->getSchema()[colIdx], arg, idx, groups,
-                                                           ascending[numColIdxs - 1], colIdx, ctx);
+            if (arg->getSchema()[colIdx] == ValueTypeCode::STR)
+                ColumnIDSort<std::string>::apply(arg, idx, groups, ascending[numColIdxs - 1], colIdx, ctx);
+            else
+                DeduceValueTypeAndExecute<ColumnIDSort>::apply(arg->getSchema()[colIdx], arg, idx, groups,
+                                                               ascending[numColIdxs - 1], colIdx, ctx);
         } else {
-            DeduceValueTypeAndExecute<MultiColumnIDSort>::apply(arg->getSchema()[colIdx], arg, idx, groups,
-                                                                ascending[numColIdxs - 1], colIdx, ctx);
+            if (arg->getSchema()[colIdx] == ValueTypeCode::STR)
+                MultiColumnIDSort<std::string>::apply(arg, idx, groups, ascending[numColIdxs - 1], colIdx, ctx);
+            else
+                DeduceValueTypeAndExecute<MultiColumnIDSort>::apply(arg->getSchema()[colIdx], arg, idx, groups,
+                                                                    ascending[numColIdxs - 1], colIdx, ctx);
             groupsRes->insert(groupsRes->end(), groups.begin(), groups.end());
         }
     }
diff --git a/test/api/cli/operations/OperationsTest.cpp b/test/api/cli/operations/OperationsTest.cpp
index 47857ceb0..5469fef2d 100644
--- a/test/api/cli/operations/OperationsTest.cpp
+++ b/test/api/cli/operations/OperationsTest.cpp
@@ -40,6 +40,7 @@ MAKE_TEST_CASE("createFrame", 1)
 MAKE_TEST_CASE("ctable", 1)
 MAKE_TEST_CASE("fill", 1)
 MAKE_TEST_CASE("gemv", 1)
+MAKE_TEST_CASE("groupSum", 1)
 MAKE_TEST_CASE("idxMax", 1)
 MAKE_TEST_CASE("idxMin", 1)
 MAKE_TEST_CASE("isNan", 1)
diff --git a/test/api/cli/operations/groupSum_1.daphne b/test/api/cli/operations/groupSum_1.daphne
new file mode 100644
index 000000000..3deb43606
--- /dev/null
+++ b/test/api/cli/operations/groupSum_1.daphne
@@ -0,0 +1,8 @@
+// constant folding of casts between unsigned, signed, signless types
+
+fr = createFrame([ "String1", "String1", "String2", "String3", "String3" ], [ 1, 1, 2, 3, 4 ],
+                 [ 100, 200, 300, 400, 500 ], "a", "b", "c");
+
+res = group(fr, "c", "a", "b");
+
+print(res);
\ No newline at end of file
diff --git a/test/api/cli/operations/groupSum_1.txt b/test/api/cli/operations/groupSum_1.txt
new file mode 100644
index 000000000..724f21c70
--- /dev/null
+++ b/test/api/cli/operations/groupSum_1.txt
@@ -0,0 +1,5 @@
+Frame(4x3, [a:std::string, b:int64_t, SUM(c):int64_t])
+String1 1 300
+String2 2 300
+String3 3 400
+String3 4 500