Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[DAPHNE-#77] Avoid stack overflow in long-running DaphneDSL loops. #820

Merged
merged 1 commit into from
Sep 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
101 changes: 93 additions & 8 deletions src/compiler/lowering/LowerToLLVMPass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,20 @@

using namespace mlir;

// Remark on the creation of mlir::LLVM::AllocaOp
// ==============================================
// This pass creates an mlir::LLVM::AllocaOp in several places and for various purposes,
// e.g., to store the result pointer of a kernel call, for variadic operands/results, etc.
// AllocaOp should not be inside a loop, as its repeated execution at run-time can lead
// to a stack overflow (depending on the number of iterations, the number of AllocaOps
// inside the loop, and the stack size). The reason is that the memory allocated by AllocaOp
// is freed only at the end of the scope (i.e., function).
// To avoid such problems, we don't create AllocaOps at the original insertion point of
// the rewriter, but at the beginning of function surrounding the currently considered op.
// To this end, we use the rewriter's ability to switch between different insertion points.
// Note that the memory allocated by an AllocaOp can be reused by multiple repeated
// kernel calls.

// Optional attribute of CallKernelOp, which indicates that all results shall
// be combined into a single variadic result.
const std::string ATTR_HASVARIADICRESULTS = "hasVariadicResults";
Expand Down Expand Up @@ -101,12 +115,28 @@ class ConstantOpLowering : public OpConversionPattern<daphne::ConstantOp>
const size_t numChars = sr.size() + 1; // +1 for trailing '\0'
const std::string str = sr.str();
const char * chars = str.c_str();

// We could assume that the daphne::ConstantOp `op` is *not* inside a loop,
// because constants are typically moved to the top of a function during
// canonicalization. Consequently, we would not need to change the insertion
// point. However, being defensive, we still do it.

// Set the insertion point to the beginning of the function surrounding this ConstantOp
// (see comment on AllocaOp above).
OpBuilder::InsertPoint ipHere = rewriter.saveInsertionPoint();
Block & fb = op.getOperation()->getParentOfType<LLVM::LLVMFuncOp>().getBody().front();
rewriter.setInsertionPointToStart(&fb);

auto allocaOp = rewriter.replaceOpWithNewOp<LLVM::AllocaOp>(
op.getOperation(),
i8PtrType,
rewriter.create<arith::ConstantOp>(loc, rewriter.getI64IntegerAttr(numChars)),
1
);

// Go back to the original insertion point.
rewriter.restoreInsertionPoint(ipHere);

for(size_t i = 0; i < numChars; i++) {
std::vector<Value> indices = {
rewriter.create<arith::ConstantOp>(loc, rewriter.getI64IntegerAttr(i))
Expand Down Expand Up @@ -301,7 +331,7 @@ class CallKernelOpLowering : public OpConversionPattern<daphne::CallKernelOp>

auto kernelOperands = allocOutputReferences(
loc, rewriter, adaptor.getOperands(), inputOutputTypes,
op->getNumResults(), hasVarRes);
op->getNumResults(), hasVarRes, op);

// call function
// The kernel call has an empty list of return types, because our
Expand Down Expand Up @@ -358,22 +388,40 @@ class CallKernelOpLowering : public OpConversionPattern<daphne::CallKernelOp>
std::vector<Value>
allocOutputReferences(Location &loc, PatternRewriter &rewriter,
ValueRange operands,
std::vector<Type> inputOutputTypes, size_t numRes, bool hasVarRes) const
std::vector<Type> inputOutputTypes, size_t numRes, bool hasVarRes,
daphne::CallKernelOp op) const
{

std::vector<Value> kernelOperands;

// Obtain an insertion point at the beginning of the function surrounding this CallKernelOp
// (see comment on AllocaOp above).
OpBuilder::InsertPoint ipHere = rewriter.saveInsertionPoint();
Block & fb = op.getOperation()->getParentOfType<LLVM::LLVMFuncOp>().getBody().front();
rewriter.setInsertionPointToStart(&fb);
OpBuilder::InsertPoint ipFuncStart = rewriter.saveInsertionPoint();
rewriter.restoreInsertionPoint(ipHere);

// --------------------------------------------------------------------
// Results
// --------------------------------------------------------------------

if(hasVarRes) { // combine all results into one variadic result
// Allocate an array of numRes elements.

// Set the insertion point to the beginning of the function (see comment on AllocaOp above).
ipHere = rewriter.saveInsertionPoint();
rewriter.restoreInsertionPoint(ipFuncStart);
auto allocaOp = rewriter.create<LLVM::AllocaOp>(
loc,
inputOutputTypes[0],
rewriter.create<arith::ConstantOp>(loc, rewriter.getI64IntegerAttr(numRes)).getResult()
);
ipFuncStart = rewriter.saveInsertionPoint();

// Go back to the original insertion point.
rewriter.restoreInsertionPoint(ipHere);

kernelOperands.push_back(allocaOp);

// If the type of this result parameter is a pointer (i.e. when it
Expand All @@ -399,13 +447,27 @@ class CallKernelOpLowering : public OpConversionPattern<daphne::CallKernelOp>
}
else { // typical case
// Constant of 1 for AllocaOp of output.
// Set the insertion point to the beginning of the function (see comment on AllocaOp above).
ipHere = rewriter.saveInsertionPoint();
rewriter.restoreInsertionPoint(ipFuncStart);
Value cst1 = rewriter.create<arith::ConstantOp>(loc, rewriter.getI64IntegerAttr(1));
ipFuncStart = rewriter.saveInsertionPoint();

// Go back to the original insertion point.
rewriter.restoreInsertionPoint(ipHere);

for (size_t i = 0; i < numRes; i++) {
// Allocate space for a single element.
// Set the insertion point to the beginning of the function (see comment on AllocaOp above).
ipHere = rewriter.saveInsertionPoint();
rewriter.restoreInsertionPoint(ipFuncStart);
auto allocaOp = rewriter.create<LLVM::AllocaOp>(loc, inputOutputTypes[i], cst1);
ipFuncStart = rewriter.saveInsertionPoint();
kernelOperands.push_back(allocaOp);

// Go back to the original insertion point.
rewriter.restoreInsertionPoint(ipHere);

// If the type of this result parameter is a pointer (i.e. when it
// represents a matrix or frame), then initialize the allocated
// element with a null pointer (required by the kernels). Otherwise
Expand Down Expand Up @@ -450,6 +512,11 @@ class CreateVariadicPackOpLowering : public OpConversionPattern<daphne::CreateVa
matchAndRewrite(daphne::CreateVariadicPackOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const override
{
// Set the insertion point to the beginning of the function surrounding this CreateVariadicPackOp
// (see comment on AllocaOp above).
Block & fb = op.getOperation()->getParentOfType<LLVM::LLVMFuncOp>().getBody().front();
rewriter.setInsertionPointToStart(&fb);

Type contType = op.getRes().getType().dyn_cast<daphne::VariadicPackType>().getContainedType();
Type convType = typeConverter->convertType(contType);
rewriter.replaceOpWithNewOp<LLVM::AllocaOp>(
Expand Down Expand Up @@ -821,38 +888,46 @@ class VectorizedPipelineOpLowering : public OpConversionPattern<daphne::Vectoriz
newOperands.push_back(vpInputs);
newOperands.push_back(rewriter.create<daphne::ConstantOp>(loc, rewriter.getIndexType(), rewriter.getIndexAttr(numDataOperands)));

// Obtain an insertion point at the beginning of the function surrounding this VectorizedPipelineOp
// (see comment on AllocaOp above).
OpBuilder::InsertPoint ipHere = rewriter.saveInsertionPoint();
Block & fb = op.getOperation()->getParentOfType<LLVM::LLVMFuncOp>().getBody().front();
rewriter.setInsertionPointToStart(&fb);
OpBuilder::InsertPoint ipFuncStart = rewriter.saveInsertionPoint();
rewriter.restoreInsertionPoint(ipHere);

auto numOutputs = op.getNumResults();
// Variadic num rows operands.
callee << "__" << CompilerUtils::mlirTypeToCppTypeName(rewriter.getIntegerType(64, true), false);
auto rowsOperands = adaptor.getOperands().drop_front(numDataOperands);
newOperands
.push_back(convertToArray(loc, rewriter, rewriter.getI64Type(), rowsOperands.take_front(numOutputs)));
.push_back(convertToArray(loc, rewriter, rewriter.getI64Type(), rowsOperands.take_front(numOutputs), ipFuncStart));
callee << "__" << CompilerUtils::mlirTypeToCppTypeName(rewriter.getIntegerType(64, true), false);
auto colsOperands = rowsOperands.drop_front(numOutputs);
newOperands.push_back(convertToArray(loc, rewriter, rewriter.getI64Type(), colsOperands.take_front(numOutputs)));
newOperands.push_back(convertToArray(loc, rewriter, rewriter.getI64Type(), colsOperands.take_front(numOutputs), ipFuncStart));

// Add array of split enums
callee << "__int64_t";
std::vector<Value> splitConsts;
for(auto split : op.getSplits()) {
splitConsts.push_back(rewriter.create<arith::ConstantOp>(loc, split));
}
newOperands.push_back(convertToArray(loc, rewriter, rewriter.getI64Type(), splitConsts));
newOperands.push_back(convertToArray(loc, rewriter, rewriter.getI64Type(), splitConsts, ipFuncStart));

// Add array of combine enums
callee << "__int64_t";
std::vector<Value> combineConsts;
for(auto combine : op.getCombines()) {
combineConsts.push_back(rewriter.create<arith::ConstantOp>(loc, combine));
}
newOperands.push_back(convertToArray(loc, rewriter, rewriter.getI64Type(), combineConsts));
newOperands.push_back(convertToArray(loc, rewriter, rewriter.getI64Type(), combineConsts, ipFuncStart));

// TODO: pass function pointer with special placeholder instead of `void`

callee << "__size_t";
newOperands.push_back(rewriter.create<daphne::ConstantOp>(loc, rewriter.getIndexType(), rewriter.getIndexAttr(func_ptrs.size())));
callee << "__void_variadic";
newOperands.push_back(convertToArray(loc, rewriter, ptrPtrI1Ty, func_ptrs));
newOperands.push_back(convertToArray(loc, rewriter, ptrPtrI1Ty, func_ptrs, ipFuncStart));
// newOperands.push_back(fnPtr);

// Add ctx
Expand All @@ -876,12 +951,22 @@ class VectorizedPipelineOpLowering : public OpConversionPattern<daphne::Vectoriz
return success();
}
private:
static Value convertToArray(Location loc, ConversionPatternRewriter &rewriter, Type valueTy, ValueRange values)
static Value convertToArray(Location loc, ConversionPatternRewriter &rewriter, Type valueTy, ValueRange values, OpBuilder::InsertPoint & ipFuncStart)
{
// Set the insertion point to the beginning of the function surrounding this VectorizedPipelineOp
// (see comment on AllocaOp above).
OpBuilder::InsertPoint ipHere = rewriter.saveInsertionPoint();
rewriter.restoreInsertionPoint(ipFuncStart);

auto valuePtrTy = LLVM::LLVMPointerType::get(valueTy);
auto array = rewriter.create<LLVM::AllocaOp>(loc,
valuePtrTy,
Value(rewriter.create<arith::ConstantOp>(loc, rewriter.getI64IntegerAttr(values.size()))));
ipFuncStart = rewriter.saveInsertionPoint();

// Go back to the original insertion point.
rewriter.restoreInsertionPoint(ipHere);

for(auto i = 0u; i < values.size(); ++i) {
Value cstI = rewriter.create<arith::ConstantOp>(loc, rewriter.getI64IntegerAttr(i));
auto addr = rewriter.create<LLVM::GEPOp>(loc, valuePtrTy, array, ArrayRef<Value>({cstI}));
Expand Down
18 changes: 17 additions & 1 deletion test/api/cli/controlflow/ControlFlowTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,4 +56,20 @@ MAKE_TEST_CASE("for", 23)
MAKE_TEST_CASE("while", 16)
MAKE_TEST_CASE("nested", 26)

MAKE_FAILURE_TEST_CASE("stop", 2)
MAKE_FAILURE_TEST_CASE("stop", 2)

TEST_CASE("loop-with-many-iterations", TAG_CONTROLFLOW) {
std::stringstream exp;
for(size_t i = 1; i <= 500*1000; i++)
exp << i << std::endl;
compareDaphneToStr(exp.str(), dirPath + "for_manyiterations_1.daphne");
compareDaphneToStr(exp.str(), dirPath + "while_manyiterations_1.daphne");
}

TEST_CASE("loop-with-many-iterations_variadic-op", TAG_CONTROLFLOW) {
std::stringstream exp;
for(size_t i = 1; i <= 500*1000; i++)
exp << "Frame(1x1, [col_0:int64_t])" << std::endl << i << std::endl;
compareDaphneToStr(exp.str(), dirPath + "for_manyiterations_2.daphne");
compareDaphneToStr(exp.str(), dirPath + "while_manyiterations_2.daphne");
}
4 changes: 4 additions & 0 deletions test/api/cli/controlflow/for_manyiterations_1.daphne
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
// A for-loop with many iterations.

for(i in 1:500_000)
print(i);
6 changes: 6 additions & 0 deletions test/api/cli/controlflow/for_manyiterations_2.daphne
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
// A for-loop with many iterations, containing a variadic op.

for(i in 1:500_000) {
f = createFrame(as.matrix(i));
print(f);
}
7 changes: 7 additions & 0 deletions test/api/cli/controlflow/while_manyiterations_1.daphne
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
// A while-loop with many iterations.

i = 1;
while(i <= 500_000) {
print(i);
i = i + 1;
}
8 changes: 8 additions & 0 deletions test/api/cli/controlflow/while_manyiterations_2.daphne
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
// A while-loop with many iterations, containing a variadic op.

i = 1;
while(i <= 500_000) {
f = createFrame(as.matrix(i));
print(f);
i = i + 1;
}