Skip to content

Commit

Permalink
Merge branch 'daphne-eu:main' into codegen-ops
Browse files Browse the repository at this point in the history
  • Loading branch information
AlexRTer authored Nov 17, 2024
2 parents c60de05 + 287f4c5 commit 20cbc5d
Show file tree
Hide file tree
Showing 20 changed files with 264 additions and 227 deletions.
6 changes: 3 additions & 3 deletions src/api/cli/DaphneUserConfig.h
Original file line number Diff line number Diff line change
Expand Up @@ -78,9 +78,9 @@ struct DaphneUserConfig {

bool force_cuda = false;

SelfSchedulingScheme taskPartitioningScheme = STATIC;
QueueTypeOption queueSetupScheme = CENTRALIZED;
VictimSelectionLogic victimSelection = SEQPRI;
SelfSchedulingScheme taskPartitioningScheme = SelfSchedulingScheme::STATIC;
QueueTypeOption queueSetupScheme = QueueTypeOption::CENTRALIZED;
VictimSelectionLogic victimSelection = VictimSelectionLogic::SEQPRI;
ALLOCATION_TYPE distributedBackEndSetup = ALLOCATION_TYPE::DIST_MPI; // default value
size_t max_distributed_serialization_chunk_size =
std::numeric_limits<int>::max() - 1024; // 2GB (-1KB to make up for gRPC headers etc.) - which is the
Expand Down
5 changes: 5 additions & 0 deletions src/api/internal/daphne_internal.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,9 @@ int startDAPHNE(int argc, const char **argv, DaphneLibResult *daphneLibRes, int
init(""));

// Scheduling options
using enum SelfSchedulingScheme;
using enum QueueTypeOption;
using enum VictimSelectionLogic;

static opt<SelfSchedulingScheme> taskPartitioningScheme(
"partitioning", cat(schedulingOptions), desc("Choose task partitioning scheme:"),
Expand All @@ -179,11 +182,13 @@ int startDAPHNE(int argc, const char **argv, DaphneLibResult *daphneLibRes, int
"i.e., MFSC does not require profiling information as FSC"),
clEnumVal(PSS, "Probabilistic self-scheduling"), clEnumVal(AUTO, "Automatic partitioning")),
init(STATIC));

static opt<QueueTypeOption> queueSetupScheme(
"queue_layout", cat(schedulingOptions), desc("Choose queue setup scheme:"),
values(clEnumVal(CENTRALIZED, "One queue (default)"), clEnumVal(PERGROUP, "One queue per CPU group"),
clEnumVal(PERCPU, "One queue per CPU core")),
init(CENTRALIZED));

static opt<VictimSelectionLogic> victimSelection(
"victim_selection", cat(schedulingOptions), desc("Choose work stealing victim selection logic:"),
values(clEnumVal(SEQ, "Steal from next adjacent worker (default)"),
Expand Down
26 changes: 13 additions & 13 deletions src/parser/config/ConfigParser.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,19 +22,19 @@
#include <string>

// must be in the same namespace as the enum SelfSchedulingScheme
NLOHMANN_JSON_SERIALIZE_ENUM(SelfSchedulingScheme, {{INVALID, nullptr},
{STATIC, "STATIC"},
{SS, "SS"},
{GSS, "GSS"},
{TSS, "TSS"},
{FAC2, "FAC2"},
{TFSS, "TFSS"},
{FISS, "FISS"},
{VISS, "VISS"},
{PLS, "PLS"},
{MSTATIC, "MSTATIC"},
{MFSC, "MFSC"},
{PSS, "PSS"}})
NLOHMANN_JSON_SERIALIZE_ENUM(SelfSchedulingScheme, {{SelfSchedulingScheme::INVALID, nullptr},
{SelfSchedulingScheme::STATIC, "STATIC"},
{SelfSchedulingScheme::SS, "SS"},
{SelfSchedulingScheme::GSS, "GSS"},
{SelfSchedulingScheme::TSS, "TSS"},
{SelfSchedulingScheme::FAC2, "FAC2"},
{SelfSchedulingScheme::TFSS, "TFSS"},
{SelfSchedulingScheme::FISS, "FISS"},
{SelfSchedulingScheme::VISS, "VISS"},
{SelfSchedulingScheme::PLS, "PLS"},
{SelfSchedulingScheme::MSTATIC, "MSTATIC"},
{SelfSchedulingScheme::MFSC, "MFSC"},
{SelfSchedulingScheme::PSS, "PSS"}})

class ConfigParser {
public:
Expand Down
1 change: 1 addition & 0 deletions src/runtime/local/kernels/BinaryOpCode.h
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,7 @@ static constexpr bool supportsBinaryOp = false;

// Concise specification of which binary operations should be supported on
// which value types.
SUPPORT_EQUALITY(bool)
SUPPORT_NUMERIC_FP(double)
SUPPORT_NUMERIC_FP(float)
SUPPORT_NUMERIC_INT(int64_t)
Expand Down
4 changes: 3 additions & 1 deletion src/runtime/local/kernels/VectorizedPipeline.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include <runtime/local/datastructures/DataObjectFactory.h>
#include <runtime/local/datastructures/DenseMatrix.h>
#include <runtime/local/vectorized/MTWrapper.h>
#include <runtime/local/vectorized/PipelineHWlocInfo.h>

#include <cstddef>

Expand All @@ -35,7 +36,8 @@ template <class DTRes> struct VectorizedPipeline {
static void apply(DTRes **outputs, size_t numOutputs, bool *isScalar, Structure **inputs, size_t numInputs,
int64_t *outRows, int64_t *outCols, int64_t *splits, int64_t *combines, size_t numFuncs,
void **fun, DCTX(ctx)) {
auto wrapper = std::make_unique<MTWrapper<DTRes>>(numFuncs, ctx);
static PipelineHWlocInfo topology{ctx};
auto wrapper = std::make_unique<MTWrapper<DTRes>>(numFuncs, topology, ctx);

std::vector<std::function<void(DTRes ***, Structure **, DCTX(ctx))>> funcs;
for (auto i = 0ul; i < numFuncs; ++i) {
Expand Down
1 change: 1 addition & 0 deletions src/runtime/local/kernels/kernels.json
Original file line number Diff line number Diff line change
Expand Up @@ -2175,6 +2175,7 @@
["int64_t", "int64_t", "int64_t"],
["uint64_t", "uint64_t", "uint64_t"],
["uint32_t", "uint32_t", "uint32_t"],
["bool", "bool", "bool"],
["size_t", "size_t", "size_t"],
["const char *", "const char *", "const char *"],
["int64_t", "const char *", "const char *"]
Expand Down
61 changes: 31 additions & 30 deletions src/runtime/local/vectorized/LoadPartitioning.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,13 @@
#include "LoadPartitioningDefs.h"

#include <cmath>
#include <cstdint>
#include <cstdlib>
#include <iostream>
#include <string>

class LoadPartitioning {

private:
int schedulingMethod;
SelfSchedulingScheme schedulingMethod;
uint64_t totalTasks;
uint64_t chunkParam;
uint64_t scheduledTasks;
Expand All @@ -37,8 +36,8 @@ class LoadPartitioning {
uint64_t tssDelta;
uint64_t mfscChunk;
uint32_t fissStages;
int getMethod(const char *method) { return std::stoi(method); }
int getStages(int tasks, int workers) {

static int getStages(int tasks, int workers) {
int actual_step = 0;
int scheduled = 0;
int step = 0;
Expand All @@ -52,19 +51,18 @@ class LoadPartitioning {
}

public:
LoadPartitioning(int method, uint64_t tasks, uint64_t chunk, uint32_t workers, bool autoChunk) {
schedulingMethod = method;
totalTasks = tasks;
LoadPartitioning(SelfSchedulingScheme method, uint64_t tasks, uint64_t chunk, uint32_t workers, bool autoChunk)
: schedulingMethod(method), totalTasks(tasks), fissStages(getStages(totalTasks, workers)) {
double tSize = (totalTasks + workers - 1.0) / workers;
mfscChunk = ceil(tSize * log(2.0) / log((1.0 * tSize)));
fissStages = getStages(totalTasks, workers);

if (!autoChunk) {
chunkParam = chunk;
} else {
// calculate expertChunk
int mul = log2(totalTasks / workers) * 0.618;
chunkParam = (totalTasks) / ((2 << mul) * workers);
method = SS;
method = SelfSchedulingScheme::SS;
if (chunkParam < 1) {
chunkParam = 1;
}
Expand All @@ -74,77 +72,80 @@ class LoadPartitioning {
schedulingStep = 0;
scheduledTasks = 0;
tssChunk = (uint64_t)ceil((double)totalTasks / ((double)2.0 * totalWorkers));
uint64_t nTemp = (uint64_t)ceil(2.0 * totalTasks / (tssChunk + 1.0));
auto nTemp = (uint64_t)ceil(2.0 * totalTasks / (tssChunk + 1.0));
tssDelta = (uint64_t)(tssChunk - 1.0) / (double)(nTemp - 1.0);
}
bool hasNextChunk() { return scheduledTasks < totalTasks; }

[[nodiscard]] bool hasNextChunk() const { return scheduledTasks < totalTasks; }

uint64_t getNextChunk() {
uint64_t chunkSize = 0;
switch (schedulingMethod) {
case STATIC: { // STATIC
case SelfSchedulingScheme::STATIC: {
chunkSize = std::ceil(totalTasks / totalWorkers);
break;
}
case SS: { // self-scheduling (SS)
case SelfSchedulingScheme::SS: {
chunkSize = 1;
break;
}
case GSS: { // guided self-scheduling (GSS)
case SelfSchedulingScheme::GSS: {
chunkSize = (uint64_t)ceil((double)remainingTasks / totalWorkers);
break;
}
case TSS: { // trapezoid self-scheduling (TSS)
case SelfSchedulingScheme::TSS: {
chunkSize = tssChunk - tssDelta * schedulingStep;
break;
}
case FAC2: { // factoring (FAC2)
uint64_t actualStep = schedulingStep / totalWorkers; // has to be an integer division
case SelfSchedulingScheme::FAC2: {
const uint64_t actualStep = schedulingStep / totalWorkers; // has to be an integer division
chunkSize = (uint64_t)ceil(pow(0.5, actualStep + 1) * (totalTasks / totalWorkers));
break;
}
case TFSS: { // trapezoid factoring self-scheduling (TFSS)
case SelfSchedulingScheme::TFSS: {
chunkSize = (uint64_t)ceil((double)remainingTasks / ((double)2.0 * totalWorkers));
break;
}
case FISS: { // fixed increase self-scheduling (FISS)
case SelfSchedulingScheme::FISS: {
// TODO
uint64_t X = fissStages + 2;
uint64_t initChunk = (uint64_t)ceil(totalTasks / ((2.0 + fissStages) * totalWorkers));
const uint64_t X = fissStages + 2;
auto initChunk = (uint64_t)ceil(totalTasks / ((2.0 + fissStages) * totalWorkers));
chunkSize =
initChunk + schedulingStep * (uint64_t)ceil((2.0 * totalTasks * (1.0 - (fissStages / X))) /
(totalWorkers * fissStages *
(fissStages - 1))); // chunksize with increment after init
break;
}
case VISS: { // variable increase self-scheduling (VISS)
case SelfSchedulingScheme::VISS: {
// TODO
uint64_t schedulingStepnew = schedulingStep / totalWorkers;
uint64_t initChunk = (uint64_t)ceil(totalTasks / ((2.0 + fissStages) * totalWorkers));
auto initChunk = (uint64_t)ceil(totalTasks / ((2.0 + fissStages) * totalWorkers));
chunkSize = initChunk * (uint64_t)ceil((double)(1 - pow(0.5, schedulingStepnew)) / 0.5);
break;
}
case PLS: { // performance-based loop self-scheduling (PLS)
case SelfSchedulingScheme::PLS: {
// TODO
double SWR = 0.5; // static workload ratio
const double SWR = 0.5; // static workload ratio
if (remainingTasks > totalTasks - (totalTasks * SWR)) {
chunkSize = (uint64_t)ceil((double)totalTasks * SWR / totalWorkers);
} else {
chunkSize = (uint64_t)ceil((double)remainingTasks / totalWorkers);
}
break;
}
case PSS: { // probabilistic self-scheduling (PSS)
case SelfSchedulingScheme::PSS: { // probabilistic self-scheduling (PSS)
// E[P] is the average number of idle processor, for now we use
// still totalWorkers
double averageIdleProc = (double)totalWorkers;
auto averageIdleProc = (double)totalWorkers;
chunkSize = (uint64_t)ceil((double)remainingTasks / (1.5 * averageIdleProc));
// TODO
break;
}
case MFSC: { // modifed fixed-size chunk self-scheduling (MFSC)
case SelfSchedulingScheme::MFSC: { // modifed fixed-size chunk self-scheduling (MFSC)
chunkSize = mfscChunk;
break;
}
case SelfSchedulingScheme::MSTATIC:
default: {
chunkSize = (uint64_t)ceil(totalTasks / totalWorkers / 4.0);
break;
Expand All @@ -157,4 +158,4 @@ class LoadPartitioning {
remainingTasks -= chunkSize;
return chunkSize;
}
};
};
30 changes: 15 additions & 15 deletions src/runtime/local/vectorized/LoadPartitioningDefs.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,23 +16,23 @@

#pragma once

enum QueueTypeOption { CENTRALIZED = 0, PERGROUP, PERCPU };
enum class QueueTypeOption { CENTRALIZED, PERGROUP, PERCPU };

enum VictimSelectionLogic { SEQ = 0, SEQPRI, RANDOM, RANDOMPRI };
enum class VictimSelectionLogic { SEQ, SEQPRI, RANDOM, RANDOMPRI };

enum SelfSchedulingScheme {
STATIC = 0,
SS,
GSS,
TSS,
FAC2,
TFSS,
FISS,
VISS,
PLS,
enum class SelfSchedulingScheme {
INVALID = -1,
STATIC,
SS, // self-scheduling
GSS, // guided self-scheduling
TSS, // trapezoid self-scheduling
FAC2, // factoring
TFSS, // trapezoid factoring self-scheduling (TFSS)
FISS, // fixed increase self-scheduling
VISS, // variable increase self-scheduling
PLS, // performance-based loop self-scheduling
PSS, // probabilistic self-scheduling
MSTATIC,
MFSC,
PSS,
MFSC, // modifed fixed-size chunk self-scheduling
AUTO,
INVALID = -1 /* only for JSON enum conversion */
};
Loading

0 comments on commit 20cbc5d

Please sign in to comment.